diff --git a/THIRDPARTY.md b/THIRDPARTY.md
index 89d1ce3de3d97..412b84b688758 100644
--- a/THIRDPARTY.md
+++ b/THIRDPARTY.md
@@ -6,7 +6,6 @@ for exceptions.
 - [crc32c.c](https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software) (CRC-32c checksum code by Mark Adler) [[ZLib](https://opensource.org/licenses/Zlib)].
 - [LDC](https://github.com/ldc-developers/ldc/blob/master/LICENSE) (for ccall/cfunction ABI definitions) [BSD-3]. The portion of code that Julia uses from LDC is [BSD-3] licensed.
 - [LLVM](https://releases.llvm.org/3.9.0/LICENSE.TXT) (for parts of src/disasm.cpp) [UIUC]
-- [MINGW](https://sourceforge.net/p/mingw/mingw-org-wsl/ci/legacy/tree/mingwrt/mingwex/dirname.c) (for dirname implementation on Windows) [MIT]
 - [NetBSD](https://www.netbsd.org/about/redistribution.html) (for setjmp, longjmp, and strptime implementations on Windows) [BSD-3]
 - [Python](https://docs.python.org/3/license.html) (for strtod implementation on Windows) [PSF]
 - [FEMTOLISP](https://github.com/JeffBezanson/femtolisp) [BSD-3]
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 45fff8dae0d24..be78c308afa53 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -3654,7 +3654,31 @@ function _keepat!(a::AbstractVector, m::AbstractVector{Bool})
     deleteat!(a, j:lastindex(a))
 end
 
-## 1-d circshift ##
+"""
+    circshift!(a::AbstractVector, shift::Integer)
+
+Circularly shift, or rotate, the data in vector `a` by `shift` positions.
+
+# Examples
+
+```jldoctest
+julia> circshift!([1, 2, 3, 4, 5], 2)
+5-element Vector{Int64}:
+ 4
+ 5
+ 1
+ 2
+ 3
+
+julia> circshift!([1, 2, 3, 4, 5], -2)
+5-element Vector{Int64}:
+ 3
+ 4
+ 5
+ 1
+ 2
+```
+"""
 function circshift!(a::AbstractVector, shift::Integer)
     n = length(a)
     n == 0 && return a
diff --git a/base/compiler/effects.jl b/base/compiler/effects.jl
index ece549eda7a6d..166df78f3130c 100644
--- a/base/compiler/effects.jl
+++ b/base/compiler/effects.jl
@@ -329,7 +329,6 @@ is_inaccessiblemem_or_argmemonly(effects::Effects) = effects.inaccessiblememonly
 
 is_consistent_overlay(effects::Effects) = effects.nonoverlayed === CONSISTENT_OVERLAY
 
-# (sync this with codegen.cpp and staticdata.c effects_foldable functions)
 function encode_effects(e::Effects)
     return ((e.consistent          % UInt32) << 0)  |
            ((e.effect_free         % UInt32) << 3)  |
diff --git a/base/math.jl b/base/math.jl
index 5266cff8d47fc..9c02b43140aaf 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -1140,6 +1140,10 @@ function modf(x::T) where T<:IEEEFloat
     return (rx, ix)
 end
 
+@inline function use_power_by_squaring(n::Integer)
+    -2^12 <= n <= 3 * 2^13
+end
+
 # @constprop aggressive to help the compiler see the switch between the integer and float
 # variants for callers with constant `y`
 @constprop :aggressive function ^(x::Float64, y::Float64)
@@ -1152,24 +1156,33 @@ end
         y = sign(y)*0x1.8p62
     end
     yint = unsafe_trunc(Int64, y) # This is actually safe since julia freezes the result
-    y == yint && return @noinline x^yint
-    2*xu==0 && return abs(y)*Inf*(!(y>0)) # if x==0
-    x<0 && throw_exp_domainerror(x) # |y| is small enough that y isn't an integer
-    !isfinite(x) && return x*(y>0 || isnan(x))           # x is inf or NaN
+    yisint = y == yint
+    if yisint
+        yint == 0 && return 1.0
+        use_power_by_squaring(yint) && return @noinline pow_body(x, yint)
+    end
+    2*xu==0 && return abs(y)*Inf*(!(y>0)) # if x === +0.0 or -0.0 (Inf * false === 0.0)
+    s = 1
+    if x < 0
+        !yisint && throw_exp_domainerror(x) # y isn't an integer
+        s = ifelse(isodd(yint), -1, 1)
+    end
+    !isfinite(x) && return copysign(x,s)*(y>0 || isnan(x))           # x is inf or NaN
+    return copysign(pow_body(abs(x), y), s)
+end
+
+@assume_effects :foldable @noinline function pow_body(x::Float64, y::Float64)
+    xu = reinterpret(UInt64, x)
     if xu < (UInt64(1)<<52) # x is subnormal
         xu = reinterpret(UInt64, x * 0x1p52) # normalize x
         xu &= ~sign_mask(Float64)
         xu -= UInt64(52) << 52 # mess with the exponent
     end
-    return pow_body(xu, y)
-end
-
-@inline function pow_body(xu::UInt64, y::Float64)
     logxhi,logxlo = _log_ext(xu)
     xyhi, xylo = two_mul(logxhi,y)
     xylo = muladd(logxlo, y, xylo)
     hi = xyhi+xylo
-    return Base.Math.exp_impl(hi, xylo-(hi-xyhi), Val(:ℯ))
+    return @inline Base.Math.exp_impl(hi, xylo-(hi-xyhi), Val(:ℯ))
 end
 
 @constprop :aggressive function ^(x::T, y::T) where T <: Union{Float16, Float32}
@@ -1193,12 +1206,29 @@ end
     return T(exp2(log2(abs(widen(x))) * y))
 end
 
-# compensated power by squaring
 @constprop :aggressive @inline function ^(x::Float64, n::Integer)
+    x^clamp(n, Int64)
+end
+@constprop :aggressive @inline function ^(x::Float64, n::Int64)
     n == 0 && return one(x)
-    return pow_body(x, n)
+    if use_power_by_squaring(n)
+        return pow_body(x, n)
+    else
+        s = ifelse(x < 0 && isodd(n), -1.0, 1.0)
+        x = abs(x)
+        y = float(n)
+        if y == n
+            return copysign(pow_body(x, y), s)
+        else
+            n2 = n % 1024
+            y = float(n - n2)
+            return pow_body(x, y) * copysign(pow_body(x, n2), s)
+        end
+    end
 end
 
+# compensated power by squaring
+# this method is only reliable for -2^20 < n < 2^20 (cf. #53881 #53886)
 @assume_effects :terminates_locally @noinline function pow_body(x::Float64, n::Integer)
     y = 1.0
     xnlo = ynlo = 0.0
diff --git a/base/precompilation.jl b/base/precompilation.jl
index 254bab05a07dc..b33b5a14728e4 100644
--- a/base/precompilation.jl
+++ b/base/precompilation.jl
@@ -141,15 +141,16 @@ function ExplicitEnv(envpath::String=Base.active_project())
 
             # Extensions
             deps_pkg = get(Dict{String, Any}, pkg_info, "extensions")::Dict{String, Any}
+            deps_pkg_concrete = Dict{String, Vector{String}}()
             for (ext, triggers) in deps_pkg
                 if triggers isa String
                     triggers = [triggers]
                 else
                     triggers = triggers::Vector{String}
                 end
-                deps_pkg[ext] = triggers
+                deps_pkg_concrete[ext] = triggers
             end
-            extensions[m_uuid] = deps_pkg
+            extensions[m_uuid] = deps_pkg_concrete
 
             # Determine strategy to find package
             lookup_strat = begin
diff --git a/base/regex.jl b/base/regex.jl
index 9d5c146a6e840..2b2717a74efc0 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -39,7 +39,13 @@ mutable struct Regex <: AbstractPattern
         end
         re = compile(new(pattern, compile_options, match_options, C_NULL))
         finalizer(re) do re
-            re.regex == C_NULL || PCRE.free_re(re.regex)
+            # don't free during exit because tasks may still be running and
+            # using it. Issue #57817. During sysimage creation _atexit_hooks_finished
+            # is not defined but threads aren't running so just always run
+            during_exit = @isdefined(_atexit_hooks_finished) && _atexit_hooks_finished
+            if re.regex != C_NULL && !during_exit
+                PCRE.free_re(re.regex)
+            end
         end
         re
     end
diff --git a/base/special/exp.jl b/base/special/exp.jl
index 32de6b9be296d..38d7509807aed 100644
--- a/base/special/exp.jl
+++ b/base/special/exp.jl
@@ -252,7 +252,7 @@ end
             twopk = (k + UInt64(53)) << 52
             return reinterpret(T, twopk + reinterpret(UInt64, small_part))*0x1p-53
         end
-        #k == 1024 && return (small_part * 2.0) * 2.0^1023
+        k == 1024 && return (small_part * 2.0) * 2.0^1023
     end
     twopk = Int64(k) << 52
     return reinterpret(T, twopk + reinterpret(Int64, small_part))
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 2d5f0cea26e36..438789758cfe0 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -512,11 +512,11 @@ prevind(s::AbstractString, i::Int)                 = prevind(s, i, 1)
 
 function prevind(s::AbstractString, i::Int, n::Int)
     n < 0 && throw(ArgumentError("n cannot be negative: $n"))
-    z = ncodeunits(s) + 1
+    z = ncodeunits(s)::Int + 1
     @boundscheck 0 < i ≤ z || throw(BoundsError(s, i))
-    n == 0 && return thisind(s, i) == i ? i : string_index_err(s, i)
+    n == 0 && return thisind(s, i)::Int == i ? i : string_index_err(s, i)
     while n > 0 && 1 < i
-        @inbounds n -= isvalid(s, i -= 1)
+        @inbounds n -= isvalid(s, i -= 1)::Bool
     end
     return i - n
 end
@@ -571,11 +571,11 @@ nextind(s::AbstractString, i::Int)                 = nextind(s, i, 1)
 
 function nextind(s::AbstractString, i::Int, n::Int)
     n < 0 && throw(ArgumentError("n cannot be negative: $n"))
-    z = ncodeunits(s)
+    z = ncodeunits(s)::Int
     @boundscheck 0 ≤ i ≤ z || throw(BoundsError(s, i))
-    n == 0 && return thisind(s, i) == i ? i : string_index_err(s, i)
+    n == 0 && return thisind(s, i)::Int == i ? i : string_index_err(s, i)
     while n > 0 && i < z
-        @inbounds n -= isvalid(s, i += 1)
+        @inbounds n -= isvalid(s, i += 1)::Bool
     end
     return i + n
 end
diff --git a/base/summarysize.jl b/base/summarysize.jl
index 4f2646c7641b7..62b0ad0849778 100644
--- a/base/summarysize.jl
+++ b/base/summarysize.jl
@@ -149,13 +149,8 @@ function (ss::SummarySize)(obj::GenericMemory)
     datakey = unsafe_convert(Ptr{Cvoid}, obj)
     if !haskey(ss.seen, datakey)
         ss.seen[datakey] = true
-        dsize = sizeof(obj)
+        size += sizeof(obj)
         T = eltype(obj)
-        if isbitsunion(T)
-            # add 1 union selector byte for each element
-            dsize += length(obj)
-        end
-        size += dsize
         if !isempty(obj) && T !== Symbol && (!Base.allocatedinline(T) || (T isa DataType && !Base.datatype_pointerfree(T)))
             push!(ss.frontier_x, obj)
             push!(ss.frontier_i, 1)
diff --git a/doc/src/manual/calling-c-and-fortran-code.md b/doc/src/manual/calling-c-and-fortran-code.md
index 6f4d69b16bc81..2c20c22b33930 100644
--- a/doc/src/manual/calling-c-and-fortran-code.md
+++ b/doc/src/manual/calling-c-and-fortran-code.md
@@ -547,15 +547,14 @@ is not valid, since the type layout of `T` is not known statically.
 
 ### SIMD Values
 
-Note: This feature is currently implemented on 64-bit x86 and AArch64 platforms only.
-
 If a C/C++ routine has an argument or return value that is a native SIMD type, the corresponding
 Julia type is a homogeneous tuple of `VecElement` that naturally maps to the SIMD type. Specifically:
 
->   * The tuple must be the same size as the SIMD type. For example, a tuple representing an `__m128`
->     on x86 must have a size of 16 bytes.
->   * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a primitive type that
->     is 1, 2, 4 or 8 bytes.
+>   * The tuple must be the same size and elements as the SIMD type. For example, a tuple
+>     representing an `__m128` on x86 must have a size of 16 bytes and Float32 elements.
+>   * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a
+>     primitive type with a power-of-two number of bytes (e.g. 1, 2, 4, 8, 16, etc) such as
+>     Int8 or Float64.
 
 For instance, consider this C routine that uses AVX intrinsics:
 
@@ -628,6 +627,10 @@ For translating a C argument list to Julia:
 
       * `T`, where `T` is a Julia leaf type
       * argument value will be copied (passed by value)
+  * `vector T` (or `__attribute__ vector_size`, or a typedef such as `__m128`)
+
+      * `NTuple{N, VecElement{T}}`, where `T` is a primitive Julia type of the correct size
+        and N is the number of elements in the vector (equal to `vector_size / sizeof T`).
   * `void*`
 
       * depends on how this parameter is used, first translate this to the intended pointer type, then
@@ -674,13 +677,16 @@ For translating a C return type to Julia:
   * `T`, where `T` is one of the primitive types: `char`, `int`, `long`, `short`, `float`, `double`,
     `complex`, `enum` or any of their `typedef` equivalents
 
-      * `T`, where `T` is an equivalent Julia Bits Type (per the table above)
-      * if `T` is an `enum`, the argument type should be equivalent to `Cint` or `Cuint`
+      * same as C argument list
       * argument value will be copied (returned by-value)
   * `struct T` (including typedef to a struct)
 
-      * `T`, where `T` is a Julia Leaf Type
+      * same as C argument list
       * argument value will be copied (returned by-value)
+
+  * `vector T`
+
+      * same as C argument list
   * `void*`
 
       * depends on how this parameter is used, first translate this to the intended pointer type, then
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index d049327c2bf36..a3eb2df3c7574 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -1971,7 +1971,7 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
     else if (!alignment)
         alignment = julia_alignment(jltype);
     if (intcast && Order == AtomicOrdering::NotAtomic) {
-        emit_memcpy(ctx, intcast, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), data, jl_aliasinfo_t::fromTBAA(ctx, tbaa), nb, Align(alignment), intcast->getAlign());
+        emit_memcpy(ctx, intcast, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), data, jl_aliasinfo_t::fromTBAA(ctx, tbaa), nb, intcast->getAlign(), Align(alignment));
     }
     else {
         if (!isboxed && jl_is_genericmemoryref_type(jltype)) {
@@ -3214,7 +3214,7 @@ static void union_alloca_type(jl_uniontype_t *ut,
             [&](unsigned idx, jl_datatype_t *jt) {
                 if (!jl_is_datatype_singleton(jt)) {
                     size_t nb1 = jl_datatype_size(jt);
-                    size_t align1 = jl_datatype_align(jt);
+                    size_t align1 = julia_alignment((jl_value_t*)jt);
                     if (nb1 > nbytes)
                         nbytes = nb1;
                     if (align1 > align)
@@ -3796,9 +3796,10 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
 
             // whether we should perform the initialization with the struct as a IR value
             // or instead initialize the stack buffer with stores
+            // although we do the former if it is a vector or could be a vector element
             auto tracked = CountTrackedPointers(lt);
             bool init_as_value = false;
-            if (lt->isVectorTy() || jl_is_vecelement_type(ty)) { // maybe also check the size ?
+            if (lt->isVectorTy() || jl_special_vector_alignment(1, ty) != 0) {
                 init_as_value = true;
             }
             else if (tracked.count) {
diff --git a/src/codegen.cpp b/src/codegen.cpp
index c75c2ad27e384..b46a406edd2be 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -8552,6 +8552,8 @@ static jl_llvm_functions_t
             Type *RT = Arg->getParamStructRetType();
             TypeSize sz = DL.getTypeAllocSize(RT);
             Align al = DL.getPrefTypeAlign(RT);
+            if (al > MAX_ALIGN)
+                al = Align(MAX_ALIGN);
             param.addAttribute(Attribute::NonNull);
             // The `dereferenceable` below does not imply `nonnull` for non addrspace(0) pointers.
             param.addDereferenceableAttr(sz);
@@ -9677,10 +9679,10 @@ jl_llvm_functions_t jl_emit_codeinst(
                 // Julia-level optimization will never need to see it
                 else if (jl_is_method(def) && // don't delete toplevel code
                          inferred != jl_nothing && // and there is something to delete (test this before calling jl_ir_inlining_cost)
-                         ((!effects_foldable(codeinst->ipo_purity_bits) && // don't delete code we may want for irinterp
-                           (jl_ir_inlining_cost(inferred) == UINT16_MAX) && // don't delete inlineable code
-                           !jl_generating_output()) || // don't delete code when generating a precompile file, trading memory in the short term for avoiding likely duplicating inference work for aotcompile
-                          jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr)) { // unless it is constant (although this shouldn't have had code in the first place)
+                         !effects_foldable(codeinst->ipo_purity_bits) && // don't delete code we may want for irinterp
+                         ((jl_ir_inlining_cost(inferred) == UINT16_MAX) || // don't delete inlineable code
+                          jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr) && // unless it is constant
+                         !(params.imaging_mode || jl_options.incremental)) { // don't delete code when generating a precompile file
                     jl_atomic_store_release(&codeinst->inferred, jl_nothing);
                 }
             }
diff --git a/src/datatype.c b/src/datatype.c
index 8de401f4dd0f7..bb33aa9e397bc 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -298,9 +298,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
 }
 
 // Determine if homogeneous tuple with fields of type t will have
-// a special alignment beyond normal Julia rules.
+// a special alignment and vector-ABI beyond normal rules for aggregates.
 // Return special alignment if one exists, 0 if normal alignment rules hold.
 // A non-zero result *must* match the LLVM rules for a vector type <nfields x t>.
+// Matching the compiler's `__attribute__ vector_size` behavior.
 // For sake of Ahead-Of-Time (AOT) compilation, this routine has to work
 // without LLVM being available.
 unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
@@ -315,8 +316,12 @@ unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t)
         // motivating use case comes up for Julia, we reject pointers.
         return 0;
     size_t elsz = jl_datatype_size(ty);
-    if (elsz != 1 && elsz != 2 && elsz != 4 && elsz != 8)
-        // Only handle power-of-two-sized elements (for now)
+    if (next_power_of_two(elsz) != elsz)
+        // Only handle power-of-two-sized elements (for now), since other
+        // lengths may be packed into very complicated arrangements (llvm pads
+        // extra bits on most platforms when computing alignment but not when
+        // computing type size, but adds no extra bytes for each element, so
+        // their effect on offsets are never what you may naturally expect).
         return 0;
     size_t size = nfields * elsz;
     // Use natural alignment for this vector: this matches LLVM and clang.
@@ -707,9 +712,9 @@ void jl_compute_field_offsets(jl_datatype_t *st)
             }
             else {
                 fsz = sizeof(void*);
-                if (fsz > MAX_ALIGN)
-                    fsz = MAX_ALIGN;
                 al = fsz;
+                if (al > MAX_ALIGN)
+                    al = MAX_ALIGN;
                 desc[i].isptr = 1;
                 zeroinit = 1;
                 npointers++;
@@ -929,6 +934,18 @@ JL_DLLEXPORT jl_datatype_t *jl_new_primitivetype(jl_value_t *name, jl_module_t *
                                         jl_emptysvec, jl_emptysvec, jl_emptysvec, 0, 0, 0);
     uint32_t nbytes = (nbits + 7) / 8;
     uint32_t alignm = next_power_of_two(nbytes);
+# if defined(_CPU_X86_) && !defined(_OS_WINDOWS_)
+    // datalayout strings are often weird: on 64-bit they usually follow fairly simple rules,
+    // but on x86 32 bit platforms, sometimes 5 to 8 byte types are
+    // 32-bit aligned even though the MAX_ALIGN (for types 9+ bytes) is 16
+    // (except for f80 which is align 4 on Mingw, Linux, and BSDs--but align 16 on MSVC and Darwin)
+    // https://llvm.org/doxygen/ARMTargetMachine_8cpp.html#adb29b487708f0dc2a940345b68649270
+    // https://llvm.org/doxygen/AArch64TargetMachine_8cpp.html#a003a58caf135efbf7273c5ed84e700d7
+    // https://llvm.org/doxygen/X86TargetMachine_8cpp.html#aefdbcd6131ef195da070cef7fdaf0532
+    // 32-bit alignment is weird
+    if (alignm == 8)
+        alignm = 4;
+# endif
     if (alignm > MAX_ALIGN)
         alignm = MAX_ALIGN;
     // memoize isprimitivetype, since it is much easier than checking
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 124b7da74dee1..3aa1612572bf6 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1100,13 +1100,14 @@ void gc_stats_big_obj(void)
             v = v->next;
         }
 
-        mallocarray_t *ma = ptls2->heap.mallocarrays;
-        while (ma != NULL) {
-            if (gc_marked(jl_astaggedvalue(ma->a)->bits.gc)) {
+        void **lst = ptls2->gc_tls.heap.mallocarrays.items;
+        for (size_t i = 0, l = ptls2->gc_tls.heap.mallocarrays.len; i < l; i++) {
+            jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[i] & ~(uintptr_t)1);
+            uint8_t bits = jl_astaggedvalue(m)->bits.gc;
+            if (gc_marked(bits)) {
                 nused++;
-                nbytes += jl_genericmemory_nbytes((jl_genericmemory_t*)ma->a);
+                nbytes += jl_genericmemory_nbytes(m);
             }
-            ma = ma->next;
         }
     }
 
diff --git a/src/gc.c b/src/gc.c
index e89e16ff187c0..c4c83861f5a52 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -6,7 +6,11 @@
 #include "julia_atomics.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
-#ifdef __GLIBC__
+#include <stdlib.h>
+
+#if defined(_OS_DARWIN_)
+#include <malloc/malloc.h>
+#else
 #include <malloc.h> // for malloc_trim
 #endif
 
@@ -1121,17 +1125,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
 
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
     // This is **NOT** a GC safe point.
-    mallocarray_t *ma;
-    if (ptls->heap.mafreelist == NULL) {
-        ma = (mallocarray_t*)malloc_s(sizeof(mallocarray_t));
-    }
-    else {
-        ma = ptls->heap.mafreelist;
-        ptls->heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_value_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->heap.mallocarrays;
-    ptls->heap.mallocarrays = ma;
+    void *a = (void*)((uintptr_t)m | !!isaligned);
+    small_arraylist_push(&ptls->heap.mallocarrays, a);
 }
 
 
@@ -1143,10 +1138,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
     jl_batch_accum_heap_size(ptls, sz);
 }
 
-void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
-{
-    jl_batch_accum_free_size(jl_current_task->ptls, sz);
-}
 
 // Only safe to update the heap inside the GC
 static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
@@ -1222,19 +1213,21 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 }
 
 
-static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
+static void jl_gc_free_memory(jl_genericmemory_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
     jl_genericmemory_t *m = (jl_genericmemory_t*)v;
     assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
     char *d = (char*)m->ptr;
+    size_t freed_bytes = memory_block_usable_size(d, isaligned);
+    assert(freed_bytes != 0);
     if (isaligned)
         jl_free_aligned(d);
     else
         free(d);
     jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
-        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m));
-    gc_num.freed += jl_genericmemory_nbytes(m);
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - freed_bytes);
+    gc_num.freed += freed_bytes;
     gc_num.freecall++;
 }
 
@@ -1245,24 +1238,23 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            mallocarray_t *ma = ptls2->heap.mallocarrays;
-            mallocarray_t **pma = &ptls2->heap.mallocarrays;
-            while (ma != NULL) {
-                mallocarray_t *nxt = ma->next;
-                jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
-                int bits = jl_astaggedvalue(a)->bits.gc;
-                if (gc_marked(bits)) {
-                    pma = &ma->next;
+            size_t n = 0;
+            size_t l = ptls2->heap.mallocarrays.len;
+            void **lst = ptls2->heap.mallocarrays.items;
+            // filter without preserving order
+            while (n < l) {
+                jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[n] & ~1);
+                if (gc_marked(jl_astaggedvalue(m)->bits.gc)) {
+                    n++;
                 }
                 else {
-                    *pma = nxt;
-                    int isaligned = (uintptr_t)ma->a & 1;
-                    jl_gc_free_memory(a, isaligned);
-                    free(ma);
+                    int isaligned = (uintptr_t)lst[n] & 1;
+                    jl_gc_free_memory(m, isaligned);
+                    l--;
+                    lst[n] = lst[l];
                 }
-                gc_time_count_mallocd_memory(bits);
-                ma = nxt;
             }
+            ptls2->heap.mallocarrays.len = l;
         }
     }
     gc_time_mallocd_memory_end();
@@ -3968,8 +3960,7 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     small_arraylist_new(&heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
         small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
+    small_arraylist_new(&heap->mallocarrays, 0);
     heap->big_objects = NULL;
     heap->remset = &heap->_remset[0];
     heap->last_remset = &heap->_remset[1];
@@ -4069,58 +4060,44 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
-// allocation wrappers that track allocation and let collection run
+// allocation wrappers that add to gc pressure
 
-JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+JL_DLLEXPORT void *jl_malloc(size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    void *data = malloc(sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-        jl_batch_accum_heap_size(ptls, sz);
-    }
-    return data;
+    return jl_gc_counted_malloc(sz);
 }
 
-JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    return jl_gc_counted_calloc(nmsz, 1);
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    void *data = calloc(nm, sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-        jl_batch_accum_heap_size(ptls, sz * nm);
-    }
-    return data;
+    if (nm > SSIZE_MAX/sz)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
 }
 
-JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+JL_DLLEXPORT void jl_free(void *p)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    free(p);
-    if (pgcstack != NULL && ct->world_age) {
-        jl_batch_accum_free_size(ct->ptls, sz);
+    if (p != NULL) {
+        size_t sz = memory_block_usable_size(p, 0);
+        free(p);
+        jl_task_t *ct = jl_get_current_task();
+        if (ct != NULL)
+            jl_batch_accum_free_size(ct->ptls, sz);
     }
 }
 
-JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
+    size_t old = p ? memory_block_usable_size(p, 0) : 0;
     void *data = realloc(p, sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
+    jl_task_t *ct = jl_get_current_task();
+    if (data != NULL && ct != NULL) {
+        sz = memory_block_usable_size(data, 0);
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (!(sz < old))
@@ -4140,63 +4117,80 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
+JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+    jl_task_t *ct = jl_current_task;
+    void *data = malloc(sz);
+    if (data != NULL && ct != NULL && ct->world_age) {
+        sz = memory_block_usable_size(data, 0);
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        jl_batch_accum_heap_size(ptls, sz);
+    }
+    return data;
 }
 
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+{
+    jl_task_t *ct = jl_current_task;
+    void *data = calloc(nm, sz);
+    if (data != NULL && ct != NULL && ct->world_age) {
+        sz = memory_block_usable_size(data, 0);
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        jl_batch_accum_heap_size(ptls, sz);
+    }
+    return data;
 }
 
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
+    jl_free(p);
 }
 
-JL_DLLEXPORT void jl_free(void *p)
+JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
 {
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
+    return jl_realloc(p, sz);
 }
 
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+// =========================================================================== //
+// malloc wrappers, aligned allocation
+// =========================================================================== //
+
+#if defined(_OS_WINDOWS_)
+// helper function based partly on wine msvcrt80+ heap.c
+// but with several fixes to improve the correctness of the computation and remove unnecessary parameters
+#define SAVED_PTR(x) ((void *)((DWORD_PTR)((char *)x - sizeof(void *)) & \
+                               ~(sizeof(void *) - 1)))
+static size_t _aligned_msize(void *p)
 {
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+    void *alloc_ptr = *(void**)SAVED_PTR(p);
+    return _msize(alloc_ptr) - ((char*)p - (char*)alloc_ptr);
 }
+#undef SAVED_PTR
+#endif
 
+size_t memory_block_usable_size(void *p, int isaligned) JL_NOTSAFEPOINT
+{
+#if defined(_OS_WINDOWS_)
+    if (isaligned)
+        return _aligned_msize(p);
+    else
+        return _msize(p);
+#elif defined(_OS_DARWIN_)
+    return malloc_size(p);
+#else
+    return malloc_usable_size(p);
+#endif
+}
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -4214,12 +4208,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     void *b = malloc_cache_align(allocsz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
-
+    size_t allocated_bytes = memory_block_usable_size(b, 1);
+    assert(allocated_bytes >= allocsz);
     jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
+        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocated_bytes);
     jl_atomic_store_relaxed(&ptls->gc_num.malloc,
         jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-    jl_batch_accum_heap_size(ptls, allocsz);
+    jl_batch_accum_heap_size(ptls, allocated_bytes);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
 #endif
diff --git a/src/gc.h b/src/gc.h
index 01d8745b2899e..7e4da2bd1900f 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -143,11 +143,6 @@ JL_EXTENSION typedef struct _bigval_t {
 
 // data structure for tracking malloc'd arrays and genericmemory.
 
-typedef struct _mallocarray_t {
-    jl_value_t *a;
-    struct _mallocarray_t *next;
-} mallocarray_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
diff --git a/src/genericmemory.c b/src/genericmemory.c
index b36852d53f9c8..02293867da4df 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -165,7 +165,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void
     if (own_buffer) {
         int isaligned = 0;  // TODO: allow passing memalign'd buffers
         jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned);
-        jl_gc_count_allocd(nel*elsz);
+        size_t allocated_bytes = memory_block_usable_size(data, isaligned);
+        jl_gc_count_allocd(allocated_bytes);
     }
     return m;
 }
@@ -208,8 +209,6 @@ JL_DLLEXPORT jl_value_t *jl_genericmemory_to_string(jl_genericmemory_t *m, size_
         JL_GC_PUSH1(&o);
         jl_value_t *str = jl_pchar_to_string((const char*)m->ptr, len);
         JL_GC_POP();
-        if (how == 1) // TODO: we might like to early-call jl_gc_free_memory here instead actually, but hopefully `m` will die soon
-            jl_gc_count_freed(mlength);
         return str;
     }
     // n.b. how == 0 is always pool-allocated, so the freed bytes are computed from the pool not the object
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 1c2d071d1a6cd..05a2f1e677d60 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -608,6 +608,7 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...);
 #endif
 
 jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz);
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz);
 
 JL_DLLEXPORT void JL_NORETURN jl_throw_out_of_memory_error(void);
@@ -618,6 +619,7 @@ JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT;
 void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT;
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT;
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT;
+size_t memory_block_usable_size(void *mem, int isaligned) JL_NOTSAFEPOINT;
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT;
 void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT;
 void jl_gc_run_all_finalizers(jl_task_t *ct);
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 3a0f7f12bffe5..0ca47cc553c88 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -130,8 +130,7 @@ typedef struct {
     small_arraylist_t live_tasks;
 
     // variables for tracking malloc'd arrays
-    struct _mallocarray_t *mallocarrays;
-    struct _mallocarray_t *mafreelist;
+    small_arraylist_t mallocarrays;
 
     // variables for tracking big objects
     struct _bigval_t *big_objects;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 0fb5b9bb18805..8ba321c75b239 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2331,8 +2331,10 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
             // strip all constant alias information, as it might depend on the gc having
             // preserved a gc root, which stops being true after this pass (#32215)
             // similar to RewriteStatepointsForGC::stripNonValidData, but less aggressive
-            if (I->getMetadata(LLVMContext::MD_invariant_load))
-                I->setMetadata(LLVMContext::MD_invariant_load, NULL);
+            if (auto *LI = dyn_cast<LoadInst>(I)){
+                if (isSpecialPtr(LI->getPointerOperand()->getType()) && LI->getMetadata(LLVMContext::MD_invariant_load))
+                    LI->setMetadata(LLVMContext::MD_invariant_load, NULL);
+            }
             if (MDNode *TBAA = I->getMetadata(LLVMContext::MD_tbaa)) {
                 if (TBAA->getNumOperands() == 4 && isTBAA(TBAA, {"jtbaa_const", "jtbaa_memoryptr", "jtbaa_memorylen", "tbaa_memoryown"})) {
                     MDNode *MutableTBAA = createMutableTBAAAccessTag(TBAA);
diff --git a/src/mtarraylist.c b/src/mtarraylist.c
index 8bad44797dab4..1bd6810cda8a6 100644
--- a/src/mtarraylist.c
+++ b/src/mtarraylist.c
@@ -14,8 +14,8 @@ extern "C" {
 // but there can be any number of observers
 
 typedef struct {
-    _Atomic(uint32_t) len;
-    uint32_t max;
+    _Atomic(size_t) len;
+    size_t max;
     _Atomic(_Atomic(void*)*) items;
     _Atomic(void*) _space[SMALL_AL_N_INLINE];
 } small_mtarraylist_t;
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 5c12e3dad0dd7..2df9d0dfd5a31 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -490,6 +490,13 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *
             FPM.addPass(IRCEPass());
             FPM.addPass(InstCombinePass());
             FPM.addPass(JumpThreadingPass());
+        } else if (O.getSpeedupLevel() >= 1) {
+            JULIA_PASS(FPM.addPass(AllocOptPass()));
+            FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
+            FPM.addPass(MemCpyOptPass());
+            FPM.addPass(SCCPPass());
+            FPM.addPass(InstCombinePass());
+            FPM.addPass(ADCEPass());
         }
         if (O.getSpeedupLevel() >= 3) {
             FPM.addPass(GVNPass());
diff --git a/src/staticdata.c b/src/staticdata.c
index 9d4c60a137058..76bb488731a92 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -725,16 +725,6 @@ static uintptr_t jl_fptr_id(void *fptr)
         return *(uintptr_t*)pbp;
 }
 
-static int effects_foldable(uint32_t effects)
-{
-    // N.B.: This needs to be kept in sync with Core.Compiler.is_foldable(effects, true)
-    return ((effects & 0x7) == 0) && // is_consistent(effects)
-           (((effects >> 10) & 0x03) == 0) && // is_noub(effects)
-           (((effects >> 3) & 0x03) == 0) && // is_effect_free(effects)
-           ((effects >> 6) & 0x01); // is_terminates(effects)
-}
-
-
 // `jl_queue_for_serialization` adds items to `serialization_order`
 #define jl_queue_for_serialization(s, v) jl_queue_for_serialization_((s), (jl_value_t*)(v), 1, 0)
 static void jl_queue_for_serialization_(jl_serializer_state *s, jl_value_t *v, int recursive, int immediate) JL_GC_DISABLED;
@@ -848,25 +838,8 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
             // TODO: if (ci in ci->defs->cache)
             record_field_change((jl_value_t**)&ci->next, NULL);
         }
-        jl_value_t *inferred = jl_atomic_load_relaxed(&ci->inferred);
-        if (inferred && inferred != jl_nothing) { // disregard if there is nothing here to delete (e.g. builtins, unspecialized)
-            if (!is_relocatable_ci(&relocatable_ext_cis, ci))
-                record_field_change((jl_value_t**)&ci->inferred, jl_nothing);
-            else if (jl_is_method(ci->def->def.method) && // don't delete toplevel code
-                     ci->def->def.method->source) { // don't delete code from optimized opaque closures that can't be reconstructed (and builtins)
-                if (jl_atomic_load_relaxed(&ci->max_world) != ~(size_t)0 || // delete all code that cannot run
-                    jl_atomic_load_relaxed(&ci->invoke) == jl_fptr_const_return) { // delete all code that just returns a constant
-                    record_field_change((jl_value_t**)&ci->inferred, jl_nothing);
-                }
-                else if (native_functions && // don't delete any code if making a ji file
-                         (ci->owner == jl_nothing) && // don't delete code for external interpreters
-                         !effects_foldable(ci->ipo_purity_bits) && // don't delete code we may want for irinterp
-                         jl_ir_inlining_cost(inferred) == UINT16_MAX) { // don't delete inlineable code
-                    // delete the code now: if we thought it was worth keeping, it would have been converted to object code
-                    record_field_change((jl_value_t**)&ci->inferred, jl_nothing);
-                }
-            }
-        }
+        if (jl_atomic_load_relaxed(&ci->inferred) && !is_relocatable_ci(&relocatable_ext_cis, ci))
+            record_field_change((jl_value_t**)&ci->inferred, jl_nothing);
     }
 
     if (immediate) // must be things that can be recursively handled, and valid as type parameters
diff --git a/src/subtype.c b/src/subtype.c
index 2d1221904d149..8bb7a2070c23c 100644
--- a/src/subtype.c
+++ b/src/subtype.c
@@ -2667,31 +2667,22 @@ static void set_bound(jl_value_t **bound, jl_value_t *val, jl_tvar_t *v, jl_sten
 // subtype, treating all vars as existential
 static int subtype_in_env_existential(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
 {
-    jl_varbinding_t *v = e->vars;
-    int len = 0;
     if (x == jl_bottom_type || y == (jl_value_t*)jl_any_type)
         return 1;
-    while (v != NULL) {
-        len++;
-        v = v->prev;
-    }
-    int8_t *rs = (int8_t*)malloc_s(len);
+    int8_t *rs = (int8_t*)alloca(current_env_length(e));
+    jl_varbinding_t *v = e->vars;
     int n = 0;
-    v = e->vars;
-    while (n < len) {
-        assert(v != NULL);
+    while (v != NULL) {
         rs[n++] = v->right;
         v->right = 1;
         v = v->prev;
     }
     int issub = subtype_in_env(x, y, e);
     n = 0; v = e->vars;
-    while (n < len) {
-        assert(v != NULL);
+    while (v != NULL) {
         v->right = rs[n++];
         v = v->prev;
     }
-    free(rs);
     return issub;
 }
 
@@ -2739,6 +2730,8 @@ static int check_unsat_bound(jl_value_t *t, jl_tvar_t *v, jl_stenv_t *e) JL_NOTS
 }
 
 
+static int intersect_var_ccheck_in_env(jl_value_t *xlb, jl_value_t *xub, jl_value_t *ylb, jl_value_t *yub, jl_stenv_t *e, int flip);
+
 static jl_value_t *intersect_var(jl_tvar_t *b, jl_value_t *a, jl_stenv_t *e, int8_t R, int param)
 {
     jl_varbinding_t *bb = lookup(e, b);
@@ -2750,20 +2743,14 @@ static jl_value_t *intersect_var(jl_tvar_t *b, jl_value_t *a, jl_stenv_t *e, int
         return R ? intersect(a, bb->lb, e, param) : intersect(bb->lb, a, e, param);
     if (!jl_is_type(a) && !jl_is_typevar(a))
         return set_var_to_const(bb, a, e, R);
-    jl_savedenv_t se;
     if (param == 2) {
         jl_value_t *ub = NULL;
         JL_GC_PUSH1(&ub);
         if (!jl_has_free_typevars(a)) {
-            save_env(e, &se, 1);
-            int issub = subtype_in_env_existential(bb->lb, a, e);
-            restore_env(e, &se, 1);
-            if (issub) {
-                issub = subtype_in_env_existential(a, bb->ub, e);
-                restore_env(e, &se, 1);
-            }
-            free_env(&se);
-            if (!issub) {
+            if (R) flip_offset(e);
+            int ccheck = intersect_var_ccheck_in_env(bb->lb, bb->ub, a, a, e, !R);
+            if (R) flip_offset(e);
+            if (!ccheck) {
                 JL_GC_POP();
                 return jl_bottom_type;
             }
@@ -2773,6 +2760,7 @@ static jl_value_t *intersect_var(jl_tvar_t *b, jl_value_t *a, jl_stenv_t *e, int
             e->triangular++;
             ub = R ? intersect_aside(a, bb->ub, e, bb->depth0) : intersect_aside(bb->ub, a, e, bb->depth0);
             e->triangular--;
+            jl_savedenv_t se;
             save_env(e, &se, 1);
             int issub = subtype_in_env_existential(bb->lb, ub, e);
             restore_env(e, &se, 1);
@@ -3845,6 +3833,89 @@ static int subtype_by_bounds(jl_value_t *x, jl_value_t *y, jl_stenv_t *e) JL_NOT
     return compareto_var(x, (jl_tvar_t*)y, e, -1) || compareto_var(y, (jl_tvar_t*)x, e, 1);
 }
 
+static int intersect_var_ccheck_in_env(jl_value_t *xlb, jl_value_t *xub, jl_value_t *ylb, jl_value_t *yub, jl_stenv_t *e, int flip)
+{
+    int easy_check1 = xlb == jl_bottom_type ||
+                      yub == (jl_value_t *)jl_any_type ||
+                      (e->Loffset == 0 && obviously_in_union(yub, xlb));
+    int easy_check2 = ylb == jl_bottom_type ||
+                      xub == (jl_value_t *)jl_any_type ||
+                      (e->Loffset == 0 && obviously_in_union(xub, ylb));
+    int nofree1 = 0, nofree2 = 0;
+    if (!easy_check1) {
+        nofree1 = !jl_has_free_typevars(xlb) && !jl_has_free_typevars(yub);
+        if (nofree1 && e->Loffset == 0) {
+            easy_check1 = jl_subtype(xlb, yub);
+            if (!easy_check1)
+                return 0;
+        }
+    }
+    if (!easy_check2) {
+        nofree2 = !jl_has_free_typevars(ylb) && !jl_has_free_typevars(xub);
+        if (nofree2 && e->Loffset == 0) {
+            easy_check2 = jl_subtype(ylb, xub);
+            if (!easy_check2)
+                return 0;
+        }
+    }
+    if (easy_check1 && easy_check2)
+        return 1;
+    int ccheck = 0;
+    if ((easy_check1 || nofree1) && (easy_check2 || nofree2)) {
+        jl_varbinding_t *vars = e->vars;
+        e->vars = NULL;
+        ccheck = easy_check1 || subtype_in_env(xlb, yub, e);
+        if (ccheck && !easy_check2) {
+            flip_offset(e);
+            ccheck = subtype_in_env(ylb, xub, e);
+            flip_offset(e);
+        }
+        e->vars = vars;
+        return ccheck;
+    }
+    jl_savedenv_t se;
+    save_env(e, &se, 1);
+    // first try normal flip.
+    if (flip) flip_vars(e);
+    ccheck = easy_check1 || subtype_in_env(xlb, yub, e);
+    if (ccheck && !easy_check2) {
+        flip_offset(e);
+        ccheck = subtype_in_env(ylb, xub, e);
+        flip_offset(e);
+    }
+    if (flip) flip_vars(e);
+    if (!ccheck) {
+        // then try reverse flip.
+        restore_env(e, &se, 1);
+        if (!flip) flip_vars(e);
+        ccheck = easy_check1 || subtype_in_env(xlb, yub, e);
+        if (ccheck && !easy_check2) {
+            flip_offset(e);
+            ccheck = subtype_in_env(ylb, xub, e);
+            flip_offset(e);
+        }
+        if (!flip) flip_vars(e);
+    }
+    if (!ccheck) {
+        // then try existential.
+        restore_env(e, &se, 1);
+        if (easy_check1)
+            ccheck = 1;
+        else {
+            ccheck = subtype_in_env_existential(xlb, yub, e);
+            restore_env(e, &se, 1);
+        }
+        if (ccheck && !easy_check2) {
+            flip_offset(e);
+            ccheck = subtype_in_env_existential(ylb, xub, e);
+            flip_offset(e);
+            restore_env(e, &se, 1);
+        }
+    }
+    free_env(&se);
+    return ccheck;
+}
+
 static int has_typevar_via_env(jl_value_t *x, jl_tvar_t *t, jl_stenv_t *e)
 {
     if (e->Loffset == 0) {
@@ -3977,14 +4048,8 @@ static jl_value_t *intersect(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int pa
                     ccheck = 1;
                 }
                 else {
-                    if (R) flip_vars(e);
-                    ccheck = subtype_in_env(xlb, yub, e);
-                    if (ccheck) {
-                        flip_offset(e);
-                        ccheck = subtype_in_env(ylb, xub, e);
-                        flip_offset(e);
-                    }
-                    if (R) flip_vars(e);
+                    // try many subtype check to avoid false `Union{}`
+                    ccheck = intersect_var_ccheck_in_env(xlb, xub, ylb, yub, e, R);
                 }
                 if (R) flip_offset(e);
                 if (!ccheck)
diff --git a/src/support/arraylist.h b/src/support/arraylist.h
index 6ad2f0e2f28c9..edad2880dbed2 100644
--- a/src/support/arraylist.h
+++ b/src/support/arraylist.h
@@ -5,7 +5,7 @@
 
 #define AL_N_INLINE 29
 
-#define SMALL_AL_N_INLINE 6
+#define SMALL_AL_N_INLINE 5
 
 #ifdef __cplusplus
 extern "C" {
@@ -13,7 +13,7 @@ extern "C" {
 
 #include "analyzer_annotations.h"
 
-typedef struct {
+typedef struct { // 32 words
     size_t len;
     size_t max;
     void **items;
@@ -27,9 +27,9 @@ void arraylist_push(arraylist_t *a, void *elt) JL_NOTSAFEPOINT;
 void *arraylist_pop(arraylist_t *a) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void arraylist_grow(arraylist_t *a, size_t n) JL_NOTSAFEPOINT;
 
-typedef struct {
-    uint32_t len;
-    uint32_t max;
+typedef struct { // 8 words
+    size_t len;
+    size_t max;
     void **items;
     void *_space[SMALL_AL_N_INLINE];
 } small_arraylist_t;
diff --git a/stdlib/LinearAlgebra/src/bunchkaufman.jl b/stdlib/LinearAlgebra/src/bunchkaufman.jl
index 8d1ded9bf8111..db0d44a1e25a6 100644
--- a/stdlib/LinearAlgebra/src/bunchkaufman.jl
+++ b/stdlib/LinearAlgebra/src/bunchkaufman.jl
@@ -127,6 +127,9 @@ function bunchkaufman!(A::StridedMatrix{<:BlasFloat}, rook::Bool = false; check:
     end
 end
 
+bkcopy_oftype(A, S) = eigencopy_oftype(A, S)
+bkcopy_oftype(A::Symmetric{<:Complex}, S) = Symmetric(copytrito!(similar(parent(A), S, size(A)), A.data, A.uplo), sym_uplo(A.uplo))
+
 """
     bunchkaufman(A, rook::Bool=false; check = true) -> S::BunchKaufman
 
@@ -206,7 +209,7 @@ julia> S.L*S.D*S.L' - A[S.p, S.p]
 ```
 """
 bunchkaufman(A::AbstractMatrix{T}, rook::Bool=false; check::Bool = true) where {T} =
-    bunchkaufman!(eigencopy_oftype(A, typeof(sqrt(oneunit(T)))), rook; check = check)
+    bunchkaufman!(bkcopy_oftype(A, typeof(sqrt(oneunit(T)))), rook; check = check)
 
 BunchKaufman{T}(B::BunchKaufman) where {T} =
     BunchKaufman(convert(Matrix{T}, B.LD), B.ipiv, B.uplo, B.symmetric, B.rook, B.info)
@@ -1529,7 +1532,7 @@ function bunchkaufman(A::AbstractMatrix{TS},
     rook::Bool = false;
     check::Bool = true
     ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
-    return bunchkaufman!(eigencopy_oftype(A, TS), rook; check)
+    return bunchkaufman!(bkcopy_oftype(A, TS), rook; check)
 end
 
 function bunchkaufman(A::AbstractMatrix{TS},
@@ -1551,15 +1554,15 @@ function bunchkaufman(A::AbstractMatrix{TS},
     # We promote input to BigInt to avoid overflow problems
     if TA == Nothing
         if TS <: Integer
-            M = Rational{BigInt}.(eigencopy_oftype(A, TS))
+            M = Rational{BigInt}.(bkcopy_oftype(A, TS))
         else
-            M = Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS))
+            M = Complex{Rational{BigInt}}.(bkcopy_oftype(A, TS))
         end
     else
         if TS <: Integer
-            M = TA(Rational{BigInt}.(eigencopy_oftype(A, TS)), Symbol(A.uplo))
+            M = TA(Rational{BigInt}.(bkcopy_oftype(A, TS)), Symbol(A.uplo))
         else
-            M = TA(Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS)),
+            M = TA(Complex{Rational{BigInt}}.(bkcopy_oftype(A, TS)),
                 Symbol(A.uplo))
         end
     end
diff --git a/stdlib/LinearAlgebra/src/symmetriceigen.jl b/stdlib/LinearAlgebra/src/symmetriceigen.jl
index 0c86383685807..e9f43ef640392 100644
--- a/stdlib/LinearAlgebra/src/symmetriceigen.jl
+++ b/stdlib/LinearAlgebra/src/symmetriceigen.jl
@@ -3,6 +3,7 @@
 # preserve HermOrSym wrapper
 eigencopy_oftype(A::Hermitian, S) = Hermitian(copy_similar(A, S), sym_uplo(A.uplo))
 eigencopy_oftype(A::Symmetric, S) = Symmetric(copy_similar(A, S), sym_uplo(A.uplo))
+eigencopy_oftype(A::Symmetric{<:Complex}, S) = copyto!(similar(parent(A), S), A)
 
 # Eigensolvers for symmetric and Hermitian matrices
 eigen!(A::RealHermSymComplexHerm{<:BlasReal,<:StridedMatrix}; sortby::Union{Function,Nothing}=nothing) =
diff --git a/stdlib/LinearAlgebra/test/hessenberg.jl b/stdlib/LinearAlgebra/test/hessenberg.jl
index 39ae7ec83a5c3..136f741fa5c0e 100644
--- a/stdlib/LinearAlgebra/test/hessenberg.jl
+++ b/stdlib/LinearAlgebra/test/hessenberg.jl
@@ -250,4 +250,11 @@ end
     @test axes(S) === (r,r)
 end
 
+@testset "complex Symmetric" begin
+    D = diagm(0=>ComplexF64[1,2])
+    S = Symmetric(D)
+    H = hessenberg(S)
+    @test H.H == D
+end
+
 end # module TestHessenberg
diff --git a/stdlib/LinearAlgebra/test/symmetriceigen.jl b/stdlib/LinearAlgebra/test/symmetriceigen.jl
index b3a5472c511f4..258ccbd2d4591 100644
--- a/stdlib/LinearAlgebra/test/symmetriceigen.jl
+++ b/stdlib/LinearAlgebra/test/symmetriceigen.jl
@@ -151,4 +151,10 @@ end
     @test HT * V ≈ V * Diagonal(λ)
 end
 
+@testset "complex Symmetric" begin
+    S = Symmetric(rand(ComplexF64,2,2))
+    λ, v = eigen(S)
+    @test S * v ≈ v * Diagonal(λ)
+end
+
 end # module TestSymmetricEigen
diff --git a/stdlib/REPL/docs/src/index.md b/stdlib/REPL/docs/src/index.md
index d2a17e3a6b4a3..fdc1be5c5f357 100644
--- a/stdlib/REPL/docs/src/index.md
+++ b/stdlib/REPL/docs/src/index.md
@@ -341,7 +341,15 @@ mapfoldl mapfoldr
 
 When a single complete tab-complete result is available at the end of an input line and 2 or more characters
 have been typed, a hint of the completion will show in a lighter color.
-This can be disabled via `Base.active_repl.options.hint_tab_completes = false`.
+This can be disabled via `Base.active_repl.options.hint_tab_completes = false` or by adding
+```
+atreplinit() do repl
+    if VERSION >= v"1.11.0-0"
+        repl.options.hint_tab_completes = false
+    end
+end
+```
+to your `~/.julia/config/startup.jl`.
 
 !!! compat "Julia 1.11"
     Tab-complete hinting was added in Julia 1.11
diff --git a/stdlib/TOML/src/print.jl b/stdlib/TOML/src/print.jl
index 63f65b017d393..c6c046b9b40c6 100644
--- a/stdlib/TOML/src/print.jl
+++ b/stdlib/TOML/src/print.jl
@@ -77,7 +77,7 @@ end
 # Fallback
 function printvalue(f::MbyFunc, io::IO, value, sorted::Bool)
     toml_value = to_toml_value(f, value)
-    @invokelatest printvalue(f, io, toml_value)
+    @invokelatest printvalue(f, io, toml_value, sorted)
 end
 
 function printvalue(f::MbyFunc, io::IO, value::AbstractVector, sorted::Bool)
@@ -156,7 +156,7 @@ function print_table(f::MbyFunc, io::IO, a::AbstractDict,
 )
 
     if a in inline_tables
-        @invokelatest print_inline_table(f, io, a)
+        @invokelatest print_inline_table(f, io, a, sorted)
         return
     end
 
diff --git a/stdlib/TOML/test/print.jl b/stdlib/TOML/test/print.jl
index 8fba1b1c1df10..e8a6431cb34a7 100644
--- a/stdlib/TOML/test/print.jl
+++ b/stdlib/TOML/test/print.jl
@@ -94,6 +94,14 @@ loaders = ["gzip", { driver = "csv", args = {delim = "\t"}}]
     a = 222
     d = 333
     """
+
+    # https://github.com/JuliaLang/julia/pull/57584
+    d = Dict("b" => [MyStruct(1), MyStruct(2)])
+    @test toml_str(d) do x
+        x isa MyStruct && return Dict("a" => x.a)
+    end == """
+    b = [{a = 1}, {a = 2}]
+    """
 end
 
 @testset "unsigned integers" for (x, s) in [
@@ -196,6 +204,14 @@ LocalPkg = {path = "LocalPkg"}
 @test toml_str(d; sorted=true, inline_tables) == s
 @test roundtrip(s)
 
+
+# https://github.com/JuliaLang/julia/pull/57584
+d = Dict("a" => 1, "b" => 2)
+inline_tables = IdSet{Dict}([d])
+s = "{a = 1, b = 2}"
+@test toml_str(d; sorted=true, inline_tables) == s
+
+
 # multiline strings (#55083)
 s = """
 a = \"\"\"lorem ipsum
diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl
index e434899be6e31..10b66fda33667 100644
--- a/test/compiler/codegen.jl
+++ b/test/compiler/codegen.jl
@@ -866,7 +866,7 @@ if Sys.ARCH === :x86_64
     foo52079() = Core.Intrinsics.have_fma(Float64)
     if foo52079() == true
         let io = IOBuffer()
-            code_native(io,^,(Float64,Float64), dump_module=false)
+            code_native(io,Base.Math.exp_impl,(Float64,Float64,Val{:ℯ}), dump_module=false)
             str = String(take!(io))
             @test !occursin("fma_emulated", str)
             @test occursin("vfmadd", str)
@@ -933,3 +933,8 @@ let
    end
    nothing
 end
+
+struct Vec56937 x::NTuple{8, VecElement{Int}} end
+
+x56937 = Ref(Vec56937(ntuple(_->VecElement(1),8)))
+@test x56937[].x[1] == VecElement{Int}(1) # shouldn't crash
diff --git a/test/core.jl b/test/core.jl
index 9750cc519a746..f525a3fc39e12 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -5668,6 +5668,13 @@ let ni128 = sizeof(FP128test) ÷ sizeof(Int),
     @test reinterpret(UInt128, arr[2].fp) == expected
 end
 
+# make sure VecElement Tuple has the C alignment and ABI for supported types
+primitive type Int24 24 end
+@test Base.datatype_alignment(NTuple{10,VecElement{Int16}}) == 32
+@test Base.datatype_alignment(NTuple{10,VecElement{Int24}}) == 4
+@test Base.datatype_alignment(NTuple{10,VecElement{Int64}}) == 128
+@test Base.datatype_alignment(NTuple{10,VecElement{Int128}}) == 256
+
 # issue #21516
 struct T21516
     x::Vector{Float64}
diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll
index 6dee18da5975f..8ca588f717ee2 100644
--- a/test/llvmpasses/late-lower-gc.ll
+++ b/test/llvmpasses/late-lower-gc.ll
@@ -125,6 +125,20 @@ top:
     ret void
 }
 
+; Confirm that `invariant.load` on other loads survive
+define void @gc_keep_invariant(float addrspace(1)* %0) {
+top:
+; CHECK-LABEL: @gc_keep_invariant
+    %pgcstack = call {}*** @julia.get_pgcstack()
+    %1 = bitcast {}*** %pgcstack to {}**
+    %current_task = getelementptr inbounds {}*, {}** %1, i64 -12
+
+; CHECK: %current_task = getelementptr inbounds ptr, ptr %1, i64 -12
+    %2 = load float, ptr addrspace(1) %0, align 4, !invariant.load !1
+; CHECK-NEXT: %2 = load float, ptr addrspace(1) %0, align 4, !invariant.load
+    ret void
+}
+
 define i32 @callee_root({} addrspace(10)* %v0, {} addrspace(10)* %v1) {
 top:
 ; CHECK-LABEL: @callee_root
diff --git a/test/math.jl b/test/math.jl
index c48a0c7f56323..d73c13530f166 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1464,6 +1464,25 @@ end
     # two cases where we have observed > 1 ULP in the past
     @test 0.0013653274095082324^-97.60372292227069 == 4.088393948750035e279
     @test 8.758520413376658e-5^70.55863059215994 == 5.052076767078296e-287
+
+    # issue #53881
+    c53881 = 2.2844135865398217e222 # check correctness within 2 ULPs
+    @test prevfloat(1.0) ^ -Int64(2)^62 ≈ c53881 atol=2eps(c53881)
+    @test 2.0 ^ typemin(Int) == 0.0
+    @test (-1.0) ^ typemin(Int) == 1.0
+    Z = Int64(2)
+    E = prevfloat(1.0)
+    @test E ^ (-Z^54) ≈ 7.38905609893065
+    @test E ^ (-Z^62) ≈ 2.2844135865231613e222
+    @test E ^ (-Z^63) == Inf
+    @test abs(E ^ (Z^62-1) * E ^ (-Z^62+1) - 1) <= eps(1.0)
+    n, x = -1065564664, 0.9999997040311492
+    @test abs(x^n - Float64(big(x)^n)) / eps(x^n) == 0 # ULPs
+    @test E ^ (big(2)^100 + 1) == 0
+    @test E ^ 6705320061009595392 == nextfloat(0.0)
+    n = Int64(1024 / log2(E))
+    @test E^n == Inf
+    @test E^float(n) == Inf
 end
 
 # Test that sqrt behaves correctly and doesn't exhibit fp80 double rounding.
diff --git a/test/misc.jl b/test/misc.jl
index 3907354e9410b..a66c868aa1e17 100644
--- a/test/misc.jl
+++ b/test/misc.jl
@@ -598,6 +598,11 @@ let z = Z53061[Z53061(S53061(rand(), (rand(),rand())), 0) for _ in 1:10^4]
     @test abs(summarysize(z) - 640000)/640000 <= 0.01 broken = Sys.WORD_SIZE == 32 && Sys.islinux()
 end
 
+# issue #57506
+let len = 100, m1 = Memory{UInt8}(1:len), m2 = Memory{Union{Nothing,UInt8}}(1:len)
+    @test summarysize(m2) == summarysize(m1) + len
+end
+
 ## test conversion from UTF-8 to UTF-16 (for Windows APIs)
 
 # empty arrays
diff --git a/test/regex.jl b/test/regex.jl
index e5f1428527512..ca411b26bbacc 100644
--- a/test/regex.jl
+++ b/test/regex.jl
@@ -245,3 +245,11 @@ end
         @test match(re, "ababc").match === SubString("ababc", 3:5)
     end
 end
+
+@testset "#57817: Don't free Regex during exit finalizer calls" begin
+    # this shouldn't segfault
+    cmd = `$(Base.julia_cmd()) -t2 --startup-file=no -e 're = Regex(""); Threads.@spawn match(re, "", 1, UInt32(0))'`
+    for i in 1:10
+        @test success(pipeline(cmd, stderr=stderr))
+    end
+end
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 87d812c5bf201..955da2d7c4564 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -878,6 +878,11 @@ end
             end
         end
     end
+
+    @testset "return type infers to `Int`" begin
+        @test Int === Base.infer_return_type(prevind, Tuple{AbstractString, Vararg})
+        @test Int === Base.infer_return_type(nextind, Tuple{AbstractString, Vararg})
+    end
 end
 
 @testset "first and last" begin
diff --git a/test/subtype.jl b/test/subtype.jl
index ba7f86bb86a14..979746bd626dc 100644
--- a/test/subtype.jl
+++ b/test/subtype.jl
@@ -1691,9 +1691,7 @@ CovType{T} = Union{AbstractArray{T,2},
 # issue #31703
 @testintersect(Pair{<:Any, Ref{Tuple{Ref{Ref{Tuple{Int}}},Ref{Float64}}}},
                Pair{T, S} where S<:(Ref{A} where A<:(Tuple{C,Ref{T}} where C<:(Ref{D} where D<:(Ref{E} where E<:Tuple{FF}) where FF<:B)) where B) where T,
-               Pair{T, Ref{Tuple{Ref{Ref{Tuple{Int}}},Ref{Float64}}}} where T)
-# TODO: should be able to get this result
-#              Pair{Float64, Ref{Tuple{Ref{Ref{Tuple{Int}}},Ref{Float64}}}}
+               Pair{Float64, Ref{Tuple{Ref{Ref{Tuple{Int}}},Ref{Float64}}}})
 
 module I31703
 using Test, LinearAlgebra
@@ -1745,8 +1743,7 @@ end
                Tuple{Type{SA{2, L}}, Type{SA{2, L}}} where L)
 @testintersect(Tuple{Type{SA{2, L}}, Type{SA{2, 16}}} where L,
                Tuple{Type{<:SA{N, L}}, Type{<:SA{N, L}}} where {N,L},
-               # TODO: this could be narrower
-               Tuple{Type{SA{2, L}}, Type{SA{2, 16}}} where L)
+               Tuple{Type{SA{2, 16}}, Type{SA{2, 16}}})
 
 # issue #31993
 @testintersect(Tuple{Type{<:AbstractVector{T}}, Int} where T,
@@ -1851,9 +1848,9 @@ c32703(::Type{<:Str{C}}, str::Str{C}) where {C<:CSE} = str
                Tuple{Type{<:Str{C}}, Str{C}} where {C<:CSE},
                Union{})
 @test c32703(UTF16Str, ASCIIStr()) == 42
-@test_broken typeintersect(Tuple{Vector{Vector{Float32}},Matrix,Matrix},
-                           Tuple{Vector{V},Matrix{Int},Matrix{S}} where {S, V<:AbstractVector{S}}) ==
-             Tuple{Array{Array{Float32,1},1},Array{Int,2},Array{Float32,2}}
+@testintersect(Tuple{Vector{Vector{Float32}},Matrix,Matrix},
+               Tuple{Vector{V},Matrix{Int},Matrix{S}} where {S, V<:AbstractVector{S}},
+               Tuple{Array{Array{Float32,1},1},Array{Int,2},Array{Float32,2}})
 
 @testintersect(Tuple{Pair{Int, DataType}, Any},
                Tuple{Pair{A, B} where B<:Type, Int} where A,
@@ -2469,6 +2466,11 @@ end
 abstract type P47654{A} end
 @test Wrapper47654{P47654, Vector{Union{P47654,Nothing}}} <: Wrapper47654
 
+#issue 41561
+@testintersect(Tuple{Vector{VT}, Vector{VT}} where {N1, VT<:AbstractVector{N1}},
+               Tuple{Vector{VN} where {N, VN<:AbstractVector{N}}, Vector{Vector{Float64}}},
+               Tuple{Vector{Vector{Float64}}, Vector{Vector{Float64}}})
+
 @testset "known subtype/intersect issue" begin
     #issue 45874
     let S = Pair{Val{P}, AbstractVector{<:Union{P,<:AbstractMatrix{P}}}} where P,
@@ -2476,9 +2478,6 @@ abstract type P47654{A} end
         @test S <: T
     end
 
-    #issue 41561
-    @test_broken typeintersect(Tuple{Vector{VT}, Vector{VT}} where {N1, VT<:AbstractVector{N1}},
-                Tuple{Vector{VN} where {N, VN<:AbstractVector{N}}, Vector{Vector{Float64}}}) !== Union{}
     #issue 40865
     @test Tuple{Set{Ref{Int}}, Set{Ref{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Ref{K}}}
     @test Tuple{Set{Val{Int}}, Set{Val{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Val{K}}}
@@ -2746,3 +2745,15 @@ end
     Val{Tuple{T,R,S}} where {T,R<:Vector{T},S<:Vector{R}},
     Val{Tuple{Int, Vector{Int}, T}} where T<:Vector{Vector{Int}},
 )
+
+#issue 57429
+@testintersect(
+    Pair{<:Any, <:Tuple{Int}},
+    Pair{N, S} where {N, NTuple{N,Int}<:S<:NTuple{M,Int} where {M}},
+    !Union{}
+)
+@testintersect(
+    Pair{N, T} where {N,NTuple{N,Int}<:T<:NTuple{N,Int}},
+    Pair{N, T} where {N,NTuple{N,Int}<:T<:Tuple{Int,Vararg{Int}}},
+    !Union{}
+)