CI: Store Passes in results.json + various improvements. (#57686)

IanButterworth · web-flow · commit d9711317e8fe · 2025-03-19T23:26:06.000-04:00
- Adds option to enable saving Test.Pass results to DefaultTestSet (started as #57690 but more complete here) and enable it when Base.runtests is called when CI=true, to match when reports are saved - Stops attributing testset duration to tests from that testset, it doesn't make sense. - Make the test name resemble (as far as possible) the original test call. Making the name independent of test outcome should make it easier to group tests and identify flaky ones. - store repeat counts as tags, rather than in the name - moves the save to inside the test workers, where all test info is available, and the save can be done in parallel to save time , then jsons are made at the end and uploaded as an artifact in a tarball - speed improvements to make saving time acceptable - fixes a couple of tests that seem to get a little flakier with this Note on the WeakRef test fix: If a test like `@test x == y` passes, the `Pass` object retains `x`. Previously to this the Pass object was still made, but must have been GC-ed immediately. This PR remakes the Pass object when appending it to DefaultTestSet without the captured data. I think that just delayed the GC a little to make the test flaky, so I moved the `@test` to outside the function to avoid capturing `x`. Also buildkite had to fix an issue to make tests show up properly on our jobs: https://forum.buildkite.community/t/test-failures-showing-up-in-test-digest-but-not-on-jobs/4306
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,8 @@
 /julia-*
 /source-dist.tmp
 /source-dist.tmp1
+/test/results_*.json
+/test/results_*.dat
 
 *.expmap
 *.exe
diff --git a/stdlib/Mmap/test/runtests.jl b/stdlib/Mmap/test/runtests.jl
@@ -47,6 +47,7 @@ close(s)
 @test_throws ErrorException mmap(file, Vector{Ref}) # must be bit-type
 GC.gc(); GC.gc()
 
+file = tempname() # new name to reduce chance of issues due slow windows fs
 s = open(f->f,file,"w")
 @test mmap(file) == Vector{UInt8}() # requested len=0 on empty file
 @test mmap(file,Vector{UInt8},0) == Vector{UInt8}()
@@ -191,6 +192,7 @@ m = mmap(file,Vector{UInt8},2,6)
 @test_throws BoundsError m[3]
 finalize(m); m = nothing; GC.gc()
 
+file = tempname() # new name to reduce chance of issues due slow windows fs
 s = open(file, "w")
 write(s, [0xffffffffffffffff,
           0xffffffffffffffff,
diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl
@@ -43,6 +43,10 @@ const DISPLAY_FAILED = (
 
 const FAIL_FAST = Ref{Bool}(false)
 
+const record_passes = OncePerProcess{Bool}() do
+    return Base.get_bool_env("JULIA_TEST_RECORD_PASSES", false)
+end
+
 #-----------------------------------------------------------------------
 
 # Backtrace utility functions
@@ -1100,8 +1104,18 @@ struct FailFastError <: Exception end
 
 # For a broken result, simply store the result
 record(ts::DefaultTestSet, t::Broken) = (push!(ts.results, t); t)
-# For a passed result, do not store the result since it uses a lot of memory
-record(ts::DefaultTestSet, t::Pass) = (ts.n_passed += 1; t)
+# For a passed result, do not store the result since it uses a lot of memory, unless
+# `record_passes()` is true. i.e. set env var `JULIA_TEST_RECORD_PASSES=true` before running any testsets
+function record(ts::DefaultTestSet, t::Pass)
+    ts.n_passed += 1
+    if record_passes()
+        # throw away the captured data so it can be GC-ed
+        t_nodata = Pass(t.test_type, t.orig_expr, nothing, t.value, t.source, t.message_only)
+        push!(ts.results, t_nodata)
+        return t_nodata
+    end
+    return t
+end
 
 # For the other result types, immediately print the error message
 # but do not terminate. Print a backtrace.
diff --git a/test/buildkitetestjson.jl b/test/buildkitetestjson.jl
@@ -7,8 +7,9 @@ module BuildkiteTestJSON
 
 using Test
 using Dates
+using Serialization
 
-export write_testset_json_files
+export serialize_testset_result_file, write_testset_json_files
 
 # Bootleg JSON writer
 
@@ -64,14 +65,12 @@ function result_dict(testset::Test.DefaultTestSet, prefix::String="")
         "id" => Base.UUID(rand(UInt128)),
         "scope" => scope,
         "tags" => Dict{String,String}(
-            "job_label" => get(ENV, "BUILDKITE_LABEL", "unknown"),
-            "job_id" => get(ENV, "BUILDKITE_JOB_ID", "unknown"),
-            "job_group" => get(ENV, "BUILDKITE_GROUP_LABEL", "unknown"),
             "os" => string(Sys.KERNEL),
             "arch" => string(Sys.ARCH),
             "julia_version" => string(VERSION),
             "testset" => testset.description,
         ),
+        # note we drop some of this from common_data before merging into individual results
         "history" => if !isnothing(testset.time_end)
             Dict{String,Any}(
                 "start_at" => testset.time_start,
@@ -86,14 +85,33 @@ end
 # Test paths on runners are often in deep directories, so just make them contain enough information
 # to be able to identify the file. Also convert Windows-style paths to Unix-style paths so tests can
 # be grouped by file.
+const generalize_file_paths_cache = Dict{AbstractString,AbstractString}()
+const norm_build_root_path = normpath(Sys.BUILD_ROOT_PATH)
+const bindir_dir = dirname(Sys.BINDIR)
+const pathsep = Sys.iswindows() ? '\\' : '/'
 function generalize_file_paths(path::AbstractString)
-    pathsep = Sys.iswindows() ? '\\' : '/'
-    path = replace(path,
-        string(Sys.STDLIB, pathsep) => "",
-        string(normpath(Sys.BUILD_ROOT_PATH), pathsep) => "",
-        string(dirname(Sys.BINDIR), pathsep) => ""
-    )
-    return Sys.iswindows() ? replace(path, "\\" => "/") : path
+    return get!(generalize_file_paths_cache, path) do
+        path = replace(path,
+            Sys.STDLIB => "stdlib",
+            string(norm_build_root_path, pathsep) => "",
+            string(bindir_dir, pathsep) => ""
+        )
+        @static if Sys.iswindows()
+            return replace(path, "\\" => "/")
+        else
+            return path
+        end
+    end
+end
+
+# raw_file_path,line => file,location
+const location_cache = Dict{Tuple{Symbol,Int},Tuple{String,String}}()
+function get_location(file::Symbol, line::Int)
+    return get!(location_cache, (file, line)) do
+        _file = generalize_file_paths(string(file))
+        _location = string(_file, ":", line)
+        return _file, _location
+    end
 end
 
 # passed, failed, skipped, or unknown
@@ -111,28 +129,65 @@ function get_status(result)
     end
 end
 
-function result_dict(result::Test.Result)
+# An attempt to reconstruct the test call.
+# Note we can't know if broken or skip was via the broken/skip macros or kwargs.
+const TEST_TYPE_MAP = Dict(
+    :test => "@test",
+    :test_nonbool => "@test",
+    :test_error => "@test",
+    :test_interrupted => "@test",
+    :test_unbroken => "@test_broken",
+    :skipped => "@test_skip",
+    :test_throws => "@test_throws",
+    :test_throws_wrong => "@test_throws",
+    :test_throws_nothing => "@test_throws"
+)
+function get_test_call_str(result)
+    prefix = get(TEST_TYPE_MAP, result.test_type, nothing)
+    prefix === nothing && return error("Unknown test type $(repr(result.test_type))")
+    return prefix == "@test_throws" ? "@test_throws $(result.data) $(result.orig_expr)" : "$prefix $(result.orig_expr)"
+end
+
+get_rid(rdata) = (rdata["location"], rdata["result"], haskey(rdata, "failure_expanded") ? hash(rdata["failure_expanded"]) : UInt64(0))
+
+const ResultCountDict = Dict{Tuple{String,String,UInt64},Int}
+
+function is_duplicate_pass(result::Test.Pass, location, status, result_counts::ResultCountDict)
+    rid = (location, status, UInt64(0))
+    count = get(result_counts, rid, nothing)
+    if count !== nothing
+        result_counts[rid] = count + 1
+        return true
+    end
+    return false
+end
+is_duplicate_pass(result::Test.Result, location, status, result_counts::ResultCountDict) = false
+
+function result_dict(result::Test.Result, result_counts::ResultCountDict)
     file, line = if !hasproperty(result, :source) || isnothing(result.source)
-        "unknown", 0
+        :unknown, 0
     else
-        something(result.source.file, "unknown"), result.source.line
+        something(result.source.file, :unknown), result.source.line
     end
-    file = generalize_file_paths(string(file))
-
+    file, location = get_location(file, line)
     status = get_status(result)
 
-    result_show = sprint(show, result; context=:color => false)
-    firstline = split(result_show, '\n')[1]
-    primary_reason = split(firstline, " at ")[1]
+    # Early exit for passed tests before more expensive operations
+    if is_duplicate_pass(result, location, status, result_counts)
+        return nothing
+    end
 
     data = Dict{String,Any}(
-        "name" => "$(primary_reason). Expression: $(result.orig_expr)",
-        "location" => string(file, ':', line),
-        "file_name" => file,
-        "result" => status)
+        "location" => location,
+        "result" => status,
+        "name" => get_test_call_str(result),
+        "file_name" => file)
 
-    job_label = replace(get(ENV, "BUILDKITE_LABEL", "job label not found"), r":\w+:\s*" => "")
     if result isa Test.Fail || result isa Test.Error
+        job_label = replace(get(ENV, "BUILDKITE_LABEL", "job label not found"), r":\w+:\s*" => "")
+        result_show = sprint(show, result; context=:color => false)
+        firstline = split(result_show, '\n')[1]
+        # put the job label at the end here because of the way buildkite UI is laid out
         data["failure_reason"] = generalize_file_paths(firstline) * " | $job_label"
         err_trace = split(result_show, "\nStacktrace:\n", limit=2)
         if length(err_trace) == 2
@@ -142,49 +197,87 @@ function result_dict(result::Test.Result)
             data["failure_expanded"] = [Dict{String,Any}("expanded" => split(result_show, '\n'), "backtrace" => [])]
         end
     end
-    return data
+
+    rid = get_rid(data)
+    duplicate = haskey(result_counts, rid)
+
+    if duplicate
+        result_counts[rid] += 1
+        return nothing
+    else
+        result_counts[rid] = 1
+        return data
+    end
+end
+
+function collect_results!(results::Vector{Dict{String,Any}}, result::Test.Result, common_data::Dict{String,Any}, result_counts::ResultCountDict)
+    rdata = result_dict(result, result_counts)
+    if rdata !== nothing # nothing if it's a duplicate that's been counted
+        push!(results, merge(common_data, rdata))
+    end
+end
+function collect_results!(results::Vector{Dict{String,Any}}, result::Test.DefaultTestSet, common_data::Dict{String,Any}, result_counts::ResultCountDict)
+    collect_results!(results, result, common_data["scope"])
+end
+function collect_results!(results::Vector{Dict{String,Any}}, result, common_data::Dict{String,Any}, result_counts::ResultCountDict)
+    return nothing
 end
 
 function collect_results!(results::Vector{Dict{String,Any}}, testset::Test.DefaultTestSet, prefix::String="")
     common_data = result_dict(testset, prefix)
+    # testset duration is not relevant for individual test results
+    common_data["history"]["duration"] = 0.0 # required field
+    delete!(common_data["history"], "end_at")
     result_offset = length(results) + 1
-    result_counts = Dict{Tuple{String,String},Int}()
-    get_rid(rdata) = (rdata["location"], rdata["result"])
-    for (i, result) in enumerate(testset.results)
-        if result isa Test.Result
-            rdata = result_dict(result)
-            rid = get_rid(rdata)
-            if haskey(result_counts, rid)
-                result_counts[rid] += 1
-            else
-                result_counts[rid] = 1
-                push!(results, merge(common_data, rdata))
-            end
-        elseif result isa Test.DefaultTestSet
-            collect_results!(results, result, common_data["scope"])
-        end
+    result_counts = ResultCountDict()
+
+    for result in testset.results
+        collect_results!(results, result, common_data, result_counts)
     end
-    # Modify names to hold `result_counts`
-    for i in result_offset:length(results)
-        result = results[i]
+    # Add a tag for count of each result
+    for result in results[result_offset:end]
         rid = get_rid(result)
-        if get(result_counts, rid, 0) > 1
-            result["name"] = replace(result["name"], r"^([^:]):" =>
-                SubstitutionString("\\1 (x$(result_counts[rid])):"))
-        end
+        result["tags"]["count"] = string(get(result_counts, rid, 1))
     end
     return results
 end
 
-function write_testset_json_files(dir::String, testset::Test.DefaultTestSet)
+function serialize_testset_result_file(dir::String, testset::Test.DefaultTestSet)
+    data = Dict{String,Any}[]
+    t = @elapsed collect_results!(data, testset)
+    if t > 20 # most are << 5s
+        @warn "Collating test result data was slow: $t seconds" collated_results=length(data)
+    end
+    name = replace(testset.description, r"[^a-zA-Z0-9]" => "_")
+    res_file = joinpath(dir, "results_$(name).dat")
+    t = @elapsed Serialization.serialize(res_file, data)
+    if t > 10
+        @warn "Serializing test result data was slow: $t seconds" file = res_file size = Base.format_bytes(filesize(res_file))
+    end
+    return res_file
+end
+
+# deserilalizes the results files and writes them to collated JSON files of 5000 max results
+function write_testset_json_files(dir::String)
     data = Dict{String,Any}[]
-    collect_results!(data, testset)
+    read_files = String[]
+    for res_dat in filter!(x -> occursin(r"^results.*\.dat$", x), readdir(dir))
+        res_file = joinpath(dir, res_dat)
+        append!(data, Serialization.deserialize(res_file))
+        @debug "Loaded $(basename(res_file)) ($(Base.format_bytes(filesize(res_file))))"
+        push!(read_files, res_file)
+    end
     files = String[]
     # Buildkite is limited to 5000 results per file https://buildkite.com/docs/test-analytics/importing-json
     for (i, chunk) in enumerate(Iterators.partition(data, 5000))
-        res_file = joinpath(dir, "results_$i.json")
+        res_file = joinpath(dir, "results_$(lpad(i, 3, '0')).json")
         open(io -> json_repr(io, chunk), res_file, "w")
         push!(files, res_file)
+        @debug "Saved $(basename(res_file)) ($(length(chunk)) results, $(Base.format_bytes(filesize(res_file))))"
+    end
+    for res_file in read_files
+        rm(res_file)
+        @debug "Deleted $(basename(res_file))"
     end
     return files
 end
diff --git a/test/core.jl b/test/core.jl
@@ -2852,15 +2852,15 @@ mutable struct Obj; x; end
         push!(wr, WeakRef(x))
         nothing
     end
-    @noinline test_wr(r, wr) = @test r[1] == wr[1].value
+    @noinline test_wr(r, wr) = r[1] == wr[1].value
     function test_wr()
         # we need to be very careful here that we never
         # use the value directly in this function, so we aren't dependent
         # on optimizations deleting the root for it before reaching the test
         ref = []
         wref = []
         mk_wr(ref, wref)
-        test_wr(ref, wref)
+        @test test_wr(ref, wref)
         GC.gc()
         test_wr(ref, wref)
         empty!(ref)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,8 +13,6 @@ include("choosetests.jl")
 include("testenv.jl")
 include("buildkitetestjson.jl")
 
-using .BuildkiteTestJSON
-
 (; tests, net_on, exit_on_error, use_revise, seed) = choosetests(ARGS)
 tests = unique(tests)
 
@@ -344,6 +342,8 @@ cd(@__DIR__) do
         end
     end
 
+    BuildkiteTestJSON.write_testset_json_files(@__DIR__)
+
     #=
 `   Construct a testset on the master node which will hold results from all the
     test files run on workers and on node1. The loop goes through the results,
@@ -418,11 +418,6 @@ cd(@__DIR__) do
         Test.pop_testset()
     end
 
-    if Base.get_bool_env("CI", false)
-        @info "Writing test result data to $(@__DIR__)"
-        write_testset_json_files(@__DIR__, o_ts)
-    end
-
     Test.TESTSET_PRINT_ENABLE[] = true
     println()
     # o_ts.verbose = true # set to true to show all timings when successful
diff --git a/test/testdefs.jl b/test/testdefs.jl