1 files changed, 65 insertions, 15 deletions
diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command
index ce3111f3c..b0396bec7 100644
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@@ -1,24 +1,74 @@
 # Upload test logs on failure, if there are any.
-if [[ "${BUILDKITE_COMMAND_EXIT_STATUS}" -ne "0" ]]; then
-  declare log_count=0
-  for log in $(make testlogs 2>/dev/null | sort | uniq); do
-    buildkite-agent artifact upload "${log}"
-    log_count=$((${log_count}+1))
-    # N.B. If *all* tests fail due to some common cause, then we will
-    # end up spending way too much time uploading logs. Instead, we just
-    # upload the first 100 and stop. That is hopefully enough to debug.
-    if [[ "${log_count}" -ge 100 ]]; then
-      echo "Only uploaded first 100 failures; skipping the rest."
-      break
-    fi
-  done
+if test "${BUILDKITE_COMMAND_EXIT_STATUS}" -ne "0"; then
+  # Generate a metafile that ends with .output, and contains all the
+  # test failures that have been uploaded. These will all be sorted and
+  # aggregated by a failure stage in the build pipeline.
+  declare output=$(mktemp "${BUILDKITE_JOB_ID}".XXXXXX.output)
+  make -s testlogs 2>/dev/null | grep // | sort | uniq | (
+    declare log_count=0
+    while read target log; do
+      if test -z "${target}"; then
+        continue
+      fi
+
+      # N.B. If *all* tests fail due to some common cause, then we will
+      # end up spending way too much time uploading logs. Instead, we just
+      # upload the first 10 and stop. That is hopefully enough to debug.
+      #
+      # We include this test in the metadata, but note that we cannot
+      # upload the actual test logs. The user should rerun locally.
+      log_count=$((${log_count}+1))
+      if test "${log_count}" -ge 10; then
+        echo " * ${target} (no upload)" | tee -a "${output}"
+      else
+        buildkite-agent artifact upload "${log}"
+        echo " * [${target}](artifact://${log#/})" | tee -a "${output}"
+      fi
+    done
+  )
+
+  # Upload if we had outputs.
+  if test -s "${output}"; then
+    buildkite-agent artifact upload "${output}"
+  fi
+  rm -rf "${output}"
+
   # Attempt to clear the cache and shut down.
   make clean || echo "make clean failed with code $?"
   make bazel-shutdown || echo "make bazel-shutdown failed with code $?"
 fi
 
+# Upload all profiles, and include in an annotation.
+if test -d /tmp/profile; then
+  # Same as above.
+  declare profile_output=$(mktemp "${BUILDKITE_JOB_ID}".XXXXXX.profile_output)
+  for file in $(find /tmp/profile -name \*.pprof -print 2>/dev/null | sort); do
+    # Generate a link to speedscope, with a URL-encoded link to the BuildKite
+    # artifact location. Note that we use do a fixed URL encode below, since
+    # the link can be uniquely determined. If the storage location changes,
+    # this schema may break and these links may stop working. The artifacts
+    # uploaded however, will still work just fine.
+    profile_name="${file#/tmp/profile/}"
+    public_url="https://storage.googleapis.com/gvisor-buildkite/${BUILDKITE_BUILD_ID}/${BUILDKITE_JOB_ID}/${file#/}"
+    encoded_url=$(jq -rn --arg x "${public_url}" '$x|@uri')
+    encoded_title=$(jq -rn --arg x "${profile_name}" '$x|@uri')
+    profile_url="https://speedscope.app/#profileURL=${encoded_url}&title=${encoded_title}"
+    buildkite-agent artifact upload "${file}"
+    echo " * [${profile_name}](${profile_url}) ([pprof](artifact://${file#/}))" | tee -a "${profile_output}"
+  done
+
+  # Upload if we had outputs.
+  if test -s "${profile_output}"; then
+    buildkite-agent artifact upload "${profile_output}"
+  fi
+  rm -rf "${profile_output}"
+
+  # Remove stale profiles, which may be owned by root.
+  sudo rm -rf /tmp/profile
+fi
+
 # Kill any running containers (clear state).
 CONTAINERS="$(docker ps -q)"
-if ! [[ -z "${CONTAINERS}" ]]; then
+if ! test -z "${CONTAINERS}"; then
   docker container kill ${CONTAINERS} 2>/dev/null || true
-fi
-\ No newline at end of file
+fi