From 16561e461e82f8d846ef1f3ada990270ef39ccc6 Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Thu, 6 Feb 2020 15:59:44 -0800 Subject: Add logic to run from baked images. Change adds the following: - logic to run from "baked images". See [GVISOR_DIR]/tools/images - installers which install modified files from a workspace. This allows users to run benchmarks while modifying runsc. - removes the --preemptible tag from built GCE instances. Preemptible instances are much more likely to be preempted on startup, which manifests for the user as a failed benchmark. I don't currently have a way to detect if a VM has been preempted that will work for this change. https://cloud.google.com/compute/docs/instances/preemptible#preemption_process https://cloud.google.com/compute/docs/instances/preemptible#preemption_selection PiperOrigin-RevId: 293697949 --- benchmarks/BUILD | 8 ++ benchmarks/README.md | 21 +++- benchmarks/harness/BUILD | 21 ++++ benchmarks/harness/__init__.py | 36 ++++++- benchmarks/harness/machine.py | 43 +++++++- benchmarks/harness/machine_producers/BUILD | 1 + .../harness/machine_producers/gcloud_producer.py | 114 ++++++++------------- benchmarks/harness/ssh_connection.py | 25 +++-- benchmarks/runner/__init__.py | 75 ++++---------- benchmarks/runner/commands.py | 70 ++++++------- 10 files changed, 239 insertions(+), 175 deletions(-) (limited to 'benchmarks') diff --git a/benchmarks/BUILD b/benchmarks/BUILD index 1455c6c5b..43614cf5d 100644 --- a/benchmarks/BUILD +++ b/benchmarks/BUILD @@ -3,8 +3,16 @@ package(licenses = ["notice"]) py_binary( name = "benchmarks", srcs = ["run.py"], + data = [ + "//tools/images:ubuntu1604", + "//tools/images:zone", + ], main = "run.py", python_version = "PY3", srcs_version = "PY3", + tags = [ + "local", + "manual", + ], deps = ["//benchmarks/runner"], ) diff --git a/benchmarks/README.md b/benchmarks/README.md index ff21614c5..975321c99 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -26,6 +26,8 @@ For configuring the environment manually, consult the ## Running benchmarks +### Locally + Run the following from the benchmarks directory: ```bash @@ -44,7 +46,7 @@ runtime, runc. Running on another installed runtime, like say runsc, is as simple as: ```bash -bazel run :benchmakrs -- run-local startup --runtime=runsc +bazel run :benchmarks -- run-local startup --runtime=runsc ``` There is help: ``bash bash bazel run :benchmarks -- --help bazel @@ -104,6 +106,23 @@ Or with different parameters: bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu ``` +### On Google Compute Engine (GCE) + +Benchmarks may be run on GCE in an automated way. The default project configured +for `gcloud` will be used. + +An additional parameter `installers` may be provided to ensure that the latest +runtime is installed from the workspace. See the files in `tools/installers` for +supported install targets. + +```bash +bazel run :benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu +``` + +When running on GCE, the scripts generate a per run SSH key, which is added to +your project. The key is set to expire in GCE after 60 minutes and is stored in +a temporary directory on the local machine running the scripts. + ## Writing benchmarks To write new benchmarks, you should familiarize yourself with the structure of diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD index 52d4e42f8..4d03e3a06 100644 --- a/benchmarks/harness/BUILD +++ b/benchmarks/harness/BUILD @@ -1,3 +1,4 @@ +load("//tools:defs.bzl", "pkg_tar") load("//tools:defs.bzl", "py_library", "py_requirement") package( @@ -5,9 +6,29 @@ package( licenses = ["notice"], ) +pkg_tar( + name = "installers", + srcs = [ + "//tools/installers:head", + "//tools/installers:master", + "//tools/installers:runsc", + ], + mode = "0755", +) + +filegroup( + name = "files", + srcs = [ + ":installers", + ], +) + py_library( name = "harness", srcs = ["__init__.py"], + data = [ + ":files", + ], ) py_library( diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py index 61fd25f73..15aa2a69a 100644 --- a/benchmarks/harness/__init__.py +++ b/benchmarks/harness/__init__.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,18 +15,48 @@ import getpass import os +import subprocess +import tempfile # LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a # format string that accepts a single string parameter. -LOCAL_WORKLOADS_PATH = os.path.join( - os.path.dirname(__file__), "../workloads/{}/tar.tar") +LOCAL_WORKLOADS_PATH = os.path.dirname(__file__) + "/../workloads/{}/tar.tar" # REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the # remote host. This is a format string that accepts a single string parameter. REMOTE_WORKLOADS_PATH = "workloads/{}" +# INSTALLER_ROOT is the set of files that needs to be copied. +INSTALLER_ARCHIVE = os.readlink(os.path.join( + os.path.dirname(__file__), "installers.tar")) + +# SSH_KEY_DIR holds SSH_PRIVATE_KEY for this run. bm-tools paramiko requires +# keys generated with the '-t rsa -m PEM' options from ssh-keygen. This is +# abstracted away from the user. +SSH_KEY_DIR = tempfile.TemporaryDirectory() +SSH_PRIVATE_KEY = "key" + # DEFAULT_USER is the default user running this script. DEFAULT_USER = getpass.getuser() # DEFAULT_USER_HOME is the home directory of the user running the script. DEFAULT_USER_HOME = os.environ["HOME"] if "HOME" in os.environ else "" + +# Default directory to remotely installer "installer" targets. +REMOTE_INSTALLERS_PATH = "installers" + + +def make_key(): + """Wraps a valid ssh key in a temporary directory.""" + path = os.path.join(SSH_KEY_DIR.name, SSH_PRIVATE_KEY) + if not os.path.exists(path): + cmd = "ssh-keygen -t rsa -m PEM -b 4096 -f {key} -q -N".format( + key=path).split(" ") + cmd.append("") + subprocess.run(cmd, check=True) + return path + + +def delete_key(): + """Deletes temporary directory containing private key.""" + SSH_KEY_DIR.cleanup() diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py index 2df4c9e31..3d32d3dda 100644 --- a/benchmarks/harness/machine.py +++ b/benchmarks/harness/machine.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,10 +29,11 @@ to run contianers. """ import logging +import os import re import subprocess import time -from typing import Tuple +from typing import List, Tuple import docker @@ -201,6 +202,7 @@ class RemoteMachine(Machine): self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs) self._tunnel.connect() self._docker_client = self._tunnel.get_docker_client() + self._has_installers = False def run(self, cmd: str) -> Tuple[str, str]: return self._ssh_connection.run(cmd) @@ -210,14 +212,45 @@ class RemoteMachine(Machine): stdout, stderr = self._ssh_connection.run("cat '{}'".format(path)) return stdout + stderr + def install(self, + installer: str, + results: List[bool] = None, + index: int = -1): + """Method unique to RemoteMachine to handle installation of installers. + + Handles installers, which install things that may change between runs (e.g. + runsc). Usually called from gcloud_producer, which expects this method to + to store results. + + Args: + installer: the installer target to run. + results: Passed by the caller of where to store success. + index: Index for this method to store the result in the passed results + list. + """ + # This generates a tarball of the full installer root (which will generate + # be the full bazel root directory) and sends it over. + if not self._has_installers: + archive = self._ssh_connection.send_installers() + self.run("tar -xvf {archive} -C {dir}".format( + archive=archive, dir=harness.REMOTE_INSTALLERS_PATH)) + self._has_installers = True + + # Execute the remote installer. + self.run("sudo {dir}/{file}".format( + dir=harness.REMOTE_INSTALLERS_PATH, file=installer)) + if results: + results[index] = True + def pull(self, workload: str) -> str: # Push to the remote machine and build. logging.info("Building %s@%s remotely...", workload, self._name) remote_path = self._ssh_connection.send_workload(workload) + remote_dir = os.path.dirname(remote_path) # Workloads are all tarballs. - self.run("tar -xvf {remote_path}/tar.tar -C {remote_path}".format( - remote_path=remote_path)) - self.run("docker build --tag={} {}".format(workload, remote_path)) + self.run("tar -xvf {remote_path} -C {remote_dir}".format( + remote_path=remote_path, remote_dir=remote_dir)) + self.run("docker build --tag={} {}".format(workload, remote_dir)) return workload # Workload is the tag. def container(self, image: str, **kwargs) -> container.Container: diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD index 48ea0ef39..3711a397f 100644 --- a/benchmarks/harness/machine_producers/BUILD +++ b/benchmarks/harness/machine_producers/BUILD @@ -76,5 +76,6 @@ py_test( python_version = "PY3", tags = [ "local", + "manual", ], ) diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py index e0b77d52b..513d16e4f 100644 --- a/benchmarks/harness/machine_producers/gcloud_producer.py +++ b/benchmarks/harness/machine_producers/gcloud_producer.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,12 +46,11 @@ class GCloudProducer(machine_producer.MachineProducer): Produces Machine objects backed by GCP instances. Attributes: - project: The GCP project name under which to create the machines. - ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys. image: image name as a string. - image_project: image project as a string. - machine_type: type of GCP to create. e.g. n1-standard-4 zone: string to a valid GCP zone. + machine_type: type of GCP to create (e.g. n1-standard-4). + installers: list of installers post-boot. + ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys. ssh_user: string of user name for ssh_key ssh_password: string of password for ssh key mock: a mock printer which will print mock data if required. Mock data is @@ -60,21 +59,19 @@ class GCloudProducer(machine_producer.MachineProducer): """ def __init__(self, - project: str, - ssh_key_file: str, image: str, - image_project: str, - machine_type: str, zone: str, + machine_type: str, + installers: List[str], + ssh_key_file: str, ssh_user: str, ssh_password: str, mock: gcloud_mock_recorder.MockPrinter = None): - self.project = project - self.ssh_key_file = ssh_key_file self.image = image - self.image_project = image_project - self.machine_type = machine_type self.zone = zone + self.machine_type = machine_type + self.installers = installers + self.ssh_key_file = ssh_key_file self.ssh_user = ssh_user self.ssh_password = ssh_password self.mock = mock @@ -87,10 +84,34 @@ class GCloudProducer(machine_producer.MachineProducer): "Cannot ask for {num} machines!".format(num=num_machines)) with self.condition: names = self._get_unique_names(num_machines) - self._build_instances(names) - instances = self._start_command(names) + instances = self._build_instances(names) self._add_ssh_key_to_instances(names) - return self._machines_from_instances(instances) + machines = self._machines_from_instances(instances) + + # Install all bits in lock-step. + # + # This will perform paralell installations for however many machines we + # have, but it's easy to track errors because if installing (a, b, c), we + # won't install "c" until "b" is installed on all machines. + for installer in self.installers: + threads = [None] * len(machines) + results = [False] * len(machines) + for i in range(len(machines)): + threads[i] = threading.Thread( + target=machines[i].install, args=(installer, results, i)) + threads[i].start() + for thread in threads: + thread.join() + for result in results: + if not result: + raise NotImplementedError( + "Installers failed on at least one machine!") + + # Add this user to each machine's docker group. + for m in machines: + m.run("sudo setfacl -m user:$USER:rw /var/run/docker.sock") + + return machines def release_machines(self, machine_list: List[machine.Machine]): """Releases the requested number of machines, deleting the instances.""" @@ -123,15 +144,7 @@ class GCloudProducer(machine_producer.MachineProducer): def _get_unique_names(self, num_names) -> List[str]: """Returns num_names unique names based on data from the GCP project.""" - curr_machines = self._list_machines() - curr_names = set([machine["name"] for machine in curr_machines]) - ret = [] - while len(ret) < num_names: - new_name = "machine-" + str(uuid.uuid4()) - if new_name not in curr_names: - ret.append(new_name) - curr_names.update(new_name) - return ret + return ["machine-" + str(uuid.uuid4()) for _ in range(0, num_names)] def _build_instances(self, names: List[str]) -> List[Dict[str, Any]]: """Creates instances using gcloud command. @@ -151,34 +164,9 @@ class GCloudProducer(machine_producer.MachineProducer): "_build_instances cannot create instances without names.") cmd = "gcloud compute instances create".split(" ") cmd.extend(names) - cmd.extend( - "--preemptible --image={image} --zone={zone} --machine-type={machine_type}" - .format( - image=self.image, zone=self.zone, - machine_type=self.machine_type).split(" ")) - if self.image_project: - cmd.append("--image-project={project}".format(project=self.image_project)) - res = self._run_command(cmd) - return json.loads(res.stdout) - - def _start_command(self, names): - """Starts instances using gcloud command. - - Runs the command `gcloud compute instances start` on list of instances by - name and returns json data on started instances on success. - - Args: - names: list of names of instances to start. - - Returns: - List of json data describing started machines. - """ - if not names: - raise ValueError("_start_command cannot start empty instance list.") - cmd = "gcloud compute instances start".split(" ") - cmd.extend(names) - cmd.append("--zone={zone}".format(zone=self.zone)) - cmd.append("--project={project}".format(project=self.project)) + cmd.append("--image=" + self.image) + cmd.append("--zone=" + self.zone) + cmd.append("--machine-type=" + self.machine_type) res = self._run_command(cmd) return json.loads(res.stdout) @@ -186,7 +174,7 @@ class GCloudProducer(machine_producer.MachineProducer): """Adds ssh key to instances by calling gcloud ssh command. Runs the command `gcloud compute ssh instance_name` on list of images by - name. Tries to ssh into given instance + name. Tries to ssh into given instance. Args: names: list of machine names to which to add the ssh-key @@ -202,30 +190,18 @@ class GCloudProducer(machine_producer.MachineProducer): cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file)) cmd.append("--zone={zone}".format(zone=self.zone)) cmd.append("--command=uname") + cmd.append("--ssh-key-expire-after=60m") timeout = datetime.timedelta(seconds=5 * 60) start = datetime.datetime.now() while datetime.datetime.now() <= timeout + start: try: self._run_command(cmd) break - except subprocess.CalledProcessError as e: + except subprocess.CalledProcessError: if datetime.datetime.now() > timeout + start: raise TimeoutError( "Could not SSH into instance after 5 min: {name}".format( name=name)) - # 255 is the returncode for ssh connection refused. - elif e.returncode == 255: - - continue - else: - raise e - - def _list_machines(self) -> List[Dict[str, Any]]: - """Runs `list` gcloud command and returns list of Machine data.""" - cmd = "gcloud compute instances list --project {project}".format( - project=self.project).split(" ") - res = self._run_command(cmd) - return json.loads(res.stdout) def _run_command(self, cmd: List[str], @@ -261,7 +237,7 @@ class GCloudProducer(machine_producer.MachineProducer): self.mock.record(res) if res.returncode != 0: raise subprocess.CalledProcessError( - cmd=res.args, + cmd=" ".join(res.args), output=res.stdout, stderr=res.stderr, returncode=res.returncode) diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py index e0bf258f1..a50e34293 100644 --- a/benchmarks/harness/ssh_connection.py +++ b/benchmarks/harness/ssh_connection.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. """SSHConnection handles the details of SSH connections.""" + import os import warnings @@ -24,18 +25,24 @@ from benchmarks import harness warnings.filterwarnings(action="ignore", module=".*paramiko.*") -def send_one_file(client: paramiko.SSHClient, path: str, remote_dir: str): +def send_one_file(client: paramiko.SSHClient, path: str, + remote_dir: str) -> str: """Sends a single file via an SSH client. Args: client: The existing SSH client. path: The local path. remote_dir: The remote directory. + + Returns: + :return: The remote path as a string. """ filename = path.split("/").pop() - client.exec_command("mkdir -p " + remote_dir) + if remote_dir != ".": + client.exec_command("mkdir -p " + remote_dir) with client.open_sftp() as ftp_client: ftp_client.put(path, os.path.join(remote_dir, filename)) + return os.path.join(remote_dir, filename) class SSHConnection: @@ -103,6 +110,12 @@ class SSHConnection: The remote path. """ with self._client() as client: - send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name), - harness.REMOTE_WORKLOADS_PATH.format(name)) - return harness.REMOTE_WORKLOADS_PATH.format(name) + return send_one_file(client, harness.LOCAL_WORKLOADS_PATH.format(name), + harness.REMOTE_WORKLOADS_PATH.format(name)) + + def send_installers(self) -> str: + with self._client() as client: + return send_one_file( + client, + path=harness.INSTALLER_ARCHIVE, + remote_dir=harness.REMOTE_INSTALLERS_PATH) diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py index ba80d83d7..ba27dc69f 100644 --- a/benchmarks/runner/__init__.py +++ b/benchmarks/runner/__init__.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,13 +15,10 @@ import copy import csv -import json import logging -import os import pkgutil import pydoc import re -import subprocess import sys import types from typing import List @@ -123,57 +120,29 @@ def run_mock(ctx, **kwargs): @runner.command("run-gcp", commands.GCPCommand) @click.pass_context -def run_gcp(ctx, project: str, ssh_key_file: str, image: str, - image_project: str, machine_type: str, zone: str, ssh_user: str, - ssh_password: str, **kwargs): +def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str, + installers: List[str], **kwargs): """Runs all benchmarks on GCP instances.""" - if not ssh_user: - ssh_user = harness.DEFAULT_USER - - # Get the default project if one was not provided. - if not project: - sub = subprocess.run( - "gcloud config get-value project".split(" "), stdout=subprocess.PIPE) - if sub.returncode: - raise ValueError( - "Cannot get default project from gcloud. Is it configured>") - project = sub.stdout.decode("utf-8").strip("\n") - - if not image_project: - image_project = project - - # Check that the ssh-key exists and is readable. - if not os.access(ssh_key_file, os.R_OK): - raise ValueError( - "ssh key given `{ssh_key}` is does not exist or is not readable." - .format(ssh_key=ssh_key_file)) - - # Check that the image exists. - sub = subprocess.run( - "gcloud compute images describe {image} --project {image_project} --format=json" - .format(image=image, image_project=image_project).split(" "), - stdout=subprocess.PIPE) - if sub.returncode or "READY" not in json.loads(sub.stdout)["status"]: - raise ValueError( - "given image was not found or is not ready: {image} {image_project}." - .format(image=image, image_project=image_project)) - - # Check and set zone to default. - if not zone: - sub = subprocess.run( - "gcloud config get-value compute/zone".split(" "), - stdout=subprocess.PIPE) - if sub.returncode: - raise ValueError( - "Default zone is not set in gcloud. Set one or pass a zone with the --zone flag." - ) - zone = sub.stdout.decode("utf-8").strip("\n") - - producer = gcloud_producer.GCloudProducer(project, ssh_key_file, image, - image_project, machine_type, zone, - ssh_user, ssh_password) - run(ctx, producer, **kwargs) + # Resolve all files. + image = open(image_file).read().rstrip() + zone = open(zone_file).read().rstrip() + + key_file = harness.make_key() + + producer = gcloud_producer.GCloudProducer( + image, + zone, + machine_type, + installers, + ssh_key_file=key_file, + ssh_user=harness.DEFAULT_USER, + ssh_password="") + + try: + run(ctx, producer, **kwargs) + finally: + harness.delete_key() def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int, diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py index 7ab12fac6..0fccb2fad 100644 --- a/benchmarks/runner/commands.py +++ b/benchmarks/runner/commands.py @@ -1,5 +1,5 @@ # python3 -# Copyright 2019 Google LLC +# Copyright 2019 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,9 +22,9 @@ def run_mock(**kwargs): # mock implementation """ -import click +import os -from benchmarks import harness +import click class RunCommand(click.core.Command): @@ -90,46 +90,40 @@ class GCPCommand(RunCommand): """GCPCommand inherits all flags from RunCommand and adds flags for run_gcp method. Attributes: - project: GCP project - ssh_key_path: path to the ssh-key to use for the run - image: name of the image to build machines from - image_project: GCP project under which to find image - zone: a GCP zone (e.g. us-west1-b) - ssh_user: username to use for the ssh-key - ssh_password: password to use for the ssh-key + image_file: name of the image to build machines from + zone_file: a GCP zone (e.g. us-west1-b) + installers: named installers for post-create + machine_type: type of machine to create (e.g. n1-standard-4) """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - project = click.core.Option( - ("--project",), - help="Project to run on if not default value given by 'gcloud config get-value project'." + image_file = click.core.Option( + ("--image_file",), + help="The file containing the image for VMs.", + default=os.path.join( + os.path.dirname(__file__), "../../tools/images/ubuntu1604.txt"), + ) + zone_file = click.core.Option( + ("--zone_file",), + help="The file containing the GCP zone.", + default=os.path.join( + os.path.dirname(__file__), "../../tools/images/zone.txt"), + ) + installers = click.core.Option( + ("--installers",), + help="The set of installers to use.", + multiple=True, + ) + machine_type = click.core.Option( + ("--machine_type",), + help="Type to make all machines.", + default="n1-standard-4", ) - ssh_key_path = click.core.Option( - ("--ssh-key-file",), - help="Path to a valid ssh private key to use. See README on generating a valid ssh key. Set to ~/.ssh/benchmark-tools by default.", - default=harness.DEFAULT_USER_HOME + "/.ssh/benchmark-tools") - image = click.core.Option(("--image",), - help="The image on which to build VMs.", - default="bm-tools-testing") - image_project = click.core.Option( - ("--image_project",), - help="The project under which the image to be used is listed.", - default="") - machine_type = click.core.Option(("--machine_type",), - help="Type to make all machines.", - default="n1-standard-4") - zone = click.core.Option(("--zone",), - help="The GCP zone to run on.", - default="") - ssh_user = click.core.Option(("--ssh-user",), - help="User for the ssh key.", - default=harness.DEFAULT_USER) - ssh_password = click.core.Option(("--ssh-password",), - help="Password for the ssh key.", - default="") self.params.extend([ - project, ssh_key_path, image, image_project, machine_type, zone, - ssh_user, ssh_password + image_file, + zone_file, + machine_type, + installers, ]) -- cgit v1.2.3