3 files changed, 707 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
new file mode 100644
index 000000000..1656ad126
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -0,0 +1,62 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "semaphore",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*waiter",
+    },
+)
+
+go_stateify(
+    name = "semaphore_state",
+    srcs = [
+        "semaphore.go",
+        "waiter_list.go",
+    ],
+    out = "semaphore_autogen_state.go",
+    package = "semaphore",
+)
+
+go_library(
+    name = "semaphore",
+    srcs = [
+        "semaphore.go",
+        "semaphore_autogen_state.go",
+        "waiter_list.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "semaphore_test",
+    size = "small",
+    srcs = ["semaphore_test.go"],
+    embed = [":semaphore"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..19ad5d537
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,473 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	valueMax = 32767 // SEMVMX
+
+	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+	semaphoresMax = 32000
+
+	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+	setsMax = 32000
+
+	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
+	// (SEMMNS = SEMMNI*SEMMSL).
+	semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+type Registry struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	semaphores map[int32]*Set
+	lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+type Set struct {
+	// Id is a handle that identifies the set.
+	ID int32
+
+	// key is an user provided key that can be shared between processes.
+	key int32
+
+	// creator is the user that created the set. Immutable.
+	creator fs.FileOwner
+
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	owner      fs.FileOwner
+	perms      fs.FilePermissions
+	opTime     ktime.Time
+	changeTime ktime.Time
+	sems       []sem
+
+	// dead is set to true when the set is removed and can't be reached anymore.
+	// All waiters must wake up and fail when set is dead.
+	dead bool
+}
+
+// sem represents a single semanphore from a set.
+type sem struct {
+	value   int16
+	waiters waiterList `state:"zerovalue"`
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+type waiter struct {
+	waiterEntry
+
+	// value represents how much resource the waiter needs to wake up.
+	value int16
+	ch    chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry() *Registry {
+	return &Registry{semaphores: make(map[int32]*Set)}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+	if nsems < 0 || nsems > semaphoresMax {
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !private {
+		// Look up an existing semaphore.
+		if set := r.findByKey(key); set != nil {
+			// Check that caller can access semaphore set.
+			creds := auth.CredentialsFromContext(ctx)
+			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+				return nil, syserror.EACCES
+			}
+
+			// Validate parameters.
+			if nsems > int32(set.size()) {
+				return nil, syserror.EINVAL
+			}
+			if create && exclusive {
+				return nil, syserror.EEXIST
+			}
+			return set, nil
+		}
+
+		if !create {
+			// Semaphore not found and should not be created.
+			return nil, syserror.ENOENT
+		}
+	}
+
+	// Zero is only valid if an existing set is found.
+	if nsems == 0 {
+		return nil, syserror.EINVAL
+	}
+
+	// Apply system limits.
+	if len(r.semaphores) >= setsMax {
+		return nil, syserror.EINVAL
+	}
+	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+		return nil, syserror.EINVAL
+	}
+
+	// Finally create a new set.
+	owner := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	set := r.semaphores[id]
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	// "The effective user ID of the calling process must match the creator or
+	// owner of the semaphore set, or the caller must be privileged."
+	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	delete(r.semaphores, set.ID)
+	set.destroy()
+	return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+	set := &Set{
+		key:        key,
+		owner:      owner,
+		creator:    owner,
+		perms:      perms,
+		changeTime: ktime.NowFromContext(ctx),
+		sems:       make([]sem, nsems),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.semaphores[id] == nil {
+			r.lastIDUsed = id
+			r.semaphores[id] = set
+			set.ID = id
+			return set, nil
+		}
+	}
+
+	log.Warningf("Semaphore map is full, they must be leaking")
+	return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+	for _, v := range r.semaphores {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+func (r *Registry) totalSems() int {
+	totalSems := 0
+	for _, v := range r.semaphores {
+		totalSems += v.size()
+	}
+	return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+	if num < 0 || int(num) >= s.size() {
+		return nil
+	}
+	return &s.sems[num]
+}
+
+func (s *Set) size() int {
+	return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The effective UID of the calling process must match the owner or creator
+	// of the semaphore set, or the caller must be privileged."
+	if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	s.owner = owner
+	s.perms = perms
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+	if val < 0 || val > valueMax {
+		return syserror.ERANGE
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return syserror.ERANGE
+	}
+
+	// TODO: Clear undo entries in all processes
+	sem.value = val
+	s.changeTime = ktime.NowFromContext(ctx)
+	sem.wakeWaiters()
+	return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.value, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// suceeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Did it race with a removal operation?
+	if s.dead {
+		return nil, 0, syserror.EIDRM
+	}
+
+	// Validate the operations.
+	readOnly := true
+	for _, op := range ops {
+		if s.findSem(int32(op.SemNum)) == nil {
+			return nil, 0, syserror.EFBIG
+		}
+		if op.SemOp != 0 {
+			readOnly = false
+		}
+	}
+
+	if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+		return nil, 0, syserror.EACCES
+	}
+
+	ch, num, err := s.executeOps(ctx, ops)
+	if err != nil {
+		return nil, 0, err
+	}
+	return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+	// Changes to semaphores go to this slice temporarily until they all succeed.
+	tmpVals := make([]int16, len(s.sems))
+	for i := range s.sems {
+		tmpVals[i] = s.sems[i].value
+	}
+
+	for _, op := range ops {
+		sem := &s.sems[op.SemNum]
+		if op.SemOp == 0 {
+			// Handle 'wait for zero' operation.
+			if tmpVals[op.SemNum] != 0 {
+				// Semaphore isn't 0, must wait.
+				if op.SemFlg&linux.IPC_NOWAIT != 0 {
+					return nil, 0, syserror.ErrWouldBlock
+				}
+
+				w := newWaiter(op.SemOp)
+				sem.waiters.PushBack(w)
+				return w.ch, int32(op.SemNum), nil
+			}
+		} else {
+			if op.SemOp < 0 {
+				// Handle 'wait' operation.
+				if -op.SemOp > valueMax {
+					return nil, 0, syserror.ERANGE
+				}
+				if -op.SemOp > tmpVals[op.SemNum] {
+					// Not enough resources, must wait.
+					if op.SemFlg&linux.IPC_NOWAIT != 0 {
+						return nil, 0, syserror.ErrWouldBlock
+					}
+
+					w := newWaiter(op.SemOp)
+					sem.waiters.PushBack(w)
+					return w.ch, int32(op.SemNum), nil
+				}
+			} else {
+				// op.SemOp > 0: Handle 'signal' operation.
+				if tmpVals[op.SemNum] > valueMax-op.SemOp {
+					return nil, 0, syserror.ERANGE
+				}
+			}
+
+			tmpVals[op.SemNum] += op.SemOp
+		}
+	}
+
+	// All operations succeeded, apply them.
+	// TODO: handle undo operations.
+	for i, v := range tmpVals {
+		s.sems[i].value = v
+		s.sems[i].wakeWaiters()
+	}
+	s.opTime = ktime.NowFromContext(ctx)
+	return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sem := &s.sems[num]
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if w.ch == ch {
+			sem.waiters.Remove(w)
+			return
+		}
+	}
+	// Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+	return s.owner.UID == creds.EffectiveKUID ||
+		s.owner.GID == creds.EffectiveKGID ||
+		s.creator.UID == creds.EffectiveKUID ||
+		s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+	return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+	// Are we owner, or in group, or other?
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	return s.checkCapability(creds)
+}
+
+func (s *Set) destroy() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Notify all waiters. Tney will fail on the next attempt to execute
+	// operations and return error.
+	s.dead = true
+	for _, s := range s.sems {
+		for w := s.waiters.Front(); w != nil; w = w.Next() {
+			w.ch <- struct{}{}
+		}
+		s.waiters.Reset()
+	}
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+	// Note that this will release all waiters waiting for 0 too.
+	for w := s.waiters.Front(); w != nil; {
+		if s.value < w.value {
+			// Still blocked, skip it.
+			continue
+		}
+		w.ch <- struct{}{}
+		old := w
+		w = w.Next()
+		s.waiters.Remove(old)
+	}
+}
+
+func newWaiter(val int16) *waiter {
+	return &waiter{
+		value: val,
+		ch:    make(chan struct{}, 1),
+	}
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
new file mode 100644
index 000000000..0386586ab
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package semaphore
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
+	ch, _, err := set.executeOps(ctx, ops)
+	if err != nil {
+		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
+	}
+	if block {
+		if ch == nil {
+			t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops)
+		}
+		if signalled(ch) {
+			t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+		}
+	} else {
+		if ch != nil {
+			t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops)
+		}
+	}
+	return ch
+}
+
+func signalled(ch chan struct{}) bool {
+	select {
+	case <-ch:
+		return true
+	default:
+		return false
+	}
+}
+
+func TestBasic(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+	}
+}
+
+func TestWaitForZero(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 0},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 0
+	chZero1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	chZero2 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+
+	ops[0].SemOp = -2
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(chZero1) {
+		t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+	if !signalled(chZero2) {
+		t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+}
+
+func TestNoWait(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+
+	ops[0].SemOp = 0
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestUnregister(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r := NewRegistry()
+	set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+	if err != nil {
+		t.Fatalf("FindOrCreate() failed, err: %v", err)
+	}
+	if got := r.FindByID(set.ID); got.ID != set.ID {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+	}
+
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: -1},
+	}
+	chs := make([]chan struct{}, 0, 5)
+	for i := 0; i < 5; i++ {
+		ch := executeOps(ctx, t, set, ops, true)
+		chs = append(chs, ch)
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	if err := r.RemoveID(set.ID, creds); err != nil {
+		t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+	}
+	if !set.dead {
+		t.Fatalf("set is not dead: %+v", set)
+	}
+	if got := r.FindByID(set.ID); got != nil {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+	}
+	for i, ch := range chs {
+		if !signalled(ch) {
+			t.Fatalf("channel %d should have been signalled", i)
+		}
+	}
+}