From b3f104507d7a04c0ca058cbcacc5ff78d853f4ba Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 6 Jun 2019 16:27:09 -0700
Subject: "Implement" mbind(2).

We still only advertise a single NUMA node, and ignore mempolicy
accordingly, but mbind() at least now succeeds and has effects reflected
by get_mempolicy().

Also fix handling of nodemasks: round sizes to unsigned long (as
documented and done by Linux), and zero trailing bits when copying them
out.

PiperOrigin-RevId: 251950859
---
 pkg/sentry/kernel/task.go                  |   7 +-
 pkg/sentry/kernel/task_sched.go            |   4 +-
 pkg/sentry/mm/mm.go                        |   6 +
 pkg/sentry/mm/syscalls.go                  |  53 +++++
 pkg/sentry/mm/vma.go                       |   3 +
 pkg/sentry/syscalls/linux/BUILD            |   1 +
 pkg/sentry/syscalls/linux/linux64.go       |   3 +-
 pkg/sentry/syscalls/linux/sys_mempolicy.go | 312 +++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_mmap.go      | 145 --------------
 9 files changed, 382 insertions(+), 152 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_mempolicy.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f9378c2de..4d889422f 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -455,12 +455,13 @@ type Task struct {
 	// single numa node, all policies are no-ops. We only track this information
 	// so that we can return reasonable values if the application calls
 	// get_mempolicy(2) after setting a non-default policy. Note that in the
-	// real syscall, nodemask can be longer than 4 bytes, but we always report a
-	// single node so never need to save more than a single bit.
+	// real syscall, nodemask can be longer than a single unsigned long, but we
+	// always report a single node so never need to save more than a single
+	// bit.
 	//
 	// numaPolicy and numaNodeMask are protected by mu.
 	numaPolicy   int32
-	numaNodeMask uint32
+	numaNodeMask uint64
 
 	// If netns is true, the task is in a non-root network namespace. Network
 	// namespaces aren't currently implemented in full; being in a network
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 5455f6ea9..1c94ab11b 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -622,14 +622,14 @@ func (t *Task) SetNiceness(n int) {
 }
 
 // NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.numaPolicy, t.numaNodeMask
 }
 
 // SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.numaPolicy = policy
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 0a026ff8c..604866d04 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -276,6 +276,12 @@ type vma struct {
 
 	mlockMode memmap.MLockMode
 
+	// numaPolicy is the NUMA policy for this vma set by mbind().
+	numaPolicy int32
+
+	// numaNodemask is the NUMA nodemask for this vma set by mbind().
+	numaNodemask uint64
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index af1e53f5d..9cf136532 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -973,6 +973,59 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
 	return nil
 }
 
+// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	vseg := mm.vmas.FindSegment(addr)
+	if !vseg.Ok() {
+		return 0, 0, syserror.EFAULT
+	}
+	vma := vseg.ValuePtr()
+	return vma.numaPolicy, vma.numaNodemask, nil
+}
+
+// SetNumaPolicy implements the semantics of Linux's mbind().
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+	if !addr.IsPageAligned() {
+		return syserror.EINVAL
+	}
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length).RoundUp()
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ar.Length() == 0 {
+		return nil
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+	}()
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() || lastEnd < vseg.Start() {
+			// "EFAULT: ... there was an unmapped hole in the specified memory
+			// range specified [sic] by addr and len." - mbind(2)
+			return syserror.EFAULT
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		vma.numaPolicy = policy
+		vma.numaNodemask = nodemask
+		lastEnd = vseg.End()
+		if ar.End <= lastEnd {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 02203f79f..0af8de5b0 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -107,6 +107,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
 		mlockMode:      opts.MLockMode,
+		numaPolicy:     linux.MPOL_DEFAULT,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	}
@@ -436,6 +437,8 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
 		vma1.mlockMode != vma2.mlockMode ||
+		vma1.numaPolicy != vma2.numaPolicy ||
+		vma1.numaNodemask != vma2.numaNodemask ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index f76989ae2..1c057526b 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -19,6 +19,7 @@ go_library(
         "sys_identity.go",
         "sys_inotify.go",
         "sys_lseek.go",
+        "sys_mempolicy.go",
         "sys_mmap.go",
         "sys_mount.go",
         "sys_pipe.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 3e4d312af..ad88b1391 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -360,8 +360,7 @@ var AMD64 = &kernel.SyscallTable{
 		235: Utimes,
 		// @Syscall(Vserver, note:Not implemented by Linux)
 		236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
-		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295)
-		237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
+		237: Mbind,
 		238: SetMempolicy,
 		239: GetMempolicy,
 		//     240: @Syscall(MqOpen), TODO(b/29354921)
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
new file mode 100644
index 000000000..652b2c206
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -0,0 +1,312 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// We unconditionally report a single NUMA node. This also means that our
+// "nodemask_t" is a single unsigned long (uint64).
+const (
+	maxNodes        = 1
+	allowedNodemask = (1 << maxNodes) - 1
+)
+
+func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
+	// "nodemask points to a bit mask of node IDs that contains up to maxnode
+	// bits. The bit mask size is rounded to the next multiple of
+	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
+	// A NULL value of nodemask or a maxnode value of zero specifies the empty
+	// set of nodes. If the value of maxnode is zero, the nodemask argument is
+	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
+	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
+	// maxnode-1, not maxnode, as the number of bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return 0, syserror.EINVAL
+	}
+	if bits == 0 {
+		return 0, nil
+	}
+	// Copy in the whole nodemask.
+	numUint64 := (bits + 63) / 64
+	buf := t.CopyScratchBuffer(int(numUint64) * 8)
+	if _, err := t.CopyInBytes(addr, buf); err != nil {
+		return 0, err
+	}
+	val := usermem.ByteOrder.Uint64(buf)
+	// Check that only allowed bits in the first unsigned long in the nodemask
+	// are set.
+	if val&^allowedNodemask != 0 {
+		return 0, syserror.EINVAL
+	}
+	// Check that all remaining bits in the nodemask are 0.
+	for i := 8; i < len(buf); i++ {
+		if buf[i] != 0 {
+			return 0, syserror.EINVAL
+		}
+	}
+	return val, nil
+}
+
+func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
+	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
+	// bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return syserror.EINVAL
+	}
+	if bits == 0 {
+		return nil
+	}
+	// Copy out the first unsigned long in the nodemask.
+	buf := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(buf, val)
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return err
+	}
+	// Zero out remaining unsigned longs in the nodemask.
+	if bits > 64 {
+		remAddr, ok := addr.AddLength(8)
+		if !ok {
+			return syserror.EFAULT
+		}
+		remUint64 := (bits - 1) / 64
+		if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mode := args[0].Pointer()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+	addr := args[3].Pointer()
+	flags := args[4].Uint()
+
+	if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	nodeFlag := flags&linux.MPOL_F_NODE != 0
+	addrFlag := flags&linux.MPOL_F_ADDR != 0
+	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+
+	// "EINVAL: The value specified by maxnode is less than the number of node
+	// IDs supported by the system." - get_mempolicy(2)
+	if nodemask != 0 && maxnode < maxNodes {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
+	// ignored and the set of nodes (memories) that the thread is allowed to
+	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
+	// absence of any mode flags) is returned in nodemask."
+	if memsAllowed {
+		// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
+		// MPOL_F_ADDR or MPOL_F_NODE."
+		if nodeFlag || addrFlag {
+			return 0, nil, syserror.EINVAL
+		}
+		if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, nil
+	}
+
+	// "If flags specifies MPOL_F_ADDR, then information is returned about the
+	// policy governing the memory address given in addr. ... If the mode
+	// argument is not NULL, then get_mempolicy() will store the policy mode
+	// and any optional mode flags of the requested NUMA policy in the location
+	// pointed to by this argument. If nodemask is not NULL, then the nodemask
+	// associated with the policy will be stored in the location pointed to by
+	// this argument."
+	if addrFlag {
+		policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if nodeFlag {
+			// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
+			// get_mempolicy() will return the node ID of the node on which the
+			// address addr is allocated into the location pointed to by mode.
+			// If no page has yet been allocated for the specified address,
+			// get_mempolicy() will allocate a page as if the thread had
+			// performed a read (load) access to that address, and return the
+			// ID of the node where that page was allocated."
+			buf := t.CopyScratchBuffer(1)
+			_, err := t.CopyInBytes(addr, buf)
+			if err != nil {
+				return 0, nil, err
+			}
+			policy = 0 // maxNodes == 1
+		}
+		if mode != 0 {
+			if _, err := t.CopyOut(mode, policy); err != nil {
+				return 0, nil, err
+			}
+		}
+		if nodemask != 0 {
+			if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+				return 0, nil, err
+			}
+		}
+		return 0, nil, nil
+	}
+
+	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
+	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
+	// inaccurate: if flags specifies MPOL_F_ADDR,
+	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
+	// just (usually) fail to find a VMA at address 0 and return EFAULT.
+	if addr != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags is specified as 0, then information about the calling thread's
+	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
+	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
+	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
+	// then get_mempolicy() will return in the location pointed to by a
+	// non-NULL mode argument, the node ID of the next node that will be used
+	// for interleaving of internal kernel pages allocated on behalf of the
+	// thread."
+	policy, nodemaskVal := t.NumaPolicy()
+	if nodeFlag {
+		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
+			return 0, nil, syserror.EINVAL
+		}
+		policy = 0 // maxNodes == 1
+	}
+	if mode != 0 {
+		if _, err := t.CopyOut(mode, policy); err != nil {
+			return 0, nil, err
+		}
+	}
+	if nodemask != 0 {
+		if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	modeWithFlags := args[0].Int()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+
+	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
+	return 0, nil, nil
+}
+
+// Mbind implements the syscall mbind(2).
+func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint64()
+	mode := args[2].Int()
+	nodemask := args[3].Pointer()
+	maxnode := args[4].Uint()
+	flags := args[5].Uint()
+
+	if flags&^linux.MPOL_MF_VALID != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
+	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
+	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
+		return 0, nil, syserror.EPERM
+	}
+
+	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Since we claim to have only a single node, all flags can be ignored
+	// (since all pages must already be on that single node).
+	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
+	return 0, nil, err
+}
+
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) {
+	flags := modeWithFlags & linux.MPOL_MODE_FLAGS
+	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+	if flags == linux.MPOL_MODE_FLAGS {
+		// Can't specify both mode flags simultaneously.
+		return 0, 0, syserror.EINVAL
+	}
+	if mode < 0 || mode >= linux.MPOL_MAX {
+		// Must specify a valid mode.
+		return 0, 0, syserror.EINVAL
+	}
+
+	var nodemaskVal uint64
+	if nodemask != 0 {
+		var err error
+		nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
+		if err != nil {
+			return 0, 0, err
+		}
+	}
+
+	switch mode {
+	case linux.MPOL_DEFAULT:
+		// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
+		// Linux allows a nodemask to be specified, as long as it is empty.
+		if nodemaskVal != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
+		// These require a non-empty nodemask.
+		if nodemaskVal == 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_PREFERRED:
+		// This permits an empty nodemask, as long as no flags are set.
+		if nodemaskVal == 0 && flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_LOCAL:
+		// This requires an empty nodemask and no flags set ...
+		if nodemaskVal != 0 || flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+		// ... and is implemented as MPOL_PREFERRED.
+		mode = linux.MPOL_PREFERRED
+	default:
+		// Unknown mode, which we should have rejected above.
+		panic(fmt.Sprintf("unknown mode: %v", mode))
+	}
+
+	return mode | flags, nodemaskVal, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 64a6e639c..9926f0ac5 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -204,151 +204,6 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 }
 
-func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) {
-	if ptr != 0 {
-		return t.CopyOut(ptr, val)
-	}
-	return 0, nil
-}
-
-// GetMempolicy implements the syscall get_mempolicy(2).
-func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	mode := args[0].Pointer()
-	nodemask := args[1].Pointer()
-	maxnode := args[2].Uint()
-	addr := args[3].Pointer()
-	flags := args[4].Uint()
-
-	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
-	nodeFlag := flags&linux.MPOL_F_NODE != 0
-	addrFlag := flags&linux.MPOL_F_ADDR != 0
-
-	// TODO(rahat): Once sysfs is implemented, report a single numa node in
-	// /sys/devices/system/node.
-	if nodemask != 0 && maxnode < 1 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// 'addr' provided iff 'addrFlag' set.
-	if addrFlag == (addr == 0) {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Default policy for the thread.
-	if flags == 0 {
-		policy, nodemaskVal := t.NumaPolicy()
-		if _, err := copyOutIfNotNull(t, mode, policy); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	// Report all nodes available to caller.
-	if memsAllowed {
-		// MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED.
-		if nodeFlag || addrFlag {
-			return 0, nil, syserror.EINVAL
-		}
-
-		// Report a single numa node.
-		if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	if addrFlag {
-		if nodeFlag {
-			// Return the id for the node where 'addr' resides, via 'mode'.
-			//
-			// The real get_mempolicy(2) allocates the page referenced by 'addr'
-			// by simulating a read, if it is unallocated before the call. It
-			// then returns the node the page is allocated on through the mode
-			// pointer.
-			b := t.CopyScratchBuffer(1)
-			_, err := t.CopyInBytes(addr, b)
-			if err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-			if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-		} else {
-			storedPolicy, _ := t.NumaPolicy()
-			// Return the policy governing the memory referenced by 'addr'.
-			if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-		}
-		return 0, nil, nil
-	}
-
-	storedPolicy, _ := t.NumaPolicy()
-	if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) {
-		// Policy for current thread is to interleave memory between
-		// nodes. Return the next node we'll allocate on. Since we only have a
-		// single node, this is always node 0.
-		if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	return 0, nil, syserror.EINVAL
-}
-
-func allowedNodesMask() uint32 {
-	const maxNodes = 1
-	return ^uint32((1 << maxNodes) - 1)
-}
-
-// SetMempolicy implements the syscall set_mempolicy(2).
-func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	modeWithFlags := args[0].Int()
-	nodemask := args[1].Pointer()
-	maxnode := args[2].Uint()
-
-	if nodemask != 0 && maxnode < 1 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
-		// Can't specify multiple modes simultaneously.
-		return 0, nil, syserror.EINVAL
-	}
-
-	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
-	if mode < 0 || mode >= linux.MPOL_MAX {
-		// Must specify a valid mode.
-		return 0, nil, syserror.EINVAL
-	}
-
-	var nodemaskVal uint32
-	// Nodemask may be empty for some policy modes.
-	if nodemask != 0 && maxnode > 0 {
-		if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-	}
-
-	if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 {
-		// Mode requires a non-empty nodemask, but got an empty nodemask.
-		return 0, nil, syserror.EINVAL
-	}
-
-	if nodemaskVal&allowedNodesMask() != 0 {
-		// Invalid node specified.
-		return 0, nil, syserror.EINVAL
-	}
-
-	t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal)
-
-	return 0, nil, nil
-}
-
 // Mincore implements the syscall mincore(2).
 func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
-- 
cgit v1.2.3