summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/arch/arch_amd64.go
blob: 3b3a0a27255a7ff5913f3834fe232828fad33a7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package arch

import (
	"bytes"
	"fmt"
	"math/rand"
	"syscall"

	"gvisor.dev/gvisor/pkg/cpuid"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/usermem"
)

// Host specifies the host architecture.
const Host = AMD64

// These constants come directly from Linux.
const (
	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
	// for a 64-bit process.
	maxAddr64 usermem.Addr = (1 << 47) - usermem.PageSize

	// maxStackRand64 is the maximum randomization to apply to the stack.
	// It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux.
	maxStackRand64 = 16 << 30 // 16 GB

	// maxMmapRand64 is the maximum randomization to apply to the mmap
	// layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux.
	maxMmapRand64 = (1 << 28) * usermem.PageSize

	// minGap64 is the minimum gap to leave at the top of the address space
	// for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux.
	minGap64 = (128 << 20) + maxStackRand64

	// preferredPIELoadAddr is the standard Linux position-independent
	// executable base load address. It is ELF_ET_DYN_BASE in Linux.
	//
	// The Platform {Min,Max}UserAddress() may preclude loading at this
	// address. See other preferredFoo comments below.
	preferredPIELoadAddr usermem.Addr = maxAddr64 / 3 * 2
)

// These constants are selected as heuristics to help make the Platform's
// potentially limited address space conform as closely to Linux as possible.
const (
	// Select a preferred minimum TopDownBase address.
	//
	// Some applications (TSAN and other *SANs) are very particular about
	// the way the Linux mmap allocator layouts out the address space.
	//
	// TSAN in particular expects top down allocations to be made in the
	// range [0x7e8000000000, 0x800000000000).
	//
	// The minimum TopDownBase on Linux would be:
	// 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000.
	//
	// (minGap64 because TSAN uses a small RLIMIT_STACK.)
	//
	// 0x7e8000000000 is selected arbitrarily by TSAN to leave room for
	// allocations below TopDownBase.
	//
	// N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all
	// the way down to 0x10007fff8000, and MSAN down to 0x700000000000.
	//
	// Of course, there is no hard minimum to allocation; an allocator can
	// search all the way from TopDownBase to Min. However, TSAN declared
	// their range "good enough".
	//
	// We would like to pick a TopDownBase such that it is unlikely that an
	// allocator will select an address below TSAN's minimum. We achieve
	// this by trying to leave a sizable gap below TopDownBase.
	//
	// This is all "preferred" because the layout min/max address may not
	// allow us to select such a TopDownBase, in which case we have to fall
	// back to a layout that TSAN may not be happy with.
	preferredTopDownAllocMin usermem.Addr = 0x7e8000000000
	preferredAllocationGap                = 128 << 30 // 128 GB
	preferredTopDownBaseMin               = preferredTopDownAllocMin + preferredAllocationGap

	// minMmapRand64 is the smallest we are willing to make the
	// randomization to stay above preferredTopDownBaseMin.
	minMmapRand64 = (1 << 26) * usermem.PageSize
)

// context64 represents an AMD64 context.
//
// +stateify savable
type context64 struct {
	State
	sigFPState []x86FPState // fpstate to be restored on sigreturn.
}

// Arch implements Context.Arch.
func (c *context64) Arch() Arch {
	return AMD64
}

func (c *context64) copySigFPState() []x86FPState {
	var sigfps []x86FPState
	for _, s := range c.sigFPState {
		sigfps = append(sigfps, s.fork())
	}
	return sigfps
}

// Fork returns an exact copy of this context.
func (c *context64) Fork() Context {
	return &context64{
		State:      c.State.Fork(),
		sigFPState: c.copySigFPState(),
	}
}

// Return returns the current syscall return value.
func (c *context64) Return() uintptr {
	return uintptr(c.Regs.Rax)
}

// SetReturn sets the syscall return value.
func (c *context64) SetReturn(value uintptr) {
	c.Regs.Rax = uint64(value)
}

// IP returns the current instruction pointer.
func (c *context64) IP() uintptr {
	return uintptr(c.Regs.Rip)
}

// SetIP sets the current instruction pointer.
func (c *context64) SetIP(value uintptr) {
	c.Regs.Rip = uint64(value)
}

// Stack returns the current stack pointer.
func (c *context64) Stack() uintptr {
	return uintptr(c.Regs.Rsp)
}

// SetStack sets the current stack pointer.
func (c *context64) SetStack(value uintptr) {
	c.Regs.Rsp = uint64(value)
}

// TLS returns the current TLS pointer.
func (c *context64) TLS() uintptr {
	return uintptr(c.Regs.Fs_base)
}

// SetTLS sets the current TLS pointer. Returns false if value is invalid.
func (c *context64) SetTLS(value uintptr) bool {
	if !isValidSegmentBase(uint64(value)) {
		return false
	}

	c.Regs.Fs = 0
	c.Regs.Fs_base = uint64(value)
	return true
}

// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
	c.Regs.R10 = uint64(value)
}

// Native returns the native type for the given val.
func (c *context64) Native(val uintptr) interface{} {
	v := uint64(val)
	return &v
}

// Value returns the generic val for the given native type.
func (c *context64) Value(val interface{}) uintptr {
	return uintptr(*val.(*uint64))
}

// Width returns the byte width of this architecture.
func (c *context64) Width() uint {
	return 8
}

// FeatureSet returns the FeatureSet in use.
func (c *context64) FeatureSet() *cpuid.FeatureSet {
	return c.State.FeatureSet
}

// mmapRand returns a random adjustment for randomizing an mmap layout.
func mmapRand(max uint64) usermem.Addr {
	return usermem.Addr(rand.Int63n(int64(max))).RoundDown()
}

// NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) {
	min, ok := min.RoundUp()
	if !ok {
		return MmapLayout{}, syscall.EINVAL
	}
	if max > maxAddr64 {
		max = maxAddr64
	}
	max = max.RoundDown()

	if min > max {
		return MmapLayout{}, syscall.EINVAL
	}

	stackSize := r.Get(limits.Stack)

	// MAX_GAP in Linux.
	maxGap := (max / 6) * 5
	gap := usermem.Addr(stackSize.Cur)
	if gap < minGap64 {
		gap = minGap64
	}
	if gap > maxGap {
		gap = maxGap
	}
	defaultDir := MmapTopDown
	if stackSize.Cur == limits.Infinity {
		defaultDir = MmapBottomUp
	}

	topDownMin := max - gap - maxMmapRand64
	maxRand := usermem.Addr(maxMmapRand64)
	if topDownMin < preferredTopDownBaseMin {
		// Try to keep TopDownBase above preferredTopDownBaseMin by
		// shrinking maxRand.
		maxAdjust := maxRand - minMmapRand64
		needAdjust := preferredTopDownBaseMin - topDownMin
		if needAdjust <= maxAdjust {
			maxRand -= needAdjust
		}
	}

	rnd := mmapRand(uint64(maxRand))
	l := MmapLayout{
		MinAddr: min,
		MaxAddr: max,
		// TASK_UNMAPPED_BASE in Linux.
		BottomUpBase:     (max/3 + rnd).RoundDown(),
		TopDownBase:      (max - gap - rnd).RoundDown(),
		DefaultDirection: defaultDir,
		// We may have reduced the maximum randomization to keep
		// TopDownBase above preferredTopDownBaseMin while maintaining
		// our stack gap. Stack allocations must use that max
		// randomization to avoiding eating into the gap.
		MaxStackRand: uint64(maxRand),
	}

	// Final sanity check on the layout.
	if !l.Valid() {
		panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
	}

	return l, nil
}

// PIELoadAddress implements Context.PIELoadAddress.
func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
	base := preferredPIELoadAddr
	max, ok := base.AddLength(maxMmapRand64)
	if !ok {
		panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
	}

	if max > l.MaxAddr {
		// preferredPIELoadAddr won't fit; fall back to the standard
		// Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
		//
		// Don't bother trying to shrink the randomization for now.
		base = l.TopDownBase / 3 * 2
	}

	return base + mmapRand(maxMmapRand64)
}

// userStructSize is the size in bytes of Linux's struct user on amd64.
const userStructSize = 928

// PtracePeekUser implements Context.PtracePeekUser.
func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
	if addr&7 != 0 || addr >= userStructSize {
		return nil, syscall.EIO
	}
	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
	// u_debugreg, returning 0 or silently no-oping for other fields
	// respectively.
	if addr < uintptr(registersSize) {
		regs := c.ptraceGetRegs()
		buf := make([]byte, regs.SizeBytes())
		regs.MarshalUnsafe(buf)
		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
	}
	// Note: x86 debug registers are missing.
	return c.Native(0), nil
}

// PtracePokeUser implements Context.PtracePokeUser.
func (c *context64) PtracePokeUser(addr, data uintptr) error {
	if addr&7 != 0 || addr >= userStructSize {
		return syscall.EIO
	}
	if addr < uintptr(registersSize) {
		regs := c.ptraceGetRegs()
		buf := make([]byte, regs.SizeBytes())
		regs.MarshalUnsafe(buf)
		usermem.ByteOrder.PutUint64(buf[addr:], uint64(data))
		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
		return err
	}
	// Note: x86 debug registers are missing.
	return nil
}