1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package host implements an fs.Filesystem for files backed by host
// file descriptors.
package host
import (
"fmt"
"path"
"path/filepath"
"strconv"
"strings"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
)
// FilesystemName is the name under which Filesystem is registered.
const FilesystemName = "whitelistfs"
const (
// whitelistKey is the mount option containing a comma-separated list
// of host paths to whitelist.
whitelistKey = "whitelist"
// rootPathKey is the mount option containing the root path of the
// mount.
rootPathKey = "root"
// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
dontTranslateOwnershipKey = "dont_translate_ownership"
)
// maxTraversals determines link traversals in building the whitelist.
const maxTraversals = 10
// Filesystem is a pseudo file system that is only available during the setup
// to lock down the configurations. This filesystem should only be mounted at root.
//
// Think twice before exposing this to applications.
//
// +stateify savable
type Filesystem struct {
// whitelist is a set of host paths to whitelist.
paths []string
}
var _ fs.Filesystem = (*Filesystem)(nil)
// Name is the identifier of this file system.
func (*Filesystem) Name() string {
return FilesystemName
}
// AllowUserMount prohibits users from using mount(2) with this file system.
func (*Filesystem) AllowUserMount() bool {
return false
}
// AllowUserList allows this filesystem to be listed in /proc/filesystems.
func (*Filesystem) AllowUserList() bool {
return true
}
// Flags returns that there is nothing special about this file system.
func (*Filesystem) Flags() fs.FilesystemFlags {
return 0
}
// Mount returns an fs.Inode exposing the host file system. It is intended to be locked
// down in PreExec below.
func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
// Parse generic comma-separated key=value options.
options := fs.GenericMountSourceOptions(data)
// Grab the whitelist if one was specified.
// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
// no whitelist.
if wl, ok := options[whitelistKey]; ok {
f.paths = strings.Split(wl, "|")
delete(options, whitelistKey)
}
// If the rootPath was set, use it. Othewise default to the root of the
// host fs.
rootPath := "/"
if rp, ok := options[rootPathKey]; ok {
rootPath = rp
delete(options, rootPathKey)
// We must relativize the whitelisted paths to the new root.
for i, p := range f.paths {
rel, err := filepath.Rel(rootPath, p)
if err != nil {
return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
}
f.paths[i] = path.Join("/", rel)
}
}
fd, err := open(nil, rootPath)
if err != nil {
return nil, fmt.Errorf("failed to find root: %v", err)
}
var dontTranslateOwnership bool
if v, ok := options[dontTranslateOwnershipKey]; ok {
b, err := strconv.ParseBool(v)
if err != nil {
return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
}
dontTranslateOwnership = b
delete(options, dontTranslateOwnershipKey)
}
// Fail if the caller passed us more options than we know about.
if len(options) > 0 {
return nil, fmt.Errorf("unsupported mount options: %v", options)
}
// The mounting EUID/EGID will be cached by this file system. This will
// be used to assign ownership to files that we own.
owner := fs.FileOwnerFromContext(ctx)
// Construct the host file system mount and inode.
msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
}
// InstallWhitelist locks down the MountNamespace to only the currently installed
// Dirents and the given paths.
func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
return installWhitelist(ctx, m, f.paths)
}
func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
// Warning will be logged during filter installation if the empty
// whitelist matters (allows for host file access).
return nil
}
// Done tracks entries already added.
done := make(map[string]bool)
root := m.Root()
defer root.DecRef()
for i := 0; i < len(paths); i++ {
// Make sure the path is absolute. This is a sanity check.
if !path.IsAbs(paths[i]) {
return fmt.Errorf("path %q is not absolute", paths[i])
}
// We need to add all the intermediate paths, in case one of
// them is a symlink that needs to be resolved.
for j := 1; j <= len(paths[i]); j++ {
if j < len(paths[i]) && paths[i][j] != '/' {
continue
}
current := paths[i][:j]
// Lookup the given component in the tree.
remainingTraversals := uint(maxTraversals)
d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
if err != nil {
log.Warningf("populate failed for %q: %v", current, err)
continue
}
// It's critical that this DecRef happens after the
// freeze below. This ensures that the dentry is in
// place to be frozen. Otherwise, we freeze without
// these entries.
defer d.DecRef()
// Expand the last component if necessary.
if current == paths[i] {
// Is it a directory or symlink?
sattr := d.Inode.StableAttr
if fs.IsDir(sattr) {
for name := range childDentAttrs(ctx, d) {
paths = append(paths, path.Join(current, name))
}
}
if fs.IsSymlink(sattr) {
// Only expand symlinks once. The
// folder structure may contain
// recursive symlinks and we don't want
// to end up infinitely expanding this
// symlink. This is safe because this
// is the last component. If a later
// path wants to symlink something
// beneath this symlink that will still
// be handled by the FindLink above.
if done[current] {
continue
}
s, err := d.Inode.Readlink(ctx)
if err != nil {
log.Warningf("readlink failed for %q: %v", current, err)
continue
}
if path.IsAbs(s) {
paths = append(paths, s)
} else {
target := path.Join(path.Dir(current), s)
paths = append(paths, target)
}
}
}
// Only report this one once even though we may look
// it up more than once. If we whitelist /a/b,/a then
// /a will be "done" when it is looked up for /a/b,
// however we still need to expand all of its contents
// when whitelisting /a.
if !done[current] {
log.Debugf("whitelisted: %s", current)
}
done[current] = true
}
}
// Freeze the mount tree in place. This prevents any new paths from
// being opened and any old ones from being removed. If we do provide
// tmpfs mounts, we'll want to freeze/thaw those separately.
m.Freeze()
return nil
}
func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
dirname, _ := d.FullName(nil /* root */)
dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
if err != nil {
log.Warningf("failed to open directory %q: %v", dirname, err)
return nil
}
dir.DecRef()
var stubSerializer fs.CollectEntriesSerializer
if err := dir.Readdir(ctx, &stubSerializer); err != nil {
log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
return nil
}
delete(stubSerializer.Entries, ".")
delete(stubSerializer.Entries, "..")
return stubSerializer.Entries
}
// newMountSource constructs a new host fs.MountSource
// relative to a root path. The root should match the mount point.
func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
return fs.NewMountSource(ctx, &superOperations{
root: root,
inodeMappings: make(map[uint64]string),
mounter: mounter,
dontTranslateOwnership: dontTranslateOwnership,
}, filesystem, flags)
}
// superOperations implements fs.MountSourceOperations.
//
// +stateify savable
type superOperations struct {
fs.SimpleMountSourceOperations
// root is the path of the mount point. All inode mappings
// are relative to this root.
root string
// inodeMappings contains mappings of fs.Inodes associated
// with this MountSource to paths under root.
inodeMappings map[uint64]string
// mounter is the cached EUID/EGID that mounted this file system.
mounter fs.FileOwner
// dontTranslateOwnership indicates whether to not translate file
// ownership.
//
// By default, files/directories owned by the sandbox uses UID/GID
// of the mounter. For files/directories that are not owned by the
// sandbox, file UID/GID is translated to a UID/GID which cannot
// be mapped in the sandboxed application's user namespace. The
// UID/GID will look like the nobody UID/GID (65534) but is not
// strictly owned by the user "nobody".
//
// If whitelistfs is a lower filesystem in an overlay, set
// dont_translate_ownership=true in mount options.
dontTranslateOwnership bool
}
var _ fs.MountSourceOperations = (*superOperations)(nil)
// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
func (m *superOperations) ResetInodeMappings() {
m.inodeMappings = make(map[uint64]string)
}
// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
// because overlay copyUp may have changed them out from under us.
// So much for "immutable".
sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
m.inodeMappings[sattr.InodeID] = path
}
// Keep implements fs.MountSourceOperations.Keep.
//
// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
// host file while it is in the dirent cache (say from RO to RW), but it is not
// possible to re-open the file with more relaxed permissions, since the host
// FD is already open and stored in the inode.
//
// Using the dirent LRU cache increases the odds that this bug is encountered.
// Since host file access is relatively fast anyways, we disable the LRU cache
// for host fs files. Once we can properly deal with permissions changes and
// re-opening host files, we should revisit whether or not to make use of the
// LRU cache.
func (*superOperations) Keep(*fs.Dirent) bool {
return false
}
func init() {
fs.RegisterFilesystem(&Filesystem{})
}
|