1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fcntl.h>
#include <linux/aio_abi.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <algorithm>
#include <string>
#include "gtest/gtest.h"
#include "test/syscalls/linux/file_base.h"
#include "test/util/cleanup.h"
#include "test/util/file_descriptor.h"
#include "test/util/fs_util.h"
#include "test/util/memory_util.h"
#include "test/util/posix_error.h"
#include "test/util/proc_util.h"
#include "test/util/temp_path.h"
#include "test/util/test_util.h"
using ::testing::_;
namespace gvisor {
namespace testing {
namespace {
// Returns the size of the VMA containing the given address.
PosixErrorOr<size_t> VmaSizeAt(uintptr_t addr) {
ASSIGN_OR_RETURN_ERRNO(std::string proc_self_maps,
GetContents("/proc/self/maps"));
ASSIGN_OR_RETURN_ERRNO(auto entries, ParseProcMaps(proc_self_maps));
// Use binary search to find the first VMA that might contain addr.
ProcMapsEntry target = {};
target.end = addr;
auto it =
std::upper_bound(entries.begin(), entries.end(), target,
[](const ProcMapsEntry& x, const ProcMapsEntry& y) {
return x.end < y.end;
});
// Check that it actually contains addr.
if (it == entries.end() || addr < it->start) {
return PosixError(ENOENT, absl::StrCat("no VMA contains address ", addr));
}
return it->end - it->start;
}
constexpr char kData[] = "hello world!";
int SubmitCtx(aio_context_t ctx, long nr, struct iocb** iocbpp) {
return syscall(__NR_io_submit, ctx, nr, iocbpp);
}
class AIOTest : public FileTest {
public:
AIOTest() : ctx_(0) {}
int SetupContext(unsigned int nr) {
return syscall(__NR_io_setup, nr, &ctx_);
}
int Submit(long nr, struct iocb** iocbpp) {
return SubmitCtx(ctx_, nr, iocbpp);
}
int GetEvents(long min, long max, struct io_event* events,
struct timespec* timeout) {
return RetryEINTR(syscall)(__NR_io_getevents, ctx_, min, max, events,
timeout);
}
int DestroyContext() { return syscall(__NR_io_destroy, ctx_); }
void TearDown() override {
FileTest::TearDown();
if (ctx_ != 0) {
ASSERT_THAT(DestroyContext(), SyscallSucceeds());
}
}
struct iocb CreateCallback() {
struct iocb cb = {};
cb.aio_data = 0x123;
cb.aio_fildes = test_file_fd_.get();
cb.aio_lio_opcode = IOCB_CMD_PWRITE;
cb.aio_buf = reinterpret_cast<uint64_t>(kData);
cb.aio_offset = 0;
cb.aio_nbytes = strlen(kData);
return cb;
}
protected:
aio_context_t ctx_;
};
TEST_F(AIOTest, BasicWrite) {
// Copied from fs/aio.c.
constexpr unsigned AIO_RING_MAGIC = 0xa10a10a1;
struct aio_ring {
unsigned id;
unsigned nr;
unsigned head;
unsigned tail;
unsigned magic;
unsigned compat_features;
unsigned incompat_features;
unsigned header_length;
struct io_event io_events[0];
};
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
// Check that 'ctx_' points to a valid address. libaio uses it to check if
// aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
// Linux implements aio_ring, so skip the zeroes check.
//
// TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring.
auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
EXPECT_EQ(ring->magic, magic);
struct iocb cb = CreateCallback();
struct iocb* cbs[1] = {&cb};
// Submit the request.
ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
// Get the reply.
struct io_event events[1];
ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
// Verify that it is as expected.
EXPECT_EQ(events[0].data, 0x123);
EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
EXPECT_EQ(events[0].res, strlen(kData));
// Verify that the file contains the contents.
char verify_buf[sizeof(kData)] = {};
ASSERT_THAT(read(test_file_fd_.get(), verify_buf, sizeof(kData)),
SyscallSucceedsWithValue(strlen(kData)));
EXPECT_STREQ(verify_buf, kData);
}
TEST_F(AIOTest, BadWrite) {
// Create a pipe and immediately close the read end.
int pipefd[2];
ASSERT_THAT(pipe(pipefd), SyscallSucceeds());
FileDescriptor rfd(pipefd[0]);
FileDescriptor wfd(pipefd[1]);
rfd.reset(); // Close the read end.
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
struct iocb cb = CreateCallback();
// Try to write to the read end.
cb.aio_fildes = wfd.get();
struct iocb* cbs[1] = {&cb};
// Submit the request.
ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
// Get the reply.
struct io_event events[1];
ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
// Verify that it fails with the right error code.
EXPECT_EQ(events[0].data, 0x123);
EXPECT_EQ(events[0].obj, reinterpret_cast<uint64_t>(&cb));
EXPECT_LT(events[0].res, 0);
}
TEST_F(AIOTest, ExitWithPendingIo) {
// Setup a context that is 5 entries deep.
ASSERT_THAT(SetupContext(5), SyscallSucceeds());
struct iocb cb = CreateCallback();
struct iocb* cbs[] = {&cb};
// Submit a request but don't complete it to make it pending.
EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
}
int Submitter(void* arg) {
auto test = reinterpret_cast<AIOTest*>(arg);
struct iocb cb = test->CreateCallback();
struct iocb* cbs[1] = {&cb};
// Submit the request.
TEST_CHECK(test->Submit(1, cbs) == 1);
return 0;
}
TEST_F(AIOTest, CloneVm) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
const size_t kStackSize = 5 * kPageSize;
std::unique_ptr<char[]> stack(new char[kStackSize]);
char* bp = stack.get() + kStackSize;
pid_t child;
ASSERT_THAT(child = clone(Submitter, bp, CLONE_VM | SIGCHLD,
reinterpret_cast<void*>(this)),
SyscallSucceeds());
// Get the reply.
struct io_event events[1];
ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
// Verify that it is as expected.
EXPECT_EQ(events[0].data, 0x123);
EXPECT_EQ(events[0].res, strlen(kData));
// Verify that the file contains the contents.
char verify_buf[32] = {};
ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
SyscallSucceeds());
EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
int status;
ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
SyscallSucceedsWithValue(child));
EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
<< " status " << status;
}
// Tests that AIO context can be remapped to a different address.
TEST_F(AIOTest, Mremap) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
const size_t ctx_size =
ASSERT_NO_ERRNO_AND_VALUE(VmaSizeAt(reinterpret_cast<uintptr_t>(ctx_)));
struct iocb cb = CreateCallback();
struct iocb* cbs[1] = {&cb};
// Reserve address space for the mremap target so we have something safe to
// map over.
Mapping dst =
ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(ctx_size, PROT_READ, MAP_PRIVATE));
// Remap context 'handle' to a different address.
ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
IsPosixErrorOkAndHolds(dst.ptr()));
aio_context_t old_ctx = ctx_;
ctx_ = reinterpret_cast<aio_context_t>(dst.addr());
// io_destroy() will unmap dst now.
dst.release();
// Check that submitting the request with the old 'ctx_' fails.
ASSERT_THAT(SubmitCtx(old_ctx, 1, cbs), SyscallFailsWithErrno(EINVAL));
// Submit the request with the new 'ctx_'.
ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
// Remap again.
dst = ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(ctx_size, PROT_READ, MAP_PRIVATE));
ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
IsPosixErrorOkAndHolds(dst.ptr()));
ctx_ = reinterpret_cast<aio_context_t>(dst.addr());
dst.release();
// Get the reply with yet another 'ctx_' and verify it.
struct io_event events[1];
ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
EXPECT_EQ(events[0].data, 0x123);
EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
EXPECT_EQ(events[0].res, strlen(kData));
// Verify that the file contains the contents.
char verify_buf[sizeof(kData)] = {};
ASSERT_THAT(read(test_file_fd_.get(), verify_buf, sizeof(kData)),
SyscallSucceedsWithValue(strlen(kData)));
EXPECT_STREQ(verify_buf, kData);
}
// Tests that AIO context cannot be expanded with mremap.
TEST_F(AIOTest, MremapExpansion) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
const size_t ctx_size =
ASSERT_NO_ERRNO_AND_VALUE(VmaSizeAt(reinterpret_cast<uintptr_t>(ctx_)));
// Reserve address space for the mremap target so we have something safe to
// map over.
Mapping dst = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(ctx_size + kPageSize, PROT_NONE, MAP_PRIVATE));
// Test that remapping to a larger address range fails.
ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
PosixErrorIs(EFAULT, _));
// mm/mremap.c:sys_mremap() => mremap_to() does do_munmap() of the destination
// before it hits the VM_DONTEXPAND check in vma_to_resize(), so we should no
// longer munmap it (another thread may have created a mapping there).
dst.release();
}
// Tests that AIO calls fail if context's address is inaccessible.
TEST_F(AIOTest, Mprotect) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
struct iocb cb = CreateCallback();
struct iocb* cbs[1] = {&cb};
ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
// Makes the context 'handle' inaccessible and check that all subsequent
// calls fail.
ASSERT_THAT(mprotect(reinterpret_cast<void*>(ctx_), kPageSize, PROT_NONE),
SyscallSucceeds());
struct io_event events[1];
EXPECT_THAT(GetEvents(1, 1, events, nullptr), SyscallFailsWithErrno(EINVAL));
ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
EXPECT_THAT(DestroyContext(), SyscallFailsWithErrno(EINVAL));
// Prevent TearDown from attempting to destroy the context and fail.
ctx_ = 0;
}
TEST_F(AIOTest, Timeout) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
struct timespec timeout;
timeout.tv_sec = 0;
timeout.tv_nsec = 10;
struct io_event events[1];
ASSERT_THAT(GetEvents(1, 1, events, &timeout), SyscallSucceedsWithValue(0));
}
class AIOReadWriteParamTest : public AIOTest,
public ::testing::WithParamInterface<int> {};
TEST_P(AIOReadWriteParamTest, BadOffset) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
struct iocb cb = CreateCallback();
struct iocb* cbs[1] = {&cb};
// Create a buffer that we can write to.
char buf[] = "hello world!";
cb.aio_buf = reinterpret_cast<uint64_t>(buf);
// Set the operation on the callback and give a negative offset.
const int opcode = GetParam();
cb.aio_lio_opcode = opcode;
iovec iov = {};
if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) {
// Create a valid iovec and set it in the callback.
iov.iov_base = reinterpret_cast<void*>(buf);
iov.iov_len = 1;
cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
// aio_nbytes is the number of iovecs.
cb.aio_nbytes = 1;
}
// Pass a negative offset.
cb.aio_offset = -1;
// Should get error on submission.
ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
}
INSTANTIATE_TEST_SUITE_P(BadOffset, AIOReadWriteParamTest,
::testing::Values(IOCB_CMD_PREAD, IOCB_CMD_PWRITE,
IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
class AIOVectorizedParamTest : public AIOTest,
public ::testing::WithParamInterface<int> {};
TEST_P(AIOVectorizedParamTest, BadIOVecs) {
// Setup a context that is 128 entries deep.
ASSERT_THAT(SetupContext(128), SyscallSucceeds());
struct iocb cb = CreateCallback();
struct iocb* cbs[1] = {&cb};
// Modify the callback to use the operation from the param.
cb.aio_lio_opcode = GetParam();
// Create an iovec with address in kernel range, and pass that as the buffer.
iovec iov = {};
iov.iov_base = reinterpret_cast<void*>(0xFFFFFFFF00000000);
iov.iov_len = 1;
cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
// aio_nbytes is the number of iovecs.
cb.aio_nbytes = 1;
// Should get error on submission.
ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EFAULT));
}
INSTANTIATE_TEST_SUITE_P(BadIOVecs, AIOVectorizedParamTest,
::testing::Values(IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
} // namespace
} // namespace testing
} // namespace gvisor
|