xemu/os-posix.c
Claudio Imbrenda c891c24b1a os-posix: asynchronous teardown for shutdown on Linux
This patch adds support for asynchronously tearing down a VM on Linux.

When qemu terminates, either naturally or because of a fatal signal,
the VM is torn down. If the VM is huge, it can take a considerable
amount of time for it to be cleaned up. In case of a protected VM, it
might take even longer than a non-protected VM (this is the case on
s390x, for example).

Some users might want to shut down a VM and restart it immediately,
without having to wait. This is especially true if management
infrastructure like libvirt is used.

This patch implements a simple trick on Linux to allow qemu to return
immediately, with the teardown of the VM being performed
asynchronously.

If the new commandline option -async-teardown is used, a new process is
spawned from qemu at startup, using the clone syscall, in such way that
it will share its address space with qemu.The new process will have the
name "cleanup/<QEMU_PID>". It will wait until qemu terminates
completely, and then it will exit itself.

This allows qemu to terminate quickly, without having to wait for the
whole address space to be torn down. The cleanup process will exit
after qemu, so it will be the last user of the address space, and
therefore it will take care of the actual teardown. The cleanup
process will share the same cgroups as qemu, so both memory usage and
cpu time will be accounted properly.

If possible, close_range will be used in the cleanup process to close
all open file descriptors. If it is not available or if it fails, /proc
will be used to determine which file descriptors to close.

If the cleanup process is forcefully killed with SIGKILL before the
main qemu process has terminated completely, the mechanism is defeated
and the teardown will not be asynchronous.

This feature can already be used with libvirt by adding the following
to the XML domain definition to pass the parameter to qemu directly:

  <commandline xmlns="http://libvirt.org/schemas/domain/qemu/1.0">
  <arg value='-async-teardown'/>
  </commandline>

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
Tested-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
Message-Id: <20220812133453.82671-1-imbrenda@linux.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-10-31 09:46:34 +01:00

337 lines
8.1 KiB
C

/*
* os-posix.c
*
* Copyright (c) 2003-2008 Fabrice Bellard
* Copyright (c) 2010 Red Hat, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "qemu/osdep.h"
#include <sys/wait.h>
#include <pwd.h>
#include <grp.h>
#include <libgen.h>
/* Needed early for CONFIG_BSD etc. */
#include "net/slirp.h"
#include "qemu/qemu-options.h"
#include "qemu/error-report.h"
#include "qemu/log.h"
#include "sysemu/runstate.h"
#include "qemu/cutils.h"
#ifdef CONFIG_LINUX
#include <sys/prctl.h>
#include "qemu/async-teardown.h"
#endif
/*
* Must set all three of these at once.
* Legal combinations are unset by name by uid
*/
static struct passwd *user_pwd; /* NULL non-NULL NULL */
static uid_t user_uid = (uid_t)-1; /* -1 -1 >=0 */
static gid_t user_gid = (gid_t)-1; /* -1 -1 >=0 */
static const char *chroot_dir;
static int daemonize;
static int daemon_pipe;
void os_setup_early_signal_handling(void)
{
struct sigaction act;
sigfillset(&act.sa_mask);
act.sa_flags = 0;
act.sa_handler = SIG_IGN;
sigaction(SIGPIPE, &act, NULL);
}
static void termsig_handler(int signal, siginfo_t *info, void *c)
{
qemu_system_killed(info->si_signo, info->si_pid);
}
void os_setup_signal_handling(void)
{
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_sigaction = termsig_handler;
act.sa_flags = SA_SIGINFO;
sigaction(SIGINT, &act, NULL);
sigaction(SIGHUP, &act, NULL);
sigaction(SIGTERM, &act, NULL);
}
void os_set_proc_name(const char *s)
{
#if defined(PR_SET_NAME)
char name[16];
if (!s)
return;
pstrcpy(name, sizeof(name), s);
/* Could rewrite argv[0] too, but that's a bit more complicated.
This simple way is enough for `top'. */
if (prctl(PR_SET_NAME, name)) {
error_report("unable to change process name: %s", strerror(errno));
exit(1);
}
#else
error_report("Change of process name not supported by your OS");
exit(1);
#endif
}
static bool os_parse_runas_uid_gid(const char *optarg)
{
unsigned long lv;
const char *ep;
uid_t got_uid;
gid_t got_gid;
int rc;
rc = qemu_strtoul(optarg, &ep, 0, &lv);
got_uid = lv; /* overflow here is ID in C99 */
if (rc || *ep != ':' || got_uid != lv || got_uid == (uid_t)-1) {
return false;
}
rc = qemu_strtoul(ep + 1, 0, 0, &lv);
got_gid = lv; /* overflow here is ID in C99 */
if (rc || got_gid != lv || got_gid == (gid_t)-1) {
return false;
}
user_pwd = NULL;
user_uid = got_uid;
user_gid = got_gid;
return true;
}
/*
* Parse OS specific command line options.
* return 0 if option handled, -1 otherwise
*/
int os_parse_cmd_args(int index, const char *optarg)
{
switch (index) {
case QEMU_OPTION_runas:
user_pwd = getpwnam(optarg);
if (user_pwd) {
user_uid = -1;
user_gid = -1;
} else if (!os_parse_runas_uid_gid(optarg)) {
error_report("User \"%s\" doesn't exist"
" (and is not <uid>:<gid>)",
optarg);
exit(1);
}
break;
case QEMU_OPTION_chroot:
chroot_dir = optarg;
break;
case QEMU_OPTION_daemonize:
daemonize = 1;
break;
#if defined(CONFIG_LINUX)
case QEMU_OPTION_asyncteardown:
init_async_teardown();
break;
#endif
default:
return -1;
}
return 0;
}
static void change_process_uid(void)
{
assert((user_uid == (uid_t)-1) || user_pwd == NULL);
assert((user_uid == (uid_t)-1) ==
(user_gid == (gid_t)-1));
if (user_pwd || user_uid != (uid_t)-1) {
gid_t intended_gid = user_pwd ? user_pwd->pw_gid : user_gid;
uid_t intended_uid = user_pwd ? user_pwd->pw_uid : user_uid;
if (setgid(intended_gid) < 0) {
error_report("Failed to setgid(%d)", intended_gid);
exit(1);
}
if (user_pwd) {
if (initgroups(user_pwd->pw_name, user_pwd->pw_gid) < 0) {
error_report("Failed to initgroups(\"%s\", %d)",
user_pwd->pw_name, user_pwd->pw_gid);
exit(1);
}
} else {
if (setgroups(1, &user_gid) < 0) {
error_report("Failed to setgroups(1, [%d])",
user_gid);
exit(1);
}
}
if (setuid(intended_uid) < 0) {
error_report("Failed to setuid(%d)", intended_uid);
exit(1);
}
if (setuid(0) != -1) {
error_report("Dropping privileges failed");
exit(1);
}
}
}
static void change_root(void)
{
if (chroot_dir) {
if (chroot(chroot_dir) < 0) {
error_report("chroot failed");
exit(1);
}
if (chdir("/")) {
error_report("not able to chdir to /: %s", strerror(errno));
exit(1);
}
}
}
void os_daemonize(void)
{
if (daemonize) {
pid_t pid;
int fds[2];
if (!g_unix_open_pipe(fds, FD_CLOEXEC, NULL)) {
exit(1);
}
pid = fork();
if (pid > 0) {
uint8_t status;
ssize_t len;
close(fds[1]);
do {
len = read(fds[0], &status, 1);
} while (len < 0 && errno == EINTR);
/* only exit successfully if our child actually wrote
* a one-byte zero to our pipe, upon successful init */
exit(len == 1 && status == 0 ? 0 : 1);
} else if (pid < 0) {
exit(1);
}
close(fds[0]);
daemon_pipe = fds[1];
setsid();
pid = fork();
if (pid > 0) {
exit(0);
} else if (pid < 0) {
exit(1);
}
umask(027);
signal(SIGTSTP, SIG_IGN);
signal(SIGTTOU, SIG_IGN);
signal(SIGTTIN, SIG_IGN);
}
}
void os_setup_post(void)
{
int fd = 0;
if (daemonize) {
if (chdir("/")) {
error_report("not able to chdir to /: %s", strerror(errno));
exit(1);
}
TFR(fd = qemu_open_old("/dev/null", O_RDWR));
if (fd == -1) {
exit(1);
}
}
change_root();
change_process_uid();
if (daemonize) {
uint8_t status = 0;
ssize_t len;
dup2(fd, 0);
dup2(fd, 1);
/* In case -D is given do not redirect stderr to /dev/null */
if (!qemu_log_enabled()) {
dup2(fd, 2);
}
close(fd);
do {
len = write(daemon_pipe, &status, 1);
} while (len < 0 && errno == EINTR);
if (len != 1) {
exit(1);
}
}
}
void os_set_line_buffering(void)
{
setvbuf(stdout, NULL, _IOLBF, 0);
}
bool is_daemonized(void)
{
return daemonize;
}
int os_set_daemonize(bool d)
{
daemonize = d;
return 0;
}
int os_mlock(void)
{
#ifdef HAVE_MLOCKALL
int ret = 0;
ret = mlockall(MCL_CURRENT | MCL_FUTURE);
if (ret < 0) {
error_report("mlockall: %s", strerror(errno));
}
return ret;
#else
return -ENOSYS;
#endif
}