"Host Memory Backends" and "Memory devices" queue ("mem"):
 - Fix NVDIMM error message
 - Add ThreadContext user-creatable object and wire it up for NUMA-aware
   hostmem preallocation
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmNbpHARHGRhdmlkQHJl
 ZGhhdC5jb20ACgkQTd4Q9wD/g1pDpw//bG9cyIlzTzDnU5pbQiXyLm0nF9tW/tli
 npGPSbFFYz/72XD9VJSVLhbNHoQSmFcMK5m/DA4WAMdOc5zF7lP3XdZcj72pDyxu
 31hJRvuRhxNb09jhEdWRfX5+Jg9UyYXuIvtKXHSWgrtaYDtHBdTXq/ojZlvlo/rr
 36v0jaVaTNRs7dKQL2oaN+DSMiPXHxBzA6FABqYmJNNwuMJT0kkX8pfz0OFwkRn+
 iqf9uRhM6b/fNNB0+ReA7FfGL+hzU6Uv8AvAL3orXUqjwPMRe9Fz2gE7HpFnE6DD
 dOP4Xk2iSSJ5XQA8HwtvrQfrGPh4gPYE80ziK/+8boy3alVeGYbYbvWVtdsNju41
 Cq9kM1wDyjZf6SSUIAbjOrNPdbhwyK4GviVBR1zh+/gA3uF5MhrDtZh4h3mWX2if
 ijmT9mfte4NwF3K1MvckAl7IHRb8nxmr7wjjhJ26JwpD+76lfAcmXC2YOlFGHCMi
 028mjvThf3HW7BD2LjlQSX4UkHmM2vUBrgMGQKyeMham1VmMfSK32wzvUNfF7xSz
 o9k0loBh7unGcUsv3EbqUGswV5F6AgjK3vWRkDql8dNrdIoapDfaejPCd58kVM98
 5N/aEoha4bAeJ6NGIKzD+4saiMxUqJ0y2NjSrE8iO4HszXgZW5e1Gbkn4Ae6d37D
 QSSqyfasVHY=
 =bLuc
 -----END PGP SIGNATURE-----

Merge tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu into staging

Hi,

"Host Memory Backends" and "Memory devices" queue ("mem"):
- Fix NVDIMM error message
- Add ThreadContext user-creatable object and wire it up for NUMA-aware
  hostmem preallocation

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmNbpHARHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1pDpw//bG9cyIlzTzDnU5pbQiXyLm0nF9tW/tli
# npGPSbFFYz/72XD9VJSVLhbNHoQSmFcMK5m/DA4WAMdOc5zF7lP3XdZcj72pDyxu
# 31hJRvuRhxNb09jhEdWRfX5+Jg9UyYXuIvtKXHSWgrtaYDtHBdTXq/ojZlvlo/rr
# 36v0jaVaTNRs7dKQL2oaN+DSMiPXHxBzA6FABqYmJNNwuMJT0kkX8pfz0OFwkRn+
# iqf9uRhM6b/fNNB0+ReA7FfGL+hzU6Uv8AvAL3orXUqjwPMRe9Fz2gE7HpFnE6DD
# dOP4Xk2iSSJ5XQA8HwtvrQfrGPh4gPYE80ziK/+8boy3alVeGYbYbvWVtdsNju41
# Cq9kM1wDyjZf6SSUIAbjOrNPdbhwyK4GviVBR1zh+/gA3uF5MhrDtZh4h3mWX2if
# ijmT9mfte4NwF3K1MvckAl7IHRb8nxmr7wjjhJ26JwpD+76lfAcmXC2YOlFGHCMi
# 028mjvThf3HW7BD2LjlQSX4UkHmM2vUBrgMGQKyeMham1VmMfSK32wzvUNfF7xSz
# o9k0loBh7unGcUsv3EbqUGswV5F6AgjK3vWRkDql8dNrdIoapDfaejPCd58kVM98
# 5N/aEoha4bAeJ6NGIKzD+4saiMxUqJ0y2NjSrE8iO4HszXgZW5e1Gbkn4Ae6d37D
# QSSqyfasVHY=
# =bLuc
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 28 Oct 2022 05:44:16 EDT
# gpg:                using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg:                issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown]
# gpg:                 aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg:                 aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D  FCCA 4DDE 10F7 00FF 835A

* tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu:
  vl: Allow ThreadContext objects to be created before the sandbox option
  hostmem: Allow for specifying a ThreadContext for preallocation
  util: Make qemu_prealloc_mem() optionally consume a ThreadContext
  util: Add write-only "node-affinity" property for ThreadContext
  util: Introduce ThreadContext user-creatable object
  util: Introduce qemu_thread_set_affinity() and qemu_thread_get_affinity()
  util: Cleanup and rename os_mem_prealloc()
  hw/mem/nvdimm: fix error message for 'unarmed' flag

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2022-10-30 18:31:59 -04:00
commit 7208429223
17 changed files with 642 additions and 31 deletions

View file

@ -232,7 +232,8 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
void *ptr = memory_region_get_ram_ptr(&backend->mr);
uint64_t sz = memory_region_size(&backend->mr);
os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err);
qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
backend->prealloc_context, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
@ -383,8 +384,9 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
* specified NUMA policy in place.
*/
if (backend->prealloc) {
os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
backend->prealloc_threads, &local_err);
qemu_prealloc_mem(memory_region_get_fd(&backend->mr), ptr, sz,
backend->prealloc_threads,
backend->prealloc_context, &local_err);
if (local_err) {
goto out;
}
@ -492,6 +494,11 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)
NULL, NULL);
object_class_property_set_description(oc, "prealloc-threads",
"Number of CPU threads to use for prealloc");
object_class_property_add_link(oc, "prealloc-context",
TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
object_class_property_set_description(oc, "prealloc-context",
"Context to use for creating CPU threads for preallocation");
object_class_property_add(oc, "size", "int",
host_memory_backend_get_size,
host_memory_backend_set_size,

View file

@ -149,7 +149,7 @@ static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
if (!nvdimm->unarmed && memory_region_is_rom(mr)) {
HostMemoryBackend *hostmem = dimm->hostmem;
error_setg(errp, "'unarmed' property must be off since memdev %s "
error_setg(errp, "'unarmed' property must be 'on' since memdev %s "
"is read-only",
object_get_canonical_path_component(OBJECT(hostmem)));
return;

View file

@ -467,7 +467,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;
os_mem_prealloc(fd, area, size, 1, &local_err);
qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
if (local_err) {
static bool warned;

View file

@ -576,8 +576,23 @@ unsigned long qemu_getauxval(unsigned long type);
void qemu_set_tty_echo(int fd, bool echo);
void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
Error **errp);
typedef struct ThreadContext ThreadContext;
/**
* qemu_prealloc_mem:
* @fd: the fd mapped into the area, -1 for anonymous memory
* @area: start address of the are to preallocate
* @sz: the size of the area to preallocate
* @max_threads: maximum number of threads to use
* @errp: returns an error if this function fails
*
* Preallocate memory (populate/prefault page tables writable) for the virtual
* memory area starting at @area with the size of @sz. After a successful call,
* each page in the area was faulted in writable at least once, for example,
* after allocating file blocks for mapped files.
*/
void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp);
/**
* qemu_get_pid_name:

View file

@ -0,0 +1,57 @@
/*
* QEMU Thread Context
*
* Copyright Red Hat Inc., 2022
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#ifndef SYSEMU_THREAD_CONTEXT_H
#define SYSEMU_THREAD_CONTEXT_H
#include "qapi/qapi-types-machine.h"
#include "qemu/thread.h"
#include "qom/object.h"
#define TYPE_THREAD_CONTEXT "thread-context"
OBJECT_DECLARE_TYPE(ThreadContext, ThreadContextClass,
THREAD_CONTEXT)
struct ThreadContextClass {
ObjectClass parent_class;
};
struct ThreadContext {
/* private */
Object parent;
/* private */
unsigned int thread_id;
QemuThread thread;
/* Semaphore to wait for context thread action. */
QemuSemaphore sem;
/* Semaphore to wait for action in context thread. */
QemuSemaphore sem_thread;
/* Mutex to synchronize requests. */
QemuMutex mutex;
/* Commands for the thread to execute. */
int thread_cmd;
void *thread_cmd_data;
/* CPU affinity bitmap used for initialization. */
unsigned long *init_cpu_bitmap;
int init_cpu_nbits;
};
void thread_context_create_thread(ThreadContext *tc, QemuThread *thread,
const char *name,
void *(*start_routine)(void *), void *arg,
int mode);
#endif /* SYSEMU_THREAD_CONTEXT_H */

View file

@ -185,6 +185,10 @@ void qemu_event_destroy(QemuEvent *ev);
void qemu_thread_create(QemuThread *thread, const char *name,
void *(*start_routine)(void *),
void *arg, int mode);
int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
unsigned long nbits);
int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
unsigned long *nbits);
void *qemu_thread_join(QemuThread *thread);
void qemu_thread_get_self(QemuThread *thread);
bool qemu_thread_is_self(QemuThread *thread);

View file

@ -18,6 +18,7 @@
#include "qom/object.h"
#include "exec/memory.h"
#include "qemu/bitmap.h"
#include "qemu/thread-context.h"
#define TYPE_MEMORY_BACKEND "memory-backend"
OBJECT_DECLARE_TYPE(HostMemoryBackend, HostMemoryBackendClass,
@ -66,6 +67,7 @@ struct HostMemoryBackend {
bool merge, dump, use_canonical_path;
bool prealloc, is_mapped, share, reserve;
uint32_t prealloc_threads;
ThreadContext *prealloc_context;
DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
HostMemPolicy policy;

View file

@ -2130,7 +2130,23 @@ config_host_data.set('CONFIG_PTHREAD_CONDATTR_SETCLOCK', cc.links(gnu_source_pre
pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
return 0;
}''', dependencies: threads))
config_host_data.set('CONFIG_PTHREAD_AFFINITY_NP', cc.links(gnu_source_prefix + '''
#include <pthread.h>
static void *f(void *p) { return NULL; }
int main(void)
{
int setsize = CPU_ALLOC_SIZE(64);
pthread_t thread;
cpu_set_t *cpuset;
pthread_create(&thread, 0, f, 0);
cpuset = CPU_ALLOC(64);
CPU_ZERO_S(setsize, cpuset);
pthread_setaffinity_np(thread, setsize, cpuset);
pthread_getaffinity_np(thread, setsize, cpuset);
CPU_FREE(cpuset);
return 0;
}''', dependencies: threads))
config_host_data.set('CONFIG_SIGNALFD', cc.links(gnu_source_prefix + '''
#include <sys/signalfd.h>
#include <stddef.h>

View file

@ -578,6 +578,9 @@
#
# @prealloc-threads: number of CPU threads to use for prealloc (default: 1)
#
# @prealloc-context: thread context to use for creation of preallocation threads
# (default: none) (since 7.2)
#
# @share: if false, the memory is private to QEMU; if true, it is shared
# (default: false)
#
@ -608,6 +611,7 @@
'*policy': 'HostMemPolicy',
'*prealloc': 'bool',
'*prealloc-threads': 'uint32',
'*prealloc-context': 'str',
'*share': 'bool',
'*reserve': 'bool',
'size': 'size',
@ -830,6 +834,28 @@
'reduced-phys-bits': 'uint32',
'*kernel-hashes': 'bool' } }
##
# @ThreadContextProperties:
#
# Properties for thread context objects.
#
# @cpu-affinity: the list of host CPU numbers used as CPU affinity for all
# threads created in the thread context (default: QEMU main
# thread CPU affinity)
#
# @node-affinity: the list of host node numbers that will be resolved to a
# list of host CPU numbers used as CPU affinity. This is a
# shortcut for specifying the list of host CPU numbers
# belonging to the host nodes manually by setting
# @cpu-affinity. (default: QEMU main thread affinity)
#
# Since: 7.2
##
{ 'struct': 'ThreadContextProperties',
'data': { '*cpu-affinity': ['uint16'],
'*node-affinity': ['uint16'] } }
##
# @ObjectType:
#
@ -882,6 +908,7 @@
{ 'name': 'secret_keyring',
'if': 'CONFIG_SECRET_KEYRING' },
'sev-guest',
'thread-context',
's390-pv-guest',
'throttle-group',
'tls-creds-anon',
@ -948,6 +975,7 @@
'secret_keyring': { 'type': 'SecretKeyringProperties',
'if': 'CONFIG_SECRET_KEYRING' },
'sev-guest': 'SevGuestProperties',
'thread-context': 'ThreadContextProperties',
'throttle-group': 'ThrottleGroupProperties',
'tls-creds-anon': 'TlsCredsAnonProperties',
'tls-creds-psk': 'TlsCredsPskProperties',

View file

@ -354,7 +354,7 @@ static void qemu_init_sigbus(void)
/*
* ALERT: when modifying this, take care that SIGBUS forwarding in
* os_mem_prealloc() will continue working as expected.
* qemu_prealloc_mem() will continue working as expected.
*/
memset(&action, 0, sizeof(action));
action.sa_flags = SA_SIGINFO;

View file

@ -1759,6 +1759,27 @@ static void object_option_parse(const char *optarg)
visit_free(v);
}
/*
* Very early object creation, before the sandbox options have been activated.
*/
static bool object_create_pre_sandbox(const char *type)
{
/*
* Objects should in general not get initialized "too early" without
* a reason. If you add one, state the reason in a comment!
*/
/*
* Reason: -sandbox on,resourcecontrol=deny disallows setting CPU
* affinity of threads.
*/
if (g_str_equal(type, "thread-context")) {
return true;
}
return false;
}
/*
* Initial object creation happens before all other
* QEMU data types are created. The majority of objects
@ -1773,6 +1794,11 @@ static bool object_create_early(const char *type)
* add one, state the reason in a comment!
*/
/* Reason: already created. */
if (object_create_pre_sandbox(type)) {
return false;
}
/* Reason: property "chardev" */
if (g_str_equal(type, "rng-egd") ||
g_str_equal(type, "qtest")) {
@ -1895,7 +1921,7 @@ static void qemu_create_early_backends(void)
*/
static bool object_create_late(const char *type)
{
return !object_create_early(type);
return !object_create_early(type) && !object_create_pre_sandbox(type);
}
static void qemu_create_late_backends(void)
@ -2351,6 +2377,11 @@ static int process_runstate_actions(void *opaque, QemuOpts *opts, Error **errp)
static void qemu_process_early_options(void)
{
qemu_opts_foreach(qemu_find_opts("name"),
parse_name, NULL, &error_fatal);
object_option_foreach_add(object_create_pre_sandbox);
#ifdef CONFIG_SECCOMP
QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL);
if (olist) {
@ -2358,9 +2389,6 @@ static void qemu_process_early_options(void)
}
#endif
qemu_opts_foreach(qemu_find_opts("name"),
parse_name, NULL, &error_fatal);
if (qemu_opts_foreach(qemu_find_opts("action"),
process_runstate_actions, NULL, &error_fatal)) {
exit(1);

View file

@ -1,4 +1,5 @@
util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c'))
util_ss.add(files('thread-context.c'), numa)
if not config_host_data.get('CONFIG_ATOMIC64')
util_ss.add(files('atomic64.c'))
endif

View file

@ -42,6 +42,7 @@
#include "qemu/cutils.h"
#include "qemu/compiler.h"
#include "qemu/units.h"
#include "qemu/thread-context.h"
#ifdef CONFIG_LINUX
#include <sys/syscall.h>
@ -329,7 +330,7 @@ static void sigbus_handler(int signal)
return;
}
#endif /* CONFIG_LINUX */
warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored");
warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
}
static void *do_touch_pages(void *arg)
@ -399,13 +400,13 @@ static void *do_madv_populate_write_pages(void *arg)
}
static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
int smp_cpus)
int max_threads)
{
long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
int ret = 1;
if (host_procs > 0) {
ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
}
/* Especially with gigantic pages, don't create more threads than pages. */
@ -418,11 +419,12 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
}
static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
int smp_cpus, bool use_madv_populate_write)
int max_threads, ThreadContext *tc,
bool use_madv_populate_write)
{
static gsize initialized = 0;
MemsetContext context = {
.num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
.num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
};
size_t numpages_per_thread, leftover;
void *(*touch_fn)(void *);
@ -457,9 +459,16 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
context.threads[i].numpages = numpages_per_thread + (i < leftover);
context.threads[i].hpagesize = hpagesize;
context.threads[i].context = &context;
qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
touch_fn, &context.threads[i],
QEMU_THREAD_JOINABLE);
if (tc) {
thread_context_create_thread(tc, &context.threads[i].pgthread,
"touch_pages",
touch_fn, &context.threads[i],
QEMU_THREAD_JOINABLE);
} else {
qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
touch_fn, &context.threads[i],
QEMU_THREAD_JOINABLE);
}
addr += context.threads[i].numpages * hpagesize;
}
@ -494,13 +503,13 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
errno != EINVAL;
}
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
Error **errp)
void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
{
static gsize initialized;
int ret;
size_t hpagesize = qemu_fd_getpagesize(fd);
size_t numpages = DIV_ROUND_UP(memory, hpagesize);
size_t numpages = DIV_ROUND_UP(sz, hpagesize);
bool use_madv_populate_write;
struct sigaction act;
@ -530,24 +539,24 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
if (ret) {
qemu_mutex_unlock(&sigbus_mutex);
error_setg_errno(errp, errno,
"os_mem_prealloc: failed to install signal handler");
"qemu_prealloc_mem: failed to install signal handler");
return;
}
}
/* touch pages simultaneously */
ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
use_madv_populate_write);
if (ret) {
error_setg_errno(errp, -ret,
"os_mem_prealloc: preallocating memory failed");
"qemu_prealloc_mem: preallocating memory failed");
}
if (!use_madv_populate_write) {
ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
if (ret) {
/* Terminate QEMU since it can't recover from error */
perror("os_mem_prealloc: failed to reinstall signal handler");
perror("qemu_prealloc_mem: failed to reinstall signal handler");
exit(1);
}
qemu_mutex_unlock(&sigbus_mutex);

View file

@ -268,14 +268,14 @@ int getpagesize(void)
return system_info.dwPageSize;
}
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
Error **errp)
void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
{
int i;
size_t pagesize = qemu_real_host_page_size();
memory = (memory + pagesize - 1) & -pagesize;
for (i = 0; i < memory / pagesize; i++) {
sz = (sz + pagesize - 1) & -pagesize;
for (i = 0; i < sz / pagesize; i++) {
memset(area + pagesize * i, 0, 1);
}
}

View file

@ -16,6 +16,7 @@
#include "qemu/notify.h"
#include "qemu-thread-common.h"
#include "qemu/tsan.h"
#include "qemu/bitmap.h"
static bool name_threads;
@ -552,6 +553,75 @@ void qemu_thread_create(QemuThread *thread, const char *name,
pthread_attr_destroy(&attr);
}
int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
unsigned long nbits)
{
#if defined(CONFIG_PTHREAD_AFFINITY_NP)
const size_t setsize = CPU_ALLOC_SIZE(nbits);
unsigned long value;
cpu_set_t *cpuset;
int err;
cpuset = CPU_ALLOC(nbits);
g_assert(cpuset);
CPU_ZERO_S(setsize, cpuset);
value = find_first_bit(host_cpus, nbits);
while (value < nbits) {
CPU_SET_S(value, setsize, cpuset);
value = find_next_bit(host_cpus, nbits, value + 1);
}
err = pthread_setaffinity_np(thread->thread, setsize, cpuset);
CPU_FREE(cpuset);
return err;
#else
return -ENOSYS;
#endif
}
int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
unsigned long *nbits)
{
#if defined(CONFIG_PTHREAD_AFFINITY_NP)
unsigned long tmpbits;
cpu_set_t *cpuset;
size_t setsize;
int i, err;
tmpbits = CPU_SETSIZE;
while (true) {
setsize = CPU_ALLOC_SIZE(tmpbits);
cpuset = CPU_ALLOC(tmpbits);
g_assert(cpuset);
err = pthread_getaffinity_np(thread->thread, setsize, cpuset);
if (err) {
CPU_FREE(cpuset);
if (err != -EINVAL) {
return err;
}
tmpbits *= 2;
} else {
break;
}
}
/* Convert the result into a proper bitmap. */
*nbits = tmpbits;
*host_cpus = bitmap_new(tmpbits);
for (i = 0; i < tmpbits; i++) {
if (CPU_ISSET(i, cpuset)) {
set_bit(i, *host_cpus);
}
}
CPU_FREE(cpuset);
return 0;
#else
return -ENOSYS;
#endif
}
void qemu_thread_get_self(QemuThread *thread)
{
thread->thread = pthread_self();

View file

@ -477,6 +477,18 @@ void qemu_thread_create(QemuThread *thread, const char *name,
thread->data = data;
}
int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
unsigned long nbits)
{
return -ENOSYS;
}
int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
unsigned long *nbits)
{
return -ENOSYS;
}
void qemu_thread_get_self(QemuThread *thread)
{
thread->data = qemu_thread_data;

362
util/thread-context.c Normal file
View file

@ -0,0 +1,362 @@
/*
* QEMU Thread Context
*
* Copyright Red Hat Inc., 2022
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "qemu/thread-context.h"
#include "qapi/error.h"
#include "qapi/qapi-builtin-visit.h"
#include "qapi/visitor.h"
#include "qemu/config-file.h"
#include "qapi/qapi-builtin-visit.h"
#include "qom/object_interfaces.h"
#include "qemu/module.h"
#include "qemu/bitmap.h"
#ifdef CONFIG_NUMA
#include <numa.h>
#endif
enum {
TC_CMD_NONE = 0,
TC_CMD_STOP,
TC_CMD_NEW,
};
typedef struct ThreadContextCmdNew {
QemuThread *thread;
const char *name;
void *(*start_routine)(void *);
void *arg;
int mode;
} ThreadContextCmdNew;
static void *thread_context_run(void *opaque)
{
ThreadContext *tc = opaque;
tc->thread_id = qemu_get_thread_id();
qemu_sem_post(&tc->sem);
while (true) {
/*
* Threads inherit the CPU affinity of the creating thread. For this
* reason, we create new (especially short-lived) threads from our
* persistent context thread.
*
* Especially when QEMU is not allowed to set the affinity itself,
* management tools can simply set the affinity of the context thread
* after creating the context, to have new threads created via
* the context inherit the CPU affinity automatically.
*/
switch (tc->thread_cmd) {
case TC_CMD_NONE:
break;
case TC_CMD_STOP:
tc->thread_cmd = TC_CMD_NONE;
qemu_sem_post(&tc->sem);
return NULL;
case TC_CMD_NEW: {
ThreadContextCmdNew *cmd_new = tc->thread_cmd_data;
qemu_thread_create(cmd_new->thread, cmd_new->name,
cmd_new->start_routine, cmd_new->arg,
cmd_new->mode);
tc->thread_cmd = TC_CMD_NONE;
tc->thread_cmd_data = NULL;
qemu_sem_post(&tc->sem);
break;
}
default:
g_assert_not_reached();
}
qemu_sem_wait(&tc->sem_thread);
}
}
static void thread_context_set_cpu_affinity(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
ThreadContext *tc = THREAD_CONTEXT(obj);
uint16List *l, *host_cpus = NULL;
unsigned long *bitmap = NULL;
int nbits = 0, ret;
Error *err = NULL;
if (tc->init_cpu_bitmap) {
error_setg(errp, "Mixing CPU and node affinity not supported");
return;
}
visit_type_uint16List(v, name, &host_cpus, &err);
if (err) {
error_propagate(errp, err);
return;
}
if (!host_cpus) {
error_setg(errp, "CPU list is empty");
goto out;
}
for (l = host_cpus; l; l = l->next) {
nbits = MAX(nbits, l->value + 1);
}
bitmap = bitmap_new(nbits);
for (l = host_cpus; l; l = l->next) {
set_bit(l->value, bitmap);
}
if (tc->thread_id != -1) {
/*
* Note: we won't be adjusting the affinity of any thread that is still
* around, but only the affinity of the context thread.
*/
ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits);
if (ret) {
error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
}
} else {
tc->init_cpu_bitmap = bitmap;
bitmap = NULL;
tc->init_cpu_nbits = nbits;
}
out:
g_free(bitmap);
qapi_free_uint16List(host_cpus);
}
static void thread_context_get_cpu_affinity(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
unsigned long *bitmap, nbits, value;
ThreadContext *tc = THREAD_CONTEXT(obj);
uint16List *host_cpus = NULL;
uint16List **tail = &host_cpus;
int ret;
if (tc->thread_id == -1) {
error_setg(errp, "Object not initialized yet");
return;
}
ret = qemu_thread_get_affinity(&tc->thread, &bitmap, &nbits);
if (ret) {
error_setg(errp, "Getting CPU affinity failed: %s", strerror(ret));
return;
}
value = find_first_bit(bitmap, nbits);
while (value < nbits) {
QAPI_LIST_APPEND(tail, value);
value = find_next_bit(bitmap, nbits, value + 1);
}
g_free(bitmap);
visit_type_uint16List(v, name, &host_cpus, errp);
qapi_free_uint16List(host_cpus);
}
static void thread_context_set_node_affinity(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
#ifdef CONFIG_NUMA
const int nbits = numa_num_possible_cpus();
ThreadContext *tc = THREAD_CONTEXT(obj);
uint16List *l, *host_nodes = NULL;
unsigned long *bitmap = NULL;
struct bitmask *tmp_cpus;
Error *err = NULL;
int ret, i;
if (tc->init_cpu_bitmap) {
error_setg(errp, "Mixing CPU and node affinity not supported");
return;
}
visit_type_uint16List(v, name, &host_nodes, &err);
if (err) {
error_propagate(errp, err);
return;
}
if (!host_nodes) {
error_setg(errp, "Node list is empty");
goto out;
}
bitmap = bitmap_new(nbits);
tmp_cpus = numa_allocate_cpumask();
for (l = host_nodes; l; l = l->next) {
numa_bitmask_clearall(tmp_cpus);
ret = numa_node_to_cpus(l->value, tmp_cpus);
if (ret) {
/* We ignore any errors, such as impossible nodes. */
continue;
}
for (i = 0; i < nbits; i++) {
if (numa_bitmask_isbitset(tmp_cpus, i)) {
set_bit(i, bitmap);
}
}
}
numa_free_cpumask(tmp_cpus);
if (bitmap_empty(bitmap, nbits)) {
error_setg(errp, "The nodes select no CPUs");
goto out;
}
if (tc->thread_id != -1) {
/*
* Note: we won't be adjusting the affinity of any thread that is still
* around for now, but only the affinity of the context thread.
*/
ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits);
if (ret) {
error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
}
} else {
tc->init_cpu_bitmap = bitmap;
bitmap = NULL;
tc->init_cpu_nbits = nbits;
}
out:
g_free(bitmap);
qapi_free_uint16List(host_nodes);
#else
error_setg(errp, "NUMA node affinity is not supported by this QEMU");
#endif
}
static void thread_context_get_thread_id(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
ThreadContext *tc = THREAD_CONTEXT(obj);
uint64_t value = tc->thread_id;
visit_type_uint64(v, name, &value, errp);
}
static void thread_context_instance_complete(UserCreatable *uc, Error **errp)
{
ThreadContext *tc = THREAD_CONTEXT(uc);
char *thread_name;
int ret;
thread_name = g_strdup_printf("TC %s",
object_get_canonical_path_component(OBJECT(uc)));
qemu_thread_create(&tc->thread, thread_name, thread_context_run, tc,
QEMU_THREAD_JOINABLE);
g_free(thread_name);
/* Wait until initialization of the thread is done. */
while (tc->thread_id == -1) {
qemu_sem_wait(&tc->sem);
}
if (tc->init_cpu_bitmap) {
ret = qemu_thread_set_affinity(&tc->thread, tc->init_cpu_bitmap,
tc->init_cpu_nbits);
if (ret) {
error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
}
g_free(tc->init_cpu_bitmap);
tc->init_cpu_bitmap = NULL;
}
}
static void thread_context_class_init(ObjectClass *oc, void *data)
{
UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
ucc->complete = thread_context_instance_complete;
object_class_property_add(oc, "thread-id", "int",
thread_context_get_thread_id, NULL, NULL,
NULL);
object_class_property_add(oc, "cpu-affinity", "int",
thread_context_get_cpu_affinity,
thread_context_set_cpu_affinity, NULL, NULL);
object_class_property_add(oc, "node-affinity", "int", NULL,
thread_context_set_node_affinity, NULL, NULL);
}
static void thread_context_instance_init(Object *obj)
{
ThreadContext *tc = THREAD_CONTEXT(obj);
tc->thread_id = -1;
qemu_sem_init(&tc->sem, 0);
qemu_sem_init(&tc->sem_thread, 0);
qemu_mutex_init(&tc->mutex);
}
static void thread_context_instance_finalize(Object *obj)
{
ThreadContext *tc = THREAD_CONTEXT(obj);
if (tc->thread_id != -1) {
tc->thread_cmd = TC_CMD_STOP;
qemu_sem_post(&tc->sem_thread);
qemu_thread_join(&tc->thread);
}
qemu_sem_destroy(&tc->sem);
qemu_sem_destroy(&tc->sem_thread);
qemu_mutex_destroy(&tc->mutex);
}
static const TypeInfo thread_context_info = {
.name = TYPE_THREAD_CONTEXT,
.parent = TYPE_OBJECT,
.class_init = thread_context_class_init,
.instance_size = sizeof(ThreadContext),
.instance_init = thread_context_instance_init,
.instance_finalize = thread_context_instance_finalize,
.interfaces = (InterfaceInfo[]) {
{ TYPE_USER_CREATABLE },
{ }
}
};
static void thread_context_register_types(void)
{
type_register_static(&thread_context_info);
}
type_init(thread_context_register_types)
void thread_context_create_thread(ThreadContext *tc, QemuThread *thread,
const char *name,
void *(*start_routine)(void *), void *arg,
int mode)
{
ThreadContextCmdNew data = {
.thread = thread,
.name = name,
.start_routine = start_routine,
.arg = arg,
.mode = mode,
};
qemu_mutex_lock(&tc->mutex);
tc->thread_cmd = TC_CMD_NEW;
tc->thread_cmd_data = &data;
qemu_sem_post(&tc->sem_thread);
while (tc->thread_cmd != TC_CMD_NONE) {
qemu_sem_wait(&tc->sem);
}
qemu_mutex_unlock(&tc->mutex);
}