From f67fde6ef4c993b6e716ac008475f935c883849c Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:46:14 +0800 Subject: [PATCH 01/59] Revert "Upload README.md" This reverts commit 718a3aa431361f62dee56b3cb12827f353b01c7b. --- README.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 4a7cd21638cd..000000000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -已停更,随缘更新 \ No newline at end of file From 70d980f59882330d75e6ac36173aa30552b10423 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:55:08 +0800 Subject: [PATCH 02/59] =?UTF-8?q?Revert=20"arm64/configs:=20=E5=90=AF?= =?UTF-8?q?=E7=94=A8REKERNEL"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 4c08d91ac1a15808e5e731e8b76e791c422b39e7. --- arch/arm64/configs/vendor/xiaomi/mi845_defconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 3768440716ca..0669b3be30e3 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -634,5 +634,3 @@ CONFIG_SND_SOC_WCD_MBHC_ADC=y CONFIG_SND_SOC_WCD_SPI=y CONFIG_SOUNDWIRE=y CONFIG_WCD_SPI_AC=y -CONFIG_REKERNEL=y -CONFIG_REKERNEL_NETWORK=y From d9d653e9cb563dbebff30d3d5c65ba560dc6d85b Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:55:26 +0800 Subject: [PATCH 03/59] Revert "kernel,KernelSU: Add manual hook support" This reverts commit fc05904517810962d3390a13185f9a4a1a300a1b. --- drivers/input/input.c | 8 -------- fs/exec.c | 14 -------------- fs/open.c | 7 ------- fs/read_write.c | 9 --------- fs/stat.c | 6 ------ kernel/reboot.c | 7 ------- security/selinux/hooks.c | 9 --------- 7 files changed, 60 deletions(-) diff --git a/drivers/input/input.c b/drivers/input/input.c index 6cbdf2737004..378717d1b3b4 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -377,19 +377,11 @@ static int input_get_disposition(struct input_dev *dev, return disposition; } -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_input_hook __read_mostly; -extern int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, int *value); -#endif static void input_handle_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { int disposition = input_get_disposition(dev, type, code, &value); -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_input_hook)) - ksu_handle_input_handle_event(&type, &code, &value); -#endif if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN) add_input_randomness(type, code, value); diff --git a/fs/exec.c b/fs/exec.c index c2530fed584d..5addf3b00561 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1676,13 +1676,6 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_execveat_hook __read_mostly; -extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, - void *envp, int *flags); -extern int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, - void *argv, void *envp, int *flags); -#endif static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, @@ -1694,13 +1687,6 @@ static int do_execveat_common(int fd, struct filename *filename, struct files_struct *displaced; int retval; -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_execveat_hook)) - ksu_handle_execveat(&fd, &filename, &argv, &envp, &flags); - else - ksu_handle_execveat_sucompat(&fd, &filename, &argv, &envp, &flags); -#endif - if (IS_ERR(filename)) return PTR_ERR(filename); diff --git a/fs/open.c b/fs/open.c index 66fadbdfd17a..f2b82c462fbb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -360,10 +360,6 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *flags); -#endif SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { const struct cred *old_cred; @@ -373,9 +369,6 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) struct vfsmount *mnt; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_handle_faccessat(&dfd, &filename, &mode, NULL); -#endif if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; diff --git a/fs/read_write.c b/fs/read_write.c index 4f892b7649d5..901231269242 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -456,19 +456,10 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, } EXPORT_SYMBOL(__vfs_read); -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_vfs_read_hook __read_mostly; -extern int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr, - size_t *count_ptr, loff_t **pos); -#endif ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_vfs_read_hook)) - ksu_handle_vfs_read(&file, &buf, &count, &pos); -#endif if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) diff --git a/fs/stat.c b/fs/stat.c index 0d099fff8b82..068fdbcc9e26 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -87,9 +87,6 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) } EXPORT_SYMBOL(vfs_fstat); -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags); -#endif int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flag) { @@ -97,9 +94,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; unsigned int lookup_flags = 0; -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_handle_stat(&dfd, &filename, &flag); -#endif if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0) goto out; diff --git a/kernel/reboot.c b/kernel/reboot.c index 48445cb61e8f..2946ed1d99d4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -277,9 +277,6 @@ static DEFINE_MUTEX(reboot_mutex); * * reboot doesn't sync: do that yourself before calling this. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); -#endif SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { @@ -287,10 +284,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, char buffer[256]; int ret = 0; -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); -#endif - /* We only trust the superuser with rebooting the system. */ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7e1caf9ee106..4abba0e1674d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2314,10 +2314,6 @@ static u32 ptrace_parent_sid(struct task_struct *task) return sid; } -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool is_ksu_transition(const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec); -#endif static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *old_tsec, const struct task_security_struct *new_tsec) @@ -2332,11 +2328,6 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, if (new_tsec->sid == old_tsec->sid) return 0; /* No change in credentials */ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (is_ksu_transition(old_tsec, new_tsec)) - return 0; -#endif - /* * The only transitions we permit under NNP or nosuid * are transitions to bounded SIDs, i.e. SIDs that are From 8b106f1dacd61c74a95bdb687877dd23856e30a9 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:55:51 +0800 Subject: [PATCH 04/59] =?UTF-8?q?Revert=20"kernel:=20=E5=BC=95=E5=85=A5Ker?= =?UTF-8?q?nelSU32430"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8c981a6e763acaeaccafc274cc2a467f10b63bac. --- drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/kernelsu/Kbuild | 26 - drivers/kernelsu/Kconfig | 48 - drivers/kernelsu/LICENSE | 339 -------- drivers/kernelsu/allowlist.c | 576 ------------ drivers/kernelsu/allowlist.h | 53 -- drivers/kernelsu/apk_sign.c | 367 -------- drivers/kernelsu/apk_sign.h | 10 - drivers/kernelsu/app_profile.c | 206 ----- drivers/kernelsu/app_profile.h | 68 -- drivers/kernelsu/arch.h | 71 -- drivers/kernelsu/feature.c | 176 ---- drivers/kernelsu/feature.h | 35 - drivers/kernelsu/file_wrapper.c | 690 --------------- drivers/kernelsu/file_wrapper.h | 10 - drivers/kernelsu/kernel_compat.c | 199 ----- drivers/kernelsu/kernel_compat.h | 62 -- drivers/kernelsu/kernel_umount.c | 190 ---- drivers/kernelsu/kernel_umount.h | 25 - drivers/kernelsu/klog.h | 11 - drivers/kernelsu/kp_hook.c | 167 ---- drivers/kernelsu/kp_hook.h | 25 - drivers/kernelsu/kp_util.c | 120 --- drivers/kernelsu/kp_util.h | 24 - drivers/kernelsu/ksu.h | 30 - drivers/kernelsu/ksud.c | 644 -------------- drivers/kernelsu/ksud.h | 39 - drivers/kernelsu/ksuinit.c | 140 --- drivers/kernelsu/lsm_hook.c | 117 --- drivers/kernelsu/manager.h | 38 - drivers/kernelsu/manager_sign.h | 16 - drivers/kernelsu/pkg_observer.c | 126 --- drivers/kernelsu/selinux/rules.c | 495 ----------- drivers/kernelsu/selinux/selinux.c | 204 ----- drivers/kernelsu/selinux/selinux.h | 43 - drivers/kernelsu/selinux/selinux_defs.h | 93 -- drivers/kernelsu/selinux/sepolicy.c | 1062 ----------------------- drivers/kernelsu/selinux/sepolicy.h | 46 - drivers/kernelsu/setuid_hook.c | 112 --- drivers/kernelsu/setuid_hook.h | 12 - drivers/kernelsu/shim.c | 36 - drivers/kernelsu/su_mount_ns.c | 270 ------ drivers/kernelsu/su_mount_ns.h | 10 - drivers/kernelsu/sucompat.c | 217 ----- drivers/kernelsu/sucompat.h | 17 - drivers/kernelsu/supercalls.c | 847 ------------------ drivers/kernelsu/supercalls.h | 152 ---- drivers/kernelsu/syscall_handler.c | 374 -------- drivers/kernelsu/syscall_handler.h | 40 - drivers/kernelsu/throne_tracker.c | 389 --------- drivers/kernelsu/throne_tracker.h | 10 - 52 files changed, 9080 deletions(-) delete mode 100644 drivers/kernelsu/Kbuild delete mode 100644 drivers/kernelsu/Kconfig delete mode 100644 drivers/kernelsu/LICENSE delete mode 100644 drivers/kernelsu/allowlist.c delete mode 100644 drivers/kernelsu/allowlist.h delete mode 100644 drivers/kernelsu/apk_sign.c delete mode 100644 drivers/kernelsu/apk_sign.h delete mode 100644 drivers/kernelsu/app_profile.c delete mode 100644 drivers/kernelsu/app_profile.h delete mode 100644 drivers/kernelsu/arch.h delete mode 100644 drivers/kernelsu/feature.c delete mode 100644 drivers/kernelsu/feature.h delete mode 100644 drivers/kernelsu/file_wrapper.c delete mode 100644 drivers/kernelsu/file_wrapper.h delete mode 100644 drivers/kernelsu/kernel_compat.c delete mode 100644 drivers/kernelsu/kernel_compat.h delete mode 100644 drivers/kernelsu/kernel_umount.c delete mode 100644 drivers/kernelsu/kernel_umount.h delete mode 100644 drivers/kernelsu/klog.h delete mode 100644 drivers/kernelsu/kp_hook.c delete mode 100644 drivers/kernelsu/kp_hook.h delete mode 100644 drivers/kernelsu/kp_util.c delete mode 100644 drivers/kernelsu/kp_util.h delete mode 100644 drivers/kernelsu/ksu.h delete mode 100644 drivers/kernelsu/ksud.c delete mode 100644 drivers/kernelsu/ksud.h delete mode 100644 drivers/kernelsu/ksuinit.c delete mode 100644 drivers/kernelsu/lsm_hook.c delete mode 100644 drivers/kernelsu/manager.h delete mode 100644 drivers/kernelsu/manager_sign.h delete mode 100644 drivers/kernelsu/pkg_observer.c delete mode 100644 drivers/kernelsu/selinux/rules.c delete mode 100644 drivers/kernelsu/selinux/selinux.c delete mode 100644 drivers/kernelsu/selinux/selinux.h delete mode 100644 drivers/kernelsu/selinux/selinux_defs.h delete mode 100644 drivers/kernelsu/selinux/sepolicy.c delete mode 100644 drivers/kernelsu/selinux/sepolicy.h delete mode 100644 drivers/kernelsu/setuid_hook.c delete mode 100644 drivers/kernelsu/setuid_hook.h delete mode 100644 drivers/kernelsu/shim.c delete mode 100644 drivers/kernelsu/su_mount_ns.c delete mode 100644 drivers/kernelsu/su_mount_ns.h delete mode 100644 drivers/kernelsu/sucompat.c delete mode 100644 drivers/kernelsu/sucompat.h delete mode 100644 drivers/kernelsu/supercalls.c delete mode 100644 drivers/kernelsu/supercalls.h delete mode 100644 drivers/kernelsu/syscall_handler.c delete mode 100644 drivers/kernelsu/syscall_handler.h delete mode 100644 drivers/kernelsu/throne_tracker.c delete mode 100644 drivers/kernelsu/throne_tracker.h diff --git a/drivers/Kconfig b/drivers/Kconfig index c89e0f383be6..38fc2a3f5c4d 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -218,6 +218,4 @@ source "drivers/halls/Kconfig" source "drivers/rekernel/Kconfig" -source "drivers/kernelsu/Kconfig" - endmenu diff --git a/drivers/Makefile b/drivers/Makefile index f691364e80c8..4e0bcc899926 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -184,4 +184,3 @@ obj-$(CONFIG_TEE) += tee/ obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/ obj-$(CONFIG_HALLS) += halls/ obj-$(CONFIG_REKERNEL) += rekernel/ -obj-$(CONFIG_KSU) += kernelsu/ diff --git a/drivers/kernelsu/Kbuild b/drivers/kernelsu/Kbuild deleted file mode 100644 index 800da52d0892..000000000000 --- a/drivers/kernelsu/Kbuild +++ /dev/null @@ -1,26 +0,0 @@ -obj-y += ksuinit.o -obj-y += allowlist.o -obj-y += app_profile.o -obj-y += apk_sign.o -obj-y += sucompat.o -obj-y += throne_tracker.o -obj-y += setuid_hook.o -obj-y += kernel_compat.o -obj-y += kernel_umount.o -obj-y += supercalls.o -obj-y += feature.o -obj-y += ksud.o -obj-y += file_wrapper.o -obj-y += su_mount_ns.o -obj-y += shim.o -obj-y += selinux/selinux.o -obj-y += selinux/sepolicy.o -obj-y += selinux/rules.o - -ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include -ccflags-y += -I$(objtree)/security/selinux -include $(srctree)/include/uapi/asm-generic/errno.h - -ccflags-y += -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat -ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-missing-prototypes - -# Keep a new line here !! Because someone may append config diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig deleted file mode 100644 index 8464a6c4ca4b..000000000000 --- a/drivers/kernelsu/Kconfig +++ /dev/null @@ -1,48 +0,0 @@ -menu "KernelSU" - -config KSU - tristate "KernelSU function support" - default y - help - Enable kernel-level root privileges on Android System. - To compile as a module, choose M here: the - module will be called kernelsu. - -config KSU_DEBUG - bool "KernelSU debug mode" - depends on KSU - default n - help - Enable KernelSU debug mode. - -config KSU_ALLOWLIST_WORKAROUND - bool "KernelSU allowlist workaround" - depends on KSU - default n - help - Enable workaround for broken allowlist save - -choice - prompt "KernelSU hooks" - default KSU_MANUAL_HOOK if !KPROBES - default KSU_SYSCALL_HOOK if KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS - help - KernelSU core hooks. - -config KSU_MANUAL_HOOK - bool "KernelSU manual hook mode." - depends on KSU && KSU != m - help - Enable manual hook support. - -config KSU_SYSCALL_HOOK - bool "KernelSU syscall hook mode." - depends on KSU - depends on KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS - help - Enable KPROBES, KRETPROBES and TRACEPOINT hook for KernelSU core. - This should not be used on kernel below 5.10. - -endchoice - -endmenu diff --git a/drivers/kernelsu/LICENSE b/drivers/kernelsu/LICENSE deleted file mode 100644 index d159169d1050..000000000000 --- a/drivers/kernelsu/LICENSE +++ /dev/null @@ -1,339 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/allowlist.c deleted file mode 100644 index 9152b7174b6c..000000000000 --- a/drivers/kernelsu/allowlist.c +++ /dev/null @@ -1,576 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) -#include -#endif - -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "selinux/selinux.h" -#include "allowlist.h" -#include "manager.h" -#include "kernel_compat.h" -#include "su_mount_ns.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif - -#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32 -#define FILE_FORMAT_VERSION 3 // u32 - -#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID -#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0" - -static DEFINE_MUTEX(allowlist_mutex); - -// default profiles, these may be used frequently, so we cache it -static struct root_profile default_root_profile; -static struct non_root_profile default_non_root_profile; - -static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly - __aligned(PAGE_SIZE); -static int allow_list_pointer __read_mostly = 0; - -static void remove_uid_from_arr(uid_t uid) -{ - int *temp_arr; - int i, j; - - if (allow_list_pointer == 0) - return; - - temp_arr = kzalloc(sizeof(allow_list_arr), GFP_KERNEL); - if (temp_arr == NULL) { - pr_err("%s: unable to allocate memory\n", __func__); - return; - } - - for (i = j = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) - continue; - temp_arr[j++] = allow_list_arr[i]; - } - - allow_list_pointer = j; - - for (; j < ARRAY_SIZE(allow_list_arr); j++) - temp_arr[j] = -1; - - memcpy(&allow_list_arr, temp_arr, PAGE_SIZE); - kfree(temp_arr); -} - -static void init_default_profiles(void) -{ - kernel_cap_t full_cap = CAP_FULL_SET; - - default_root_profile.uid = 0; - default_root_profile.gid = 0; - default_root_profile.groups_count = 1; - default_root_profile.groups[0] = 0; - memcpy(&default_root_profile.capabilities.effective, &full_cap, - sizeof(default_root_profile.capabilities.effective)); - default_root_profile.namespaces = KSU_NS_INHERITED; - strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN); - - // This means that we will umount modules by default! - default_non_root_profile.umount_modules = true; -} - -struct perm_data { - struct list_head list; - struct app_profile profile; -}; - -static struct list_head allow_list; - -static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE); -#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1) - -#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist" - -void persistent_allow_list(void); - -void ksu_show_allow_list(void) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - pr_info("ksu_show_allow_list\n"); - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - pr_info("uid :%d, allow: %d\n", p->profile.current_uid, - p->profile.allow_su); - } -} - -#ifdef CONFIG_KSU_DEBUG -static void ksu_grant_root_to_shell(void) -{ - struct app_profile profile = { - .version = KSU_APP_PROFILE_VER, - .allow_su = true, - .current_uid = 2000, - }; - strcpy(profile.key, "com.android.shell"); - strcpy(profile.rp_config.profile.selinux_domain, - KSU_DEFAULT_SELINUX_DOMAIN); - ksu_set_app_profile(&profile, false); -} -#endif - -bool ksu_get_app_profile(struct app_profile *profile) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - bool found = false; - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - bool uid_match = profile->current_uid == p->profile.current_uid; - if (uid_match) { - // found it, override it with ours - memcpy(profile, &p->profile, sizeof(*profile)); - found = true; - goto exit; - } - } - -exit: - return found; -} - -static inline bool forbid_system_uid(uid_t uid) -{ -#define SHELL_UID 2000 -#define SYSTEM_UID 1000 - return uid < SHELL_UID && uid != SYSTEM_UID; -} - -static bool profile_valid(struct app_profile *profile) -{ - if (!profile) { - return false; - } - - if (profile->version < KSU_APP_PROFILE_VER) { - pr_info("Unsupported profile version: %d\n", profile->version); - return false; - } - - if (profile->allow_su) { - if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) { - return false; - } - - if (strlen(profile->rp_config.profile.selinux_domain) == 0) { - return false; - } - } - - return true; -} - -bool ksu_set_app_profile(struct app_profile *profile, bool persist) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - bool result = false; - - if (!profile_valid(profile)) { - pr_err("Failed to set app profile: invalid profile!\n"); - return false; - } - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - // both uid and package must match, otherwise it will break multiple package with different user id - if (profile->current_uid == p->profile.current_uid && - !strcmp(profile->key, p->profile.key)) { - // found it, just override it all! - memcpy(&p->profile, profile, sizeof(*profile)); - result = true; - goto out; - } - } - - // not found, alloc a new node! - p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); - if (!p) { - pr_err("ksu_set_app_profile alloc failed\n"); - return false; - } - - memcpy(&p->profile, profile, sizeof(*profile)); - if (profile->allow_su) { - pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", - profile->key, profile->current_uid, - profile->rp_config.profile.gid, - profile->rp_config.profile.selinux_domain); - } else { - pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", - profile->key, profile->current_uid, - profile->nrp_config.profile.umount_modules); - } - list_add_tail(&p->list, &allow_list); - -out: - if (profile->current_uid <= BITMAP_UID_MAX) { - if (profile->allow_su) - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |= - 1 << (profile->current_uid % BITS_PER_BYTE); - else - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &= - ~(1 << (profile->current_uid % BITS_PER_BYTE)); - } else { - if (profile->allow_su) { - /* - * 1024 apps with uid higher than BITMAP_UID_MAX - * registered to request superuser? - */ - if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) { - pr_err("too many apps registered\n"); - WARN_ON(1); - return false; - } - allow_list_arr[allow_list_pointer++] = - profile->current_uid; - } else { - remove_uid_from_arr(profile->current_uid); - } - } - result = true; - - // check if the default profiles is changed, cache it to a single struct to accelerate access. - if (unlikely(!strcmp(profile->key, "$"))) { - // set default non root profile - memcpy(&default_non_root_profile, &profile->nrp_config.profile, - sizeof(default_non_root_profile)); - } - - if (unlikely(!strcmp(profile->key, "#"))) { - // set default root profile - memcpy(&default_root_profile, &profile->rp_config.profile, - sizeof(default_root_profile)); - } - - if (persist) { - persistent_allow_list(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - // FIXME: use a new flag - ksu_mark_running_process(); -#endif - } - - return result; -} - -bool __ksu_is_allow_uid(uid_t uid) -{ - int i; - - if (forbid_system_uid(uid)) { - // do not bother going through the list if it's system - return false; - } - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { - // manager is always allowed! - return true; - } - - if (likely(uid <= BITMAP_UID_MAX)) { - return !!(allow_list_bitmap[uid / BITS_PER_BYTE] & - (1 << (uid % BITS_PER_BYTE))); - } else { - for (i = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) - return true; - } - } - - return false; -} - -bool __ksu_is_allow_uid_for_current(uid_t uid) -{ - if (unlikely(uid == 0)) { - // already root, but only allow our domain. - return is_ksu_domain(); - } - return __ksu_is_allow_uid(uid); -} - -bool ksu_uid_should_umount(uid_t uid) -{ - struct app_profile profile = { .current_uid = uid }; - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { - // we should not umount on manager! - return false; - } - - bool found = ksu_get_app_profile(&profile); - if (!found) { - // no app profile found, it must be non root app - return default_non_root_profile.umount_modules; - } - if (profile.allow_su) { - // if found and it is granted to su, we shouldn't umount for it - return false; - } else { - // found an app profile - if (profile.nrp_config.use_default) { - return default_non_root_profile.umount_modules; - } else { - return profile.nrp_config.profile.umount_modules; - } - } -} - -struct root_profile *ksu_get_root_profile(uid_t uid) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - if (uid == p->profile.current_uid && p->profile.allow_su) { - if (!p->profile.rp_config.use_default) { - return &p->profile.rp_config.profile; - } - } - } - - // use default profile - return &default_root_profile; -} - -bool ksu_get_allow_list(int *array, int *length, bool allow) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - int i = 0; - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow); - if (p->profile.allow_su == allow) { - array[i++] = p->profile.current_uid; - } - } - *length = i; - - return true; -} - -static void do_persistent_allow_list(struct callback_head *_cb) -{ - u32 magic = FILE_MAGIC; - u32 version = FILE_FORMAT_VERSION; - struct perm_data *p = NULL; - struct list_head *pos = NULL; - loff_t off = 0; - - mutex_lock(&allowlist_mutex); - struct file *fp = ksu_filp_open_compat( - KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (IS_ERR(fp)) { - pr_err("save_allow_list create file failed: %ld\n", - PTR_ERR(fp)); - goto unlock; - } - - // store magic and version - if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) != - sizeof(magic)) { - pr_err("save_allow_list write magic failed.\n"); - goto close_file; - } - - if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) != - sizeof(version)) { - pr_err("save_allow_list write version failed.\n"); - goto close_file; - } - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - pr_info("save allow list, name: %s uid :%d, allow: %d\n", - p->profile.key, p->profile.current_uid, - p->profile.allow_su); - - ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile), - &off); - } - -close_file: - filp_close(fp, 0); -unlock: - mutex_unlock(&allowlist_mutex); - kfree(_cb); -} - -void persistent_allow_list(void) -{ - struct task_struct *tsk; - - tsk = get_pid_task(find_vpid(1), PIDTYPE_PID); - if (!tsk) { - pr_err("save_allow_list find init task err\n"); - return; - } - - struct callback_head *cb = - kzalloc(sizeof(struct callback_head), GFP_KERNEL); - if (!cb) { - pr_err("save_allow_list alloc cb err\b"); - goto put_task; - } - cb->func = do_persistent_allow_list; - if (task_work_add(tsk, cb, TWA_RESUME)) { - kfree(cb); - pr_warn("save_allow_list add task_work failed\n"); - } - -put_task: - put_task_struct(tsk); -} - -void ksu_load_allow_list(void) -{ - loff_t off = 0; - ssize_t ret = 0; - struct file *fp = NULL; - u32 magic; - u32 version; - -#ifdef CONFIG_KSU_DEBUG - // always allow adb shell by default - ksu_grant_root_to_shell(); -#endif - - // load allowlist now! - fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); - if (IS_ERR(fp)) { - pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp)); - return; - } - - // verify magic - if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) != - sizeof(magic) || - magic != FILE_MAGIC) { - pr_err("allowlist file invalid: %d!\n", magic); - goto exit; - } - - if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) != - sizeof(version)) { - pr_err("allowlist read version: %d failed\n", version); - goto exit; - } - - pr_info("allowlist version: %d\n", version); - - while (true) { - struct app_profile profile; - - ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile), - &off); - - if (ret <= 0) { - pr_info("load_allow_list read err: %zd\n", ret); - break; - } - - pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", - profile.key, profile.current_uid, profile.allow_su); - ksu_set_app_profile(&profile, false); - } - -exit: - ksu_show_allow_list(); - filp_close(fp, 0); -} - -void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), - void *data) -{ - struct perm_data *np, *n = NULL; - - if (!ksu_boot_completed) { - pr_info("boot not completed, skip prune\n"); - return; - } - - bool modified = false; - // TODO: use RCU! - mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - uid_t uid = np->profile.current_uid; - char *package = np->profile.key; - // we use this uid for special cases, don't prune it! - bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID; - if (!is_preserved_uid && !is_uid_valid(uid, package, data)) { - modified = true; - pr_info("prune uid: %d, package: %s\n", uid, package); - list_del(&np->list); - if (likely(uid <= BITMAP_UID_MAX)) { - allow_list_bitmap[uid / BITS_PER_BYTE] &= - ~(1 << (uid % BITS_PER_BYTE)); - } - remove_uid_from_arr(uid); - smp_mb(); - kfree(np); - } - } - mutex_unlock(&allowlist_mutex); - - if (modified) { - persistent_allow_list(); - } -} - -void ksu_allowlist_init(void) -{ - int i; - - BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE); - BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE); - - for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++) - allow_list_arr[i] = -1; - - INIT_LIST_HEAD(&allow_list); - - init_default_profiles(); -} - -void ksu_allowlist_exit(void) -{ - struct perm_data *np, *n = NULL; - - // free allowlist - mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - list_del(&np->list); - kfree(np); - } - mutex_unlock(&allowlist_mutex); -} diff --git a/drivers/kernelsu/allowlist.h b/drivers/kernelsu/allowlist.h deleted file mode 100644 index d52795afe866..000000000000 --- a/drivers/kernelsu/allowlist.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef __KSU_H_ALLOWLIST -#define __KSU_H_ALLOWLIST - -#include -#include -#include "app_profile.h" - -#define PER_USER_RANGE 100000 -#define FIRST_APPLICATION_UID 10000 -#define LAST_APPLICATION_UID 19999 -#define FIRST_ISOLATED_UID 99000 -#define LAST_ISOLATED_UID 99999 - -void ksu_allowlist_init(void); - -void ksu_allowlist_exit(void); - -void ksu_load_allow_list(void); - -void ksu_show_allow_list(void); - -// Check if the uid is in allow list -bool __ksu_is_allow_uid(uid_t uid); -#define ksu_is_allow_uid(uid) unlikely(__ksu_is_allow_uid(uid)) - -// Check if the uid is in allow list, or current is ksu domain root -bool __ksu_is_allow_uid_for_current(uid_t uid); -#define ksu_is_allow_uid_for_current(uid) \ - unlikely(__ksu_is_allow_uid_for_current(uid)) - -bool ksu_get_allow_list(int *array, int *length, bool allow); - -void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), - void *data); - -bool ksu_get_app_profile(struct app_profile *); -bool ksu_set_app_profile(struct app_profile *, bool persist); - -bool ksu_uid_should_umount(uid_t uid); -struct root_profile *ksu_get_root_profile(uid_t uid); - -static inline bool is_appuid(uid_t uid) -{ - uid_t appid = uid % PER_USER_RANGE; - return appid >= FIRST_APPLICATION_UID && appid <= LAST_APPLICATION_UID; -} - -static inline bool is_isolated_process(uid_t uid) -{ - uid_t appid = uid % PER_USER_RANGE; - return appid >= FIRST_ISOLATED_UID && appid <= LAST_ISOLATED_UID; -} -#endif diff --git a/drivers/kernelsu/apk_sign.c b/drivers/kernelsu/apk_sign.c deleted file mode 100644 index 4c6c63d0d886..000000000000 --- a/drivers/kernelsu/apk_sign.c +++ /dev/null @@ -1,367 +0,0 @@ -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_KSU_DEBUG -#include -#endif -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) -#include -#else -#include -#endif - -#include "apk_sign.h" -#include "app_profile.h" -#include "klog.h" // IWYU pragma: keep -#include "kernel_compat.h" -#include "manager_sign.h" - -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - -static apk_sign_key_t apk_sign_keys[] = { - { EXPECTED_SIZE_RSUNTK, EXPECTED_HASH_RSUNTK }, // RKSU -}; - -static struct sdesc *init_sdesc(struct crypto_shash *alg) -{ - struct sdesc *sdesc; - int size; - - size = sizeof(struct shash_desc) + crypto_shash_descsize(alg); - sdesc = kzalloc(size, GFP_KERNEL); - if (!sdesc) - return ERR_PTR(-ENOMEM); - sdesc->shash.tfm = alg; - return sdesc; -} - -static int calc_hash(struct crypto_shash *alg, const unsigned char *data, - unsigned int datalen, unsigned char *digest) -{ - struct sdesc *sdesc; - int ret; - - sdesc = init_sdesc(alg); - if (IS_ERR(sdesc)) { - pr_info("can't alloc sdesc\n"); - return PTR_ERR(sdesc); - } - - ret = crypto_shash_digest(&sdesc->shash, data, datalen, digest); - kfree(sdesc); - return ret; -} - -static int ksu_sha256(const unsigned char *data, unsigned int datalen, - unsigned char *digest) -{ - struct crypto_shash *alg; - char *hash_alg_name = "sha256"; - int ret; - - alg = crypto_alloc_shash(hash_alg_name, 0, 0); - if (IS_ERR(alg)) { - pr_info("can't alloc alg %s\n", hash_alg_name); - return PTR_ERR(alg); - } - ret = calc_hash(alg, data, datalen, digest); - crypto_free_shash(alg); - return ret; -} - -static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset) -{ - int i; - apk_sign_key_t sign_key; - - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer-sequence length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signed data length - - *offset += 0x4 * 3; - - ksu_kernel_read_compat(fp, size4, 0x4, pos); // digests-sequence length - - *pos += *size4; - *offset += 0x4 + *size4; - - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificates length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificate length - *offset += 0x4 * 2; - - for (i = 0; i < ARRAY_SIZE(apk_sign_keys); i++) { - sign_key = apk_sign_keys[i]; - - if (*size4 != sign_key.size) - continue; - *offset += *size4; - -#define CERT_MAX_LENGTH 1024 - char cert[CERT_MAX_LENGTH]; - if (*size4 > CERT_MAX_LENGTH) { - pr_info("cert length overlimit\n"); - return false; - } - ksu_kernel_read_compat(fp, cert, *size4, pos); - unsigned char digest[SHA256_DIGEST_SIZE]; - if (ksu_sha256(cert, *size4, digest) < 0) { - pr_info("sha256 error\n"); - return false; - } - - char hash_str[SHA256_DIGEST_SIZE * 2 + 1]; - hash_str[SHA256_DIGEST_SIZE * 2] = '\0'; - - bin2hex(hash_str, digest, SHA256_DIGEST_SIZE); - pr_info("sha256: %s, expected: %s\n", hash_str, - sign_key.sha256); - if (strcmp(sign_key.sha256, hash_str) == 0) { - return true; - } - } - return false; -} - -struct zip_entry_header { - uint32_t signature; - uint16_t version; - uint16_t flags; - uint16_t compression; - uint16_t mod_time; - uint16_t mod_date; - uint32_t crc32; - uint32_t compressed_size; - uint32_t uncompressed_size; - uint16_t file_name_length; - uint16_t extra_field_length; -} __attribute__((packed)); - -// This is a necessary but not sufficient condition, but it is enough for us -static bool has_v1_signature_file(struct file *fp) -{ - struct zip_entry_header header; - const char MANIFEST[] = "META-INF/MANIFEST.MF"; - - loff_t pos = 0; - - while (ksu_kernel_read_compat(fp, &header, - sizeof(struct zip_entry_header), &pos) == - sizeof(struct zip_entry_header)) { - if (header.signature != 0x04034b50) { - // ZIP magic: 'PK' - return false; - } - // Read the entry file name - if (header.file_name_length == sizeof(MANIFEST) - 1) { - char fileName[sizeof(MANIFEST)]; - ksu_kernel_read_compat(fp, fileName, - header.file_name_length, &pos); - fileName[header.file_name_length] = '\0'; - - // Check if the entry matches META-INF/MANIFEST.MF - if (strncmp(MANIFEST, fileName, sizeof(MANIFEST) - 1) == - 0) { - return true; - } - } else { - // Skip the entry file name - pos += header.file_name_length; - } - - // Skip to the next entry - pos += header.extra_field_length + header.compressed_size; - } - - return false; -} - -static __always_inline bool check_v2_signature(char *path) -{ - unsigned char buffer[0x11] = { 0 }; - u32 size4; - u64 size8, size_of_block; - - loff_t pos; - - bool v2_signing_valid = false; - int v2_signing_blocks = 0; - bool v3_signing_exist = false; - bool v3_1_signing_exist = false; - - int i; - struct file *fp = ksu_filp_open_compat(path, O_RDONLY, 0); - if (IS_ERR(fp)) { - pr_err("open %s error.\n", path); - return false; - } - - // disable inotify for this file - fp->f_mode |= FMODE_NONOTIFY; - - // https://en.wikipedia.org/wiki/Zip_(file_format)#End_of_central_directory_record_(EOCD) - for (i = 0;; ++i) { - unsigned short n; - pos = generic_file_llseek(fp, -i - 2, SEEK_END); - ksu_kernel_read_compat(fp, &n, 2, &pos); - if (n == i) { - pos -= 22; - ksu_kernel_read_compat(fp, &size4, 4, &pos); - if ((size4 ^ 0xcafebabeu) == 0xccfbf1eeu) { - break; - } - } - if (i == 0xffff) { - pr_info("error: cannot find eocd\n"); - goto clean; - } - } - - pos += 12; - // offset - ksu_kernel_read_compat(fp, &size4, 0x4, &pos); - pos = size4 - 0x18; - - ksu_kernel_read_compat(fp, &size8, 0x8, &pos); - ksu_kernel_read_compat(fp, buffer, 0x10, &pos); - if (strcmp((char *)buffer, "APK Sig Block 42")) { - goto clean; - } - - pos = size4 - (size8 + 0x8); - ksu_kernel_read_compat(fp, &size_of_block, 0x8, &pos); - if (size_of_block != size8) { - goto clean; - } - - int loop_count = 0; - while (loop_count++ < 10) { - uint32_t id; - uint32_t offset; - ksu_kernel_read_compat(fp, &size8, 0x8, - &pos); // sequence length - if (size8 == size_of_block) { - break; - } - ksu_kernel_read_compat(fp, &id, 0x4, &pos); // id - offset = 4; - if (id == 0x7109871au) { - v2_signing_blocks++; - v2_signing_valid = - check_block(fp, &size4, &pos, &offset); - } else if (id == 0xf05368c0u) { - // http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#73 - v3_signing_exist = true; - } else if (id == 0x1b93ad61u) { - // http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#74 - v3_1_signing_exist = true; - } else { -#ifdef CONFIG_KSU_DEBUG - pr_info("Unknown id: 0x%08x\n", id); -#endif - } - pos += (size8 - offset); - } - - if (v2_signing_blocks != 1) { -#ifdef CONFIG_KSU_DEBUG - pr_err("Unexpected v2 signature count: %d\n", - v2_signing_blocks); -#endif - v2_signing_valid = false; - } - - if (v2_signing_valid) { - int has_v1_signing = has_v1_signature_file(fp); - if (has_v1_signing) { - pr_err("Unexpected v1 signature scheme found!\n"); - filp_close(fp, 0); - return false; - } - } -clean: - filp_close(fp, 0); - - if (v3_signing_exist || v3_1_signing_exist) { -#ifdef CONFIG_KSU_DEBUG - pr_err("Unexpected v3 signature scheme found!\n"); -#endif - return false; - } - - return v2_signing_valid; -} - -#ifdef CONFIG_KSU_DEBUG - -int ksu_debug_manager_appid = -1; - -#include "manager.h" - -static int set_expected_size(const char *val, const struct kernel_param *kp) -{ - int rv = param_set_uint(val, kp); - ksu_set_manager_appid(ksu_debug_manager_appid); - pr_info("ksu_manager_appid set to %d\n", ksu_debug_manager_appid); - return rv; -} - -static struct kernel_param_ops expected_size_ops = { - .set = set_expected_size, - .get = param_get_uint, -}; - -module_param_cb(ksu_debug_manager_appid, &expected_size_ops, - &ksu_debug_manager_appid, S_IRUSR | S_IWUSR); - -#endif - -int get_pkg_from_apk_path(char *pkg, const char *path) -{ - int len = strlen(path); - if (len >= KSU_MAX_PACKAGE_NAME || len < 1) - return -1; - - const char *last_slash = NULL; - const char *second_last_slash = NULL; - - int i; - for (i = len - 1; i >= 0; i--) { - if (path[i] == '/') { - if (!last_slash) { - last_slash = &path[i]; - } else { - second_last_slash = &path[i]; - break; - } - } - } - - if (!last_slash || !second_last_slash) - return -1; - - const char *last_hyphen = strchr(second_last_slash, '-'); - if (!last_hyphen || last_hyphen > last_slash) - return -1; - - int pkg_len = last_hyphen - second_last_slash - 1; - if (pkg_len >= KSU_MAX_PACKAGE_NAME || pkg_len <= 0) - return -1; - - // Copying the package name - strncpy(pkg, second_last_slash + 1, pkg_len); - pkg[pkg_len] = '\0'; - - return 0; -} - -bool is_manager_apk(char *path) -{ - return check_v2_signature(path); -} diff --git a/drivers/kernelsu/apk_sign.h b/drivers/kernelsu/apk_sign.h deleted file mode 100644 index b4d4ce3756c4..000000000000 --- a/drivers/kernelsu/apk_sign.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __KSU_H_APK_V2_SIGN -#define __KSU_H_APK_V2_SIGN - -#include - -bool is_manager_apk(char *path); - -int get_pkg_from_apk_path(char *pkg, const char *path); - -#endif diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/app_profile.c deleted file mode 100644 index 4d2f333ebffd..000000000000 --- a/drivers/kernelsu/app_profile.c +++ /dev/null @@ -1,206 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include // signal_struct -#include -#endif -#include -#include -#include -#include -#include - -#include "allowlist.h" -#include "app_profile.h" -#include "arch.h" -#include "kernel_compat.h" -#include "klog.h" // IWYU pragma: keep -#include "selinux/selinux.h" -#include "su_mount_ns.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0) -static struct group_info root_groups = { - .usage = REFCOUNT_INIT(2), -}; -#else -static struct group_info root_groups = { .usage = ATOMIC_INIT(2) }; -#endif - -void setup_groups(struct root_profile *profile, struct cred *cred) -{ - if (profile->groups_count > KSU_MAX_GROUPS) { - pr_warn("Failed to setgroups, too large group: %d!\n", - profile->uid); - return; - } - - if (profile->groups_count == 1 && profile->groups[0] == 0) { - // setgroup to root and return early. - if (cred->group_info) - put_group_info(cred->group_info); - cred->group_info = get_group_info(&root_groups); - return; - } - - u32 ngroups = profile->groups_count; - struct group_info *group_info = groups_alloc(ngroups); - if (!group_info) { - pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid); - return; - } - - int i; - for (i = 0; i < ngroups; i++) { - gid_t gid = profile->groups[i]; - kgid_t kgid = make_kgid(current_user_ns(), gid); - if (!gid_valid(kgid)) { - pr_warn("Failed to setgroups, invalid gid: %d\n", gid); - put_group_info(group_info); - return; - } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) - group_info->gid[i] = kgid; -#else - GROUP_AT(group_info, i) = kgid; -#endif - } - - groups_sort(group_info); - set_groups(cred, group_info); - put_group_info(group_info); -} - -static void do_disable_seccomp(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - struct task_struct *fake; - fake = kmalloc(sizeof(*fake), GFP_ATOMIC); - if (!fake) { - pr_err("%s: cannot allocate fake struct!\n", __func__); - return; - } -#endif - - // Refer to kernel/seccomp.c: seccomp_set_mode_strict - // When disabling Seccomp, ensure that current->sighand->siglock is held during the operation. - spin_lock_irq(¤t->sighand->siglock); - // disable seccomp -#if defined(CONFIG_GENERIC_ENTRY) && \ - LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - clear_syscall_work(SECCOMP); -#else - clear_thread_flag(TIF_SECCOMP); -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - memcpy(fake, current, sizeof(*fake)); -#endif - current->seccomp.mode = 0; -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) - // put_seccomp_filter is allowed while we holding sighand - put_seccomp_filter(current); -#endif - current->seccomp.filter = NULL; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0) - atomic_set(¤t->seccomp.filter_count, 0); -#endif - spin_unlock_irq(¤t->sighand->siglock); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) - // https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577 - fake->flags |= PF_EXITING; -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - // https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558 - fake->sighand = NULL; -#endif - seccomp_filter_release(fake); - kfree(fake); -#endif -} - -void disable_seccomp(void) -{ - // https://github.com/backslashxx/KernelSU/tree/e28930645e764b9f0e5d0d1b0d5e236464939075/kernel/app_profile.c - if (!!!current->seccomp.mode) { - return; - } - - do_disable_seccomp(); -} - -void escape_with_root_profile(void) -{ - struct cred *cred; -#ifdef CONFIG_KSU_SYSCALL_HOOK - struct task_struct *t; -#endif - - if (current_euid().val == 0) { - pr_warn("Already root, don't escape!\n"); - return; - } - - cred = prepare_creds(); - if (!cred) { - pr_warn("prepare_creds failed!\n"); - return; - } - - struct root_profile *profile = ksu_get_root_profile(cred->uid.val); - - cred->uid.val = profile->uid; - cred->suid.val = profile->uid; - cred->euid.val = profile->uid; - cred->fsuid.val = profile->uid; - - cred->gid.val = profile->gid; - cred->fsgid.val = profile->gid; - cred->sgid.val = profile->gid; - cred->egid.val = profile->gid; - cred->securebits = 0; - - BUILD_BUG_ON(sizeof(profile->capabilities.effective) != - sizeof(kernel_cap_t)); - - // setup capabilities - // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process - // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! - u64 cap_for_ksud = - profile->capabilities.effective | CAP_DAC_READ_SEARCH; - memcpy(&cred->cap_effective, &cap_for_ksud, - sizeof(cred->cap_effective)); - memcpy(&cred->cap_permitted, &profile->capabilities.effective, - sizeof(cred->cap_permitted)); - memcpy(&cred->cap_bset, &profile->capabilities.effective, - sizeof(cred->cap_bset)); - - setup_groups(profile, cred); - - commit_creds(cred); - - disable_seccomp(); - - setup_selinux(profile->selinux_domain); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - for_each_thread (current, t) { - ksu_set_task_tracepoint_flag(t); - } -#endif - - setup_mount_ns(profile->namespaces); -} - -void escape_to_root_for_init(void) -{ - setup_selinux(KERNEL_SU_CONTEXT); -} diff --git a/drivers/kernelsu/app_profile.h b/drivers/kernelsu/app_profile.h deleted file mode 100644 index 1263509c2f5e..000000000000 --- a/drivers/kernelsu/app_profile.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef __KSU_H_APP_PROFILE -#define __KSU_H_APP_PROFILE - -#include - -// Forward declarations -struct cred; - -#define KSU_APP_PROFILE_VER 2 -#define KSU_MAX_PACKAGE_NAME 256 -// NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. -#define KSU_MAX_GROUPS 32 -#define KSU_SELINUX_DOMAIN 64 - -struct root_profile { - int32_t uid; - int32_t gid; - - int32_t groups_count; - int32_t groups[KSU_MAX_GROUPS]; - - // kernel_cap_t is u32[2] for capabilities v3 - struct { - u64 effective; - u64 permitted; - u64 inheritable; - } capabilities; - - char selinux_domain[KSU_SELINUX_DOMAIN]; - - int32_t namespaces; -}; - -struct non_root_profile { - bool umount_modules; -}; - -struct app_profile { - // It may be utilized for backward compatibility, although we have never explicitly made any promises regarding this. - u32 version; - - // this is usually the package of the app, but can be other value for special apps - char key[KSU_MAX_PACKAGE_NAME]; - int32_t current_uid; - bool allow_su; - - union { - struct { - bool use_default; - char template_name[KSU_MAX_PACKAGE_NAME]; - - struct root_profile profile; - } rp_config; - - struct { - bool use_default; - - struct non_root_profile profile; - } nrp_config; - }; -}; - -// Escalate current process to root with the appropriate profile -void escape_with_root_profile(void); - -void escape_to_root_for_init(void); - -#endif diff --git a/drivers/kernelsu/arch.h b/drivers/kernelsu/arch.h deleted file mode 100644 index b1c79a8c9985..000000000000 --- a/drivers/kernelsu/arch.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef __KSU_H_ARCH -#define __KSU_H_ARCH - -#include - -#if defined(__aarch64__) - -#define __PT_PARM1_REG regs[0] -#define __PT_PARM2_REG regs[1] -#define __PT_PARM3_REG regs[2] -#define __PT_SYSCALL_PARM4_REG regs[3] -#define __PT_CCALL_PARM4_REG regs[3] -#define __PT_PARM5_REG regs[4] -#define __PT_PARM6_REG regs[5] -#define __PT_RET_REG regs[30] -#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ -#define __PT_RC_REG regs[0] -#define __PT_SP_REG sp -#define __PT_IP_REG pc - -#define REBOOT_SYMBOL "__arm64_sys_reboot" -#define SYS_READ_SYMBOL "__arm64_sys_read" -#define SYS_EXECVE_SYMBOL "__arm64_sys_execve" - -#elif defined(__x86_64__) - -#define __PT_PARM1_REG di -#define __PT_PARM2_REG si -#define __PT_PARM3_REG dx -/* syscall uses r10 for PARM4 */ -#define __PT_SYSCALL_PARM4_REG r10 -#define __PT_CCALL_PARM4_REG cx -#define __PT_PARM5_REG r8 -#define __PT_PARM6_REG r9 -#define __PT_RET_REG sp -#define __PT_FP_REG bp -#define __PT_RC_REG ax -#define __PT_SP_REG sp -#define __PT_IP_REG ip - -#define REBOOT_SYMBOL "__x64_sys_reboot" -#define SYS_READ_SYMBOL "__x64_sys_read" -#define SYS_EXECVE_SYMBOL "__x64_sys_execve" - -#else -#ifdef CONFIG_KSU_SYSCALL_HOOK -#error "Unsupported arch" -#endif -#endif - -/* allow some architecutres to override `struct pt_regs` */ -#ifndef __PT_REGS_CAST -#define __PT_REGS_CAST(x) (x) -#endif - -#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) -#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) -#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) -#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG) -#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG) -#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) -#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) -#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) -#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) -#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) -#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) -#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) - -#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs)) - -#endif diff --git a/drivers/kernelsu/feature.c b/drivers/kernelsu/feature.c deleted file mode 100644 index a1017aafbb8e..000000000000 --- a/drivers/kernelsu/feature.c +++ /dev/null @@ -1,176 +0,0 @@ -#include "feature.h" -#include "klog.h" // IWYU pragma: keep - -#include - -static const struct ksu_feature_handler *feature_handlers[KSU_FEATURE_MAX]; - -static DEFINE_MUTEX(feature_mutex); - -int ksu_register_feature_handler(const struct ksu_feature_handler *handler) -{ - if (!handler) { - pr_err("feature: register handler is NULL\n"); - return -EINVAL; - } - - if (handler->feature_id >= KSU_FEATURE_MAX) { - pr_err("feature: invalid feature_id %u\n", handler->feature_id); - return -EINVAL; - } - - if (!handler->get_handler && !handler->set_handler) { - pr_err("feature: no handler provided for feature %u\n", - handler->feature_id); - return -EINVAL; - } - - mutex_lock(&feature_mutex); - - if (feature_handlers[handler->feature_id]) { - pr_warn("feature: handler for %u already registered, overwriting\n", - handler->feature_id); - } - - feature_handlers[handler->feature_id] = handler; - - pr_info("feature: registered handler for %s (id=%u)\n", - handler->name ? handler->name : "unknown", handler->feature_id); - - mutex_unlock(&feature_mutex); - return 0; -} - -int ksu_unregister_feature_handler(u32 feature_id) -{ - int ret = 0; - - if (feature_id >= KSU_FEATURE_MAX) { - pr_err("feature: invalid feature_id %u\n", feature_id); - return -EINVAL; - } - - mutex_lock(&feature_mutex); - - if (!feature_handlers[feature_id]) { - pr_warn("feature: no handler registered for %u\n", feature_id); - ret = -ENOENT; - goto out; - } - - feature_handlers[feature_id] = NULL; - - pr_info("feature: unregistered handler for id=%u\n", feature_id); - -out: - mutex_unlock(&feature_mutex); - return ret; -} - -int ksu_get_feature(u32 feature_id, u64 *value, bool *supported) -{ - int ret = 0; - const struct ksu_feature_handler *handler; - - if (feature_id >= KSU_FEATURE_MAX) { - pr_err("feature: invalid feature_id %u\n", feature_id); - return -EINVAL; - } - - if (!value || !supported) { - pr_err("feature: invalid parameters\n"); - return -EINVAL; - } - - mutex_lock(&feature_mutex); - - handler = feature_handlers[feature_id]; - - if (!handler) { - *supported = false; - *value = 0; - pr_debug("feature: feature %u not supported\n", feature_id); - goto out; - } - - *supported = true; - - if (!handler->get_handler) { - pr_warn("feature: no get_handler for feature %u\n", feature_id); - ret = -EOPNOTSUPP; - goto out; - } - - ret = handler->get_handler(value); - if (ret) { - pr_err("feature: get_handler for %u failed: %d\n", feature_id, - ret); - } - -out: - mutex_unlock(&feature_mutex); - return ret; -} - -int ksu_set_feature(u32 feature_id, u64 value) -{ - int ret = 0; - const struct ksu_feature_handler *handler; - - if (feature_id >= KSU_FEATURE_MAX) { - pr_err("feature: invalid feature_id %u\n", feature_id); - return -EINVAL; - } - - mutex_lock(&feature_mutex); - - handler = feature_handlers[feature_id]; - - if (!handler) { - pr_err("feature: feature %u not registered\n", feature_id); - ret = -EOPNOTSUPP; - goto out; - } - - if (!handler->set_handler) { - pr_warn("feature: no set_handler for feature %u\n", feature_id); - ret = -EOPNOTSUPP; - goto out; - } - - ret = handler->set_handler(value); - if (ret) { - pr_err("feature: set_handler for %u failed: %d\n", feature_id, - ret); - } - -out: - mutex_unlock(&feature_mutex); - return ret; -} - -void ksu_feature_init(void) -{ - int i; - - for (i = 0; i < KSU_FEATURE_MAX; i++) { - feature_handlers[i] = NULL; - } - - pr_info("feature: feature management initialized\n"); -} - -void ksu_feature_exit(void) -{ - int i; - - mutex_lock(&feature_mutex); - - for (i = 0; i < KSU_FEATURE_MAX; i++) { - feature_handlers[i] = NULL; - } - - mutex_unlock(&feature_mutex); - - pr_info("feature: feature management cleaned up\n"); -} diff --git a/drivers/kernelsu/feature.h b/drivers/kernelsu/feature.h deleted file mode 100644 index a5de137a5cfb..000000000000 --- a/drivers/kernelsu/feature.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __KSU_H_FEATURE -#define __KSU_H_FEATURE - -#include - -enum ksu_feature_id { - KSU_FEATURE_SU_COMPAT = 0, - KSU_FEATURE_KERNEL_UMOUNT = 1, - - KSU_FEATURE_MAX -}; - -typedef int (*ksu_feature_get_t)(u64 *value); -typedef int (*ksu_feature_set_t)(u64 value); - -struct ksu_feature_handler { - u32 feature_id; - const char *name; - ksu_feature_get_t get_handler; - ksu_feature_set_t set_handler; -}; - -int ksu_register_feature_handler(const struct ksu_feature_handler *handler); - -int ksu_unregister_feature_handler(u32 feature_id); - -int ksu_get_feature(u32 feature_id, u64 *value, bool *supported); - -int ksu_set_feature(u32 feature_id, u64 value); - -void ksu_feature_init(void); - -void ksu_feature_exit(void); - -#endif // __KSU_H_FEATURE diff --git a/drivers/kernelsu/file_wrapper.c b/drivers/kernelsu/file_wrapper.c deleted file mode 100644 index f2b252334645..000000000000 --- a/drivers/kernelsu/file_wrapper.c +++ /dev/null @@ -1,690 +0,0 @@ -#include -#include -#include -#include -#include // kernel 3.18 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "objsec.h" -#include "ksud.h" - -struct ksu_file_wrapper { - struct file *orig; - struct file_operations ops; -}; - -static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp); - -static int ksu_wrapper_open(struct inode *ino, struct file *fp) -{ - struct path *orig_path = fp->f_path.dentry->d_fsdata; - struct file *orig_file = - dentry_open(orig_path, fp->f_flags, current_cred()); - if (IS_ERR(orig_file)) { - return PTR_ERR(orig_file); - } - struct ksu_file_wrapper *wrapper = ksu_create_file_wrapper(orig_file); - if (IS_ERR(wrapper)) { - filp_close(orig_file, current->files); - return PTR_ERR(wrapper); - } - fp->private_data = wrapper; - const struct file_operations *new_fops = fops_get(&wrapper->ops); - replace_fops(fp, new_fops); - return 0; -} - -static const struct file_operations ksu_file_wrapper_inode_fops = { - .owner = THIS_MODULE, - .open = ksu_wrapper_open -}; - -static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->llseek(data->orig, off, flags); -} - -static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz, - loff_t *off) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->read(orig, ptr, sz, off); -} - -static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr, - size_t sz, loff_t *off) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->write(orig, ptr, sz, off); -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) -static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi) -{ - struct ksu_file_wrapper *data = iocb->ki_filp->private_data; - struct file *orig = data->orig; - iocb->ki_filp = orig; - return orig->f_op->read_iter(iocb, iovi); -} - -static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi) -{ - struct ksu_file_wrapper *data = iocb->ki_filp->private_data; - struct file *orig = data->orig; - iocb->ki_filp = orig; - return orig->f_op->write_iter(iocb, iovi); -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch *icb, - unsigned int v) -{ - struct ksu_file_wrapper *data = kiocb->ki_filp->private_data; - struct file *orig = data->orig; - kiocb->ki_filp = orig; - return orig->f_op->iopoll(kiocb, icb, v); -} -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) -static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin) -{ - struct ksu_file_wrapper *data = kiocb->ki_filp->private_data; - struct file *orig = data->orig; - kiocb->ki_filp = orig; - return orig->f_op->iopoll(kiocb, spin); -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) -static int ksu_wrapper_iterate(struct file *fp, struct dir_context *dc) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->iterate(orig, dc); -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->iterate_shared(orig, dc); -} -#endif - -// typedef unsigned __bitwise __poll_t; -static unsigned __bitwise ksu_wrapper_poll(struct file *fp, - struct poll_table_struct *pts) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->poll(orig, pts); -} - -static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd, - unsigned long arg) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->unlocked_ioctl(orig, cmd, arg); -} - -static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd, - unsigned long arg) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->compat_ioctl(orig, cmd, arg); -} - -static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->mmap(orig, vma); -} - -static int ksu_wrapper_flush(struct file *fp, fl_owner_t id) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->flush(orig, id); -} - -static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2, - int datasync) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->fsync(orig, off1, off2, datasync); -} - -static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->fasync(arg, orig, arg2); -} - -static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - return orig->f_op->lock(orig, arg1, fl); -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) -static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, - size_t sz, loff_t *off, int arg2) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->sendpage) { - return orig->f_op->sendpage(orig, pg, arg1, sz, off, arg2); - } - return -EINVAL; -} -#endif - -static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->get_unmapped_area) { - return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3, - arg4); - } - return -EINVAL; -} - -// static int ksu_wrapper_check_flags(int arg) {} - -static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->flock) { - return orig->f_op->flock(orig, arg1, fl); - } - return -EINVAL; -} - -static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info *pii, - struct file *fp, loff_t *off, size_t sz, - unsigned int arg1) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->splice_write) { - return orig->f_op->splice_write(pii, orig, off, sz, arg1); - } - return -EINVAL; -} - -static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, - struct pipe_inode_info *pii, size_t sz, - unsigned int arg1) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->splice_read) { - return orig->f_op->splice_read(orig, off, pii, sz, arg1); - } - return -EINVAL; -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) -void ksu_wrapper_splice_eof(struct file *fp) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->splice_eof) { - return orig->f_op->splice_eof(orig); - } -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -static int ksu_wrapper_setlease(struct file *fp, int arg1, - struct file_lease **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->setlease) { - return orig->f_op->setlease(orig, arg1, fl, p); - } - return -EINVAL; -} -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) -static int ksu_wrapper_setlease(struct file *fp, int arg1, - struct file_lock **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->setlease) { - return orig->f_op->setlease(orig, arg1, fl, p); - } - return -EINVAL; -} -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) -// int (*setlease)(struct file *, long, struct file_lock **, void **); -static int ksu_wrapper_setlease(struct file *fp, long arg1, - struct file_lock **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->setlease) { - return orig->f_op->setlease(orig, arg1, fl, p); - } - return -EINVAL; -} -#else -// int (*setlease)(struct file *, long, struct file_lock **); -static int ksu_wrapper_setlease(struct file *fp, long arg1, - struct file_lock **fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->setlease) { - return orig->f_op->setlease(orig, arg1, fl); - } - return -EINVAL; -} -#endif - -static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, - loff_t len) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->fallocate) { - return orig->f_op->fallocate(orig, mode, offset, len); - } - return -EINVAL; -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) -static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct ksu_file_wrapper *data = f->private_data; - struct file *orig = data->orig; - if (orig->f_op->show_fdinfo) { - orig->f_op->show_fdinfo(m, orig); - } -} -#else -static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct ksu_file_wrapper *data = f->private_data; - struct file *orig = data->orig; - if (orig->f_op->show_fdinfo) { - orig->f_op->show_fdinfo(m, orig); - } - return -EINVAL; -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1593-1606;drc=398da7defe218d3e51b0f3bdff75147e28125b60 -static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, - loff_t pos_out, size_t len, - unsigned int flags) -{ - struct ksu_file_wrapper *data = file_out->private_data; - struct file *orig = data->orig; - return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len, - flags); -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) -// no REMAP_FILE_DEDUP: use file_in -// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1598-1599;drc=398da7defe218d3e51b0f3bdff75147e28125b60 -// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=403-404;drc=398da7defe218d3e51b0f3bdff75147e28125b60 -// REMAP_FILE_DEDUP: use file_out -// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=483-484;drc=398da7defe218d3e51b0f3bdff75147e28125b60 -static loff_t ksu_wrapper_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, - loff_t pos_out, loff_t len, - unsigned int remap_flags) -{ - if (remap_flags & REMAP_FILE_DEDUP) { - struct ksu_file_wrapper *data = file_out->private_data; - struct file *orig = data->orig; - return orig->f_op->remap_file_range(file_in, pos_in, orig, - pos_out, len, remap_flags); - } else { - struct ksu_file_wrapper *data = file_in->private_data; - struct file *orig = data->orig; - return orig->f_op->remap_file_range(orig, pos_in, file_out, - pos_out, len, remap_flags); - } -} -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) -static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, - int flags) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; - if (orig->f_op->fadvise) { - return orig->f_op->fadvise(orig, off1, off2, flags); - } - return -EINVAL; -} -#endif - -static void ksu_release_file_wrapper(struct ksu_file_wrapper *data); - -static int ksu_wrapper_release(struct inode *inode, struct file *filp) -{ - // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/file_table.c;l=467-473;drc=3be0b283b562eabbc2b1f3bb534dc8903079bbaa - // f_op->release is called before fops_put(f_op), so we put it manually. - fops_put(filp->f_op); - // prevent it from being put again - filp->f_op = NULL; - ksu_release_file_wrapper(filp->private_data); - return 0; -} - -static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp) -{ - struct ksu_file_wrapper *p = - kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL); - if (!p) { - return ERR_PTR(-ENOMEM); - } - - get_file(fp); - - p->orig = fp; - p->ops.owner = THIS_MODULE; - p->ops.llseek = fp->f_op->llseek ? ksu_wrapper_llseek : NULL; - p->ops.read = fp->f_op->read ? ksu_wrapper_read : NULL; - p->ops.write = fp->f_op->write ? ksu_wrapper_write : NULL; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) - p->ops.read_iter = fp->f_op->read_iter ? ksu_wrapper_read_iter : NULL; - p->ops.write_iter = - fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL; -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) - p->ops.iopoll = fp->f_op->iopoll ? ksu_wrapper_iopoll : NULL; -#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) - p->ops.iterate = fp->f_op->iterate ? ksu_wrapper_iterate : NULL; -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) - p->ops.iterate_shared = - fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL; -#endif - p->ops.poll = fp->f_op->poll ? ksu_wrapper_poll : NULL; - p->ops.unlocked_ioctl = - fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL; - p->ops.compat_ioctl = - fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL; - p->ops.mmap = fp->f_op->mmap ? ksu_wrapper_mmap : NULL; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) - p->ops.fop_flags = fp->f_op->fop_flags; -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) - p->ops.mmap_supported_flags = fp->f_op->mmap_supported_flags; -#endif - p->ops.flush = fp->f_op->flush ? ksu_wrapper_flush : NULL; - p->ops.release = ksu_wrapper_release; - p->ops.fsync = fp->f_op->fsync ? ksu_wrapper_fsync : NULL; - p->ops.fasync = fp->f_op->fasync ? ksu_wrapper_fasync : NULL; - p->ops.lock = fp->f_op->lock ? ksu_wrapper_lock : NULL; -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) - p->ops.sendpage = fp->f_op->sendpage ? ksu_wrapper_sendpage : NULL; -#endif - p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ? - ksu_wrapper_get_unmapped_area : - NULL; - p->ops.check_flags = fp->f_op->check_flags; - p->ops.flock = fp->f_op->flock ? ksu_wrapper_flock : NULL; - p->ops.splice_write = - fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL; - p->ops.splice_read = - fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL; - p->ops.setlease = fp->f_op->setlease ? ksu_wrapper_setlease : NULL; - p->ops.fallocate = fp->f_op->fallocate ? ksu_wrapper_fallocate : NULL; - p->ops.show_fdinfo = - fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) - p->ops.copy_file_range = - fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL; -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) - p->ops.remap_file_range = fp->f_op->remap_file_range ? - ksu_wrapper_remap_file_range : - NULL; -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) - p->ops.fadvise = fp->f_op->fadvise ? ksu_wrapper_fadvise : NULL; -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) - p->ops.splice_eof = - fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL; -#endif - - return p; -} - -static void ksu_release_file_wrapper(struct ksu_file_wrapper *data) -{ - fput((struct file *)data->orig); - kfree(data); -} - -static char *ksu_wrapper_d_dname(struct dentry *dentry, char *buffer, - int buflen) -{ - struct path *orig_path = dentry->d_fsdata; - return d_path(orig_path, buffer, buflen); -} - -static void ksu_wrapper_d_release(struct dentry *dentry) -{ - struct path *orig_path = dentry->d_fsdata; - path_put(orig_path); - kfree(orig_path); -} - -static const struct dentry_operations ksu_file_wrapper_d_ops = { - .d_dname = ksu_wrapper_d_dname, - .d_release = ksu_wrapper_d_release -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) -#define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) -#define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) -// There is no anon_inode_create_getfile in 4.19, but it's not difficult to implement it. -// https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e - -// Borrow kernel's anon_inode_mnt, so that we don't need to mount one by ourselves. -static struct vfsmount *anon_inode_mnt __read_mostly; - -static struct inode * -ksu_anon_inode_make_secure_inode(const char *name, - const struct inode *context_inode) -{ - struct inode *inode; - - if (unlikely(!anon_inode_mnt)) { - return ERR_PTR(-ENODEV); - } - - inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); - if (IS_ERR(inode)) - return inode; - inode->i_flags &= ~S_PRIVATE; - - return inode; -} - -static struct file *ksu_anon_inode_create_getfile_compat( - const char *name, const struct file_operations *fops, void *priv, - int flags, const struct inode *context_inode) -{ - struct inode *inode; - struct file *file; - - if (fops->owner && !try_module_get(fops->owner)) - return ERR_PTR(-ENOENT); - - inode = ksu_anon_inode_make_secure_inode(name, context_inode); - if (IS_ERR(inode)) { - file = ERR_CAST(inode); - goto err; - } - - file = alloc_file_pseudo(inode, anon_inode_mnt, name, - flags & (O_ACCMODE | O_NONBLOCK), fops); - if (IS_ERR(file)) - goto err_iput; - - file->f_mapping = inode->i_mapping; - - file->private_data = priv; - - return file; - -err_iput: - iput(inode); -err: - module_put(fops->owner); - return file; -} -#else // KERNEL_VERSION < 4.19 -struct file *ksu_anon_inode_create_getfile_compat( - const char *name, const struct file_operations *fops, void *priv, - int flags, const struct inode *context_inode) -{ - return anon_inode_getfile(name, fops, priv, flags); -} -#endif - -int ksu_install_file_wrapper(int fd) -{ - int out_fd, ret; - struct file *orig_file = fget(fd); - if (!orig_file) { - return -EBADF; - } - - out_fd = get_unused_fd_flags(O_CLOEXEC); - if (out_fd < 0) { - ret = out_fd; - goto done; - } - - struct ksu_file_wrapper *file_wrapper_data = - ksu_create_file_wrapper(orig_file); - if (IS_ERR(file_wrapper_data)) { - ret = PTR_ERR(file_wrapper_data); - goto out_put_fd; - } - - struct file *wrapper_file = ksu_anon_inode_create_getfile_compat( - "[ksu_fdwrapper]", &file_wrapper_data->ops, file_wrapper_data, - orig_file->f_flags, NULL); - if (IS_ERR(wrapper_file)) { - pr_err("ksu_fdwrapper: getfile failed: %ld\n", - PTR_ERR(wrapper_file)); - ret = PTR_ERR(wrapper_file); - goto out_release_wrapper; - } - - // Now do magic on inode and dentry. - // It should be safe to modify them since the file hasn't been published. - - struct inode *wrapper_inode = file_inode(wrapper_file); - // libc's stdio relies on the fstat() result of the fd to determine its buffer type. - wrapper_inode->i_mode = file_inode(orig_file)->i_mode; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - struct inode_security_struct *wrapper_sec = - selinux_inode(wrapper_inode); -#else - struct inode_security_struct *wrapper_sec = - (struct inode_security_struct *)wrapper_inode->i_security; -#endif - - // Use ksu_file_sid to bypass SELinux check. - // When we call `su` from terminal app, this is useful. - if (wrapper_sec) { - wrapper_sec->sid = ksu_file_sid; - } - // Install open file operation for inode. - wrapper_inode->i_fop = &ksu_file_wrapper_inode_fops; - - struct path *orig_path = kmalloc(sizeof(struct path), GFP_KERNEL); - if (!orig_path) { - ret = -ENOMEM; - goto out_put_wrapper_file; - } - *orig_path = orig_file->f_path; - path_get(orig_path); - // Some applications (such as screen) won't work if the tty's path is weird, - // Therefore, we use d_dname to spoof it to return the path to the original file. - wrapper_file->f_path.dentry->d_fsdata = orig_path; - wrapper_file->f_path.dentry->d_op = &ksu_file_wrapper_d_ops; - - fd_install(out_fd, wrapper_file); - ret = out_fd; - goto done; - -out_put_wrapper_file: - fput(wrapper_file); - // file_wrapper will be released by fput - goto out_put_fd; -out_release_wrapper: - ksu_release_file_wrapper(file_wrapper_data); -out_put_fd: - put_unused_fd(out_fd); -done: - fput(orig_file); - - return ret; -} - -void ksu_file_wrapper_init(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) && \ - LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) - static const struct file_operations tmp = { .owner = THIS_MODULE }; - struct file *dummy = anon_inode_getfile("dummy", &tmp, NULL, 0); - if (IS_ERR(dummy)) { - pr_err("file_wrapper: initialize anon_inode_mnt failed, can't get file: %ld\n", - PTR_ERR(dummy)); - return; - } - anon_inode_mnt = dummy->f_path.mnt; - if (unlikely(!anon_inode_mnt)) { - pr_err("file_wrapper: initialize anon_inode_mnt failed, got NULL\n"); - } - fput(dummy); -#endif -} diff --git a/drivers/kernelsu/file_wrapper.h b/drivers/kernelsu/file_wrapper.h deleted file mode 100644 index faae4dded301..000000000000 --- a/drivers/kernelsu/file_wrapper.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef KSU_FILE_WRAPPER_H -#define KSU_FILE_WRAPPER_H - -#include -#include - -int ksu_install_file_wrapper(int fd); -void ksu_file_wrapper_init(void); - -#endif // KSU_FILE_WRAPPER_H diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c deleted file mode 100644 index 38f0251f08a4..000000000000 --- a/drivers/kernelsu/kernel_compat.c +++ /dev/null @@ -1,199 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif -#include -#include -#include - -#include "klog.h" // IWYU pragma: keep -#include "kernel_compat.h" - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -#include -#include -#include - -extern int install_session_keyring_to_cred(struct cred *, struct key *); -struct key *init_session_keyring = NULL; - -static int install_session_keyring(struct key *keyring) -{ - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = install_session_keyring_to_cred(new, keyring); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); -} -#endif - -struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) - if (init_session_keyring != NULL && !current_cred()->session_keyring && - (current->flags & PF_WQ_WORKER)) { - pr_info("installing init session keyring for older kernel\n"); - install_session_keyring(init_session_keyring); - } -#endif - return filp_open(filename, flags, mode); -} - -ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, - loff_t *pos) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_read(p, buf, count, pos); -#else - loff_t offset = pos ? *pos : 0; - ssize_t result = kernel_read(p, offset, (char *)buf, count); - if (pos && result > 0) { - *pos = offset + result; - } - return result; -#endif -} - -ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, - loff_t *pos) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_write(p, buf, count, pos); -#else - loff_t offset = pos ? *pos : 0; - ssize_t result = kernel_write(p, buf, count, offset); - if (pos && result > 0) { - *pos = offset + result; - } - return result; -#endif -} - -static inline long -do_strncpy_user_nofault(char *dst, const void __user *unsafe_addr, long count) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) - return strncpy_from_user_nofault(dst, unsafe_addr, count); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) - return strncpy_from_unsafe_user(dst, unsafe_addr, count); -#else - mm_segment_t old_fs = get_fs(); - long ret; - - if (unlikely(count <= 0)) - return 0; - - set_fs(USER_DS); - pagefault_disable(); - ret = strncpy_from_user(dst, unsafe_addr, count); - pagefault_enable(); - set_fs(old_fs); - - if (ret >= count) { - ret = count; - dst[ret - 1] = '\0'; - } else if (ret > 0) { - ret++; - } - - return ret; -#endif -} - -long ksu_strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, - long count) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - long ret; - - ret = do_strncpy_user_nofault(dst, unsafe_addr, count); - if (likely(ret >= 0)) - return ret; - - // we faulted! fallback to slow path - if (unlikely(!ksu_access_ok(unsafe_addr, count))) - return -EFAULT; - - ret = strncpy_from_user(dst, unsafe_addr, count); - if (ret >= count) { - ret = count; - dst[ret - 1] = '\0'; - } else if (ret >= 0) { - ret++; - } - - return ret; -#else - return do_strncpy_user_nofault(dst, unsafe_addr, count); -#endif -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) -int path_mount(const char *dev_name, struct path *path, const char *type_page, - unsigned long flags, void *data_page) -{ - // 384 is enough - char buf[384] = { 0 }; - mm_segment_t old_fs; - long ret; - - // -1 on the size as implicit null termination - // as we zero init the thing - char *realpath = d_path(path, buf, sizeof(buf) - 1); - if (!(realpath && realpath != buf)) - return -ENOENT; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = do_mount(dev_name, (const char __user *)realpath, type_page, - flags, data_page); - set_fs(old_fs); - return ret; -} -#endif - -int do_close_fd(unsigned int fd) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - return close_fd(fd); -#else - return __close_fd(current->files, fd); -#endif -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -// https://elixir.bootlin.com/linux/v5.10.247/source/mm/util.c#L664 -void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags) -{ - void *newp; - - if (oldsize >= newsize) - return (void *)p; - newp = kvmalloc(newsize, flags); - if (!newp) - return NULL; - memcpy(newp, p, oldsize); - kvfree(p); - return newp; -} -#endif diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h deleted file mode 100644 index b8fe8874d17d..000000000000 --- a/drivers/kernelsu/kernel_compat.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __KSU_H_KERNEL_COMPAT -#define __KSU_H_KERNEL_COMPAT - -#include -#include -#include -#include - -/* - * Adapt to Huawei HISI kernel without affecting other kernels , - * Huawei Hisi Kernel EBITMAP Enable or Disable Flag , - * From ss/ebitmap.h - */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) && \ - (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)) && \ - (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) -#ifdef HISI_SELINUX_EBITMAP_RO -#define CONFIG_IS_HW_HISI -#endif -#endif - -extern long ksu_strncpy_from_user_nofault(char *dst, - const void __user *unsafe_addr, - long count); - -extern struct file *ksu_filp_open_compat(const char *filename, int flags, - umode_t mode); -extern ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, - loff_t *pos); -extern ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, - size_t count, loff_t *pos); - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -extern struct key *init_session_keyring; -#endif - -extern int do_close_fd(unsigned int fd); - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -extern void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); -#endif - -#ifndef VERIFY_READ -#define ksu_access_ok(addr, size) access_ok(addr, size) -#else -#define ksu_access_ok(addr, size) access_ok(VERIFY_READ, addr, size) -#endif - -// Linux >= 5.7 -// task_work_add (struct, struct, enum) -// Linux pre-5.7 -// task_work_add (struct, struct, bool) -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) -#ifndef TWA_RESUME -#define TWA_RESUME true -#endif -#endif - -#endif diff --git a/drivers/kernelsu/kernel_umount.c b/drivers/kernelsu/kernel_umount.c deleted file mode 100644 index cd9889ea7f72..000000000000 --- a/drivers/kernelsu/kernel_umount.c +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kernel_umount.h" -#include "klog.h" // IWYU pragma: keep -#include "allowlist.h" -#include "kernel_compat.h" -#include "selinux/selinux.h" -#include "feature.h" -#include "ksud.h" -#include "ksu.h" - -bool __read_mostly ksu_kernel_umount_enabled = true; - -static int kernel_umount_feature_get(u64 *value) -{ - *value = ksu_kernel_umount_enabled ? 1 : 0; - return 0; -} - -static int kernel_umount_feature_set(u64 value) -{ - bool enable = value != 0; - ksu_kernel_umount_enabled = enable; - pr_info("kernel_umount: set to %d\n", enable); - return 0; -} - -static const struct ksu_feature_handler kernel_umount_handler = { - .feature_id = KSU_FEATURE_KERNEL_UMOUNT, - .name = "kernel_umount", - .get_handler = kernel_umount_feature_get, - .set_handler = kernel_umount_feature_set, -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) -extern int path_umount(struct path *path, int flags); -static int ksu_umount_mnt(const char *__never_use_mnt, struct path *path, - int flags) -{ - return path_umount(path, flags); -} -#else -static int ksu_sys_umount(const char *mnt, int flags) -{ - char __user *usermnt = (char __user *)mnt; - mm_segment_t old_fs; - int ret = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - ret = ksys_umount(usermnt, flags); -#else - // Perhaps its not necessary to cast it - ret = (int)sys_umount(usermnt, flags); // cuz asmlinkage long sys##name -#endif - set_fs(old_fs); - return ret; -} -#define ksu_umount_mnt(mnt, __unused, flags) \ - ({ \ - path_put(__unused); \ - ksu_sys_umount(mnt, flags); \ - }) - -#endif - -static void try_umount(const char *mnt, int flags) -{ - struct path path; - int ret = 0; - if (kern_path(mnt, 0, &path)) { - return; - } - - if (path.dentry != path.mnt->mnt_root) { - // it is not root mountpoint, maybe umounted by others already. - path_put(&path); - return; - } - - ret = ksu_umount_mnt(mnt, &path, flags); - if (ret) { - pr_info("%s: umounting %s (flags=0x%x) failed, err: %d\n", - __func__, mnt, flags, ret); - } -} - -struct umount_tw { - struct callback_head cb; -}; - -static void umount_tw_func(struct callback_head *cb) -{ - struct umount_tw *tw = container_of(cb, struct umount_tw, cb); - const struct cred *saved = override_creds(ksu_cred); - - down_read(&mount_list_lock); - struct mount_entry *entry; - list_for_each_entry (entry, &mount_list, list) { - pr_info("%s: unmounting: %s flags 0x%x\n", __func__, - entry->umountable, entry->flags); - try_umount(entry->umountable, entry->flags); - } - up_read(&mount_list_lock); - - revert_creds(saved); - kfree(tw); -} - -int ksu_handle_umount(uid_t old_uid, uid_t new_uid) -{ - // if there isn't any module mounted, just ignore it! - if (!ksu_module_mounted) { - return 0; - } - - if (!ksu_kernel_umount_enabled) { - return 0; - } - - if (!ksu_cred) { - return 0; - } - - // There are 5 scenarios: - // 1. Normal app: zygote -> appuid - // 2. Isolated process forked from zygote: zygote -> isolated_process - // 3. App zygote forked from zygote: zygote -> appuid - // 4. Isolated process froked from app zygote: appuid -> isolated_process (already handled by 3) - // 5. Isolated process froked from webview zygote (no need to handle, app cannot run custom code) - if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - // check old process's selinux context, if it is not zygote, ignore it! - // because some su apps may setuid to untrusted_app but they are in global mount namespace - // when we umount for such process, that is a disaster! - // also handle case 4 and 5 - bool is_zygote_child = is_zygote(get_current_cred()); - if (!is_zygote_child) { - pr_info("handle umount ignore non zygote child: %d\n", - current->pid); - return 0; - } - // umount the target mnt - pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); - - struct umount_tw *tw; - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return 0; - - tw->cb.func = umount_tw_func; - - int err = task_work_add(current, &tw->cb, TWA_RESUME); - if (err) { - kfree(tw); - pr_warn("unmount add task_work failed\n"); - } - - return 0; -} - -void ksu_kernel_umount_init(void) -{ - if (ksu_register_feature_handler(&kernel_umount_handler)) { - pr_err("Failed to register kernel_umount feature handler\n"); - } -} - -void ksu_kernel_umount_exit(void) -{ - ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT); -} diff --git a/drivers/kernelsu/kernel_umount.h b/drivers/kernelsu/kernel_umount.h deleted file mode 100644 index 96a23fba5bcd..000000000000 --- a/drivers/kernelsu/kernel_umount.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __KSU_H_KERNEL_UMOUNT -#define __KSU_H_KERNEL_UMOUNT - -#include -#include -#include - -void ksu_kernel_umount_init(void); -void ksu_kernel_umount_exit(void); - -// Handler function to be called from setresuid hook -int ksu_handle_umount(uid_t old_uid, uid_t new_uid); - -// for the umount list -struct mount_entry { - char *umountable; - unsigned int flags; - struct list_head list; -}; -extern struct list_head mount_list; -extern struct rw_semaphore mount_list_lock; - -extern bool __read_mostly ksu_kernel_umount_enabled; - -#endif diff --git a/drivers/kernelsu/klog.h b/drivers/kernelsu/klog.h deleted file mode 100644 index a934027fbeeb..000000000000 --- a/drivers/kernelsu/klog.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __KSU_H_KLOG -#define __KSU_H_KLOG - -#include - -#ifdef pr_fmt -#undef pr_fmt -#define pr_fmt(fmt) "KernelSU: " fmt -#endif - -#endif diff --git a/drivers/kernelsu/kp_hook.c b/drivers/kernelsu/kp_hook.c deleted file mode 100644 index 23ef72fb14ba..000000000000 --- a/drivers/kernelsu/kp_hook.c +++ /dev/null @@ -1,167 +0,0 @@ -#include -#include -#include - -#define DECL_KP(name, sym, pre) \ - struct kprobe name = { \ - .symbol_name = sym, \ - .pre_handler = pre, \ - } - -// ksud.c - -static struct work_struct stop_vfs_read_work, stop_execve_hook_work, - stop_input_hook_work; - -static int sys_execve_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - const char __user **filename_user = - (const char **)&PT_REGS_PARM1(real_regs); - const char __user *const __user *__argv = - (const char __user *const __user *)PT_REGS_PARM2(real_regs); - struct user_arg_ptr argv = { .ptr.native = __argv }; - struct filename filename_in, *filename_p; - char path[32]; - - if (!filename_user) - return 0; - if (!ksu_retry_filename_access(filename_user, path, 32, false)) - return 0; - - filename_in.name = path; - filename_p = &filename_in; - return ksu_handle_execveat_ksud((int *)AT_FDCWD, &filename_p, &argv, - NULL, NULL); -} - -static int sys_read_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - unsigned int fd = PT_REGS_PARM1(real_regs); - char __user **buf_ptr = (char __user **)&PT_REGS_PARM2(real_regs); - size_t *count_ptr = (size_t *)&PT_REGS_PARM3(real_regs); - - return ksu_handle_sys_read(fd, buf_ptr, count_ptr); -} - -static int input_handle_event_handler_pre(struct kprobe *p, - struct pt_regs *regs) -{ - unsigned int *type = (unsigned int *)&PT_REGS_PARM2(regs); - unsigned int *code = (unsigned int *)&PT_REGS_PARM3(regs); - int *value = (int *)&PT_REGS_CCALL_PARM4(regs); - return ksu_handle_input_handle_event(type, code, value); -} - -static DECL_KP(execve_kp, SYS_EXECVE_SYMBOL, sys_execve_handler_pre); -static DECL_KP(vfs_read_kp, SYS_READ_SYMBOL, sys_read_handler_pre); -static DECL_KP(input_event_kp, "input_event", input_handle_event_handler_pre); - -static void do_stop_vfs_read_hook(struct work_struct *work) -{ - unregister_kprobe(&vfs_read_kp); -} - -static void do_stop_execve_hook(struct work_struct *work) -{ - unregister_kprobe(&execve_kp); -} - -static void do_stop_input_hook(struct work_struct *work) -{ - unregister_kprobe(&input_event_kp); -} - -void kp_handle_ksud_stop(enum ksud_stop_code stop_code) -{ - bool ret; - switch (stop_code) { - case VFS_READ_HOOK_KP: { - ret = schedule_work(&stop_vfs_read_work); - pr_info("unregister vfs_read kprobe: %d!\n", ret); - break; - } - case EXECVE_HOOK_KP: { - ret = schedule_work(&stop_execve_hook_work); - pr_info("unregister execve kprobe: %d!\n", ret); - break; - } - case INPUT_EVENT_HOOK_KP: { - static bool input_hook_stopped = false; - if (input_hook_stopped) { - return; - } - input_hook_stopped = true; - ret = schedule_work(&stop_input_hook_work); - pr_info("unregister input kprobe: %d!\n", ret); - break; - } - default: - return; - } - return; -} - -void kp_handle_ksud_init(void) -{ - int ret; - - ret = register_kprobe(&execve_kp); - pr_info("ksud: execve_kp: %d\n", ret); - - ret = register_kprobe(&vfs_read_kp); - pr_info("ksud: vfs_read_kp: %d\n", ret); - - ret = register_kprobe(&input_event_kp); - pr_info("ksud: input_event_kp: %d\n", ret); - - INIT_WORK(&stop_vfs_read_work, do_stop_vfs_read_hook); - INIT_WORK(&stop_execve_hook_work, do_stop_execve_hook); - INIT_WORK(&stop_input_hook_work, do_stop_input_hook); -} - -void kp_handle_ksud_exit(void) -{ - unregister_kprobe(&execve_kp); - // this should be done before unregister vfs_read_kp - // unregister_kprobe(&vfs_read_kp); - unregister_kprobe(&input_event_kp); -} - -// supercalls.c - -extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, - void __user **arg); - -static int reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - int magic1 = (int)PT_REGS_PARM1(real_regs); - int magic2 = (int)PT_REGS_PARM2(real_regs); - void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs); - - // cmd is not really used here, so we NULL! - if (ksu_handle_sys_reboot(magic1, magic2, NULL, arg)) { - pr_err("kp_hook: sys_reboot failure\n"); - } - - return 0; -} - -static DECL_KP(reboot_kp, REBOOT_SYMBOL, reboot_handler_pre); - -void kp_handle_supercalls_init(void) -{ - int rc = register_kprobe(&reboot_kp); - if (rc) { - pr_err("reboot kprobe failed: %d\n", rc); - return; - } - pr_info("reboot kprobe registered successfully\n"); -} - -void kp_handle_supercalls_exit(void) -{ - unregister_kprobe(&reboot_kp); -} diff --git a/drivers/kernelsu/kp_hook.h b/drivers/kernelsu/kp_hook.h deleted file mode 100644 index 708e78665ba8..000000000000 --- a/drivers/kernelsu/kp_hook.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __KSU_H_KP_HOOK -#define __KSU_H_KP_HOOK - -// ksud.c -enum ksud_stop_code { - VFS_READ_HOOK_KP = 0, - EXECVE_HOOK_KP, - INPUT_EVENT_HOOK_KP, -}; - -int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr, - size_t *count_ptr); - -int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, - int *value); - -void kp_handle_ksud_stop(enum ksud_stop_code); -void kp_handle_ksud_init(void); -void kp_handle_ksud_exit(void); - -// supercalls.c -void kp_handle_supercalls_init(void); -void kp_handle_supercalls_exit(void); - -#endif diff --git a/drivers/kernelsu/kp_util.c b/drivers/kernelsu/kp_util.c deleted file mode 100644 index 05e6715672c8..000000000000 --- a/drivers/kernelsu/kp_util.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include - -static bool try_set_access_flag(unsigned long addr) -{ -#ifdef CONFIG_ARM64 - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - bool ret = false; - - if (!mm) - return false; - - if (!mmap_read_trylock(mm)) - return false; - - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) - goto out_unlock; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out_unlock; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - goto out_unlock; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - goto out_unlock; - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out_unlock; - - if (pmd_trans_huge(*pmd)) - goto out_unlock; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!ptep) - goto out_unlock; - - pte = *ptep; - - if (!pte_present(pte)) - goto out_pte_unlock; - - if (pte_young(pte)) { - ret = true; - goto out_pte_unlock; - } - - ptep_set_access_flags(vma, addr, ptep, pte_mkyoung(pte), 0); - pr_info("set AF for addr %lx\n", addr); - ret = true; - -out_pte_unlock: - pte_unmap_unlock(ptep, ptl); -out_unlock: - mmap_read_unlock(mm); - return ret; -#else - return false; -#endif -} - -bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest, - size_t dest_len, bool exit_atomic_ctx) -{ - unsigned long addr; - const char __user *fn; - long ret; - - if (!char_usr_ptr) - return false; - - addr = untagged_addr((unsigned long)*char_usr_ptr); -#ifdef CONFIG_KSU_DEBUG - pr_info("got addr: %lu\n", addr); -#endif - fn = (const char __user *)addr; - memset(dest, 0, dest_len); - ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len); - - if (ret < 0 && try_set_access_flag(addr)) { - ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len); - } - - /* - * This is crazy, but we know what we are doing: - * Temporarily exit atomic context to handle page faults, then restore it. - */ - if (exit_atomic_ctx) { - if (ret < 0 && preempt_count()) { -#ifdef CONFIG_KSU_DEBUG - pr_info("access to pointer failed, attempting to rescue..\n"); -#endif - preempt_enable_no_resched_notrace(); - ret = strncpy_from_user(dest, fn, dest_len); - preempt_disable_notrace(); - } - } - - if (ret < 0) { - pr_err("all fallback were tried. err: %lu\n", ret); - return false; - } - - return true; -} diff --git a/drivers/kernelsu/kp_util.h b/drivers/kernelsu/kp_util.h deleted file mode 100644 index b9128964d6a8..000000000000 --- a/drivers/kernelsu/kp_util.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __KSU_H_KP_UTIL -#define __KSU_H_KP_UTIL -#include - -#ifndef preempt_enable_no_resched_notrace -#define preempt_enable_no_resched_notrace() \ - do { \ - barrier(); \ - __preempt_count_dec(); \ - } while (0) -#endif - -#ifndef preempt_disable_notrace -#define preempt_disable_notrace() \ - do { \ - __preempt_count_inc(); \ - barrier(); \ - } while (0) -#endif - -bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest, - size_t dest_len, bool exit_atomic_ctx); - -#endif diff --git a/drivers/kernelsu/ksu.h b/drivers/kernelsu/ksu.h deleted file mode 100644 index 32e81d967fff..000000000000 --- a/drivers/kernelsu/ksu.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __KSU_H_KSU -#define __KSU_H_KSU - -#include -#include -#include - -#define KERNEL_SU_VERSION 32430 - -#define EVENT_POST_FS_DATA 1 -#define EVENT_BOOT_COMPLETED 2 -#define EVENT_MODULE_MOUNTED 3 - -static inline int startswith(char *s, char *prefix) -{ - return strncmp(s, prefix, strlen(prefix)); -} - -static inline int endswith(const char *s, const char *t) -{ - size_t slen = strlen(s); - size_t tlen = strlen(t); - if (tlen > slen) - return 1; - return strcmp(s + slen - tlen, t); -} - -extern struct cred *ksu_cred; - -#endif diff --git a/drivers/kernelsu/ksud.c b/drivers/kernelsu/ksud.c deleted file mode 100644 index c880d2270c3a..000000000000 --- a/drivers/kernelsu/ksud.c +++ /dev/null @@ -1,644 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) -#include -#else -#include -#endif -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif - -#include "manager.h" -#include "allowlist.h" -#include "arch.h" -#include "kernel_compat.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_hook.h" -#endif -#include "selinux/selinux.h" -#include "throne_tracker.h" - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern int ksu_observer_init(void); -#endif - -bool ksu_module_mounted __read_mostly = false; -bool ksu_boot_completed __read_mostly = false; - -static const char KERNEL_SU_RC[] = - "\n" - - "on post-fs-data\n" - " start logd\n" - // We should wait for the post-fs-data finish - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH - " post-fs-data\n" - "\n" - - "on nonencrypted\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" - "\n" - - "on property:vold.decrypt=trigger_restart_framework\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" - "\n" - - "on property:sys.boot_completed=1\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH - " boot-completed\n" - "\n" - - "\n"; - -static void stop_vfs_read_hook(void); -static void stop_execve_hook(void); -static void stop_input_hook(void); - -#ifdef CONFIG_KSU_MANUAL_HOOK -bool ksu_vfs_read_hook __read_mostly = true; -bool ksu_execveat_hook __read_mostly = true; -bool ksu_input_hook __read_mostly = true; -#endif - -void on_post_fs_data(void) -{ - static bool already_post_fs_data = false; - if (already_post_fs_data) { - pr_info("on_post_fs_data already done\n"); - return; - } - already_post_fs_data = true; - pr_info("on_post_fs_data!\n"); - ksu_load_allow_list(); -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - ksu_observer_init(); -#endif - stop_input_hook(); -} - -extern void ext4_unregister_sysfs(struct super_block *sb); -int nuke_ext4_sysfs(const char *mnt) -{ - struct path path; - int err = kern_path(mnt, 0, &path); - if (err) { - pr_err("nuke path err: %d\n", err); - return err; - } - - struct super_block *sb = path.dentry->d_inode->i_sb; - const char *name = sb->s_type->name; - if (strcmp(name, "ext4") != 0) { - pr_info("nuke but module aren't mounted\n"); - path_put(&path); - return -EINVAL; - } - - ext4_unregister_sysfs(sb); - path_put(&path); - return 0; -} - -void on_module_mounted(void) -{ - pr_info("on_module_mounted!\n"); - ksu_module_mounted = true; -} - -void on_boot_completed(void) -{ - ksu_boot_completed = true; - pr_info("on_boot_completed!\n"); -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - track_throne(true); -#endif -} - -#define MAX_ARG_STRINGS 0x7FFFFFFF - -static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) -{ - const char __user *native; - -#ifdef CONFIG_COMPAT - if (unlikely(argv.is_compat)) { - compat_uptr_t compat; - - if (get_user(compat, argv.ptr.compat + nr)) - return ERR_PTR(-EFAULT); - - return compat_ptr(compat); - } -#endif - - if (get_user(native, argv.ptr.native + nr)) - return ERR_PTR(-EFAULT); - - return native; -} - -/* - * count() counts the number of strings in array ARGV. - */ - -/* - * Make sure old GCC compiler can use __maybe_unused, - * Test passed in 4.4.x ~ 4.9.x when use GCC. - */ - -static int __maybe_unused count(struct user_arg_ptr argv, int max) -{ - int i = 0; - - if (argv.ptr.native != NULL) { - for (;;) { - const char __user *p = get_user_arg_ptr(argv, i); - - if (!p) - break; - - if (IS_ERR(p)) - return -EFAULT; - - if (i >= max) - return -E2BIG; - ++i; - - if (fatal_signal_pending(current)) - return -ERESTARTNOHAND; -#ifdef CONFIG_KSU_MANUAL_HOOK - cond_resched(); -#endif - } - } - return i; -} - -static void on_post_fs_data_cbfun(struct callback_head *cb) -{ - on_post_fs_data(); -} - -static struct callback_head on_post_fs_data_cb = { - .func = on_post_fs_data_cbfun -}; - -static inline void handle_second_stage(void) -{ - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); -} - -static bool check_argv(struct user_arg_ptr argv, int index, - const char *expected, char *buf, size_t buf_len) -{ - const char __user *p; - int argc; - long ret; - - argc = count(argv, MAX_ARG_STRINGS); - if (argc <= index) { - return false; - } - - p = get_user_arg_ptr(argv, index); - if (IS_ERR_OR_NULL(p)) { - if (PTR_ERR(p)) { - pr_err("check_argv: invalid user pointer, err: %ld\n", - PTR_ERR(p)); - } - return false; - } - - ret = ksu_strncpy_from_user_nofault(buf, p, buf_len); - if (ret <= 0) { - pr_err("check_argv: failed to copy pointer, err: %ld\n", ret); - return false; - } - - buf[buf_len - 1] = '\0'; - - return !strcmp(buf, expected); -} - -// IMPORTANT NOTE: the call from execve_handler_pre WON'T provided correct value for envp and flags in GKI version -int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr, - struct user_arg_ptr *argv, - struct user_arg_ptr *envp, int *flags) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_execveat_hook) { - return 0; - } -#endif - struct filename *filename; - - static const char app_process[] = "/system/bin/app_process"; - static bool first_zygote = true; - - /* This applies to versions Android 10+ */ - static const char system_bin_init[] = "/system/bin/init"; - /* This applies to versions between Android 6 ~ 9 */ - static const char old_system_init[] = "/init"; - static bool init_second_stage_executed = false; - - if (!filename_ptr) - return 0; - - filename = *filename_ptr; - if (IS_ERR(filename)) { - return 0; - } - -#ifdef CONFIG_KSU_MANUAL_HOOK - if (current->pid != 1 && is_init(get_current_cred())) { - if (unlikely(strcmp(filename->name, KSUD_PATH) == 0)) { - pr_info("escape to root for init executing ksud: %d\n", - current->pid); - escape_to_root_for_init(); - } - } -#endif - - if (unlikely(!memcmp(filename->name, system_bin_init, - sizeof(system_bin_init) - 1) && - argv)) { - char buf[16]; - if (!init_second_stage_executed && - check_argv(*argv, 1, "second_stage", buf, sizeof(buf))) { - pr_info("/system/bin/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = true; - } - } else if (unlikely(!memcmp(filename->name, old_system_init, - sizeof(old_system_init) - 1) && - argv)) { - char buf[16]; - if (!init_second_stage_executed && - check_argv(*argv, 1, "--second-stage", buf, sizeof(buf))) { - /* This applies to versions between Android 6 ~ 7 */ - pr_info("/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = true; - } else if (count(*argv, MAX_ARG_STRINGS) == 1 && - !init_second_stage_executed && envp) { - /* This applies to versions between Android 8 ~ 9 */ - int envc = count(*envp, MAX_ARG_STRINGS); - if (envc > 0) { - int n; - for (n = 1; n <= envc; n++) { - const char __user *p = - get_user_arg_ptr(*envp, n); - if (!p || IS_ERR(p)) { - continue; - } - char env[256]; - // Reading environment variable strings from user space - if (ksu_strncpy_from_user_nofault( - env, p, sizeof(env)) < 0) - continue; - // Parsing environment variable names and values - char *env_name = env; - char *env_value = strchr(env, '='); - if (env_value == NULL) - continue; - // Replace equal sign with string terminator - *env_value = '\0'; - env_value++; - // Check if the environment variable name and value are matching - if (!strcmp(env_name, - "INIT_SECOND_STAGE") && - (!strcmp(env_value, "1") || - !strcmp(env_value, "true"))) { - pr_info("/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = - true; - } - } - } - } - } - - if (unlikely(first_zygote && - !memcmp(filename->name, app_process, - sizeof(app_process) - 1) && - argv)) { - char buf[16]; - if (check_argv(*argv, 1, "-Xzygote", buf, sizeof(buf))) { - pr_info("exec zygote, /data prepared, second_stage: %d\n", - init_second_stage_executed); - rcu_read_lock(); - struct task_struct *init_task = - rcu_dereference(current->real_parent); - if (init_task) - task_work_add(init_task, &on_post_fs_data_cb, - TWA_RESUME); - rcu_read_unlock(); - first_zygote = false; - stop_execve_hook(); - } - } - - return 0; -} - -static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); -static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *); -static struct file_operations fops_proxy; -static ssize_t ksu_rc_pos = 0; -const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1; - -// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da -// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da -// The system will read init.rc file until EOF, whenever read() returns 0, -// so we begin append ksu rc when we meet EOF. - -static ssize_t read_proxy(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - ssize_t ret = 0; - size_t append_count; - if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) - goto append_ksu_rc; - - ret = orig_read(file, buf, count, pos); - if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { - return ret; - } else { - pr_info("read_proxy: orig read finished, start append rc\n"); - } -append_ksu_rc: - append_count = ksu_rc_len - ksu_rc_pos; - if (append_count > count - ret) - append_count = count - ret; - // copy_to_user returns the number of not copied - if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) { - pr_info("read_proxy: append error, totally appended %zd\n", - ksu_rc_pos); - } else { - pr_info("read_proxy: append %zu\n", append_count); - - ksu_rc_pos += append_count; - if (ksu_rc_pos == ksu_rc_len) { - pr_info("read_proxy: append done\n"); - } - ret += append_count; - } - - return ret; -} - -static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to) -{ - ssize_t ret = 0; - size_t append_count; - if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) - goto append_ksu_rc; - - ret = orig_read_iter(iocb, to); - if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { - return ret; - } else { - pr_info("read_iter_proxy: orig read finished, start append rc\n"); - } -append_ksu_rc: - // copy_to_iter returns the number of copied bytes - append_count = copy_to_iter(KERNEL_SU_RC + ksu_rc_pos, - ksu_rc_len - ksu_rc_pos, to); - if (!append_count) { - pr_info("read_iter_proxy: append error, totally appended %zd\n", - ksu_rc_pos); - } else { - pr_info("read_iter_proxy: append %zu\n", append_count); - - ksu_rc_pos += append_count; - if (ksu_rc_pos == ksu_rc_len) { - pr_info("read_iter_proxy: append done\n"); - } - ret += append_count; - } - return ret; -} - -static bool check_init_path(char *dpath) -{ - const char *valid_paths[] = { "/system/etc/init/hw/init.rc", - "/init.rc" }; - bool path_match = false; - int i; - - for (i = 0; i < ARRAY_SIZE(valid_paths); i++) { - if (strcmp(dpath, valid_paths[i]) == 0) { - path_match = true; - break; - } - } - - if (!path_match) { - pr_err("vfs_read: couldn't determine init.rc path for %s\n", - dpath); - return false; - } - - pr_info("vfs_read: got init.rc path: %s\n", dpath); - return true; -} - -int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr, - size_t *count_ptr, loff_t **pos) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_vfs_read_hook) { - return 0; - } -#endif - - struct file *file; - size_t count; - - if (strcmp(current->comm, "init")) { - // we are only interest in `init` process - return 0; - } - - file = *file_ptr; - if (IS_ERR(file)) { - return 0; - } - - if (!d_is_reg(file->f_path.dentry)) { - return 0; - } - - const char *short_name = file->f_path.dentry->d_name.name; - if (strcmp(short_name, "init.rc")) { - // we are only interest `init.rc` file name file - return 0; - } - char path[256]; - char *dpath = d_path(&file->f_path, path, sizeof(path)); - - if (IS_ERR(dpath)) { - return 0; - } - - if (!check_init_path(dpath)) { - return 0; - } - - // we only process the first read - static bool rc_hooked = false; - if (rc_hooked) { - // we don't need this kprobe, unregister it! - stop_vfs_read_hook(); - return 0; - } - rc_hooked = true; - - // now we can sure that the init process is reading - // `/system/etc/init/hw/init.rc` or `/init.rc` - count = *count_ptr; - - pr_info("vfs_read: %s, comm: %s, count: %zu, rc_count: %zu\n", dpath, - current->comm, count, ksu_rc_len); - - // Now we need to proxy the read and modify the result! - // But, we can not modify the file_operations directly, because it's in read-only memory. - // We just replace the whole file_operations with a proxy one. - memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations)); - orig_read = file->f_op->read; - if (orig_read) { - fops_proxy.read = read_proxy; - } - orig_read_iter = file->f_op->read_iter; - if (orig_read_iter) { - fops_proxy.read_iter = read_iter_proxy; - } - // replace the file_operations - file->f_op = &fops_proxy; - - return 0; -} - -int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr, - size_t *count_ptr) -{ - struct file *file = fget(fd); - if (!file) { - return 0; - } - int result = ksu_handle_vfs_read(&file, buf_ptr, count_ptr, NULL); - fput(file); - return result; -} - -static unsigned int volumedown_pressed_count = 0; - -static bool is_volumedown_enough(unsigned int count) -{ - return count >= 3; -} - -int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, - int *value) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_input_hook) { - return 0; - } -#endif - - if (*type == EV_KEY && *code == KEY_VOLUMEDOWN && *value) { - // key pressed, count it - volumedown_pressed_count++; - pr_info("input_handle_event: vol_down pressed count: %u\n", - volumedown_pressed_count); - if (is_volumedown_enough(volumedown_pressed_count)) { - pr_info("input_handle_event: vol_down pressed MAX! safe mode is active!\n"); - stop_input_hook(); - } - } - - return 0; -} - -bool ksu_is_safe_mode(void) -{ - return is_volumedown_enough(volumedown_pressed_count); -} - -static void stop_vfs_read_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(VFS_READ_HOOK_KP); -#else - ksu_vfs_read_hook = false; - pr_info("stop vfs_read_hook\n"); -#endif -} - -static void stop_execve_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(EXECVE_HOOK_KP); -#else - ksu_execveat_hook = false; - pr_info("stop execve_hook\n"); -#endif -} - -static void stop_input_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(INPUT_EVENT_HOOK_KP); -#else - // No need to stop when its already stopped. - if (!ksu_input_hook) { - return; - } - ksu_input_hook = false; - pr_info("stop input_hook\n"); -#endif -} - -// ksud: module support -void ksu_ksud_init(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_init(); -#endif -} - -void ksu_ksud_exit(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_exit(); -#endif -} diff --git a/drivers/kernelsu/ksud.h b/drivers/kernelsu/ksud.h deleted file mode 100644 index 68c545714c24..000000000000 --- a/drivers/kernelsu/ksud.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __KSU_H_KSUD -#define __KSU_H_KSUD - -#include - -#define KSUD_PATH "/data/adb/ksud" - -void ksu_ksud_init(void); -void ksu_ksud_exit(void); - -void on_post_fs_data(void); -void on_module_mounted(void); -void on_boot_completed(void); - -bool ksu_is_safe_mode(void); - -int nuke_ext4_sysfs(const char *mnt); - -extern u32 ksu_file_sid; -extern bool ksu_module_mounted; -extern bool ksu_boot_completed; - -struct user_arg_ptr { -#ifdef CONFIG_COMPAT - bool is_compat; -#endif - union { - const char __user *const __user *native; -#ifdef CONFIG_COMPAT - const compat_uptr_t __user *compat; -#endif - } ptr; -}; - -int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr, - struct user_arg_ptr *argv, - struct user_arg_ptr *envp, int *flags); - -#endif diff --git a/drivers/kernelsu/ksuinit.c b/drivers/kernelsu/ksuinit.c deleted file mode 100644 index 75cfced0268d..000000000000 --- a/drivers/kernelsu/ksuinit.c +++ /dev/null @@ -1,140 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include /* LINUX_VERSION_CODE, KERNEL_VERSION macros */ - -#include "allowlist.h" -#include "arch.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "throne_tracker.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK -#include "setuid_hook.h" -#include "sucompat.h" -#endif -#include "ksud.h" -#include "supercalls.h" -#include "ksu.h" -#include "file_wrapper.h" - -struct cred *ksu_cred; - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern void __init ksu_lsm_hook_init(void); -#endif - -int __init kernelsu_init(void) -{ -#ifdef CONFIG_KSU_DEBUG - pr_alert( - "*************************************************************"); - pr_alert( - "** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); - pr_alert( - "** **"); - pr_alert( - "** You are running KernelSU in DEBUG mode **"); - pr_alert( - "** **"); - pr_alert( - "** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); - pr_alert( - "*************************************************************"); -#endif - - ksu_cred = prepare_creds(); - if (!ksu_cred) { - pr_err("prepare cred failed!\n"); - } - - ksu_feature_init(); - - ksu_supercalls_init(); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_syscall_hook_manager_init(); -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0) - ksu_lsm_hook_init(); -#endif - ksu_setuid_hook_init(); - ksu_sucompat_init(); -#endif - - ksu_allowlist_init(); - - ksu_throne_tracker_init(); - - ksu_ksud_init(); - - ksu_file_wrapper_init(); - -#ifdef MODULE -#ifndef CONFIG_KSU_DEBUG - kobject_del(&THIS_MODULE->mkobj.kobj); -#endif -#endif - return 0; -} - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern void ksu_observer_exit(void); -#endif - -void kernelsu_exit(void) -{ - ksu_allowlist_exit(); - - ksu_throne_tracker_exit(); - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - ksu_observer_exit(); -#endif - - ksu_ksud_exit(); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_syscall_hook_manager_exit(); -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_sucompat_exit(); - ksu_setuid_hook_exit(); -#endif - - ksu_supercalls_exit(); - - ksu_feature_exit(); - - if (ksu_cred) { - put_cred(ksu_cred); - } -} - -module_init(kernelsu_init); -module_exit(kernelsu_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("weishu"); -MODULE_DESCRIPTION("Android KernelSU"); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0) -MODULE_IMPORT_NS("VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver"); -#else -MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); -#endif -#endif diff --git a/drivers/kernelsu/lsm_hook.c b/drivers/kernelsu/lsm_hook.c deleted file mode 100644 index e1c0a76ec5ba..000000000000 --- a/drivers/kernelsu/lsm_hook.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -static int ksu_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) -{ - if (init_session_keyring != NULL) { - return 0; - } - if (strcmp(current->comm, "init")) { - // we are only interested in `init` process - return 0; - } - init_session_keyring = cred->session_keyring; - pr_info("kernel_compat: got init_session_keyring\n"); - return 0; -} -#endif - -static int ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, - struct inode *new_inode, struct dentry *new_dentry) -{ - // skip kernel threads - if (!current->mm) { - return 0; - } - - // skip non system uid - if (current_uid().val != 1000) { - return 0; - } - - if (!old_dentry || !new_dentry) { - return 0; - } - - // /data/system/packages.list.tmp -> /data/system/packages.list - if (strcmp(new_dentry->d_iname, "packages.list")) { - return 0; - } - - char path[128]; - char *buf = dentry_path_raw(new_dentry, path, sizeof(path)); - if (IS_ERR(buf)) { - pr_err("dentry_path_raw failed.\n"); - return 0; - } - - if (!strstr(buf, "/system/packages.list")) { - return 0; - } - - pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, - new_dentry->d_iname, buf); - - /* - * RKSU note: - * track_throne(true) only occurs on on_boot_completed event. - * When using this LSM, we must handle it here, else it returns - * ENOENT (-2). - */ - static bool did = false; - if (ksu_boot_completed && !did) { - did = true; - track_throne(true); - return 0; - } - - track_throne(false); - - return 0; -} - -static int ksu_task_fix_setuid(struct cred *new, const struct cred *old, - int flags) -{ - if (!new || !old) - return 0; - - return ksu_handle_setuid_common(new->uid.val, old->uid.val, new->euid.val); -} - -static struct security_hook_list ksu_hooks[] = { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) - LSM_HOOK_INIT(key_permission, ksu_key_permission), -#endif - LSM_HOOK_INIT(inode_rename, ksu_inode_rename), - LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid) -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) -static const struct lsm_id ksu_lsmid = { - .name = "ksu", - .id = 912, -}; -#endif - -void __init ksu_lsm_hook_init(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), &ksu_lsmid); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); -#else - // https://elixir.bootlin.com/linux/v4.10.17/source/include/linux/lsm_hooks.h#L1892 - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks)); -#endif - pr_info("LSM hooks initialized.\n"); -} diff --git a/drivers/kernelsu/manager.h b/drivers/kernelsu/manager.h deleted file mode 100644 index a22ac52ec1f2..000000000000 --- a/drivers/kernelsu/manager.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __KSU_H_KSU_MANAGER -#define __KSU_H_KSU_MANAGER - -#include -#include -#include "allowlist.h" - -#define KSU_INVALID_APPID -1 - -extern uid_t ksu_manager_appid; // DO NOT DIRECT USE - -static inline bool ksu_is_manager_appid_valid(void) -{ - return ksu_manager_appid != KSU_INVALID_APPID; -} - -static inline bool is_manager(void) -{ - return unlikely(ksu_manager_appid == - current_uid().val % PER_USER_RANGE); -} - -static inline uid_t ksu_get_manager_appid(void) -{ - return ksu_manager_appid; -} - -static inline void ksu_set_manager_appid(uid_t appid) -{ - ksu_manager_appid = appid; -} - -static inline void ksu_invalidate_manager_uid(void) -{ - ksu_manager_appid = KSU_INVALID_APPID; -} - -#endif diff --git a/drivers/kernelsu/manager_sign.h b/drivers/kernelsu/manager_sign.h deleted file mode 100644 index 2766b261e311..000000000000 --- a/drivers/kernelsu/manager_sign.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __KSU_H_MANAGER_SIGN -#define __KSU_H_MANAGER_SIGN - -#include - -// rsuntk/KernelSU -#define EXPECTED_SIZE_RSUNTK 0x396 -#define EXPECTED_HASH_RSUNTK \ - "f415f4ed9435427e1fdf7f1fccd4dbc07b3d6b8751e4dbcec6f19671f427870b" - -typedef struct { - u32 size; - const char *sha256; -} apk_sign_key_t; - -#endif /* MANAGER_SIGN_H */ diff --git a/drivers/kernelsu/pkg_observer.c b/drivers/kernelsu/pkg_observer.c deleted file mode 100644 index 049c58e38caf..000000000000 --- a/drivers/kernelsu/pkg_observer.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include - -#define MASK_SYSTEM (FS_CREATE | FS_MOVE | FS_EVENT_ON_CHILD) - -struct watch_dir { - const char *path; - u32 mask; - struct path kpath; - struct inode *inode; - struct fsnotify_mark *mark; -}; - -static struct fsnotify_group *g; - -static int ksu_handle_inode_event(struct fsnotify_mark *mark, u32 mask, - struct inode *inode, struct inode *dir, - const struct qstr *file_name, u32 cookie) -{ - if (!file_name) - return 0; - if (mask & FS_ISDIR) - return 0; - if (file_name->len == 13 && - !memcmp(file_name->name, "packages.list", 13)) { - pr_info("packages.list detected: %d\n", mask); - track_throne(false); - } - return 0; -} - -static const struct fsnotify_ops ksu_ops = { - .handle_inode_event = ksu_handle_inode_event, -}; - -static int add_mark_on_inode(struct inode *inode, u32 mask, - struct fsnotify_mark **out) -{ - struct fsnotify_mark *m; - - m = kzalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return -ENOMEM; - - fsnotify_init_mark(m, g); - m->mask = mask; - - if (fsnotify_add_inode_mark(m, inode, 0)) { - fsnotify_put_mark(m); - return -EINVAL; - } - *out = m; - return 0; -} - -static int watch_one_dir(struct watch_dir *wd) -{ - int ret = kern_path(wd->path, LOOKUP_FOLLOW, &wd->kpath); - if (ret) { - pr_info("path not ready: %s (%d)\n", wd->path, ret); - return ret; - } - wd->inode = d_inode(wd->kpath.dentry); - ihold(wd->inode); - - ret = add_mark_on_inode(wd->inode, wd->mask, &wd->mark); - if (ret) { - pr_err("Add mark failed for %s (%d)\n", wd->path, ret); - path_put(&wd->kpath); - iput(wd->inode); - wd->inode = NULL; - return ret; - } - pr_info("watching %s\n", wd->path); - return 0; -} - -static void unwatch_one_dir(struct watch_dir *wd) -{ - if (wd->mark) { - fsnotify_destroy_mark(wd->mark, g); - fsnotify_put_mark(wd->mark); - wd->mark = NULL; - } - if (wd->inode) { - iput(wd->inode); - wd->inode = NULL; - } - if (wd->kpath.dentry) { - path_put(&wd->kpath); - memset(&wd->kpath, 0, sizeof(wd->kpath)); - } -} - -static struct watch_dir g_watch = { .path = "/data/system", - .mask = MASK_SYSTEM }; - -int ksu_observer_init(void) -{ - int ret = 0; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 0, 0) - g = fsnotify_alloc_group(&ksu_ops, 0); -#else - g = fsnotify_alloc_group(&ksu_ops); -#endif - if (IS_ERR(g)) - return PTR_ERR(g); - - ret = watch_one_dir(&g_watch); - pr_info("observer init done\n"); - return 0; -} - -void ksu_observer_exit(void) -{ - unwatch_one_dir(&g_watch); - fsnotify_put_group(g); - pr_info("observer exit done\n"); -} diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c deleted file mode 100644 index a2b9a7dde728..000000000000 --- a/drivers/kernelsu/selinux/rules.c +++ /dev/null @@ -1,495 +0,0 @@ -#include -#include -#include - -#include "../klog.h" // IWYU pragma: keep -#include "selinux.h" -#include "sepolicy.h" -#include "ss/services.h" -#include "linux/lsm_audit.h" // IWYU pragma: keep -#include "xfrm.h" - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) -#define SELINUX_POLICY_INSTEAD_SELINUX_SS -#endif - -#define ALL NULL - -static struct policydb *get_policydb(void) -{ - struct policydb *db; -// selinux_state does not exists before 4.19 -#ifdef KSU_COMPAT_USE_SELINUX_STATE -#ifdef SELINUX_POLICY_INSTEAD_SELINUX_SS - struct selinux_policy *policy = selinux_state.policy; - db = &policy->policydb; -#else - struct selinux_ss *ss = selinux_state.ss; - db = &ss->policydb; -#endif -#else - db = &policydb; -#endif - return db; -} - -static DEFINE_MUTEX(ksu_rules); -void apply_kernelsu_rules(void) -{ - struct policydb *db; - - if (!getenforce()) { - pr_info("SELinux permissive or disabled, apply rules!\n"); - } - - mutex_lock(&ksu_rules); - - db = get_policydb(); - - ksu_permissive(db, KERNEL_SU_DOMAIN); - ksu_typeattribute(db, KERNEL_SU_DOMAIN, "mlstrustedsubject"); - ksu_typeattribute(db, KERNEL_SU_DOMAIN, "netdomain"); - ksu_typeattribute(db, KERNEL_SU_DOMAIN, "bluetoothdomain"); - - // Create unconstrained file type - ksu_type(db, KERNEL_SU_FILE, "file_type"); - ksu_typeattribute(db, KERNEL_SU_FILE, "mlstrustedobject"); - ksu_allow(db, ALL, KERNEL_SU_FILE, ALL, ALL); - - // allow all! - ksu_allow(db, KERNEL_SU_DOMAIN, ALL, ALL, ALL); - - // allow us do any ioctl - if (db->policyvers >= POLICYDB_VERSION_XPERMS_IOCTL) { - ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "blk_file", ALL); - ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "fifo_file", ALL); - ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "chr_file", ALL); - ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "file", ALL); - } - - // our ksud triggered by init - ksu_allow(db, "init", KERNEL_SU_DOMAIN, ALL, ALL); -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_allow(db, "init", "adb_data_file", "file", ALL); - ksu_allow(db, "init", "adb_data_file", "dir", ALL); // #1289 -#endif - - // copied from Magisk rules - // suRights - ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "dir", "search"); - ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "dir", "read"); - ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "open"); - ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "read"); - ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "process", "sigchld"); - - // allowLog - ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "dir", "search"); - ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "read"); - ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "open"); - ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "getattr"); - - // dumpsys - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fd", "use"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "write"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "read"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "open"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "getattr"); - - // bootctl - ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "dir", "search"); - ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "read"); - ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "open"); - ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", - "getattr"); - - // Allow all binder transactions - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "binder", ALL); - - // Allow system server kill su process - ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "getpgid"); - ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "sigkill"); - - mutex_unlock(&ksu_rules); -} - -#define MAX_SEPOL_LEN 128 - -#define CMD_NORMAL_PERM 1 -#define CMD_XPERM 2 -#define CMD_TYPE_STATE 3 -#define CMD_TYPE 4 -#define CMD_TYPE_ATTR 5 -#define CMD_ATTR 6 -#define CMD_TYPE_TRANSITION 7 -#define CMD_TYPE_CHANGE 8 -#define CMD_GENFSCON 9 - -struct sepol_data { - u32 cmd; - u32 subcmd; - u64 sepol1; - u64 sepol2; - u64 sepol3; - u64 sepol4; - u64 sepol5; - u64 sepol6; - u64 sepol7; -}; - -static int get_object(char *buf, char __user *user_object, size_t buf_sz, - char **object) -{ - if (!user_object) { - *object = ALL; - return 0; - } - - if (strncpy_from_user(buf, user_object, buf_sz) < 0) { - return -EINVAL; - } - - *object = buf; - - return 0; -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) || \ - !defined(KSU_COMPAT_USE_SELINUX_STATE) -extern int avc_ss_reset(u32 seqno); -#else -extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); -#endif -// reset avc cache table, otherwise the new rules will not take effect if already denied -static void reset_avc_cache(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) || \ - !defined(KSU_COMPAT_USE_SELINUX_STATE) - avc_ss_reset(0); - selnl_notify_policyload(0); - selinux_status_update_policyload(0); -#else - struct selinux_avc *avc = selinux_state.avc; - avc_ss_reset(avc, 0); - selnl_notify_policyload(0); - selinux_status_update_policyload(&selinux_state, 0); -#endif - selinux_xfrm_notify_policyload(); -} - -int handle_sepolicy(unsigned long arg3, void __user *arg4) -{ - struct policydb *db; - - if (!arg4) { - return -EINVAL; - } - - if (!getenforce()) { - pr_info("SELinux permissive or disabled when handle policy!\n"); - } - - struct sepol_data data; - if (copy_from_user(&data, arg4, sizeof(struct sepol_data))) { - pr_err("sepol: copy sepol_data failed.\n"); - return -EINVAL; - } - - u32 cmd = data.cmd; - u32 subcmd = data.subcmd; - - mutex_lock(&ksu_rules); - - db = get_policydb(); - - int ret = -EINVAL; - switch (cmd) { - case CMD_NORMAL_PERM: { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - char perm_buf[MAX_SEPOL_LEN]; - - char *s, *t, *c, *p; - if (get_object(src_buf, (void __user *)data.sepol1, - sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - - if (get_object(tgt_buf, (void __user *)data.sepol2, - sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - - if (get_object(cls_buf, (void __user *)data.sepol3, - sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - - if (get_object(perm_buf, (void __user *)data.sepol4, - sizeof(perm_buf), &p) < 0) { - pr_err("sepol: copy perm failed.\n"); - goto exit; - } - - bool success = false; - - if (subcmd == 1) { - success = ksu_allow(db, s, t, c, p); - } else if (subcmd == 2) { - success = ksu_deny(db, s, t, c, p); - } else if (subcmd == 3) { - success = ksu_auditallow(db, s, t, c, p); - } else if (subcmd == 4) { - success = ksu_dontaudit(db, s, t, c, p); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - ret = success ? 0 : -EINVAL; - break; - } - case CMD_XPERM: { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - - char __maybe_unused - operation[MAX_SEPOL_LEN]; // it is always ioctl now! - char perm_set[MAX_SEPOL_LEN]; - - char *s, *t, *c; - if (get_object(src_buf, (void __user *)data.sepol1, - sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (get_object(tgt_buf, (void __user *)data.sepol2, - sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (get_object(cls_buf, (void __user *)data.sepol3, - sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(operation, (void __user *)data.sepol4, - sizeof(operation)) < 0) { - pr_err("sepol: copy operation failed.\n"); - goto exit; - } - if (strncpy_from_user(perm_set, (void __user *)data.sepol5, - sizeof(perm_set)) < 0) { - pr_err("sepol: copy perm_set failed.\n"); - goto exit; - } - - bool success = false; - if (subcmd == 1) { - success = ksu_allowxperm(db, s, t, c, perm_set); - } else if (subcmd == 2) { - success = ksu_auditallowxperm(db, s, t, c, perm_set); - } else if (subcmd == 3) { - success = ksu_dontauditxperm(db, s, t, c, perm_set); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - ret = success ? 0 : -EINVAL; - break; - } - case CMD_TYPE_STATE: { - char src[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - - bool success = false; - if (subcmd == 1) { - success = ksu_permissive(db, src); - } else if (subcmd == 2) { - success = ksu_enforce(db, src); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - if (success) - ret = 0; - break; - } - case CMD_TYPE: - case CMD_TYPE_ATTR: { - char type[MAX_SEPOL_LEN]; - char attr[MAX_SEPOL_LEN]; - - if (strncpy_from_user(type, (void __user *)data.sepol1, - sizeof(type)) < 0) { - pr_err("sepol: copy type failed.\n"); - goto exit; - } - if (strncpy_from_user(attr, (void __user *)data.sepol2, - sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; - } - - bool success = false; - if (cmd == CMD_TYPE) { - success = ksu_type(db, type, attr); - } else { - success = ksu_typeattribute(db, type, attr); - } - if (!success) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; - } - ret = 0; - break; - } - case CMD_ATTR: { - char attr[MAX_SEPOL_LEN]; - - if (strncpy_from_user(attr, (void __user *)data.sepol1, - sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; - } - if (!ksu_attribute(db, attr)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; - } - ret = 0; - break; - } - case CMD_TYPE_TRANSITION: { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - char object[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (strncpy_from_user(tgt, (void __user *)data.sepol2, - sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (strncpy_from_user(cls, (void __user *)data.sepol3, - sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(default_type, (void __user *)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; - } - char *real_object; - if ((void __user *)data.sepol5 == NULL) { - real_object = NULL; - } else { - if (strncpy_from_user(object, - (void __user *)data.sepol5, - sizeof(object)) < 0) { - pr_err("sepol: copy object failed.\n"); - goto exit; - } - real_object = object; - } - - bool success = ksu_type_transition(db, src, tgt, cls, - default_type, real_object); - if (success) - ret = 0; - break; - } - case CMD_TYPE_CHANGE: { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (strncpy_from_user(tgt, (void __user *)data.sepol2, - sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (strncpy_from_user(cls, (void __user *)data.sepol3, - sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(default_type, (void __user *)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; - } - bool success = false; - if (subcmd == 1) { - success = ksu_type_change(db, src, tgt, cls, - default_type); - } else if (subcmd == 2) { - success = ksu_type_member(db, src, tgt, cls, - default_type); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - if (success) - ret = 0; - break; - } - case CMD_GENFSCON: { - char name[MAX_SEPOL_LEN]; - char path[MAX_SEPOL_LEN]; - char context[MAX_SEPOL_LEN]; - if (strncpy_from_user(name, (void __user *)data.sepol1, - sizeof(name)) < 0) { - pr_err("sepol: copy name failed.\n"); - goto exit; - } - if (strncpy_from_user(path, (void __user *)data.sepol2, - sizeof(path)) < 0) { - pr_err("sepol: copy path failed.\n"); - goto exit; - } - if (strncpy_from_user(context, (void __user *)data.sepol3, - sizeof(context)) < 0) { - pr_err("sepol: copy context failed.\n"); - goto exit; - } - - if (!ksu_genfscon(db, name, path, context)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; - } - ret = 0; - break; - } - default: { - pr_err("sepol: unknown cmd: %d\n", cmd); - break; - } - } - -exit: - mutex_unlock(&ksu_rules); - - // only allow and xallow needs to reset avc cache, but we cannot do that because - // we are in atomic context. so we just reset it every time. - reset_avc_cache(); - - return ret; -} diff --git a/drivers/kernelsu/selinux/selinux.c b/drivers/kernelsu/selinux/selinux.c deleted file mode 100644 index 010732dffd9b..000000000000 --- a/drivers/kernelsu/selinux/selinux.c +++ /dev/null @@ -1,204 +0,0 @@ -#include "linux/cred.h" -#include "linux/sched.h" -#include "linux/security.h" -#include "linux/version.h" -#include "selinux_defs.h" -#include "../klog.h" // IWYU pragma: keep -#include "../ksu.h" - -/* - * Cached SID values for frequently checked contexts. - * These are resolved once at init and used for fast u32 comparison - * instead of expensive string operations on every check. - * - * A value of 0 means "no cached SID is available" for that context. - * This covers both the initial "not yet cached" state and any case - * where resolving the SID (e.g. via security_secctx_to_secid) failed. - * In all such cases we intentionally fall back to the slower - * string-based comparison path; this degrades performance only and - * does not cause a functional failure. - */ -static u32 cached_su_sid __read_mostly = 0; -static u32 cached_zygote_sid __read_mostly = 0; -static u32 cached_init_sid __read_mostly = 0; -u32 ksu_file_sid __read_mostly = 0; - -static int transive_to_domain(const char *domain, struct cred *cred) -{ - taskcred_sec_t *tsec; - u32 sid; - int error; - - tsec = (taskcred_sec_t *)selinux_cred(cred); - if (!tsec) { - pr_err("tsec == NULL!\n"); - return -1; - } - error = security_secctx_to_secid(domain, strlen(domain), &sid); - if (error) { - pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n", - domain, sid, error); - } - if (!error) { - tsec->sid = sid; - tsec->create_sid = 0; - tsec->keycreate_sid = 0; - tsec->sockcreate_sid = 0; - } - return error; -} - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(4, 19, 0) -bool __maybe_unused -is_ksu_transition(const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec) -{ - static u32 ksu_sid; - char *secdata; - int err; - u32 seclen; - bool allowed = false; - - if (!ksu_sid) { - err = security_secctx_to_secid( - KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &ksu_sid); - pr_err("failed to get ksu_sid: %d\n", err); - } - - if (security_secid_to_secctx(old_tsec->sid, &secdata, &seclen)) - return false; - - allowed = (!strcmp("u:r:init:s0", secdata) && new_tsec->sid == ksu_sid); - security_release_secctx(secdata, seclen); - return allowed; -} -#endif - -void setup_selinux(const char *domain) -{ - if (transive_to_domain(domain, (struct cred *)__task_cred(current))) { - pr_err("transive domain failed.\n"); - return; - } -} - -void setup_ksu_cred(void) -{ - if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred)) { - pr_err("setup ksu cred failed.\n"); - } -} - -void setenforce(bool enforce) -{ - do_setenforce(enforce); -} - -bool getenforce(void) -{ - if (is_selinux_disabled()) { - return false; - } - - return is_selinux_enforcing(); -} - -/* - * Initialize cached SID values for frequently checked SELinux contexts. - * Called once after SELinux policy is loaded (post-fs-data). - * This eliminates expensive string comparisons in hot paths. - */ -void cache_sid(void) -{ - int err; - - err = security_secctx_to_secid( - KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid); - if (err) { - pr_warn("Failed to cache kernel su domain SID: %d\n", err); - cached_su_sid = 0; - } else { - pr_info("Cached su SID: %u\n", cached_su_sid); - } - - err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT), - &cached_zygote_sid); - if (err) { - pr_warn("Failed to cache zygote SID: %d\n", err); - cached_zygote_sid = 0; - } else { - pr_info("Cached zygote SID: %u\n", cached_zygote_sid); - } - - err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT), - &cached_init_sid); - if (err) { - pr_warn("Failed to cache init SID: %d\n", err); - cached_init_sid = 0; - } else { - pr_info("Cached init SID: %u\n", cached_init_sid); - } - - err = security_secctx_to_secid(KSU_FILE_CONTEXT, - strlen(KSU_FILE_CONTEXT), &ksu_file_sid); - if (err) { - pr_warn("Failed to cache ksu_file SID: %d\n", err); - ksu_file_sid = 0; - } else { - pr_info("Cached ksu_file SID: %u\n", ksu_file_sid); - } -} - -/* - * Fast path: compare task's SID directly against cached value. - * Falls back to string comparison if cache is not initialized. - */ -static bool is_sid_match(const struct cred *cred, u32 cached_sid, - const char *fallback_context) -{ - const taskcred_sec_t *tsec; - if (!cred) { - return false; - } - - tsec = (const taskcred_sec_t *)selinux_cred(cred); - if (!tsec) { - return false; - } - - // Fast path: use cached SID if available - if (likely(cached_sid != 0)) { - return tsec->sid == cached_sid; - } - - // Slow path fallback: string comparison (only before cache is initialized) - struct lsm_context ctx = { 0 }; - bool result; - int err = __security_secid_to_secctx(tsec->sid, &ctx); - if (err) { - return false; - } - result = strncmp(fallback_context, ctx.context, ctx.len) == 0; - __security_release_secctx(&ctx); - return result; -} - -bool is_task_ksu_domain(const struct cred *cred) -{ - return is_sid_match(cred, cached_su_sid, KERNEL_SU_CONTEXT); -} - -bool is_ksu_domain(void) -{ - return is_task_ksu_domain(current_cred()); -} - -bool is_zygote(const struct cred *cred) -{ - return is_sid_match(cred, cached_zygote_sid, ZYGOTE_CONTEXT); -} - -bool is_init(const struct cred *cred) -{ - return is_sid_match(cred, cached_init_sid, INIT_CONTEXT); -} diff --git a/drivers/kernelsu/selinux/selinux.h b/drivers/kernelsu/selinux/selinux.h deleted file mode 100644 index cf8c414ee0ea..000000000000 --- a/drivers/kernelsu/selinux/selinux.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef __KSU_H_SELINUX -#define __KSU_H_SELINUX - -#include "linux/types.h" -#include "linux/version.h" -#include "linux/cred.h" - -// TODO: rename to "ksu" -#define KERNEL_SU_DOMAIN "su" -#define KERNEL_SU_FILE "ksu_file" - -#define KERNEL_SU_CONTEXT "u:r:" KERNEL_SU_DOMAIN ":s0" -#define KSU_FILE_CONTEXT "u:object_r:" KERNEL_SU_FILE ":s0" -#define ZYGOTE_CONTEXT "u:r:zygote:s0" -#define INIT_CONTEXT "u:r:init:s0" - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) -#define KSU_COMPAT_USE_SELINUX_STATE -#endif - -void setup_selinux(const char *); - -void setenforce(bool); - -bool getenforce(void); - -void cache_sid(void); - -bool is_task_ksu_domain(const struct cred *cred); - -bool is_ksu_domain(void); - -bool is_zygote(const struct cred *cred); - -bool is_init(const struct cred *cred); - -void apply_kernelsu_rules(void); - -int handle_sepolicy(unsigned long arg3, void __user *arg4); - -void setup_ksu_cred(void); - -#endif diff --git a/drivers/kernelsu/selinux/selinux_defs.h b/drivers/kernelsu/selinux/selinux_defs.h deleted file mode 100644 index b8e47e7d77f1..000000000000 --- a/drivers/kernelsu/selinux/selinux_defs.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef __KSU_H_SELINUX_DEFS -#define __KSU_H_SELINUX_DEFS - -#include "selinux.h" -#include "objsec.h" -#ifndef KSU_COMPAT_USE_SELINUX_STATE -#include "avc.h" -#endif - -static inline bool is_selinux_disabled(void) -{ -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -#ifdef KSU_COMPAT_USE_SELINUX_STATE - return selinux_state.disabled; -#else - return selinux_disabled; -#endif -#else - return false; -#endif -} - -static inline bool is_selinux_enforcing(void) -{ -#ifdef CONFIG_SECURITY_SELINUX_DEVELOP -#ifdef KSU_COMPAT_USE_SELINUX_STATE - return selinux_state.enforcing; -#else - return selinux_enforcing; -#endif -#else - return true; -#endif -} - -static inline void do_setenforce(bool val) -{ -#ifdef CONFIG_SECURITY_SELINUX_DEVELOP -#ifdef KSU_COMPAT_USE_SELINUX_STATE - selinux_state.enforcing = val; -#else - selinux_enforcing = val; -#endif -#else - /* do nothing */ -#endif -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) -typedef struct task_security_struct taskcred_sec_t; -#else -typedef struct cred_security_struct taskcred_sec_t; -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) -static inline taskcred_sec_t *selinux_cred(const struct cred *cred) -{ - return (taskcred_sec_t *)cred->security; -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0) -struct lsm_context { - char *context; - u32 len; -}; - -static inline int __security_secid_to_secctx(u32 secid, struct lsm_context *cp) -{ - return security_secid_to_secctx(secid, &cp->context, &cp->len); -} -static inline void __security_release_secctx(struct lsm_context *cp) -{ - security_release_secctx(cp->context, cp->len); -} -#else -#define __security_secid_to_secctx security_secid_to_secctx -#define __security_release_secctx security_release_secctx -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) -/* - * get the subjective security ID of the current task - */ -static inline u32 current_sid(void) -{ - const taskcred_sec_t *sec = current_security(); - - return sec->sid; -} -#endif - -#endif diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c deleted file mode 100644 index 1d3ec397030f..000000000000 --- a/drivers/kernelsu/selinux/sepolicy.c +++ /dev/null @@ -1,1062 +0,0 @@ -#include -#include -#include -#include - -#include "sepolicy.h" -#include "../klog.h" // IWYU pragma: keep -#include "ss/symtab.h" -#include "../kernel_compat.h" // Add check Huawei Device - -#define KSU_SUPPORT_ADD_TYPE - -////////////////////////////////////////////////////// -// Declaration -////////////////////////////////////////////////////// - -static struct avtab_node *get_avtab_node(struct policydb *db, - struct avtab_key *key, - struct avtab_extended_perms *xperms); - -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert); - -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert); - -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert); -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert); - -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect); - -static bool add_filename_trans(struct policydb *db, const char *s, - const char *t, const char *c, const char *d, - const char *o); - -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context); - -static bool add_type(struct policydb *db, const char *type_name, bool attr); - -static bool set_type_state(struct policydb *db, const char *type_name, - bool permissive); - -static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, - struct type_datum *attr); - -static bool add_typeattribute(struct policydb *db, const char *type, - const char *attr); - -////////////////////////////////////////////////////// -// Implementation -////////////////////////////////////////////////////// - -// Invert is adding rules for auditdeny; in other cases, invert is removing -// rules -#define strip_av(effect, invert) ((effect == AVTAB_AUDITDENY) == !invert) - -#define ksu_hash_for_each(node_ptr, n_slot, cur) \ - int i; \ - for (i = 0; i < n_slot; ++i) \ - for (cur = node_ptr[i]; cur; cur = cur->next) - -// htable is a struct instead of pointer above 5.8.0: -// https://elixir.bootlin.com/linux/v5.8-rc1/source/security/selinux/ss/symtab.h -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab.htable, htab.size, cur) -#else -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab->htable, htab->size, cur) -#endif - -// symtab_search is introduced on 5.9.0: -// https://elixir.bootlin.com/linux/v5.9-rc1/source/security/selinux/ss/symtab.h -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) -#define symtab_search(s, name) hashtab_search((s)->table, name) -#define symtab_insert(s, name, datum) hashtab_insert((s)->table, name, datum) -#endif - -#define avtab_for_each(avtab, cur) \ - ksu_hash_for_each(avtab.htable, avtab.nslot, cur); - -static struct avtab_node *get_avtab_node(struct policydb *db, - struct avtab_key *key, - struct avtab_extended_perms *xperms) -{ - struct avtab_node *node; - - /* AVTAB_XPERMS entries are not necessarily unique */ - if (key->specified & AVTAB_XPERMS) { - bool match = false; - node = avtab_search_node(&db->te_avtab, key); - while (node) { - if ((node->datum.u.xperms->specified == - xperms->specified) && - (node->datum.u.xperms->driver == xperms->driver)) { - match = true; - break; - } - node = avtab_search_node_next(node, key->specified); - } - if (!match) - node = NULL; - } else { - node = avtab_search_node(&db->te_avtab, key); - } - - if (!node) { - struct avtab_datum avdatum = {}; - /* - * AUDITDENY, aka DONTAUDIT, are &= assigned, versus |= for - * others. Initialize the data accordingly. - */ - if (key->specified & AVTAB_XPERMS) { - avdatum.u.xperms = xperms; - } else { - avdatum.u.data = - key->specified == AVTAB_AUDITDENY ? ~0U : 0U; - } - /* this is used to get the node - insertion is actually unique */ - node = avtab_insert_nonunique(&db->te_avtab, key, &avdatum); - - int grow_size = sizeof(struct avtab_key); - grow_size += sizeof(struct avtab_datum); - if (key->specified & AVTAB_XPERMS) { - grow_size += sizeof(u8); - grow_size += sizeof(u8); - grow_size += sizeof(u32) * - ARRAY_SIZE(avdatum.u.xperms->perms.p); - } - db->len += grow_size; - } - - return node; -} - -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert) -{ - struct type_datum *src = NULL, *tgt = NULL; - struct class_datum *cls = NULL; - struct perm_datum *perm = NULL; - - if (s) { - src = symtab_search(&db->p_types, s); - if (src == NULL) { - pr_info("source type %s does not exist\n", s); - return false; - } - } - - if (t) { - tgt = symtab_search(&db->p_types, t); - if (tgt == NULL) { - pr_info("target type %s does not exist\n", t); - return false; - } - } - - if (c) { - cls = symtab_search(&db->p_classes, c); - if (cls == NULL) { - pr_info("class %s does not exist\n", c); - return false; - } - } - - if (p) { - if (c == NULL) { - pr_info("No class is specified, cannot add perm [%s] \n", - p); - return false; - } - - perm = symtab_search(&cls->permissions, p); - if (perm == NULL && cls->comdatum != NULL) { - perm = symtab_search(&cls->comdatum->permissions, p); - } - if (perm == NULL) { - pr_info("perm %s does not exist in class %s\n", p, c); - return false; - } - } - add_rule_raw(db, src, tgt, cls, perm, effect, invert); - return true; -} - -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert) -{ - if (src == NULL) { - struct hashtab_node *node; - if (strip_av(effect, invert)) { - ksu_hashtab_for_each(db->p_types.table, node) - { - add_rule_raw(db, - (struct type_datum *)node->datum, - tgt, cls, perm, effect, invert); - }; - } else { - ksu_hashtab_for_each(db->p_types.table, node) - { - struct type_datum *type = - (struct type_datum *)(node->datum); - if (type->attribute) { - add_rule_raw(db, type, tgt, cls, perm, - effect, invert); - } - }; - } - } else if (tgt == NULL) { - struct hashtab_node *node; - if (strip_av(effect, invert)) { - ksu_hashtab_for_each(db->p_types.table, node) - { - add_rule_raw(db, src, - (struct type_datum *)node->datum, - cls, perm, effect, invert); - }; - } else { - ksu_hashtab_for_each(db->p_types.table, node) - { - struct type_datum *type = - (struct type_datum *)(node->datum); - if (type->attribute) { - add_rule_raw(db, src, type, cls, perm, - effect, invert); - } - }; - } - } else if (cls == NULL) { - struct hashtab_node *node; - ksu_hashtab_for_each(db->p_classes.table, node) - { - add_rule_raw(db, src, tgt, - (struct class_datum *)node->datum, perm, - effect, invert); - } - } else { - struct avtab_key key; - key.source_type = src->value; - key.target_type = tgt->value; - key.target_class = cls->value; - key.specified = effect; - - struct avtab_node *node = get_avtab_node(db, &key, NULL); - if (invert) { - if (perm) - node->datum.u.data &= - ~(1U << (perm->value - 1)); - else - node->datum.u.data = 0U; - } else { - if (perm) - node->datum.u.data |= 1U << (perm->value - 1); - else - node->datum.u.data = ~0U; - } - } -} - -#define ioctl_driver(x) (x >> 8 & 0xFF) -#define ioctl_func(x) (x & 0xFF) - -#define xperm_test(x, p) (1 & (p[x >> 5] >> (x & 0x1f))) -#define xperm_set(x, p) (p[x >> 5] |= (1 << (x & 0x1f))) -#define xperm_clear(x, p) (p[x >> 5] &= ~(1 << (x & 0x1f))) - -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert) -{ - if (src == NULL) { - struct hashtab_node *node; - ksu_hashtab_for_each(db->p_types.table, node) - { - struct type_datum *type = - (struct type_datum *)(node->datum); - if (type->attribute) { - add_xperm_rule_raw(db, type, tgt, cls, low, - high, effect, invert); - } - }; - } else if (tgt == NULL) { - struct hashtab_node *node; - ksu_hashtab_for_each(db->p_types.table, node) - { - struct type_datum *type = - (struct type_datum *)(node->datum); - if (type->attribute) { - add_xperm_rule_raw(db, src, type, cls, low, - high, effect, invert); - } - }; - } else if (cls == NULL) { - struct hashtab_node *node; - ksu_hashtab_for_each(db->p_classes.table, node) - { - add_xperm_rule_raw(db, src, tgt, - (struct class_datum *)(node->datum), - low, high, effect, invert); - }; - } else { - struct avtab_key key; - key.source_type = src->value; - key.target_type = tgt->value; - key.target_class = cls->value; - key.specified = effect; - - struct avtab_datum *datum; - struct avtab_node *node; - struct avtab_extended_perms xperms; - - memset(&xperms, 0, sizeof(xperms)); - if (ioctl_driver(low) != ioctl_driver(high)) { - xperms.specified = AVTAB_XPERMS_IOCTLDRIVER; - xperms.driver = 0; - } else { - xperms.specified = AVTAB_XPERMS_IOCTLFUNCTION; - xperms.driver = ioctl_driver(low); - } - int i; - if (xperms.specified == AVTAB_XPERMS_IOCTLDRIVER) { - for (i = ioctl_driver(low); i <= ioctl_driver(high); - ++i) { - if (invert) - xperm_clear(i, xperms.perms.p); - else - xperm_set(i, xperms.perms.p); - } - } else { - for (i = ioctl_func(low); i <= ioctl_func(high); ++i) { - if (invert) - xperm_clear(i, xperms.perms.p); - else - xperm_set(i, xperms.perms.p); - } - } - - node = get_avtab_node(db, &key, &xperms); - if (!node) { - pr_warn("add_xperm_rule_raw cannot found node!\n"); - return; - } - datum = &node->datum; - - if (datum->u.xperms == NULL) { - datum->u.xperms = - (struct avtab_extended_perms *)(kzalloc( - sizeof(xperms), GFP_ATOMIC)); - if (!datum->u.xperms) { - pr_err("alloc xperms failed\n"); - return; - } - memcpy(datum->u.xperms, &xperms, sizeof(xperms)); - } - } -} - -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert) -{ - struct type_datum *src = NULL, *tgt = NULL; - struct class_datum *cls = NULL; - - if (s) { - src = symtab_search(&db->p_types, s); - if (src == NULL) { - pr_info("source type %s does not exist\n", s); - return false; - } - } - - if (t) { - tgt = symtab_search(&db->p_types, t); - if (tgt == NULL) { - pr_info("target type %s does not exist\n", t); - return false; - } - } - - if (c) { - cls = symtab_search(&db->p_classes, c); - if (cls == NULL) { - pr_info("class %s does not exist\n", c); - return false; - } - } - - u16 low, high; - - if (range) { - if (strchr(range, '-')) { - sscanf(range, "%hx-%hx", &low, &high); - } else { - sscanf(range, "%hx", &low); - high = low; - } - } else { - low = 0; - high = 0xFFFF; - } - - add_xperm_rule_raw(db, src, tgt, cls, low, high, effect, invert); - return true; -} - -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect) -{ - struct type_datum *src, *tgt, *def; - struct class_datum *cls; - - src = symtab_search(&db->p_types, s); - if (src == NULL) { - pr_info("source type %s does not exist\n", s); - return false; - } - tgt = symtab_search(&db->p_types, t); - if (tgt == NULL) { - pr_info("target type %s does not exist\n", t); - return false; - } - cls = symtab_search(&db->p_classes, c); - if (cls == NULL) { - pr_info("class %s does not exist\n", c); - return false; - } - def = symtab_search(&db->p_types, d); - if (def == NULL) { - pr_info("default type %s does not exist\n", d); - return false; - } - - struct avtab_key key; - key.source_type = src->value; - key.target_type = tgt->value; - key.target_class = cls->value; - key.specified = effect; - - struct avtab_node *node = get_avtab_node(db, &key, NULL); - node->datum.u.data = def->value; - - return true; -} - -// 5.9.0 : static inline int hashtab_insert(struct hashtab *h, void *key, void -// *datum, struct hashtab_key_params key_params) 5.8.0: int -// hashtab_insert(struct hashtab *h, void *k, void *d); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) -static u32 filenametr_hash(const void *k) -{ - const struct filename_trans_key *ft = k; - unsigned long hash; - unsigned int byte_num; - unsigned char focus; - - hash = ft->ttype ^ ft->tclass; - - byte_num = 0; - while ((focus = ft->name[byte_num++])) - hash = partial_name_hash(focus, hash); - return hash; -} - -static int filenametr_cmp(const void *k1, const void *k2) -{ - const struct filename_trans_key *ft1 = k1; - const struct filename_trans_key *ft2 = k2; - int v; - - v = ft1->ttype - ft2->ttype; - if (v) - return v; - - v = ft1->tclass - ft2->tclass; - if (v) - return v; - - return strcmp(ft1->name, ft2->name); -} - -static const struct hashtab_key_params filenametr_key_params = { - .hash = filenametr_hash, - .cmp = filenametr_cmp, -}; -#endif - -static bool add_filename_trans(struct policydb *db, const char *s, - const char *t, const char *c, const char *d, - const char *o) -{ - struct type_datum *src, *tgt, *def; - struct class_datum *cls; - - src = symtab_search(&db->p_types, s); - if (src == NULL) { - pr_warn("source type %s does not exist\n", s); - return false; - } - tgt = symtab_search(&db->p_types, t); - if (tgt == NULL) { - pr_warn("target type %s does not exist\n", t); - return false; - } - cls = symtab_search(&db->p_classes, c); - if (cls == NULL) { - pr_warn("class %s does not exist\n", c); - return false; - } - def = symtab_search(&db->p_types, d); - if (def == NULL) { - pr_warn("default type %s does not exist\n", d); - return false; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) - struct filename_trans_key key; - key.ttype = tgt->value; - key.tclass = cls->value; - key.name = (char *)o; - - struct filename_trans_datum *last = NULL; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - struct filename_trans_datum *trans = - policydb_filenametr_search(db, &key); -#else - struct filename_trans_datum *trans = - hashtab_search(&db->filename_trans, &key); -#endif - while (trans) { - if (ebitmap_get_bit(&trans->stypes, src->value - 1)) { - // Duplicate, overwrite existing data and return - trans->otype = def->value; - return true; - } - if (trans->otype == def->value) - break; - last = trans; - trans = trans->next; - } - - if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_ATOMIC); - struct filename_trans_key *new_key = - (struct filename_trans_key *)kzalloc(sizeof(*new_key), - GFP_ATOMIC); - *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); - trans->next = last; - trans->otype = def->value; - hashtab_insert(&db->filename_trans, new_key, trans, - filenametr_key_params); - } - - db->compat_filename_trans_count++; - return ebitmap_set_bit(&trans->stypes, src->value - 1, 1) == 0; -#else // < 5.7.0, has no filename_trans_key, but struct filename_trans - - struct filename_trans key; - key.ttype = tgt->value; - key.tclass = cls->value; - key.name = (char *)o; - - struct filename_trans_datum *trans = - hashtab_search(db->filename_trans, &key); - - if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_ATOMIC); - if (!trans) { - pr_err("add_filename_trans: Failed to alloc datum\n"); - return false; - } - struct filename_trans *new_key = - (struct filename_trans *)kzalloc(sizeof(*new_key), - GFP_ATOMIC); - if (!new_key) { - pr_err("add_filename_trans: Failed to alloc new_key\n"); - return false; - } - *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); - trans->otype = def->value; - hashtab_insert(db->filename_trans, new_key, trans); - } - - return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == - 0; -#endif -} - -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context) -{ - return false; -} - -// https://github.com/torvalds/linux/commit/590b9d576caec6b4c46bba49ed36223a399c3fc5#diff-cc9aa90e094e6e0f47bd7300db4f33cf4366b98b55d8753744f31eb69c691016R844-R845 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_ATOMIC) -#else -#define ksu_kvrealloc(p, new_size, old_size) \ - ksu_compat_kvrealloc(p, old_size, new_size, GFP_ATOMIC) -#endif - -static bool add_type(struct policydb *db, const char *type_name, bool attr) -{ -#ifdef KSU_SUPPORT_ADD_TYPE - struct type_datum *type = symtab_search(&db->p_types, type_name); - if (type) { - pr_warn("Type %s already exists\n", type_name); - return true; - } - - u32 value = ++db->p_types.nprim; - type = (struct type_datum *)kzalloc(sizeof(struct type_datum), - GFP_ATOMIC); - if (!type) { - pr_err("add_type: alloc type_datum failed.\n"); - return false; - } - - type->primary = 1; - type->value = value; - type->attribute = attr; - - char *key = kstrdup(type_name, GFP_ATOMIC); - if (!key) { - pr_err("add_type: alloc key failed.\n"); - return false; - } - - if (symtab_insert(&db->p_types, key, type)) { - pr_err("add_type: insert symtab failed.\n"); - return false; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - struct ebitmap *new_type_attr_map_array = - ksu_kvrealloc(db->type_attr_map_array, - value * sizeof(struct ebitmap), - (value - 1) * sizeof(struct ebitmap)); - - if (!new_type_attr_map_array) { - pr_err("add_type: alloc type_attr_map_array failed\n"); - return false; - } - - struct type_datum **new_type_val_to_struct = - ksu_kvrealloc(db->type_val_to_struct, - sizeof(*db->type_val_to_struct) * value, - sizeof(*db->type_val_to_struct) * (value - 1)); - - if (!new_type_val_to_struct) { - pr_err("add_type: alloc type_val_to_struct failed\n"); - return false; - } - - char **new_val_to_name_types = - ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES], - sizeof(char *) * value, - sizeof(char *) * (value - 1)); - if (!new_val_to_name_types) { - pr_err("add_type: alloc val_to_name failed\n"); - return false; - } - - db->type_attr_map_array = new_type_attr_map_array; - ebitmap_init(&db->type_attr_map_array[value - 1]); - ebitmap_set_bit(&db->type_attr_map_array[value - 1], value - 1, 1); - - db->type_val_to_struct = new_type_val_to_struct; - db->type_val_to_struct[value - 1] = type; - - db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; - db->sym_val_to_name[SYM_TYPES][value - 1] = key; - - int i; - for (i = 0; i < db->p_roles.nprim; ++i) { - ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, - 1); - } - - return true; -#elif defined(CONFIG_IS_HW_HISI) - /* - * Huawei use type_attr_map and type_val_to_struct. - * And use ebitmap not flex_array. - */ - size_t new_size = sizeof(struct ebitmap) * db->p_types.nprim; - struct ebitmap *new_type_attr_map = - (krealloc(db->type_attr_map, new_size, GFP_ATOMIC)); - - struct type_datum **new_type_val_to_struct = - krealloc(db->type_val_to_struct, - sizeof(*db->type_val_to_struct) * db->p_types.nprim, - GFP_ATOMIC); - - if (!new_type_attr_map) { - pr_err("add_type: alloc type_attr_map failed\n"); - return false; - } - - if (!new_type_val_to_struct) { - pr_err("add_type: alloc type_val_to_struct failed\n"); - return false; - } - - char **new_val_to_name_types = - krealloc(db->sym_val_to_name[SYM_TYPES], - sizeof(char *) * db->symtab[SYM_TYPES].nprim, - GFP_KERNEL); - if (!new_val_to_name_types) { - pr_err("add_type: alloc val_to_name failed\n"); - return false; - } - - db->type_attr_map = new_type_attr_map; - ebitmap_init(&db->type_attr_map[value - 1], HISI_SELINUX_EBITMAP_RO); - ebitmap_set_bit(&db->type_attr_map[value - 1], value - 1, 1); - - db->type_val_to_struct = new_type_val_to_struct; - db->type_val_to_struct[value - 1] = type; - - db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; - db->sym_val_to_name[SYM_TYPES][value - 1] = key; - - int i; - for (i = 0; i < db->p_roles.nprim; ++i) { - ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, - 1); - } - - return true; -#else - // flex_array is not extensible, we need to create a new bigger one instead - struct flex_array *new_type_attr_map_array = - flex_array_alloc(sizeof(struct ebitmap), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); - - struct flex_array *new_type_val_to_struct = - flex_array_alloc(sizeof(struct type_datum *), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); - - struct flex_array *new_val_to_name_types = - flex_array_alloc(sizeof(char *), db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO); - - if (!new_type_attr_map_array) { - pr_err("add_type: alloc type_attr_map_array failed\n"); - return false; - } - - if (!new_type_val_to_struct) { - pr_err("add_type: alloc type_val_to_struct failed\n"); - return false; - } - - if (!new_val_to_name_types) { - pr_err("add_type: alloc val_to_name failed\n"); - return false; - } - - // preallocate so we don't have to worry about the put ever failing - if (flex_array_prealloc(new_type_attr_map_array, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { - pr_err("add_type: prealloc type_attr_map_array failed\n"); - return false; - } - - if (flex_array_prealloc(new_type_val_to_struct, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { - pr_err("add_type: prealloc type_val_to_struct_array failed\n"); - return false; - } - - if (flex_array_prealloc(new_val_to_name_types, 0, - db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO)) { - pr_err("add_type: prealloc val_to_name_types failed\n"); - return false; - } - - int j; - void *old_elem; - // copy the old data or pointers to new flex arrays - for (j = 0; j < db->type_attr_map_array->total_nr_elements; j++) { - old_elem = flex_array_get(db->type_attr_map_array, j); - if (old_elem) - flex_array_put(new_type_attr_map_array, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); - } - - for (j = 0; j < db->type_val_to_struct_array->total_nr_elements; j++) { - old_elem = flex_array_get_ptr(db->type_val_to_struct_array, j); - if (old_elem) - flex_array_put_ptr(new_type_val_to_struct, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); - } - - for (j = 0; j < db->symtab[SYM_TYPES].nprim; j++) { - old_elem = - flex_array_get_ptr(db->sym_val_to_name[SYM_TYPES], j); - if (old_elem) - flex_array_put_ptr(new_val_to_name_types, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); - } - - // store the pointer of old flex arrays first, when assigning new ones we - // should free it - struct flex_array *old_fa; - - old_fa = db->type_attr_map_array; - db->type_attr_map_array = new_type_attr_map_array; - if (old_fa) { - flex_array_free(old_fa); - } - - ebitmap_init(flex_array_get(db->type_attr_map_array, value - 1)); - ebitmap_set_bit(flex_array_get(db->type_attr_map_array, value - 1), - value - 1, 1); - - old_fa = db->type_val_to_struct_array; - db->type_val_to_struct_array = new_type_val_to_struct; - if (old_fa) { - flex_array_free(old_fa); - } - flex_array_put_ptr(db->type_val_to_struct_array, value - 1, type, - GFP_ATOMIC | __GFP_ZERO); - - old_fa = db->sym_val_to_name[SYM_TYPES]; - db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; - if (old_fa) { - flex_array_free(old_fa); - } - flex_array_put_ptr(db->sym_val_to_name[SYM_TYPES], value - 1, key, - GFP_ATOMIC | __GFP_ZERO); - - int i; - for (i = 0; i < db->p_roles.nprim; ++i) { - ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, - 1); - } - return true; -#endif - -#else - return false; -#endif -} - -static bool set_type_state(struct policydb *db, const char *type_name, - bool permissive) -{ - struct type_datum *type; - if (type_name == NULL) { - struct hashtab_node *node; - ksu_hashtab_for_each(db->p_types.table, node) - { - type = (struct type_datum *)(node->datum); - if (ebitmap_set_bit(&db->permissive_map, type->value, - permissive)) - pr_info("Could not set bit in permissive map\n"); - }; - } else { - type = (struct type_datum *)symtab_search(&db->p_types, - type_name); - if (type == NULL) { - pr_info("type %s does not exist\n", type_name); - return false; - } - if (ebitmap_set_bit(&db->permissive_map, type->value, - permissive)) { - pr_info("Could not set bit in permissive map\n"); - return false; - } - } - return true; -} - -static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, - struct type_datum *attr) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - struct ebitmap *sattr = &db->type_attr_map_array[type->value - 1]; -#elif defined(CONFIG_IS_HW_HISI) - /* - * HISI_SELINUX_EBITMAP_RO is Huawei's unique features. - */ - struct ebitmap *sattr = &db->type_attr_map[type->value - 1], - HISI_SELINUX_EBITMAP_RO; -#else - struct ebitmap *sattr = - flex_array_get(db->type_attr_map_array, type->value - 1); -#endif - ebitmap_set_bit(sattr, attr->value - 1, 1); - - struct hashtab_node *node; - struct constraint_node *n; - struct constraint_expr *e; - ksu_hashtab_for_each(db->p_classes.table, node) - { - struct class_datum *cls = (struct class_datum *)(node->datum); - for (n = cls->constraints; n; n = n->next) { - for (e = n->expr; e; e = e->next) { - if (e->expr_type == CEXPR_NAMES && - ebitmap_get_bit(&e->type_names->types, - attr->value - 1)) { - ebitmap_set_bit(&e->names, - type->value - 1, 1); - } - } - } - }; -} - -static bool add_typeattribute(struct policydb *db, const char *type, - const char *attr) -{ - struct type_datum *type_d = symtab_search(&db->p_types, type); - if (type_d == NULL) { - pr_info("type %s does not exist\n", type); - return false; - } else if (type_d->attribute) { - pr_info("type %s is an attribute\n", attr); - return false; - } - - struct type_datum *attr_d = symtab_search(&db->p_types, attr); - if (attr_d == NULL) { - pr_info("attribute %s does not exist\n", type); - return false; - } else if (!attr_d->attribute) { - pr_info("type %s is not an attribute \n", attr); - return false; - } - - add_typeattribute_raw(db, type_d, attr_d); - return true; -} - -////////////////////////////////////////////////////////////////////////// - -// Operation on types -bool ksu_type(struct policydb *db, const char *name, const char *attr) -{ - return add_type(db, name, false) && add_typeattribute(db, name, attr); -} - -bool ksu_attribute(struct policydb *db, const char *name) -{ - return add_type(db, name, true); -} - -bool ksu_permissive(struct policydb *db, const char *type) -{ - return set_type_state(db, type, true); -} - -bool ksu_enforce(struct policydb *db, const char *type) -{ - return set_type_state(db, type, false); -} - -bool ksu_typeattribute(struct policydb *db, const char *type, const char *attr) -{ - return add_typeattribute(db, type, attr); -} - -bool ksu_exists(struct policydb *db, const char *type) -{ - return symtab_search(&db->p_types, type) != NULL; -} - -// Access vector rules -bool ksu_allow(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm) -{ - return add_rule(db, src, tgt, cls, perm, AVTAB_ALLOWED, false); -} - -bool ksu_deny(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm) -{ - return add_rule(db, src, tgt, cls, perm, AVTAB_ALLOWED, true); -} - -bool ksu_auditallow(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm) -{ - return add_rule(db, src, tgt, cls, perm, AVTAB_AUDITALLOW, false); -} -bool ksu_dontaudit(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm) -{ - return add_rule(db, src, tgt, cls, perm, AVTAB_AUDITDENY, true); -} - -// Extended permissions access vector rules -bool ksu_allowxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range) -{ - return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_ALLOWED, - false); -} - -bool ksu_auditallowxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range) -{ - return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_AUDITALLOW, - false); -} - -bool ksu_dontauditxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range) -{ - return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_DONTAUDIT, - false); -} - -// Type rules -bool ksu_type_transition(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def, const char *obj) -{ - if (obj) { - return add_filename_trans(db, src, tgt, cls, def, obj); - } else { - return add_type_rule(db, src, tgt, cls, def, AVTAB_TRANSITION); - } -} - -bool ksu_type_change(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def) -{ - return add_type_rule(db, src, tgt, cls, def, AVTAB_CHANGE); -} - -bool ksu_type_member(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def) -{ - return add_type_rule(db, src, tgt, cls, def, AVTAB_MEMBER); -} - -// File system labeling -bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, - const char *ctx) -{ - return add_genfscon(db, fs_name, path, ctx); -} diff --git a/drivers/kernelsu/selinux/sepolicy.h b/drivers/kernelsu/selinux/sepolicy.h deleted file mode 100644 index 675d1499e46d..000000000000 --- a/drivers/kernelsu/selinux/sepolicy.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef __KSU_H_SEPOLICY -#define __KSU_H_SEPOLICY - -#include - -#include "ss/policydb.h" - -// Operation on types -bool ksu_type(struct policydb *db, const char *name, const char *attr); -bool ksu_attribute(struct policydb *db, const char *name); -bool ksu_permissive(struct policydb *db, const char *type); -bool ksu_enforce(struct policydb *db, const char *type); -bool ksu_typeattribute(struct policydb *db, const char *type, const char *attr); -bool ksu_exists(struct policydb *db, const char *type); - -// Access vector rules -bool ksu_allow(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm); -bool ksu_deny(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm); -bool ksu_auditallow(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm); -bool ksu_dontaudit(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *perm); - -// Extended permissions access vector rules -bool ksu_allowxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range); -bool ksu_auditallowxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range); -bool ksu_dontauditxperm(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *range); - -// Type rules -bool ksu_type_transition(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def, const char *obj); -bool ksu_type_change(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def); -bool ksu_type_member(struct policydb *db, const char *src, const char *tgt, - const char *cls, const char *def); - -// File system labeling -bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, - const char *ctx); - -#endif diff --git a/drivers/kernelsu/setuid_hook.c b/drivers/kernelsu/setuid_hook.c deleted file mode 100644 index c15123101c45..000000000000 --- a/drivers/kernelsu/setuid_hook.c +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#endif -#include -#include -#include -#include - -#include "allowlist.h" -#include "setuid_hook.h" -#include "klog.h" // IWYU pragma: keep -#include "manager.h" -#include "selinux/selinux.h" -#include "supercalls.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif -#include "kernel_umount.h" -#include "kernel_compat.h" - -static void ksu_install_manager_fd_tw_func(struct callback_head *cb) -{ - ksu_install_fd(); - kfree(cb); -} - -static void do_install_manager_fd(void) -{ - struct callback_head *cb = kzalloc(sizeof(*cb), GFP_ATOMIC); - if (!cb) - return; - - cb->func = ksu_install_manager_fd_tw_func; - if (task_work_add(current, cb, TWA_RESUME)) { - kfree(cb); - pr_warn("install manager fd add task_work failed\n"); - } -} - -// force_sig kcompat, TODO: move it out of core_hook.c -// https://elixir.bootlin.com/linux/v5.3-rc1/source/kernel/signal.c#L1613 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) -#define send_sig(sig) force_sig(sig) -#else -#define send_sig(sig) force_sig(sig, current) -#endif - -extern void disable_seccomp(void); -int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("handle_setuid from %d to %d\n", old_uid, new_uid); -#endif - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == new_uid % PER_USER_RANGE)) { - disable_seccomp(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_set_task_tracepoint_flag(current); -#endif - pr_info("install fd for manager (uid=%d)\n", new_uid); - do_install_manager_fd(); - return 0; - } - - if (ksu_is_allow_uid_for_current(new_uid)) { - disable_seccomp(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_set_task_tracepoint_flag(current); - } else { - ksu_clear_task_tracepoint_flag_if_needed(current); -#endif - } - - // Handle kernel umount - ksu_handle_umount(old_uid, new_uid); - - return 0; -} - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid) -{ - if (!is_zygote(current_cred())) { -#ifdef CONFIG_KSU_DEBUG - pr_info("setresuid: disallow non zygote sid!\n"); -#endif - return 0; - } - return ksu_handle_setuid_common(ruid, current_uid().val, euid); -} -#endif - -void ksu_setuid_hook_init(void) -{ - ksu_kernel_umount_init(); -} - -void ksu_setuid_hook_exit(void) -{ - pr_info("ksu setuid exit\n"); - ksu_kernel_umount_exit(); -} diff --git a/drivers/kernelsu/setuid_hook.h b/drivers/kernelsu/setuid_hook.h deleted file mode 100644 index 7c4eda71c1c0..000000000000 --- a/drivers/kernelsu/setuid_hook.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __KSU_H_KSU_SETUID_HOOK -#define __KSU_H_KSU_SETUID_HOOK - -#include -#include - -void ksu_setuid_hook_init(void); -void ksu_setuid_hook_exit(void); - -int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid); - -#endif diff --git a/drivers/kernelsu/shim.c b/drivers/kernelsu/shim.c deleted file mode 100644 index 75d5542a87aa..000000000000 --- a/drivers/kernelsu/shim.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include - -// unity build idea from backslashxx, not full, we only use it for shim ksu hooks - -#include "allowlist.h" -#include "arch.h" -#include "kp_hook.h" -#include "ksu.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "kernel_compat.h" -#include "kp_util.h" -#include "supercalls.h" -#include "sucompat.h" -#include "setuid_hook.h" -#include "syscall_handler.h" -#include "selinux/selinux.h" -#include "throne_tracker.h" - -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "pkg_observer.c" -#include "kp_hook.c" -#include "kp_util.c" -#include "syscall_handler.c" -#endif - -#if (defined(CONFIG_KSU_MANUAL_HOOK) && \ - LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0)) -#include "lsm_hook.c" -#elif (defined(CONFIG_KSU_MANUAL_HOOK) && \ - LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)) -// + ksu_handle_setresuid hook for 6.8+ -#include "pkg_observer.c" -#endif diff --git a/drivers/kernelsu/su_mount_ns.c b/drivers/kernelsu/su_mount_ns.c deleted file mode 100644 index 4a0e4a29b103..000000000000 --- a/drivers/kernelsu/su_mount_ns.c +++ /dev/null @@ -1,270 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -#include -#else -#include -#endif - -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "kernel_compat.h" -#include "su_mount_ns.h" - -extern int path_mount(const char *dev_name, struct path *path, - const char *type_page, unsigned long flags, - void *data_page); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - -// RKSU: tiny arch.h, avoid depending on real arch.h -#ifndef __PT_REGS_CAST -#define __PT_REGS_CAST(x) (x) -#endif - -#if defined(__aarch64__) -#define PT_PARM1(x) (__PT_REGS_CAST(x)->regs[0]) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->regs[1]) -extern long __arm64_sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (__arm64_sys_setns(regs)) -#elif defined(__x86_64__) -#define PT_PARM1(x) (__PT_REGS_CAST(x)->di) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->si) -extern long __x64_sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (__x64_sys_setns(regs)) -#elif defined(__arm__) // https://syscalls.mebeim.net/?table=arm/32/eabi/latest -// taken from: -// https://github.com/backslashxx/KernelSU/blob/8b71e8bce199e8ac44538648e298092a9b3ef42b/kernel/arch.h#L29 -#define PT_PARM1(x) (__PT_REGS_CAST(x)->uregs[0]) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->uregs[1]) -extern long sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (sys_setns(regs)) -#endif - -static long ksu_sys_setns(int fd, int flags) -{ -#ifdef PT_PARM1 - struct pt_regs regs; - memset(®s, 0, sizeof(regs)); - - PT_PARM1(®s) = fd; - PT_PARM2(®s) = flags; - - return do_sys_setns(®s); -#else - return -ENOSYS; -#endif -} -#else -static long ksu_sys_setns(int fd, int flags) -{ - return sys_setns(fd, flags); -} - -int ksys_unshare(unsigned long unshare_flags) -{ - return sys_unshare(unshare_flags); -} -#endif - -// global mode , need CAP_SYS_ADMIN and CAP_SYS_CHROOT to perform setns -static void ksu_mnt_ns_global(void) -{ - // save current working directory as absolute path before setns - char *pwd_path = NULL; - char *pwd_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pwd_buf) { - pr_warn("no mem for pwd buffer, skip restore pwd!!\n"); - goto try_setns; - } - - struct path saved_pwd; - get_fs_pwd(current->fs, &saved_pwd); - pwd_path = d_path(&saved_pwd, pwd_buf, PATH_MAX); - path_put(&saved_pwd); - - if (IS_ERR(pwd_path)) { - if (PTR_ERR(pwd_path) == -ENAMETOOLONG) { - pr_warn("absolute pwd longer than: %d, skip restore pwd!!\n", - PATH_MAX); - } else { - pr_warn("get absolute pwd failed: %ld\n", - PTR_ERR(pwd_path)); - } - pwd_path = NULL; - } - -try_setns: - - rcu_read_lock(); - // &init_task is not init, but swapper/idle, which forks the init process - // so we need find init process - struct pid *pid_struct = find_pid_ns(1, &init_pid_ns); - if (unlikely(!pid_struct)) { - rcu_read_unlock(); - pr_warn("failed to find pid_struct for PID 1\n"); - goto out; - } - - struct task_struct *pid1_task = get_pid_task(pid_struct, PIDTYPE_PID); - rcu_read_unlock(); - if (unlikely(!pid1_task)) { - pr_warn("failed to get task_struct for PID 1\n"); - goto out; - } - struct path ns_path; - long ret = ns_get_path(&ns_path, pid1_task, &mntns_operations); - put_task_struct(pid1_task); - if (ret) { - pr_warn("failed get path for init mount namespace: %ld\n", ret); - goto out; - } - struct file *ns_file = dentry_open(&ns_path, O_RDONLY, ksu_cred); - - path_put(&ns_path); - if (IS_ERR(ns_file)) { - pr_warn("failed open file for init mount namespace: %ld\n", - PTR_ERR(ns_file)); - goto out; - } - - int fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - pr_warn("failed to get an unused fd: %d\n", fd); - fput(ns_file); - goto out; - } - - fd_install(fd, ns_file); - ret = ksu_sys_setns(fd, CLONE_NEWNS); - - do_close_fd(fd); - - if (ret) { - pr_warn("call setns failed: %ld\n", ret); - goto out; - } - // try to restore working directory using absolute path after setns - if (pwd_path) { - struct path new_pwd; - int err = kern_path(pwd_path, 0, &new_pwd); - if (!err) { - set_fs_pwd(current->fs, &new_pwd); - path_put(&new_pwd); - } else { - pr_warn("restore pwd failed: %d, path: %s\n", err, - pwd_path); - } - } -out: - kfree(pwd_buf); -} - -// individual mode , need CAP_SYS_ADMIN to perform unshare and remount -static void ksu_mnt_ns_individual(void) -{ - long ret = ksys_unshare(CLONE_NEWNS); - if (ret) { - pr_warn("call ksys_unshare failed: %ld\n", ret); - return; - } - - // make root mount private - struct path root_path; - get_fs_root(current->fs, &root_path); - int pm_ret = - path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL); - path_put(&root_path); - - if (pm_ret < 0) { - pr_err("failed to make root private, err: %d\n", pm_ret); - } -} - -#ifdef CONFIG_KSU_SYSCALL_HOOK -struct ksu_mns_tw { - struct callback_head cb; - int32_t ns_mode; -}; - -static void ksu_setup_mount_ns_tw_func(struct callback_head *cb) -{ - struct ksu_mns_tw *tw = container_of(cb, struct ksu_mns_tw, cb); - const struct cred *old_cred = override_creds(ksu_cred); - if (tw->ns_mode == KSU_NS_GLOBAL) { - ksu_mnt_ns_global(); - } else { - ksu_mnt_ns_individual(); - } - revert_creds(old_cred); - kfree(tw); -} - -static void ksu_handle_setup_mount_ns(int32_t ns_mode) -{ - struct ksu_mns_tw *tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) { - pr_err("no mem for tw! skip mnt_ns magic for pid: %d.\n", - current->pid); - return; - } - tw->cb.func = ksu_setup_mount_ns_tw_func; - tw->ns_mode = ns_mode; - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_err("add task work failed! skip mnt_ns magic for pid: %d.\n", - current->pid); - } -} -#else -static void ksu_handle_setup_mount_ns(int32_t ns_mode) -{ - const struct cred *old_cred = override_creds(ksu_cred); - if (ns_mode == KSU_NS_GLOBAL) { - ksu_mnt_ns_global(); - } else { - ksu_mnt_ns_individual(); - } - revert_creds(old_cred); -} -#endif - -void setup_mount_ns(int32_t ns_mode) -{ - // inherit mode - if (ns_mode == KSU_NS_INHERITED) { - // do nothing - return; - } - - if (ns_mode != KSU_NS_GLOBAL && ns_mode != KSU_NS_INDIVIDUAL) { - pr_warn("pid: %d ,unknown mount namespace mode: %d\n", - current->pid, ns_mode); - return; - } - - if (!ksu_cred) { - pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n", - current->pid); - return; - } - - ksu_handle_setup_mount_ns(ns_mode); -} diff --git a/drivers/kernelsu/su_mount_ns.h b/drivers/kernelsu/su_mount_ns.h deleted file mode 100644 index f118d8135c12..000000000000 --- a/drivers/kernelsu/su_mount_ns.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __KSU_SU_MOUNT_NS_H -#define __KSU_SU_MOUNT_NS_H - -#define KSU_NS_INHERITED 0 -#define KSU_NS_GLOBAL 1 -#define KSU_NS_INDIVIDUAL 2 - -void setup_mount_ns(int32_t ns_mode); - -#endif diff --git a/drivers/kernelsu/sucompat.c b/drivers/kernelsu/sucompat.c deleted file mode 100644 index 2bb1a9fba702..000000000000 --- a/drivers/kernelsu/sucompat.c +++ /dev/null @@ -1,217 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif -#include - -#include "allowlist.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "kernel_compat.h" -#include "sucompat.h" -#include "app_profile.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_util.h" -#endif - -#define SU_PATH "/system/bin/su" -#define SH_PATH "/system/bin/sh" - -bool ksu_su_compat_enabled __read_mostly = true; - -static const char su_path[] = SU_PATH; -static const char ksud_path[] = KSUD_PATH; -static const char sh_path[] = SH_PATH; - -static int su_compat_feature_get(u64 *value) -{ - *value = ksu_su_compat_enabled ? 1 : 0; - return 0; -} - -static int su_compat_feature_set(u64 value) -{ - bool enable = value != 0; - ksu_su_compat_enabled = enable; - pr_info("su_compat: set to %d\n", enable); - return 0; -} - -static const struct ksu_feature_handler su_compat_handler = { - .feature_id = KSU_FEATURE_SU_COMPAT, - .name = "su_compat", - .get_handler = su_compat_feature_get, - .set_handler = su_compat_feature_set, -}; - -static void __user *userspace_stack_buffer(const void *d, size_t len) -{ - // To avoid having to mmap a page in userspace, just write below the stack - // pointer. - char __user *p = (void __user *)current_user_stack_pointer() - len; - - return copy_to_user(p, d, len) ? NULL : p; -} - -static char __user *sh_user_path(void) -{ - return userspace_stack_buffer(sh_path, sizeof(sh_path)); -} - -static char __user *ksud_user_path(void) -{ - return userspace_stack_buffer(ksud_path, sizeof(ksud_path)); -} - -static inline bool is_su_allowed(void) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_su_compat_enabled) - return false; -#endif -#ifdef CONFIG_SECCOMP - if (likely(!!current->seccomp.mode)) - return false; -#endif - if (!ksu_is_allow_uid_for_current(current_uid().val)) - return false; - - return true; -} - -static int ksu_sucompat_user_common(const char __user **filename_user, - const char *syscall_name, - const bool escalate) -{ - char path[sizeof(su_path) + 1]; - - if (unlikely(!filename_user)) - return 0; - if (!is_su_allowed()) - return 0; - - memset(path, 0, sizeof(path)); - ksu_strncpy_from_user_nofault(path, *filename_user, sizeof(path)); - - if (memcmp(path, su_path, sizeof(su_path))) - return 0; - - if (escalate) { - pr_info("%s su found\n", syscall_name); - *filename_user = ksud_user_path(); - escape_with_root_profile(); // escalate !! - } else { - pr_info("%s su->sh!\n", syscall_name); - *filename_user = sh_user_path(); - } - - return 0; -} - -#ifdef CONFIG_KSU_SYSCALL_HOOK -static int do_execve_sucompat_for_kp(const char __user **filename_user) -{ - char path[sizeof(su_path) + 1]; - - if (unlikely(!filename_user)) - return 0; - if (!is_su_allowed()) - return 0; - if (!ksu_retry_filename_access(filename_user, path, sizeof(path), true)) - return 0; - if (likely(memcmp(path, su_path, sizeof(su_path)))) - return 0; - - pr_info("sys_execve su found\n"); - *filename_user = ksud_user_path(); - - escape_with_root_profile(); - - return 0; -} -#define handle_execve_sucompat(filename_ptr) \ - (do_execve_sucompat_for_kp(filename_ptr)) -#else -#define handle_execve_sucompat(filename_ptr) \ - (ksu_sucompat_user_common(filename_ptr, "sys_execve", true)) -#endif - -int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *__unused_flags) -{ - return ksu_sucompat_user_common(filename_user, "faccessat", false); -} - -int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) -{ - return ksu_sucompat_user_common(filename_user, "newfstatat", false); -} - -int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) -{ - return handle_execve_sucompat(filename_user); -} - -int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) -{ - struct filename *filename; - - if (unlikely(!filename_ptr)) - return 0; - if (!is_su_allowed()) - return 0; - - filename = *filename_ptr; - if (IS_ERR(filename)) - return 0; - if (likely(memcmp(filename->name, su_path, sizeof(su_path)))) - return 0; - - pr_info("do_execveat_common su found\n"); - memcpy((void *)filename->name, ksud_path, sizeof(ksud_path)); - - escape_with_root_profile(); - - return 0; -} - -int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, - void *envp, int *flags) -{ - return ksu_handle_execveat_ksud(fd, filename_ptr, argv, envp, flags); -} - -// dead code: devpts handling -int __maybe_unused ksu_handle_devpts(struct inode *inode) -{ - return 0; -} - -// sucompat: permitted process can execute 'su' to gain root access. -void ksu_sucompat_init(void) -{ - if (ksu_register_feature_handler(&su_compat_handler)) { - pr_err("Failed to register su_compat feature handler\n"); - } -} - -void ksu_sucompat_exit(void) -{ - ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT); -} diff --git a/drivers/kernelsu/sucompat.h b/drivers/kernelsu/sucompat.h deleted file mode 100644 index de4bcfe037fa..000000000000 --- a/drivers/kernelsu/sucompat.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __KSU_H_SUCOMPAT -#define __KSU_H_SUCOMPAT -#include - -extern bool ksu_su_compat_enabled; - -void ksu_sucompat_init(void); -void ksu_sucompat_exit(void); - -// Handler functions exported for hook_manager -int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *__unused_flags); -int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags); -int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags); -#endif diff --git a/drivers/kernelsu/supercalls.c b/drivers/kernelsu/supercalls.c deleted file mode 100644 index 12c7e284cfd1..000000000000 --- a/drivers/kernelsu/supercalls.c +++ /dev/null @@ -1,847 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif - -#include "supercalls.h" -#include "arch.h" -#include "allowlist.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "ksud.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_hook.h" -#include "syscall_handler.h" -#endif -#include "kernel_compat.h" -#include "kernel_umount.h" -#include "manager.h" -#include "selinux/selinux.h" -#include "file_wrapper.h" - -// Permission check functions -bool only_manager(void) -{ - return is_manager(); -} - -bool only_root(void) -{ - return current_uid().val == 0; -} - -bool manager_or_root(void) -{ - return current_uid().val == 0 || is_manager(); -} - -bool always_allow(void) -{ - return true; // No permission check -} - -bool allowed_for_su(void) -{ - return is_manager() || ksu_is_allow_uid_for_current(current_uid().val); -} - -static int do_grant_root(void __user *arg) -{ - // we already check uid above on allowed_for_su() - - pr_info("allow root for: %d\n", current_uid().val); - escape_with_root_profile(); - - return 0; -} - -static int do_get_info(void __user *arg) -{ - struct ksu_get_info_cmd cmd = { .version = KERNEL_SU_VERSION, - .flags = 0 }; - -#ifdef MODULE - cmd.flags |= 0x1; -#endif - - if (is_manager()) { - cmd.flags |= 0x2; - } - cmd.features = KSU_FEATURE_MAX; - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_version: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_report_event(void __user *arg) -{ - struct ksu_report_event_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - switch (cmd.event) { - case EVENT_POST_FS_DATA: { - static bool post_fs_data_lock = false; - if (!post_fs_data_lock) { - post_fs_data_lock = true; - pr_info("post-fs-data triggered\n"); - on_post_fs_data(); - } - break; - } - case EVENT_BOOT_COMPLETED: { - static bool boot_complete_lock = false; - if (!boot_complete_lock) { - boot_complete_lock = true; - pr_info("boot_complete triggered\n"); - on_boot_completed(); - } - break; - } - case EVENT_MODULE_MOUNTED: { - pr_info("module mounted!\n"); - on_module_mounted(); - break; - } - default: - break; - } - - return 0; -} - -static int do_set_sepolicy(void __user *arg) -{ - struct ksu_set_sepolicy_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - return handle_sepolicy(cmd.cmd, (void __user *)cmd.arg); -} - -static int do_check_safemode(void __user *arg) -{ - struct ksu_check_safemode_cmd cmd; - - cmd.in_safe_mode = ksu_is_safe_mode(); - - if (cmd.in_safe_mode) { - pr_warn("safemode enabled!\n"); - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("check_safemode: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_allow_list(void __user *arg) -{ - struct ksu_get_allow_list_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - bool success = - ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, true); - - if (!success) { - return -EFAULT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_allow_list: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_deny_list(void __user *arg) -{ - struct ksu_get_allow_list_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - bool success = - ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, false); - - if (!success) { - return -EFAULT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_deny_list: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_uid_granted_root(void __user *arg) -{ - struct ksu_uid_granted_root_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - cmd.granted = ksu_is_allow_uid_for_current(cmd.uid); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("uid_granted_root: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_uid_should_umount(void __user *arg) -{ - struct ksu_uid_should_umount_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - cmd.should_umount = ksu_uid_should_umount(cmd.uid); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("uid_should_umount: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_manager_appid(void __user *arg) -{ - struct ksu_get_manager_appid_cmd cmd; - - cmd.appid = ksu_get_manager_appid(); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_manager_appid: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_app_profile(void __user *arg) -{ - struct ksu_get_app_profile_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_app_profile: copy_from_user failed\n"); - return -EFAULT; - } - - if (!ksu_get_app_profile(&cmd.profile)) { - return -ENOENT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_app_profile: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_set_app_profile(void __user *arg) -{ - struct ksu_set_app_profile_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("set_app_profile: copy_from_user failed\n"); - return -EFAULT; - } - - if (!ksu_set_app_profile(&cmd.profile, true)) { - return -EFAULT; - } - - return 0; -} - -static int do_get_feature(void __user *arg) -{ - struct ksu_get_feature_cmd cmd; - bool supported; - int ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_feature: copy_from_user failed\n"); - return -EFAULT; - } - - ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported); - cmd.supported = supported ? 1 : 0; - - if (ret && supported) { - pr_err("get_feature: failed for feature %u: %d\n", - cmd.feature_id, ret); - return ret; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_feature: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_set_feature(void __user *arg) -{ - struct ksu_set_feature_cmd cmd; - int ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("set_feature: copy_from_user failed\n"); - return -EFAULT; - } - - ret = ksu_set_feature(cmd.feature_id, cmd.value); - if (ret) { - pr_err("set_feature: failed for feature %u: %d\n", - cmd.feature_id, ret); - return ret; - } - - return 0; -} - -static int do_get_wrapper_fd(void __user *arg) -{ - if (!ksu_file_sid) { - return -EINVAL; - } - - struct ksu_get_wrapper_fd_cmd cmd; - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_wrapper_fd: copy_from_user failed\n"); - return -EFAULT; - } - - return ksu_install_file_wrapper(cmd.fd); -} - -static int do_manage_mark(void __user *arg) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - struct ksu_manage_mark_cmd cmd; - int ret = 0; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("manage_mark: copy_from_user failed\n"); - return -EFAULT; - } - - switch (cmd.operation) { - case KSU_MARK_GET: { - // Get task mark status - ret = ksu_get_task_mark(cmd.pid); - if (ret < 0) { - pr_err("manage_mark: get failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - cmd.result = (u32)ret; - break; - } - case KSU_MARK_MARK: { - if (cmd.pid == 0) { - ksu_mark_all_process(); - } else { - ret = ksu_set_task_mark(cmd.pid, true); - if (ret < 0) { - pr_err("manage_mark: set_mark failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - } - break; - } - case KSU_MARK_UNMARK: { - if (cmd.pid == 0) { - ksu_unmark_all_process(); - } else { - ret = ksu_set_task_mark(cmd.pid, false); - if (ret < 0) { - pr_err("manage_mark: set_unmark failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - } - break; - } - case KSU_MARK_REFRESH: { - ksu_mark_running_process(); - pr_info("manage_mark: refreshed running processes\n"); - break; - } - default: { - pr_err("manage_mark: invalid operation %u\n", cmd.operation); - return -EINVAL; - } - } - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("manage_mark: copy_to_user failed\n"); - return -EFAULT; - } - return 0; -#else - // We don't care, just return -ENOTSUPP - pr_warn("manage_mark: this supercalls is not implemented for manual hook.\n"); - return -ENOTSUPP; -#endif -} - -struct list_head mount_list = LIST_HEAD_INIT(mount_list); -DECLARE_RWSEM(mount_list_lock); - -static int add_try_umount(void __user *arg) -{ - struct mount_entry *new_entry, *entry, *tmp; - struct ksu_add_try_umount_cmd cmd; - char buf[256] = { 0 }; - - // When userspace disable kernel_umount, don't do anything. - if (!ksu_kernel_umount_enabled) { - pr_warn("add_try_umount supercall is not available when kernel_umount is disabled!\n"); - return -ENOTSUPP; - } - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - switch (cmd.mode) { - case KSU_UMOUNT_WIPE: { - struct mount_entry *entry, *tmp; - down_write(&mount_list_lock); - list_for_each_entry_safe (entry, tmp, &mount_list, list) { - pr_info("wipe_umount_list: removing entry: %s\n", - entry->umountable); - list_del(&entry->list); - kfree(entry->umountable); - kfree(entry); - } - up_write(&mount_list_lock); - - return 0; - } - - case KSU_UMOUNT_ADD: { - long len = strncpy_from_user(buf, (const char __user *)cmd.arg, - 256); - if (len <= 0) - return -EFAULT; - - buf[sizeof(buf) - 1] = '\0'; - - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - - new_entry->umountable = kstrdup(buf, GFP_KERNEL); - if (!new_entry->umountable) { - kfree(new_entry); - return -1; - } - - down_write(&mount_list_lock); - - // disallow dupes - // if this gets too many, we can consider moving this whole task to a kthread - list_for_each_entry (entry, &mount_list, list) { - if (!strcmp(entry->umountable, buf)) { - pr_info("cmd_add_try_umount: %s is already here!\n", - buf); - up_write(&mount_list_lock); - kfree(new_entry->umountable); - kfree(new_entry); - return -1; - } - } - - // now check flags and add - // this also serves as a null check - if (cmd.flags) - new_entry->flags = cmd.flags; - else - new_entry->flags = 0; - - // debug - list_add(&new_entry->list, &mount_list); - up_write(&mount_list_lock); - pr_info("cmd_add_try_umount: %s added!\n", buf); - - return 0; - } - - // this is just strcmp'd wipe anyway - case KSU_UMOUNT_DEL: { - long len = strncpy_from_user(buf, (const char __user *)cmd.arg, - sizeof(buf) - 1); - if (len <= 0) - return -EFAULT; - - buf[sizeof(buf) - 1] = '\0'; - - down_write(&mount_list_lock); - list_for_each_entry_safe (entry, tmp, &mount_list, list) { - if (!strcmp(entry->umountable, buf)) { - pr_info("cmd_add_try_umount: entry removed: %s\n", - entry->umountable); - list_del(&entry->list); - kfree(entry->umountable); - kfree(entry); - } - } - up_write(&mount_list_lock); - - return 0; - } - - // this way userspace can deduce the memory it has to prepare. - case KSU_UMOUNT_GETSIZE: { - // check for pointer first - if (!cmd.arg) - return -EFAULT; - - size_t total_size = 0; // size of list in bytes - - down_read(&mount_list_lock); - list_for_each_entry (entry, &mount_list, list) { - // + 1 for \0 - total_size = total_size + strlen(entry->umountable) + 1; - } - up_read(&mount_list_lock); - - pr_info("cmd_add_try_umount: total_size: %zu\n", total_size); - - if (copy_to_user((size_t __user *)cmd.arg, &total_size, - sizeof(total_size))) - return -EFAULT; - - return 0; - } - - // WARNING! this is straight up pointerwalking. - // this way we dont need to redefine the ioctl defs. - // this also avoids us needing to kmalloc - // userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA. - case KSU_UMOUNT_GETLIST: { - if (!cmd.arg) - return -EFAULT; - - char *user_buf = (char *)cmd.arg; - - down_read(&mount_list_lock); - list_for_each_entry (entry, &mount_list, list) { - pr_info("cmd_add_try_umount: entry: %s\n", - entry->umountable); - - if (copy_to_user((char __user *)user_buf, - entry->umountable, - strlen(entry->umountable) + 1)) { - up_read(&mount_list_lock); - return -EFAULT; - } - - // walk it! +1 for null terminator - user_buf = user_buf + strlen(entry->umountable) + 1; - } - up_read(&mount_list_lock); - - return 0; - } - - default: { - pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode); - return -EINVAL; - } - - } // switch(cmd.mode) - - return 0; -} - -static int do_nuke_ext4_sysfs(void __user *arg) -{ - struct ksu_nuke_ext4_sysfs_cmd cmd; - char mnt[256]; - long ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) - return -EFAULT; - - if (!cmd.arg) - return -EINVAL; - - memset(mnt, 0, sizeof(mnt)); - - ret = strncpy_from_user(mnt, cmd.arg, sizeof(mnt)); - if (ret < 0) { - pr_err("nuke ext4 copy mnt failed: %ld\n", ret); - return -EFAULT; // 或者 return ret; - } - - if (ret == sizeof(mnt)) { - pr_err("nuke ext4 mnt path too long\n"); - return -ENAMETOOLONG; - } - - pr_info("do_nuke_ext4_sysfs: %s\n", mnt); - - return nuke_ext4_sysfs(mnt); -} - -// IOCTL handlers mapping table -static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { - KSU_IOCTL(GRANT_ROOT, "GRANT_ROOT", do_grant_root, allowed_for_su), - KSU_IOCTL(GET_INFO, "GET_INFO", do_get_info, always_allow), - KSU_IOCTL(REPORT_EVENT, "REPORT_EVENT", do_report_event, only_root), - KSU_IOCTL(SET_SEPOLICY, "SET_SEPOLICY", do_set_sepolicy, only_root), - KSU_IOCTL(CHECK_SAFEMODE, "CHECK_SAFEMODE", do_check_safemode, - always_allow), - KSU_IOCTL(GET_ALLOW_LIST, "GET_ALLOW_LIST", do_get_allow_list, - manager_or_root), - KSU_IOCTL(GET_DENY_LIST, "GET_DENY_LIST", do_get_deny_list, - manager_or_root), - KSU_IOCTL(UID_GRANTED_ROOT, "UID_GRANTED_ROOT", do_uid_granted_root, - manager_or_root), - KSU_IOCTL(UID_SHOULD_UMOUNT, "UID_SHOULD_UMOUNT", do_uid_should_umount, - manager_or_root), - KSU_IOCTL(GET_MANAGER_APPID, "GET_MANAGER_APPID", do_get_manager_appid, - manager_or_root), - KSU_IOCTL(GET_APP_PROFILE, "GET_APP_PROFILE", do_get_app_profile, - only_manager), - KSU_IOCTL(SET_APP_PROFILE, "SET_APP_PROFILE", do_set_app_profile, - only_manager), - KSU_IOCTL(GET_FEATURE, "GET_FEATURE", do_get_feature, manager_or_root), - KSU_IOCTL(SET_FEATURE, "SET_FEATURE", do_set_feature, manager_or_root), - KSU_IOCTL(GET_WRAPPER_FD, "GET_WRAPPER_FD", do_get_wrapper_fd, - manager_or_root), - KSU_IOCTL(MANAGE_MARK, "MANAGE_MARK", do_manage_mark, manager_or_root), - KSU_IOCTL(NUKE_EXT4_SYSFS, "NUKE_EXT4_SYSFS", do_nuke_ext4_sysfs, - manager_or_root), - KSU_IOCTL(ADD_TRY_UMOUNT, "ADD_TRY_UMOUNT", add_try_umount, - manager_or_root), - - // Sentinel - { .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } -}; - -#ifdef CONFIG_KSU_SYSCALL_HOOK -struct ksu_install_fd_tw { - struct callback_head cb; - int __user *outp; -}; - -static void ksu_install_fd_tw_func(struct callback_head *cb) -{ - struct ksu_install_fd_tw *tw = - container_of(cb, struct ksu_install_fd_tw, cb); - int fd = ksu_install_fd(); - - if (copy_to_user(tw->outp, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - do_close_fd(fd); - } - - kfree(tw); -} - -static int ksu_handle_fd_request(void __user *arg) -{ - struct ksu_install_fd_tw *tw; - - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return -ENOMEM; - - tw->outp = (int __user *)arg; - tw->cb.func = ksu_install_fd_tw_func; - - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_warn("install fd add task_work failed\n"); - return -EINVAL; - } - - return 0; -} -#else -static int ksu_handle_fd_request(void __user *arg) -{ - int fd = ksu_install_fd(); - - if (copy_to_user(arg, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - do_close_fd(fd); - return -EFAULT; - } - - return 0; -} -#endif - -int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, - void __user **arg) -{ - if (magic1 != KSU_INSTALL_MAGIC1) - return -EINVAL; - - // Rare case that unlikely to happen - if (unlikely(!arg)) - return -EINVAL; - -#ifdef CONFIG_KSU_DEBUG - pr_info("sys_reboot: magic: 0x%x (id: %d)\n", magic1, magic2); -#endif - - // Dereference **arg.. with IS_ERR check. - void __user *argp = (void __user *)*arg; - if (IS_ERR(argp)) { - pr_err("Failed to deref user arg, err: %lu\n", PTR_ERR(argp)); - return -EINVAL; - } - - // Check if this is a request to install KSU fd - if (magic2 == KSU_INSTALL_MAGIC2) { - return ksu_handle_fd_request(argp); - } - - return 0; -} - -void ksu_supercalls_init(void) -{ - int i; - - pr_info("KernelSU IOCTL Commands:\n"); - for (i = 0; ksu_ioctl_handlers[i].handler; i++) { - pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, - ksu_ioctl_handlers[i].cmd); - } -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_supercalls_init(); -#endif -} - -void ksu_supercalls_exit(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_supercalls_exit(); -#endif -} - -// IOCTL dispatcher -static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int i; - -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val); -#endif - - for (i = 0; ksu_ioctl_handlers[i].handler; i++) { - if (cmd == ksu_ioctl_handlers[i].cmd) { - // Check permission first - if (ksu_ioctl_handlers[i].perm_check && - !ksu_ioctl_handlers[i].perm_check()) { - pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", - cmd, current_uid().val); - return -EPERM; - } - // Execute handler - return ksu_ioctl_handlers[i].handler(argp); - } - } - - pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd); - return -ENOTTY; -} - -// File release handler -static int anon_ksu_release(struct inode *inode, struct file *filp) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu fd released\n"); -#endif - return 0; -} - -// File operations structure -static const struct file_operations anon_ksu_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = anon_ksu_ioctl, - .compat_ioctl = anon_ksu_ioctl, - .release = anon_ksu_release, -}; - -// Install KSU fd to current process -int ksu_install_fd(void) -{ - struct file *filp; - int fd; - - // Get unused fd - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - pr_err("ksu_install_fd: failed to get unused fd\n"); - return fd; - } - - // Create anonymous inode file - filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, - O_RDWR | O_CLOEXEC); - if (IS_ERR(filp)) { - pr_err("ksu_install_fd: failed to create anon inode file\n"); - put_unused_fd(fd); - return PTR_ERR(filp); - } - - // Install fd - fd_install(fd, filp); - -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu fd[%d] installed for %s/%d\n", fd, current->comm, - current->pid); -#endif - - return fd; -} diff --git a/drivers/kernelsu/supercalls.h b/drivers/kernelsu/supercalls.h deleted file mode 100644 index f6ba38c498d3..000000000000 --- a/drivers/kernelsu/supercalls.h +++ /dev/null @@ -1,152 +0,0 @@ -#ifndef __KSU_H_SUPERCALLS -#define __KSU_H_SUPERCALLS - -#include -#include -#include "app_profile.h" - -// Magic numbers for reboot hook to install fd -#define KSU_INSTALL_MAGIC1 0xDEADBEEF -#define KSU_INSTALL_MAGIC2 0xCAFEBABE - -// Command structures for ioctl - -struct ksu_become_daemon_cmd { - __u8 token[65]; // Input: daemon token (null-terminated) -}; - -struct ksu_get_info_cmd { - __u32 version; // Output: KERNEL_SU_VERSION - __u32 flags; // Output: flags (bit 0: MODULE mode) - __u32 features; // Output: max feature ID supported -}; - -struct ksu_report_event_cmd { - __u32 event; // Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. -}; - -struct ksu_set_sepolicy_cmd { - __u64 cmd; // Input: sepolicy command - __aligned_u64 arg; // Input: sepolicy argument pointer -}; - -struct ksu_check_safemode_cmd { - __u8 in_safe_mode; // Output: true if in safe mode, false otherwise -}; - -struct ksu_get_allow_list_cmd { - __u32 uids[128]; // Output: array of allowed/denied UIDs - __u32 count; // Output: number of UIDs in array - __u8 allow; // Input: true for allow list, false for deny list -}; - -struct ksu_uid_granted_root_cmd { - __u32 uid; // Input: target UID to check - __u8 granted; // Output: true if granted, false otherwise -}; - -struct ksu_uid_should_umount_cmd { - __u32 uid; // Input: target UID to check - __u8 should_umount; // Output: true if should umount, false otherwise -}; - -struct ksu_get_manager_appid_cmd { - __u32 appid; // Output: manager app id -}; - -struct ksu_get_app_profile_cmd { - struct app_profile profile; // Input/Output: app profile structure -}; - -struct ksu_set_app_profile_cmd { - struct app_profile profile; // Input: app profile structure -}; - -struct ksu_get_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Output: feature value/state - __u8 supported; // Output: true if feature is supported, false otherwise -}; - -struct ksu_set_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Input: feature value/state to set -}; - -struct ksu_get_wrapper_fd_cmd { - __u32 fd; // Input: userspace fd - __u32 flags; // Input: flags of userspace fd -}; - -struct ksu_manage_mark_cmd { - __u32 operation; // Input: KSU_MARK_* - __s32 pid; // Input: target pid (0 for all processes) - __u32 result; // Output: for get operation - mark status or reg_count -}; - -struct ksu_nuke_ext4_sysfs_cmd { - __aligned_u64 arg; // Input: mnt pointer -}; - -#define KSU_MARK_GET 1 -#define KSU_MARK_MARK 2 -#define KSU_MARK_UNMARK 3 -#define KSU_MARK_REFRESH 4 - -struct ksu_add_try_umount_cmd { - __aligned_u64 arg; // char ptr, this is the mountpoint - __u32 flags; // this is the flag we use for it - __u8 mode; // denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry -}; - -#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list -#define KSU_UMOUNT_ADD 1 // add entry (path + flags) -#define KSU_UMOUNT_DEL 2 // delete entry, strcmp -#define KSU_UMOUNT_GETSIZE 3 // get list size -#define KSU_UMOUNT_GETLIST 4 // get list - -// IOCTL command definitions -#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) -#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) -#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) -#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ | _IOC_WRITE, 'K', 4, 0) -#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) -#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 6, 0) -#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 7, 0) -#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ | _IOC_WRITE, 'K', 8, 0) -#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ | _IOC_WRITE, 'K', 9, 0) -#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) -#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ | _IOC_WRITE, 'K', 11, 0) -#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) -#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ | _IOC_WRITE, 'K', 13, 0) -#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) -#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) -#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ | _IOC_WRITE, 'K', 16, 0) -#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) -#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) - -// IOCTL handler types -typedef int (*ksu_ioctl_handler_t)(void __user *arg); -typedef bool (*ksu_perm_check_t)(void); - -// IOCTL command mapping -struct ksu_ioctl_cmd_map { - unsigned int cmd; - const char *name; - ksu_ioctl_handler_t handler; - ksu_perm_check_t perm_check; // Permission check function -}; - -#define KSU_IOCTL(CMD, NAME, HANDLER, PERM) \ - { \ - .cmd = KSU_IOCTL_##CMD, .name = NAME, .handler = HANDLER, \ - .perm_check = PERM \ - } - -// Install KSU fd to current process -int ksu_install_fd(void); - -void ksu_supercalls_init(void); -void ksu_supercalls_exit(void); - -#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/syscall_handler.c b/drivers/kernelsu/syscall_handler.c deleted file mode 100644 index 499967165bce..000000000000 --- a/drivers/kernelsu/syscall_handler.c +++ /dev/null @@ -1,374 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -// Tracepoint registration count management -// == 1: just us -// > 1: someone else is also using syscall tracepoint e.g. ftrace -static int tracepoint_reg_count = 0; -static DEFINE_SPINLOCK(tracepoint_reg_lock); - -void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count <= 1) { - ksu_clear_task_tracepoint_flag(t); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); -} - -// Process marking management -static void handle_process_mark(bool mark) -{ - struct task_struct *p, *t; - read_lock(&tasklist_lock); - for_each_process_thread (p, t) { - if (mark) - ksu_set_task_tracepoint_flag(t); - else - ksu_clear_task_tracepoint_flag(t); - } - read_unlock(&tasklist_lock); -} - -void ksu_mark_all_process(void) -{ - handle_process_mark(true); - pr_info("hook_manager: mark all user process done!\n"); -} - -void ksu_unmark_all_process(void) -{ - handle_process_mark(false); - pr_info("hook_manager: unmark all user process done!\n"); -} - -static void ksu_mark_running_process_locked(void) -{ - struct task_struct *p, *t; - read_lock(&tasklist_lock); - for_each_process_thread (p, t) { - if (!t->mm) { // only user processes - continue; - } - int uid = task_uid(t).val; - const struct cred *cred = get_task_cred(t); - bool ksu_root_process = uid == 0 && is_task_ksu_domain(cred); - bool is_zygote_process = is_zygote(cred); - bool is_shell = uid == 2000; - // before boot completed, we shall mark init for marking zygote - bool is_init = t->pid == 1; - if (ksu_root_process || is_zygote_process || is_shell || - is_init || ksu_is_allow_uid(uid)) { - ksu_set_task_tracepoint_flag(t); - pr_info("hook_manager: mark process: pid:%d, uid: %d, comm:%s\n", - t->pid, uid, t->comm); - } else { - ksu_clear_task_tracepoint_flag(t); - pr_info("hook_manager: unmark process: pid:%d, uid: %d, comm:%s\n", - t->pid, uid, t->comm); - } - put_cred(cred); - } - read_unlock(&tasklist_lock); -} - -void ksu_mark_running_process(void) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count <= 1) { - ksu_mark_running_process_locked(); - } else { - pr_info("hook_manager: not mark running process since syscall tracepoint is in use\n"); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); -} - -// Get task mark status -// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found -int ksu_get_task_mark(pid_t pid) -{ - struct task_struct *task; - int marked = -ESRCH; - - rcu_read_lock(); - task = find_task_by_vpid(pid); - if (task) { - get_task_struct(task); - rcu_read_unlock(); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - marked = test_task_syscall_work(task, SYSCALL_TRACEPOINT) ? 1 : 0; -#else - marked = test_tsk_thread_flag(task, TIF_SYSCALL_TRACEPOINT) ? 1 : 0; -#endif - put_task_struct(task); - } else { - rcu_read_unlock(); - } - - return marked; -} - -// Set task mark status -// Returns: 0 on success, -ESRCH if task not found -int ksu_set_task_mark(pid_t pid, bool mark) -{ - struct task_struct *task; - int ret = -ESRCH; - - rcu_read_lock(); - task = find_task_by_vpid(pid); - if (task) { - get_task_struct(task); - rcu_read_unlock(); - if (mark) { - ksu_set_task_tracepoint_flag(task); - pr_info("hook_manager: marked task pid=%d comm=%s\n", - pid, task->comm); - } else { - ksu_clear_task_tracepoint_flag(task); - pr_info("hook_manager: unmarked task pid=%d comm=%s\n", - pid, task->comm); - } - put_task_struct(task); - ret = 0; - } else { - rcu_read_unlock(); - } - - return ret; -} - -#ifdef CONFIG_KRETPROBES - -static struct kretprobe *init_kretprobe(const char *name, - kretprobe_handler_t handler) -{ - struct kretprobe *rp = kzalloc(sizeof(struct kretprobe), GFP_KERNEL); - if (!rp) - return NULL; - rp->kp.symbol_name = name; - rp->handler = handler; - rp->data_size = 0; - rp->maxactive = 0; - - int ret = register_kretprobe(rp); - pr_info("hook_manager: register_%s kretprobe: %d\n", name, ret); - if (ret) { - kfree(rp); - return NULL; - } - - return rp; -} - -static void destroy_kretprobe(struct kretprobe **rp_ptr) -{ - struct kretprobe *rp = *rp_ptr; - if (!rp) - return; - unregister_kretprobe(rp); - synchronize_rcu(); - kfree(rp); - *rp_ptr = NULL; -} - -static int syscall_regfunc_handler(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count < 1) { - // while install our tracepoint, mark our processes - ksu_mark_running_process_locked(); - } else if (tracepoint_reg_count == 1) { - // while other tracepoint first added, mark all processes - ksu_mark_all_process(); - } - tracepoint_reg_count++; - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); - return 0; -} - -static int syscall_unregfunc_handler(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - tracepoint_reg_count--; - if (tracepoint_reg_count <= 0) { - // while no tracepoint left, unmark all processes - ksu_unmark_all_process(); - } else if (tracepoint_reg_count == 1) { - // while just our tracepoint left, unmark disallowed processes - ksu_mark_running_process_locked(); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); - return 0; -} - -static struct kretprobe *syscall_regfunc_rp = NULL; -static struct kretprobe *syscall_unregfunc_rp = NULL; -#endif - -static inline bool check_syscall_fastpath(int nr) -{ - switch (nr) { - case __NR_newfstatat: - case __NR_faccessat: - case __NR_execve: - case __NR_setresuid: - return true; - default: - return false; - } -} - -// Unmark init's child that are not zygote, adbd or ksud -int ksu_handle_init_mark_tracker(const char __user **filename_user) -{ - char path[64]; - - if (unlikely(!filename_user)) - return 0; - if (!ksu_retry_filename_access(filename_user, path, sizeof(path), - false)) - return 0; - - if (unlikely(strcmp(path, KSUD_PATH) == 0)) { - pr_info("hook_manager: escape to root for init executing ksud: %d\n", - current->pid); - escape_to_root_for_init(); - } else if (likely(strstr(path, "/app_process") == NULL && - strstr(path, "/adbd") == NULL)) { - pr_info("hook_manager: unmark %d exec %s\n", current->pid, - path); - ksu_clear_task_tracepoint_flag_if_needed(current); - } - - return 0; -} - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -static int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid) -{ - return ksu_handle_setuid_common(ruid, current_uid().val, euid); -} - -// Generic sys_enter handler that dispatches to specific handlers -static void ksu_sys_enter_handler(void *data, struct pt_regs *regs, long id) -{ - if (unlikely(check_syscall_fastpath(id))) { - if (ksu_su_compat_enabled) { - // Handle newfstatat - if (id == __NR_newfstatat) { - int *dfd = (int *)&PT_REGS_PARM1(regs); - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM2( - regs); - int *flags = - (int *)&PT_REGS_SYSCALL_PARM4(regs); - ksu_handle_stat(dfd, filename_user, flags); - return; - } - - // Handle faccessat - if (id == __NR_faccessat) { - int *dfd = (int *)&PT_REGS_PARM1(regs); - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM2( - regs); - int *mode = (int *)&PT_REGS_PARM3(regs); - ksu_handle_faccessat(dfd, filename_user, mode, - NULL); - return; - } - - // Handle execve - if (id == __NR_execve) { - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM1( - regs); - if (current->pid != 1 && - is_init(get_current_cred())) { - ksu_handle_init_mark_tracker( - filename_user); - } else { - ksu_handle_execve_sucompat( - NULL, filename_user, NULL, NULL, - NULL); - } - return; - } - } - - // Handle setresuid - if (id == __NR_setresuid) { - uid_t ruid = (uid_t)PT_REGS_PARM1(regs); - uid_t euid = (uid_t)PT_REGS_PARM2(regs); - uid_t suid = (uid_t)PT_REGS_PARM3(regs); - ksu_handle_setresuid(ruid, euid, suid); - return; - } - } -} -#endif - -void ksu_syscall_hook_manager_init(void) -{ - int ret; - pr_info("hook_manager: ksu_hook_manager_init called\n"); - -#ifdef CONFIG_KRETPROBES - // Register kretprobe for syscall_regfunc - syscall_regfunc_rp = - init_kretprobe("syscall_regfunc", syscall_regfunc_handler); - // Register kretprobe for syscall_unregfunc - syscall_unregfunc_rp = - init_kretprobe("syscall_unregfunc", syscall_unregfunc_handler); -#endif - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - ret = register_trace_sys_enter(ksu_sys_enter_handler, NULL); -#ifndef CONFIG_KRETPROBES - ksu_mark_running_process_locked(); -#endif - if (ret) { - pr_err("hook_manager: failed to register sys_enter tracepoint: %d\n", - ret); - } else { - pr_info("hook_manager: sys_enter tracepoint registered\n"); - } -#endif - - ksu_setuid_hook_init(); - ksu_sucompat_init(); -} - -void ksu_syscall_hook_manager_exit(void) -{ - pr_info("hook_manager: ksu_hook_manager_exit called\n"); -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - unregister_trace_sys_enter(ksu_sys_enter_handler, NULL); - tracepoint_synchronize_unregister(); - pr_info("hook_manager: sys_enter tracepoint unregistered\n"); -#endif - -#ifdef CONFIG_KRETPROBES - destroy_kretprobe(&syscall_regfunc_rp); - destroy_kretprobe(&syscall_unregfunc_rp); -#endif - - ksu_sucompat_exit(); - ksu_setuid_hook_exit(); -} diff --git a/drivers/kernelsu/syscall_handler.h b/drivers/kernelsu/syscall_handler.h deleted file mode 100644 index 463617fd97d9..000000000000 --- a/drivers/kernelsu/syscall_handler.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __KSU_H_HOOK_MANAGER -#define __KSU_H_HOOK_MANAGER - -#include -#include -#include - -// Hook manager initialization and cleanup -void ksu_syscall_hook_manager_init(void); -void ksu_syscall_hook_manager_exit(void); - -// Process marking for tracepoint -void ksu_mark_all_process(void); -void ksu_unmark_all_process(void); -void ksu_mark_running_process(void); - -// Per-task mark operations -int ksu_get_task_mark(pid_t pid); -int ksu_set_task_mark(pid_t pid, bool mark); - -static inline void ksu_set_task_tracepoint_flag(struct task_struct *t) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - set_task_syscall_work(t, SYSCALL_TRACEPOINT); -#else - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); -#endif -} - -static inline void ksu_clear_task_tracepoint_flag(struct task_struct *t) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - clear_task_syscall_work(t, SYSCALL_TRACEPOINT); -#else - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); -#endif -} - -void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t); -#endif diff --git a/drivers/kernelsu/throne_tracker.c b/drivers/kernelsu/throne_tracker.c deleted file mode 100644 index a129fa9f4935..000000000000 --- a/drivers/kernelsu/throne_tracker.c +++ /dev/null @@ -1,389 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "allowlist.h" -#include "apk_sign.h" -#include "klog.h" // IWYU pragma: keep -#include "manager.h" -#include "kernel_compat.h" -#include "throne_tracker.h" - -uid_t ksu_manager_appid = KSU_INVALID_APPID; - -#if defined(CONFIG_KSU_MANUAL_HOOK) -#define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list.tmp" -#elif defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -#define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list" -#endif - -struct uid_data { - struct list_head list; - u32 uid; - char package[KSU_MAX_PACKAGE_NAME]; -}; - -static void crown_manager(const char *apk, struct list_head *uid_data) -{ - char pkg[KSU_MAX_PACKAGE_NAME]; - if (get_pkg_from_apk_path(pkg, apk) < 0) { - pr_err("Failed to get package name from apk path: %s\n", apk); - return; - } - - pr_info("manager pkg: %s\n", pkg); - - struct list_head *list = (struct list_head *)uid_data; - struct uid_data *np; - - list_for_each_entry (np, list, list) { - if (strncmp(np->package, pkg, KSU_MAX_PACKAGE_NAME) == 0) { - pr_info("Crowning manager: %s(uid=%d)\n", pkg, np->uid); - ksu_set_manager_appid(np->uid); - break; - } - } -} - -#define DATA_PATH_LEN 384 // 384 is enough for /data/app//base.apk - -struct data_path { - char dirpath[DATA_PATH_LEN]; - int depth; - struct list_head list; -}; - -struct apk_path_hash { - unsigned int hash; - bool exists; - struct list_head list; -}; - -static struct list_head apk_path_hash_list; - -struct my_dir_context { - struct dir_context ctx; - struct list_head *data_path_list; - char *parent_dir; - void *private_data; - int depth; - int *stop; -}; -// https://docs.kernel.org/filesystems/porting.html -// filldir_t (readdir callbacks) calling conventions have changed. -// Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). -// Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted. -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -#define FILLDIR_RETURN_TYPE bool -#define FILLDIR_ACTOR_CONTINUE true -#define FILLDIR_ACTOR_STOP false -#else -#define FILLDIR_RETURN_TYPE int -#define FILLDIR_ACTOR_CONTINUE 0 -#define FILLDIR_ACTOR_STOP -EINVAL -#endif -extern bool is_manager_apk(char *path); - -static inline void print_iter(bool is_manager, char *path) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("Found new base.apk at path: %s, is_manager: %d\n", path, - is_manager); -#else - if (is_manager) - pr_info("Found KernelSU base.apk at %s\n", path); -#endif -} - -FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name, - int namelen, loff_t off, u64 ino, - unsigned int d_type) -{ - struct my_dir_context *my_ctx = - container_of(ctx, struct my_dir_context, ctx); - char dirpath[DATA_PATH_LEN]; - - if (!my_ctx) { - pr_err("Invalid context\n"); - return FILLDIR_ACTOR_STOP; - } - if (my_ctx->stop && *my_ctx->stop) { - pr_info("Stop searching\n"); - return FILLDIR_ACTOR_STOP; - } - - if (!strncmp(name, "..", namelen) || !strncmp(name, ".", namelen)) - return FILLDIR_ACTOR_CONTINUE; // Skip "." and ".." - - if (d_type == DT_DIR && namelen >= 8 && !strncmp(name, "vmdl", 4) && - !strncmp(name + namelen - 4, ".tmp", 4)) { - pr_info("Skipping directory: %.*s\n", namelen, name); - return FILLDIR_ACTOR_CONTINUE; // Skip staging package - } - - if (snprintf(dirpath, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, - namelen, name) >= DATA_PATH_LEN) { - pr_err("Path too long: %s/%.*s\n", my_ctx->parent_dir, namelen, - name); - return FILLDIR_ACTOR_CONTINUE; - } - - if (d_type == DT_DIR && my_ctx->depth > 0 && - (my_ctx->stop && !*my_ctx->stop)) { - struct data_path *data = - kzalloc(sizeof(struct data_path), GFP_ATOMIC); - - if (!data) { - pr_err("Failed to allocate memory for %s\n", dirpath); - return FILLDIR_ACTOR_CONTINUE; - } - - strscpy(data->dirpath, dirpath, DATA_PATH_LEN); - data->depth = my_ctx->depth - 1; - list_add_tail(&data->list, my_ctx->data_path_list); - } else { - if ((namelen == 8) && - (strncmp(name, "base.apk", namelen) == 0)) { - struct apk_path_hash *pos; -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) - unsigned int hash = - full_name_hash(dirpath, strlen(dirpath)); -#else - unsigned int hash = - full_name_hash(NULL, dirpath, strlen(dirpath)); -#endif - list_for_each_entry (pos, &apk_path_hash_list, list) { - if (hash == pos->hash) { - pos->exists = true; - return FILLDIR_ACTOR_CONTINUE; - } - } - - bool is_manager = is_manager_apk(dirpath); - print_iter(is_manager, dirpath); - if (is_manager) { - crown_manager(dirpath, my_ctx->private_data); - *my_ctx->stop = 1; - } - } - } - - return FILLDIR_ACTOR_CONTINUE; -} - -static void search_manager(const char *path, int depth, - struct list_head *uid_data) -{ - int i, stop = 0; - struct list_head data_path_list; - INIT_LIST_HEAD(&data_path_list); - INIT_LIST_HEAD(&apk_path_hash_list); - unsigned long data_app_magic = 0; - - // Initialize APK cache list - struct apk_path_hash *pos, *n; - list_for_each_entry (pos, &apk_path_hash_list, list) { - pos->exists = false; - } - - // First depth - struct data_path data; - strscpy(data.dirpath, path, DATA_PATH_LEN); - data.depth = depth; - list_add_tail(&data.list, &data_path_list); - - for (i = depth; i >= 0; i--) { - struct data_path *pos, *n; - - list_for_each_entry_safe (pos, n, &data_path_list, list) { - struct my_dir_context ctx = { .ctx.actor = my_actor, - .data_path_list = - &data_path_list, - .parent_dir = - pos->dirpath, - .private_data = uid_data, - .depth = pos->depth, - .stop = &stop }; - struct file *file; - - if (!stop) { - file = ksu_filp_open_compat( - pos->dirpath, O_RDONLY | O_NOFOLLOW, 0); - if (IS_ERR(file)) { - pr_err("Failed to open directory: %s, err: %ld\n", - pos->dirpath, PTR_ERR(file)); - goto skip_iterate; - } - - // grab magic on first folder, which is /data/app - if (!data_app_magic) { - if (file->f_inode->i_sb->s_magic) { - data_app_magic = - file->f_inode->i_sb - ->s_magic; - pr_info("%s: dir: %s got magic! 0x%lx\n", - __func__, pos->dirpath, - data_app_magic); - } else { - filp_close(file, NULL); - goto skip_iterate; - } - } - - if (file->f_inode->i_sb->s_magic != - data_app_magic) { - pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", - __func__, pos->dirpath, - file->f_inode->i_sb->s_magic, - data_app_magic); - filp_close(file, NULL); - goto skip_iterate; - } - - iterate_dir(file, &ctx.ctx); - filp_close(file, NULL); - } - skip_iterate: - list_del(&pos->list); - if (pos != &data) - kfree(pos); - } - } - - // clear apk_path_hash_list unconditionally - pr_info("Search manager: cleanup!\n"); - list_for_each_entry_safe (pos, n, &apk_path_hash_list, list) { - list_del(&pos->list); - kfree(pos); - } -} - -static bool is_uid_exist(uid_t uid, char *package, void *data) -{ - struct list_head *list = (struct list_head *)data; - struct uid_data *np; - - bool exist = false; - list_for_each_entry (np, list, list) { - if (np->uid == uid % PER_USER_RANGE && - strncmp(np->package, package, KSU_MAX_PACKAGE_NAME) == 0) { - exist = true; - break; - } - } - return exist; -} - -void track_throne(bool prune_only) -{ - struct file *fp = - ksu_filp_open_compat(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); - if (IS_ERR(fp)) { - pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", - __func__, PTR_ERR(fp)); - return; - } - - struct list_head uid_list; - INIT_LIST_HEAD(&uid_list); - - char chr = 0; - loff_t pos = 0; - loff_t line_start = 0; - char buf[KSU_MAX_PACKAGE_NAME]; - for (;;) { - ssize_t count = - ksu_kernel_read_compat(fp, &chr, sizeof(chr), &pos); - if (count != sizeof(chr)) - break; - if (chr != '\n') - continue; - - count = ksu_kernel_read_compat(fp, buf, sizeof(buf), - &line_start); - - struct uid_data *data = - kzalloc(sizeof(struct uid_data), GFP_ATOMIC); - if (!data) { - filp_close(fp, 0); - goto out; - } - - char *tmp = buf; - const char *delim = " "; - char *package = strsep(&tmp, delim); - char *uid = strsep(&tmp, delim); - if (!uid || !package) { - kfree(data); - pr_err("update_uid: package or uid is NULL!\n"); - break; - } - - u32 res; - if (kstrtou32(uid, 10, &res)) { - kfree(data); - pr_err("update_uid: uid parse err\n"); - break; - } - data->uid = res; - strncpy(data->package, package, KSU_MAX_PACKAGE_NAME); - list_add_tail(&data->list, &uid_list); - // reset line start - line_start = pos; - } - filp_close(fp, 0); - - if (prune_only) { - pr_info("throne_tracker: prune allowlist only!\n"); - goto prune; - } - - // now update uid list - struct uid_data *np, *n; - - // first, check if manager_uid exist! - bool manager_exist = false; - list_for_each_entry (np, &uid_list, list) { - if (np->uid == ksu_get_manager_appid()) { - manager_exist = true; - break; - } - } - - if (!manager_exist) { - if (ksu_is_manager_appid_valid()) { - pr_info("manager is uninstalled, invalidate it!\n"); - ksu_invalidate_manager_uid(); - goto prune; - } - pr_info("Searching manager...\n"); - search_manager("/data/app", 2, &uid_list); - pr_info("Search manager finished.\n"); - } - -prune: - // then prune the allowlist - ksu_prune_allowlist(is_uid_exist, &uid_list); -out: - // free uid_list - list_for_each_entry_safe (np, n, &uid_list, list) { - list_del(&np->list); - kfree(np); - } -} - -void ksu_throne_tracker_init(void) -{ - // nothing to do -} - -void ksu_throne_tracker_exit(void) -{ - // nothing to do -} diff --git a/drivers/kernelsu/throne_tracker.h b/drivers/kernelsu/throne_tracker.h deleted file mode 100644 index 8bb3b9a29b51..000000000000 --- a/drivers/kernelsu/throne_tracker.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __KSU_H_THRONE_TRACKER -#define __KSU_H_THRONE_TRACKER - -void ksu_throne_tracker_init(void); - -void ksu_throne_tracker_exit(void); - -void track_throne(bool prune_only); - -#endif From 5c4b7ffd7c859b2347acc825b5d24ad3b328dc03 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:56:23 +0800 Subject: [PATCH 05/59] =?UTF-8?q?Revert=20"ANDROID:=20=E5=BC=95=E5=85=A5Re?= =?UTF-8?q?-Kernel8.6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e9253546641bd8ffdb1ad77d5fc1135fd4ae0cc0. --- drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/android/binder.c | 40 ----- drivers/rekernel/Kconfig | 15 -- drivers/rekernel/Makefile | 1 - drivers/rekernel/rekernel.c | 333 ------------------------------------ drivers/rekernel/rekernel.h | 34 ---- kernel/signal.c | 8 - 8 files changed, 434 deletions(-) delete mode 100644 drivers/rekernel/Kconfig delete mode 100644 drivers/rekernel/Makefile delete mode 100644 drivers/rekernel/rekernel.c delete mode 100644 drivers/rekernel/rekernel.h diff --git a/drivers/Kconfig b/drivers/Kconfig index 38fc2a3f5c4d..4ee7416ed53d 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -216,6 +216,4 @@ source "drivers/gps/Kconfig" source "drivers/halls/Kconfig" -source "drivers/rekernel/Kconfig" - endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 4e0bcc899926..8daadc6db681 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -183,4 +183,3 @@ obj-$(CONFIG_SENSORS_SSC) += sensors/ obj-$(CONFIG_TEE) += tee/ obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/ obj-$(CONFIG_HALLS) += halls/ -obj-$(CONFIG_REKERNEL) += rekernel/ diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 977cb783ea0b..d7546f8dc482 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -68,9 +68,6 @@ #include #include -#ifdef CONFIG_REKERNEL -#include <../rekernel/rekernel.h> -#endif /* CONFIG_REKERNEL */ #include #include @@ -2838,12 +2835,8 @@ static int binder_fixup_parent(struct list_head *pf_head, static bool binder_can_update_transaction(struct binder_transaction *t1, struct binder_transaction *t2) { -#ifdef CONFIG_REKERNEL - if ((t1->flags & t2->flags & TF_ONE_WAY) != TF_ONE_WAY || !t1->to_proc || !t2->to_proc) -#else if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) -#endif /* CONFIG_REKERNEL */ return false; if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && @@ -2881,32 +2874,6 @@ binder_find_outdated_transaction_ilocked(struct binder_transaction *t, return NULL; } -#ifdef CONFIG_REKERNEL -void rekernel_binder_transaction(bool reply, struct binder_transaction *t, - struct binder_node *target_node, struct binder_transaction_data *tr) { - struct binder_proc *to_proc; - struct binder_alloc *target_alloc; - if (!t->to_proc) - return; - to_proc = t->to_proc; - - if (reply) { - binder_reply_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, false, tr); - } else if (t->from) { - if (t->from->proc) { - binder_trans_handler(t->from->proc->pid, t->from->proc->tsk, to_proc->pid, to_proc->tsk, false, tr); - } - } else { // oneway=1 - binder_trans_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr); - - target_alloc = &to_proc->alloc; - if (target_alloc->free_async_space < (target_alloc->buffer_size / 10 + 0x300)) { - binder_overflow_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr); - } - } -} -#endif /* CONFIG_REKERNEL */ - /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send @@ -2968,11 +2935,7 @@ static int binder_proc_transaction(struct binder_transaction *t, } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { -#ifdef CONFIG_REKERNEL - if (frozen_task_group(proc->tsk)) { -#else if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) { -#endif /* CONFIG_REKERNEL */ t_outdated = binder_find_outdated_transaction_ilocked(t, &node->async_todo); if (t_outdated) { @@ -3369,9 +3332,6 @@ static void binder_transaction(struct binder_proc *proc, } } -#ifdef CONFIG_REKERNEL - rekernel_binder_transaction(reply, t, target_node, tr); -#endif /* CONFIG_REKERNEL */ trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, diff --git a/drivers/rekernel/Kconfig b/drivers/rekernel/Kconfig deleted file mode 100644 index dadf14779fde..000000000000 --- a/drivers/rekernel/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -menu "Re:Kernel" - -config REKERNEL - bool "Re:Kernel support" - default n - help - Make tombstone users get a better experience. - -config REKERNEL_NETWORK - bool "Re:Kernel NetReceive unfreeze support" - depends on REKERNEL - default n - help - Make tombstone users get a better experience. -endmenu diff --git a/drivers/rekernel/Makefile b/drivers/rekernel/Makefile deleted file mode 100644 index bb613644a5f4..000000000000 --- a/drivers/rekernel/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_REKERNEL) += rekernel.o diff --git a/drivers/rekernel/rekernel.c b/drivers/rekernel/rekernel.c deleted file mode 100644 index d3783225c33c..000000000000 --- a/drivers/rekernel/rekernel.c +++ /dev/null @@ -1,333 +0,0 @@ -#include -#include - -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#endif /* IS_ENABLED(CONFIG_IPV6) */ -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#endif /* IS_ENABLED(CONFIG_IPV6) */ - -#include -#include -#include "rekernel.h" - -#define MIN_USERAPP_UID 10000 -#define MAX_SYSTEM_UID 2000 -#define SYSTEM_APP_UID 1000 -#define INTERFACETOKEN_BUFF_SIZE 140 -#define PARCEL_OFFSET 16 -#define LINE_ERROR 1 -#define LINE_SUCCESS 0 - -#define NETLINK_REKERNEL_MAX 26 -#define NETLINK_REKERNEL_MIN 22 -#define USER_PORT 100 -#define PACKET_SIZE 256 - -static const char* binder_type[] = { - "reply", - "transaction", - "free_buffer_full", -}; -static const char* rpc_type[] = { - "SYNC_BINDER_REPLY", - "SYNC_BINDER", - "FREE_BUFFER_FULL", -}; -static struct sock* netlink_socket; -extern struct net init_net; -static unsigned long netlink_unit = 0; -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry* rekernel_dir, * rekernel_unit_entry; -#endif /* CONFIG_PROC_FS */ - -static int sendMessage(char* packet_buffer, uint16_t len) { - struct sk_buff* socket_buffer; - struct nlmsghdr* netlink_hdr; - - socket_buffer = nlmsg_new(len, GFP_ATOMIC); - if (!socket_buffer) { - pr_err("netlink alloc failure.\n"); - return -LINE_ERROR; - } - - netlink_hdr = nlmsg_put(socket_buffer, 0, 0, netlink_unit, len, 0); - if (!netlink_hdr) { - pr_err("nlmsg_put failaure.\n"); - nlmsg_free(socket_buffer); - return -LINE_ERROR; - } - - memcpy(nlmsg_data(netlink_hdr), packet_buffer, len); - return netlink_unicast(netlink_socket, socket_buffer, USER_PORT, MSG_DONTWAIT); -} -static void netlink_rcv_msg(struct sk_buff* socket_buffer) { - struct nlmsghdr* nlhdr = NULL; - char* umsg = NULL; - - if (socket_buffer->len >= nlmsg_total_size(0)) { - nlhdr = nlmsg_hdr(socket_buffer); - umsg = nlmsg_data(nlhdr); - if (umsg) { -#ifdef CONFIG_PROC_FS - if (!memcmp(umsg, "#proc_remove", nlmsg_len(nlhdr))) { - if (rekernel_dir) { - proc_remove(rekernel_dir); - } - } -#endif /* CONFIG_PROC_FS */ - } - } -} -#ifdef CONFIG_REKERNEL_NETWORK -static unsigned int rekernel_pkg_ipv4_ipv6_in(void* priv, struct sk_buff* socket_buffer, - const struct nf_hook_state* state) { - struct sock* sk; - unsigned int thoff = 0; - unsigned short frag_off = 0; - uid_t uid; - uint hook; - struct net_device* dev = NULL; - struct tcphdr *th; - int data_len = 0; - - if (!socket_buffer || !socket_buffer->len || !state) - return NF_ACCEPT; - - hook = state->hook; - if (NF_INET_LOCAL_IN == hook) - dev = state->in; - - if (NULL == dev) - return NF_ACCEPT; - - if (ip_hdr(socket_buffer)->version == 4) { - struct iphdr *iph4 = ip_hdr(socket_buffer); - if (iph4->protocol != IPPROTO_TCP) - return NF_ACCEPT; - if (!pskb_may_pull(socket_buffer, (iph4->ihl << 2) + sizeof(struct tcphdr))) - return NF_ACCEPT; - th = (struct tcphdr *)((unsigned char *)iph4 + (iph4->ihl << 2)); - data_len = ntohs(iph4->tot_len) - (iph4->ihl << 2) - (th->doff << 2); -#if IS_ENABLED(CONFIG_IPV6) - } else if (ip_hdr(socket_buffer)->version == 6) { - struct ipv6hdr *iph6 = ipv6_hdr(socket_buffer); - if (ipv6_find_hdr(socket_buffer, &thoff, -1, &frag_off, NULL) != IPPROTO_TCP) - return NF_ACCEPT; - if (!pskb_may_pull(socket_buffer, thoff + sizeof(struct tcphdr))) - return NF_ACCEPT; - th = (struct tcphdr *)(skb_network_header(socket_buffer) + thoff); - data_len = ntohs(iph6->payload_len) - (thoff - sizeof(struct ipv6hdr)) - (th->doff << 2); -#endif - } else { - return NF_ACCEPT; - } - - sk = skb_to_full_sk(socket_buffer); - if (sk == NULL || !sk_fullsock(sk)) - return NF_ACCEPT; - - uid = sock_i_uid(sk).val; - if (uid < MIN_USERAPP_UID) - return NF_ACCEPT; - - if (data_len <= 0 && !th->syn && !th->fin && !th->rst) - return NF_ACCEPT; - - rekernel_report(NETWORK, ip_hdr(socket_buffer)->version, data_len, NULL, uid, NULL, true, NULL); - return NF_ACCEPT; -} -/* Only monitor input network packages */ -static struct nf_hook_ops rekernel_nf_ops[] = { - { - .hook = rekernel_pkg_ipv4_ipv6_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_SELINUX_LAST + 1, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .hook = rekernel_pkg_ipv4_ipv6_in, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_SELINUX_LAST + 1, - } -#endif -}; - -int register_netfilter(void) { - int rc; - struct net* net = NULL; - for_each_net(net) { - rc = nf_register_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops)); - if (rc) { - pr_err("register netfilter hooks failed, rc=%d\n", rc); - break; - } - } - if (rc) { - for_each_net(net) { - nf_unregister_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops)); - } - return -1; - } - - return LINE_SUCCESS; -} -#endif /* CONFIG_REKERNEL_NETWORK */ -struct netlink_kernel_cfg cfg = { - .input = netlink_rcv_msg, // set recv callback -}; -#ifdef CONFIG_PROC_FS -static int rekernel_unit_show(struct seq_file* m, void* v) { - seq_printf(m, "%d\n", netlink_unit); - return LINE_SUCCESS; -} -static int rekernel_unit_open(struct inode* inode, struct file* file) { - return single_open(file, rekernel_unit_show, NULL); -} -static const struct file_operations rekernel_unit_fops = { - .open = rekernel_unit_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release -}; -#endif /* CONFIG_PROC_FS */ -// init -static int start_rekernel(void) { - if (netlink_unit) - return 0; - - pr_info("Thank you for choosing Re:Kernel!\n"); -#ifdef CONFIG_REKERNEL_NETWORK - pr_info("NetFilter is enabled!\n"); -#endif - pr_info("Re:Kernel v8.6 | DEVELOPER: Sakion Team | Timeline | USER PORT: %d\n", USER_PORT); - pr_info("Trying to create Re:Kernel Server......\n"); - - for (netlink_unit = NETLINK_REKERNEL_MIN; netlink_unit < NETLINK_REKERNEL_MAX; netlink_unit++) { - netlink_socket = netlink_kernel_create(&init_net, netlink_unit, &cfg); - if (netlink_socket != NULL) - break; - } - if (netlink_socket == NULL) { - netlink_unit = 0; - pr_err("Failed to create Re:Kernel server!\n"); - return -LINE_ERROR; - } - pr_info("Created Re:Kernel server! NETLINK UNIT: %d\n", netlink_unit); - -#ifdef CONFIG_PROC_FS - rekernel_dir = proc_mkdir("rekernel", NULL); - if (!rekernel_dir) { - pr_err("create /proc/rekernel failed!\n"); - } else { - char buff[32]; - sprintf(buff, "%d", netlink_unit); - rekernel_unit_entry = proc_create(buff, 0644, rekernel_dir, &rekernel_unit_fops); - if (!rekernel_unit_entry) { - pr_err("create rekernel unit failed!\n"); - } - } -#endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_REKERNEL_NETWORK - if (register_netfilter()) { - pr_err("%s: Failed to hook netfilter!\n", __func__); - return -LINE_ERROR; - } -#endif /* CONFIG_REKERNEL_NETWORK */ - return LINE_SUCCESS; -} - -void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - char binder_kmsg[PACKET_SIZE]; - char buf_data[INTERFACETOKEN_BUFF_SIZE]; - size_t buf_data_size; - char buf[INTERFACETOKEN_BUFF_SIZE] = { 0 }; - char* p; - int i = 0; - int j = 0; - - if (start_rekernel()) - return; - -#ifdef CONFIG_REKERNEL_NETWORK - if (reporttype == NETWORK) { - char binder_kmsg[PACKET_SIZE]; - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Network,target=%d,proto=ipv%d,data_len=%d;", dst_pid, type, src_pid); - sendMessage(binder_kmsg, strlen(binder_kmsg)); - return; - } -#endif /* CONFIG_REKERNEL_NETWORK */ - - if (!frozen_task_group(dst)) - return; - - if (task_uid(src).val == task_uid(dst).val) - return; - - switch (reporttype) { - case BINDER: - if (oneway && type == TRANSACTION) { - if (tr->code < 29 || tr->code > 32) - return; - buf_data_size = tr->data_size > INTERFACETOKEN_BUFF_SIZE ? INTERFACETOKEN_BUFF_SIZE : tr->data_size; - if (copy_from_user(buf_data, (char*)tr->data.ptr.buffer, buf_data_size)) - return; - j = PARCEL_OFFSET + 1; - p = (char*)(buf_data)+PARCEL_OFFSET; - while (i < INTERFACETOKEN_BUFF_SIZE && j < buf_data_size && *p != '\0') { - buf[i++] = *p; - j += 2; - p += 2; - } - if (i == INTERFACETOKEN_BUFF_SIZE) { - buf[i - 1] = '\0'; - } - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d,rpc_name=%s,code=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, buf, tr->code); - } else { - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, rpc_type[type], -1); - } - break; - case SIGNAL: - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Signal,signal=%d,killer_pid=%d,killer=%d,dst_pid=%d,dst=%d;", type, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val); - break; - default: - return; - } - sendMessage(binder_kmsg, strlen(binder_kmsg)); -} - -void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - if (task_uid(dst).val > MAX_SYSTEM_UID || src_pid == dst_pid) - return; - - // oneway=0 - rekernel_report(BINDER, REPLY, src_pid, src, dst_pid, dst, oneway, tr); -} - -void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - if ((task_uid(dst).val <= MIN_USERAPP_UID) || src_pid == dst_pid) - return; - - rekernel_report(BINDER, TRANSACTION, src_pid, src, dst_pid, dst, oneway, tr); -} - -void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - - // oneway=1 - rekernel_report(BINDER, OVERFLOW, src_pid, src, dst_pid, dst, oneway, tr); -} diff --git a/drivers/rekernel/rekernel.h b/drivers/rekernel/rekernel.h deleted file mode 100644 index af7022a8535c..000000000000 --- a/drivers/rekernel/rekernel.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __REKERNEL_H -#define __REKERNEL_H - -#include -#include -#include -#include - -enum report_type { - BINDER, - SIGNAL, -#ifdef CONFIG_REKERNEL_NETWORK - NETWORK, -#endif /* CONFIG_REKERNEL_NETWORK */ -}; -enum binder_type { - REPLY, - TRANSACTION, - OVERFLOW, -}; - -static inline bool jobctl_frozen(struct task_struct* task) { - return ((task->jobctl & JOBCTL_TRAP_FREEZE) != 0); -} -static inline bool frozen_task_group(struct task_struct* task) { - return (jobctl_frozen(task) || cgroup_freezing(task)); -} - -extern void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); - -#endif /* __REKERNEL_H */ diff --git a/kernel/signal.c b/kernel/signal.c index 5b2edc6341f8..ff8ba82c9e03 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -48,10 +48,6 @@ #include #include #include -#ifdef CONFIG_REKERNEL -#include -#include <../drivers/rekernel/rekernel.h> -#endif /* CONFIG_REKERNEL */ #include "audit.h" /* audit_signal_info() */ /* @@ -1212,10 +1208,6 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, { unsigned long flags; int ret = -ESRCH; -#ifdef CONFIG_REKERNEL - if (sig == SIGKILL || sig == SIGTERM || sig == SIGABRT || sig == SIGQUIT) - rekernel_report(SIGNAL, sig, task_tgid_nr(current), current, task_tgid_nr(p), p, false, NULL); -#endif /* CONFIG_REKERNEL */ if (lock_task_sighand(p, &flags)) { ret = send_signal(sig, info, p, group); From c0e49431fb7dd3f39583014f557778aec4bf4289 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:56:39 +0800 Subject: [PATCH 06/59] Revert "ANDROID: binder: Backported from 5.15" This reverts commit 359574ca4506b99379dbfb5dd1013ea211ca5765. --- drivers/android/Kconfig | 89 +- drivers/android/Makefile | 4 - drivers/android/android_debug_symbols.c | 149 -- drivers/android/binder.c | 2509 +++++++++-------------- drivers/android/binder_alloc.c | 257 +-- drivers/android/binder_alloc.h | 47 +- drivers/android/binder_alloc_selftest.c | 11 +- drivers/android/binder_internal.h | 603 ------ drivers/android/binder_trace.h | 87 +- drivers/android/binderfs.c | 819 -------- drivers/android/vendor_hooks.c | 433 ---- fs/file.c | 31 - include/linux/fdtable.h | 1 - include/linux/mm.h | 22 - include/linux/seq_file.h | 14 - include/uapi/linux/android/binder.h | 47 +- include/uapi/linux/android/binderfs.h | 35 - 17 files changed, 1129 insertions(+), 4029 deletions(-) delete mode 100644 drivers/android/android_debug_symbols.c delete mode 100644 drivers/android/binder_internal.h delete mode 100644 drivers/android/binderfs.c delete mode 100644 drivers/android/vendor_hooks.c delete mode 100644 include/uapi/linux/android/binderfs.h diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 491751ab0dbf..bb2a5b581622 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -1,9 +1,8 @@ -# SPDX-License-Identifier: GPL-2.0 menu "Android" config ANDROID bool "Android Drivers" - help + ---help--- Enable support for various drivers needed on the Android platform if ANDROID @@ -12,7 +11,7 @@ config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU default n - help + ---help--- Binder is used in Android for both communication between processes, and remote method invocation. @@ -20,23 +19,11 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. -config ANDROID_BINDERFS - bool "Android Binderfs filesystem" - depends on ANDROID_BINDER_IPC - default n - help - Binderfs is a pseudo-filesystem for the Android Binder IPC driver - which can be mounted per-ipc namespace allowing to run multiple - instances of Android. - Each binderfs mount initially only contains a binder-control device. - It can be used to dynamically allocate new binder IPC devices via - ioctls. - config ANDROID_BINDER_DEVICES string "Android Binder devices" depends on ANDROID_BINDER_IPC default "binder,hwbinder,vndbinder" - help + ---help--- Default value for the binder.devices parameter. The binder.devices parameter is a comma-separated list of strings @@ -44,71 +31,29 @@ config ANDROID_BINDER_DEVICES created. Each binder device has its own context manager, and is therefore logically separated from the other devices. +config ANDROID_BINDER_IPC_32BIT + bool "Android Binder IPC 32BIT Driver" + depends on !64BIT && ANDROID_BINDER_IPC + default n + ---help--- + The Binder API has been changed to support both 32 and 64bit + applications in a mixed environment. + + Enable this to support an old 32-bit Android user-space (v4.4 and + earlier). + + Note that enabling this will break newer Android user-space. + config ANDROID_BINDER_IPC_SELFTEST bool "Android Binder IPC Driver Selftest" depends on ANDROID_BINDER_IPC - help + ---help--- This feature allows binder selftest to run. Binder selftest checks the allocation and free of binder buffers exhaustively with combinations of various buffer sizes and alignments. -config ANDROID_DEBUG_SYMBOLS - bool "Android Debug Symbols" - help - Enables export of debug symbols that are useful for offline debugging - of a kernel. These symbols would be used in vendor modules to find - addresses of the core kernel symbols for vendor extensions. - - This driver is statically compiled into kernel and maintains all the - required symbol addresses for vendor modules and provides necessary - interface vendor modules. - -config ANDROID_VENDOR_HOOKS - bool "Android Vendor Hooks" - depends on TRACEPOINTS - help - Enable vendor hooks implemented as tracepoints - - Allow vendor modules to attach to tracepoint "hooks" defined via - DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. - -config ANDROID_KABI_RESERVE - bool "Android KABI reserve padding" - default y - help - This option enables the padding that the Android GKI kernel adds - to many different kernel structures to support an in-kernel stable ABI - over the lifespan of support for the kernel. - - Only disable this option if you have a system that needs the Android - kernel drivers, but is NOT an Android GKI kernel image. If disabled - it has the possibility to make the kernel static and runtime image - slightly smaller but will NOT be supported by the Google Android - kernel team. - - If even slightly unsure, say Y. - -config ANDROID_VENDOR_OEM_DATA - bool "Android vendor and OEM data padding" - default y - help - This option enables the padding that the Android GKI kernel adds - to many different kernel structures to support an in-kernel stable ABI - over the lifespan of support for the kernel as well as OEM additional - fields that are needed by some of the Android kernel tracepoints. The - macros enabled by this option are used to enable padding in vendor modules - used for the above specified purposes. - - Only disable this option if you have a system that needs the Android - kernel drivers, but is NOT an Android GKI kernel image and you do NOT - use the Android kernel tracepoints. If disabled it has the possibility - to make the kernel static and runtime image slightly smaller but will - NOT be supported by the Google Android kernel team. - - If even slightly unsure, say Y. - endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index f1ac44102987..a01254c43ee3 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -1,8 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0-only ccflags-y += -I$(src) # needed for trace events -obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o -obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS) += android_debug_symbols.o -obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/android/android_debug_symbols.c b/drivers/android/android_debug_symbols.c deleted file mode 100644 index dd75ddac2085..000000000000 --- a/drivers/android/android_debug_symbols.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -/* - * Copyright (c) 2021, The Linux Foundation. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include "../../mm/slab.h" -#include -#include -#include -#include -#include - -struct ads_entry { - char *name; - void *addr; -}; - -bool ads_page_owner; -bool ads_slub_debug; -unsigned long ads_vmalloc_nr_pages; -unsigned long ads_pcpu_nr_pages; - -#define _ADS_ENTRY(index, symbol) \ - [index] = { .name = #symbol, .addr = (void *)symbol } -#define ADS_ENTRY(index, symbol) _ADS_ENTRY(index, symbol) - -#define _ADS_PER_CPU_ENTRY(index, symbol) \ - [index] = { .name = #symbol, .addr = (void *)&symbol } -#define ADS_PER_CPU_ENTRY(index, symbol) _ADS_PER_CPU_ENTRY(index, symbol) - -/* - * This module maintains static array of symbol and address information. - * Add all required core kernel symbols and their addresses into ads_entries[] array, - * so that vendor modules can query and to find address of non-exported symbol. - */ -static const struct ads_entry ads_entries[ADS_END] = { - ADS_ENTRY(ADS_SDATA, _sdata), - ADS_ENTRY(ADS_BSS_END, __bss_stop), - ADS_ENTRY(ADS_PER_CPU_START, __per_cpu_start), - ADS_ENTRY(ADS_PER_CPU_END, __per_cpu_end), - ADS_ENTRY(ADS_START_RO_AFTER_INIT, __start_ro_after_init), - ADS_ENTRY(ADS_END_RO_AFTER_INIT, __end_ro_after_init), - ADS_ENTRY(ADS_LINUX_BANNER, linux_banner), -#ifdef CONFIG_CMA - ADS_ENTRY(ADS_TOTAL_CMA, &totalcma_pages), -#endif - ADS_ENTRY(ADS_SLAB_CACHES, &slab_caches), - ADS_ENTRY(ADS_SLAB_MUTEX, &slab_mutex), - ADS_ENTRY(ADS_MIN_LOW_PFN, &min_low_pfn), - ADS_ENTRY(ADS_MAX_PFN, &max_pfn), - ADS_ENTRY(ADS_VMALLOC_NR_PAGES, &ads_vmalloc_nr_pages), - ADS_ENTRY(ADS_PCPU_NR_PAGES, &ads_pcpu_nr_pages), -#ifdef CONFIG_PAGE_OWNER - ADS_ENTRY(ADS_PAGE_OWNER_ENABLED, &ads_page_owner), -#endif -#ifdef CONFIG_SLUB_DEBUG - ADS_ENTRY(ADS_SLUB_DEBUG, &ads_slub_debug), -#endif -#ifdef CONFIG_SWAP - ADS_ENTRY(ADS_NR_SWAP_PAGES, &nr_swap_pages), -#endif -#ifdef CONFIG_MMU - ADS_ENTRY(ADS_MMAP_MIN_ADDR, &mmap_min_addr), -#endif - ADS_ENTRY(ADS_STACK_GUARD_GAP, &stack_guard_gap), -#ifdef CONFIG_SYSCTL - ADS_ENTRY(ADS_SYSCTL_LEGACY_VA_LAYOUT, &sysctl_legacy_va_layout), -#endif - ADS_ENTRY(ADS_SHOW_MEM, show_mem), -#ifdef CONFIG_ARM64 - ADS_ENTRY(ADS_PUT_TASK_STACK, put_task_stack), -#endif -}; - -/* - * ads_per_cpu_entries array contains all the per_cpu variable address information. - */ -static const struct ads_entry ads_per_cpu_entries[ADS_DEBUG_PER_CPU_END] = { -#ifdef CONFIG_ARM64 - ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, irq_stack_ptr), -#endif -#ifdef CONFIG_X86 - ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, hardirq_stack_ptr), -#endif -}; - -/* - * android_debug_symbol - Provide address inforamtion of debug symbol. - * @symbol: Index of debug symbol array. - * - * Return address of core kernel symbol on success and a negative errno will be - * returned in error cases. - * - */ -void *android_debug_symbol(enum android_debug_symbol symbol) -{ - if (symbol >= ADS_END) - return ERR_PTR(-EINVAL); - - return ads_entries[symbol].addr; -} -EXPORT_SYMBOL_NS_GPL(android_debug_symbol, MINIDUMP); - -/* - * android_debug_per_cpu_symbol - Provide address inforamtion of per cpu debug symbol. - * @symbol: Index of per cpu debug symbol array. - * - * Return address of core kernel symbol on success and a negative errno will be - * returned in error cases. - * - */ -void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol) -{ - if (symbol >= ADS_DEBUG_PER_CPU_END) - return ERR_PTR(-EINVAL); - - return ads_per_cpu_entries[symbol].addr; -} -EXPORT_SYMBOL_NS_GPL(android_debug_per_cpu_symbol, MINIDUMP); - -static int __init debug_symbol_init(void) -{ -#ifdef CONFIG_PAGE_OWNER - ads_page_owner = page_owner_ops.need(); -#endif -#ifdef CONFIG_SLUB_DEBUG - ads_slub_debug = __slub_debug_enabled(); -#endif - ads_vmalloc_nr_pages = vmalloc_nr_pages(); - ads_pcpu_nr_pages = pcpu_nr_pages(); - return 0; -} -module_init(debug_symbol_init); - -static void __exit debug_symbol_exit(void) -{ } -module_exit(debug_symbol_exit); - -MODULE_DESCRIPTION("Debug Symbol Driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d7546f8dc482..20356105e4ba 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1,9 +1,18 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ /* @@ -42,6 +51,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -54,25 +64,13 @@ #include #include #include -#include -#include +#include #include -#include #include #include #include #include -#include -#include -#include -#include - -#include -#include - -#include - -#include "binder_internal.h" +#include "binder_alloc.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); @@ -89,11 +87,36 @@ static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; -static int proc_show(struct seq_file *m, void *unused); -DEFINE_SHOW_ATTRIBUTE(proc); +#define BINDER_DEBUG_ENTRY(name) \ +static int binder_##name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, binder_##name##_show, inode->i_private); \ +} \ +\ +static const struct file_operations binder_##name##_fops = { \ + .owner = THIS_MODULE, \ + .open = binder_##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +static int binder_proc_show(struct seq_file *m, void *unused); +BINDER_DEBUG_ENTRY(proc); + +/* This is only defined in include/asm-arm/sizes.h */ +#ifndef SZ_1K +#define SZ_1K 0x400 +#endif + +#ifndef SZ_4M +#define SZ_4M 0x400000 +#endif #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) +#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64) + enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, @@ -115,8 +138,8 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); -char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; -module_param_named(devices, binder_devices_param, charp, 0444); +static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(devices, binder_devices_param, charp, S_IRUGO); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; @@ -137,13 +160,13 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define binder_debug(mask, x...) \ do { \ if (binder_debug_mask & mask) \ - pr_info_ratelimited(x); \ + pr_info(x); \ } while (0) #define binder_user_error(x...) \ do { \ if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \ - pr_info_ratelimited(x); \ + pr_info(x); \ if (binder_stop_on_user_error) \ binder_stop_on_user_error = 2; \ } while (0) @@ -159,6 +182,24 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) +enum binder_stat_types { + BINDER_STAT_PROC, + BINDER_STAT_THREAD, + BINDER_STAT_NODE, + BINDER_STAT_REF, + BINDER_STAT_DEATH, + BINDER_STAT_TRANSACTION, + BINDER_STAT_TRANSACTION_COMPLETE, + BINDER_STAT_COUNT +}; + +struct binder_stats { + atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1]; + atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; + atomic_t obj_created[BINDER_STAT_COUNT]; + atomic_t obj_deleted[BINDER_STAT_COUNT]; +}; + static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) @@ -186,26 +227,16 @@ struct binder_transaction_log_entry { int return_error_line; uint32_t return_error; uint32_t return_error_param; - char context_name[BINDERFS_MAX_NAME + 1]; + const char *context_name; }; - struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; - static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; -static struct kmem_cache *binder_node_pool; -static struct kmem_cache *binder_proc_pool; -static struct kmem_cache *binder_ref_death_pool; -static struct kmem_cache *binder_ref_pool; -static struct kmem_cache *binder_thread_pool; -static struct kmem_cache *binder_transaction_pool; -static struct kmem_cache *binder_work_pool; - static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { @@ -226,9 +257,320 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( return e; } +struct binder_context { + struct binder_node *binder_context_mgr_node; + struct mutex context_mgr_node_lock; + + kuid_t binder_context_mgr_uid; + const char *name; +}; + +struct binder_device { + struct hlist_node hlist; + struct miscdevice miscdev; + struct binder_context context; +}; + +/** + * struct binder_work - work enqueued on a worklist + * @entry: node enqueued on list + * @type: type of work to be performed + * + * There are separate work lists for proc, thread, and node (async). + */ +struct binder_work { + struct list_head entry; + + enum binder_work_type { + BINDER_WORK_TRANSACTION = 1, + BINDER_WORK_TRANSACTION_COMPLETE, + BINDER_WORK_RETURN_ERROR, + BINDER_WORK_NODE, + BINDER_WORK_DEAD_BINDER, + BINDER_WORK_DEAD_BINDER_AND_CLEAR, + BINDER_WORK_CLEAR_DEATH_NOTIFICATION, + } type; +}; + +struct binder_error { + struct binder_work work; + uint32_t cmd; +}; + +/** + * struct binder_node - binder node bookkeeping + * @debug_id: unique ID for debugging + * (invariant after initialized) + * @lock: lock for node fields + * @work: worklist element for node work + * (protected by @proc->inner_lock) + * @rb_node: element for proc->nodes tree + * (protected by @proc->inner_lock) + * @dead_node: element for binder_dead_nodes list + * (protected by binder_dead_nodes_lock) + * @proc: binder_proc that owns this node + * (invariant after initialized) + * @refs: list of references on this node + * (protected by @lock) + * @internal_strong_refs: used to take strong references when + * initiating a transaction + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @local_weak_refs: weak user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @local_strong_refs: strong user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @tmp_refs: temporary kernel refs + * (protected by @proc->inner_lock while @proc + * is valid, and by binder_dead_nodes_lock + * if @proc is NULL. During inc/dec and node release + * it is also protected by @lock to provide safety + * as the node dies and @proc becomes NULL) + * @ptr: userspace pointer for node + * (invariant, no lock needed) + * @cookie: userspace cookie for node + * (invariant, no lock needed) + * @has_strong_ref: userspace notified of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @pending_strong_ref: userspace has acked notification of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @has_weak_ref: userspace notified of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @pending_weak_ref: userspace has acked notification of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @has_async_transaction: async transaction to node in progress + * (protected by @lock) + * @sched_policy: minimum scheduling policy for node + * (invariant after initialized) + * @accept_fds: file descriptor operations supported for node + * (invariant after initialized) + * @min_priority: minimum scheduling priority + * (invariant after initialized) + * @inherit_rt: inherit RT scheduling policy from caller + * @txn_security_ctx: require sender's security context + * (invariant after initialized) + * @async_todo: list of async work items + * (protected by @proc->inner_lock) + * + * Bookkeeping structure for binder nodes. + */ +struct binder_node { + int debug_id; + spinlock_t lock; + struct binder_work work; + union { + struct rb_node rb_node; + struct hlist_node dead_node; + }; + struct binder_proc *proc; + struct hlist_head refs; + int internal_strong_refs; + int local_weak_refs; + int local_strong_refs; + int tmp_refs; + binder_uintptr_t ptr; + binder_uintptr_t cookie; + struct { + /* + * bitfield elements protected by + * proc inner_lock + */ + u8 has_strong_ref:1; + u8 pending_strong_ref:1; + u8 has_weak_ref:1; + u8 pending_weak_ref:1; + }; + struct { + /* + * invariant after initialization + */ + u8 sched_policy:2; + u8 inherit_rt:1; + u8 accept_fds:1; + u8 txn_security_ctx:1; + u8 min_priority; + }; + bool has_async_transaction; + struct list_head async_todo; +}; + +struct binder_ref_death { + /** + * @work: worklist element for death notifications + * (protected by inner_lock of the proc that + * this ref belongs to) + */ + struct binder_work work; + binder_uintptr_t cookie; +}; + +/** + * struct binder_ref_data - binder_ref counts and id + * @debug_id: unique ID for the ref + * @desc: unique userspace handle for ref + * @strong: strong ref count (debugging only if not locked) + * @weak: weak ref count (debugging only if not locked) + * + * Structure to hold ref count and ref id information. Since + * the actual ref can only be accessed with a lock, this structure + * is used to return information about the ref to callers of + * ref inc/dec functions. + */ +struct binder_ref_data { + int debug_id; + uint32_t desc; + int strong; + int weak; +}; + +/** + * struct binder_ref - struct to track references on nodes + * @data: binder_ref_data containing id, handle, and current refcounts + * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree + * @rb_node_node: node for lookup by @node in proc's rb_tree + * @node_entry: list entry for node->refs list in target node + * (protected by @node->lock) + * @proc: binder_proc containing ref + * @node: binder_node of target node. When cleaning up a + * ref for deletion in binder_cleanup_ref, a non-NULL + * @node indicates the node must be freed + * @death: pointer to death notification (ref_death) if requested + * (protected by @node->lock) + * + * Structure to track references from procA to target node (on procB). This + * structure is unsafe to access without holding @proc->outer_lock. + */ +struct binder_ref { + /* Lookups needed: */ + /* node + proc => ref (transaction) */ + /* desc + proc => ref (transaction, inc/dec ref) */ + /* node => refs + procs (proc exit) */ + struct binder_ref_data data; + struct rb_node rb_node_desc; + struct rb_node rb_node_node; + struct hlist_node node_entry; + struct binder_proc *proc; + struct binder_node *node; + struct binder_ref_death *death; +}; + enum binder_deferred_state { - BINDER_DEFERRED_FLUSH = 0x01, - BINDER_DEFERRED_RELEASE = 0x02, + BINDER_DEFERRED_PUT_FILES = 0x01, + BINDER_DEFERRED_FLUSH = 0x02, + BINDER_DEFERRED_RELEASE = 0x04, +}; + +/** + * struct binder_priority - scheduler policy and priority + * @sched_policy scheduler policy + * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT + * + * The binder driver supports inheriting the following scheduler policies: + * SCHED_NORMAL + * SCHED_BATCH + * SCHED_FIFO + * SCHED_RR + */ +struct binder_priority { + unsigned int sched_policy; + int prio; +}; + +/** + * struct binder_proc - binder process bookkeeping + * @proc_node: element for binder_procs list + * @threads: rbtree of binder_threads in this proc + * (protected by @inner_lock) + * @nodes: rbtree of binder nodes associated with + * this proc ordered by node->ptr + * (protected by @inner_lock) + * @refs_by_desc: rbtree of refs ordered by ref->desc + * (protected by @outer_lock) + * @refs_by_node: rbtree of refs ordered by ref->node + * (protected by @outer_lock) + * @waiting_threads: threads currently waiting for proc work + * (protected by @inner_lock) + * @pid PID of group_leader of process + * (invariant after initialized) + * @tsk task_struct for group_leader of process + * (invariant after initialized) + * @files files_struct for process + * (protected by @files_lock) + * @files_lock mutex to protect @files + * @cred struct cred associated with the `struct file` + * in binder_open() + * (invariant after initialized) + * @deferred_work_node: element for binder_deferred_list + * (protected by binder_deferred_lock) + * @deferred_work: bitmap of deferred work to perform + * (protected by binder_deferred_lock) + * @is_dead: process is dead and awaiting free + * when outstanding transactions are cleaned up + * (protected by @inner_lock) + * @todo: list of work for this process + * (protected by @inner_lock) + * @stats: per-process binder statistics + * (atomics, no lock needed) + * @delivered_death: list of delivered death notification + * (protected by @inner_lock) + * @max_threads: cap on number of binder threads + * (protected by @inner_lock) + * @requested_threads: number of binder threads requested but not + * yet started. In current implementation, can + * only be 0 or 1. + * (protected by @inner_lock) + * @requested_threads_started: number binder threads started + * (protected by @inner_lock) + * @tmp_ref: temporary reference to indicate proc is in use + * (atomic since @proc->inner_lock cannot + * always be acquired) + * @default_priority: default scheduler priority + * (invariant after initialized) + * @debugfs_entry: debugfs node + * @alloc: binder allocator bookkeeping + * @context: binder_context for this proc + * (invariant after initialized) + * @inner_lock: can nest under outer_lock and/or node lock + * @outer_lock: no nesting under innor or node lock + * Lock order: 1) outer, 2) node, 3) inner + * + * Bookkeeping structure for binder processes + */ +struct binder_proc { + struct hlist_node proc_node; + struct rb_root threads; + struct rb_root nodes; + struct rb_root refs_by_desc; + struct rb_root refs_by_node; + struct list_head waiting_threads; + int pid; + struct task_struct *tsk; + struct files_struct *files; + struct mutex files_lock; + const struct cred *cred; + struct hlist_node deferred_work_node; + int deferred_work; + bool is_dead; + + struct list_head todo; + struct binder_stats stats; + struct list_head delivered_death; + int max_threads; + int requested_threads; + int requested_threads_started; + atomic_t tmp_ref; + struct binder_priority default_priority; + struct dentry *debugfs_entry; + struct binder_alloc alloc; + struct binder_context *context; + spinlock_t inner_lock; + spinlock_t outer_lock; }; enum { @@ -240,6 +582,110 @@ enum { BINDER_LOOPER_STATE_POLL = 0x20, }; +/** + * struct binder_thread - binder thread bookkeeping + * @proc: binder process for this thread + * (invariant after initialization) + * @rb_node: element for proc->threads rbtree + * (protected by @proc->inner_lock) + * @waiting_thread_node: element for @proc->waiting_threads list + * (protected by @proc->inner_lock) + * @pid: PID for this thread + * (invariant after initialization) + * @looper: bitmap of looping state + * (only accessed by this thread) + * @looper_needs_return: looping thread needs to exit driver + * (no lock needed) + * @transaction_stack: stack of in-progress transactions for this thread + * (protected by @proc->inner_lock) + * @todo: list of work to do for this thread + * (protected by @proc->inner_lock) + * @process_todo: whether work in @todo should be processed + * (protected by @proc->inner_lock) + * @return_error: transaction errors reported by this thread + * (only accessed by this thread) + * @reply_error: transaction errors reported by target thread + * (protected by @proc->inner_lock) + * @wait: wait queue for thread work + * @stats: per-thread statistics + * (atomics, no lock needed) + * @tmp_ref: temporary reference to indicate thread is in use + * (atomic since @proc->inner_lock cannot + * always be acquired) + * @is_dead: thread is dead and awaiting free + * when outstanding transactions are cleaned up + * (protected by @proc->inner_lock) + * @task: struct task_struct for this thread + * + * Bookkeeping structure for binder threads. + */ +struct binder_thread { + struct binder_proc *proc; + struct rb_node rb_node; + struct list_head waiting_thread_node; + int pid; + int looper; /* only modified by this thread */ + bool looper_need_return; /* can be written by other thread */ + struct binder_transaction *transaction_stack; + struct list_head todo; + bool process_todo; + struct binder_error return_error; + struct binder_error reply_error; + wait_queue_head_t wait; + struct binder_stats stats; + atomic_t tmp_ref; + bool is_dead; + struct task_struct *task; +}; + +struct binder_transaction { + int debug_id; + struct binder_work work; + struct binder_thread *from; + struct binder_transaction *from_parent; + struct binder_proc *to_proc; + struct binder_thread *to_thread; + struct binder_transaction *to_parent; + unsigned need_reply:1; + /* unsigned is_dead:1; */ /* not used at the moment */ + + struct binder_buffer *buffer; + unsigned int code; + unsigned int flags; + struct binder_priority priority; + struct binder_priority saved_priority; + bool set_priority_called; + kuid_t sender_euid; + binder_uintptr_t security_ctx; + /** + * @lock: protects @from, @to_proc, and @to_thread + * + * @from, @to_proc, and @to_thread can be set to NULL + * during thread teardown + */ + spinlock_t lock; +}; + +/** + * struct binder_object - union of flat binder object types + * @hdr: generic object header + * @fbo: binder object (nodes and refs) + * @fdo: file descriptor object + * @bbo: binder buffer pointer + * @fdao: file descriptor array + * + * Used for type-independent object copies + */ +struct binder_object { + union { + struct binder_object_header hdr; + struct flat_binder_object fbo; + struct binder_fd_object fdo; + struct binder_buffer_object bbo; + struct binder_fd_array_object fdao; + }; +}; + /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire @@ -250,7 +696,6 @@ enum { #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) - __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -266,7 +711,6 @@ _binder_proc_lock(struct binder_proc *proc, int line) #define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) - __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -282,7 +726,6 @@ _binder_proc_unlock(struct binder_proc *proc, int line) #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) - __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -298,7 +741,6 @@ _binder_inner_proc_lock(struct binder_proc *proc, int line) #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) - __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -314,7 +756,6 @@ _binder_inner_proc_unlock(struct binder_proc *proc, int line) #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) - __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -330,7 +771,6 @@ _binder_node_lock(struct binder_node *node, int line) #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) - __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -347,16 +787,12 @@ _binder_node_unlock(struct binder_node *node, int line) #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) - __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); - else - /* annotation for sparse */ - __acquire(&node->proc->inner_lock); } /** @@ -368,7 +804,6 @@ _binder_node_inner_lock(struct binder_node *node, int line) #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) - __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; @@ -376,9 +811,6 @@ _binder_node_inner_unlock(struct binder_node *node, int line) "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); - else - /* annotation for sparse */ - __release(&node->proc->inner_lock); spin_unlock(&node->lock); } @@ -439,7 +871,6 @@ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { - WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } @@ -457,7 +888,6 @@ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { - WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); thread->process_todo = true; } @@ -518,13 +948,69 @@ static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); +static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) +{ + unsigned long rlim_cur; + unsigned long irqs; + int ret; + + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + ret = -ESRCH; + goto err; + } + if (!lock_task_sighand(proc->tsk, &irqs)) { + ret = -EMFILE; + goto err; + } + rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); + unlock_task_sighand(proc->tsk, &irqs); + + ret = __alloc_fd(proc->files, 0, rlim_cur, flags); +err: + mutex_unlock(&proc->files_lock); + return ret; +} + +/* + * copied from fd_install + */ +static void task_fd_install( + struct binder_proc *proc, unsigned int fd, struct file *file) +{ + mutex_lock(&proc->files_lock); + if (proc->files) + __fd_install(proc->files, fd, file); + mutex_unlock(&proc->files_lock); +} + +/* + * copied from sys_close + */ +static long task_close_fd(struct binder_proc *proc, unsigned int fd) +{ + int retval; + + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + retval = -ESRCH; + goto err; + } + retval = __close_fd(proc->files, fd); + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; +err: + mutex_unlock(&proc->files_lock); + return retval; +} + static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { - int ret = 0; - - if (ret) - return true; return thread->process_todo || thread->looper_need_return || (do_proc_work && @@ -669,7 +1155,7 @@ static int to_userspace_prio(int policy, int kernel_priority) if (is_fair_policy(policy)) return PRIO_TO_NICE(kernel_priority); else - return MAX_RT_PRIO - 1 - kernel_priority; + return MAX_USER_RT_PRIO - 1 - kernel_priority; } static int to_kernel_prio(int policy, int user_priority) @@ -677,29 +1163,23 @@ static int to_kernel_prio(int policy, int user_priority) if (is_fair_policy(policy)) return NICE_TO_PRIO(user_priority); else - return MAX_RT_PRIO - 1 - user_priority; + return MAX_USER_RT_PRIO - 1 - user_priority; } -static void binder_do_set_priority(struct binder_thread *thread, - const struct binder_priority *desired, +static void binder_do_set_priority(struct task_struct *task, + struct binder_priority desired, bool verify) { - struct task_struct *task = thread->task; int priority; /* user-space prio value */ bool has_cap_nice; - unsigned int policy = desired->sched_policy; + unsigned int policy = desired.sched_policy; - if (task->policy == policy && task->normal_prio == desired->prio) { - spin_lock(&thread->prio_lock); - if (thread->prio_state == BINDER_PRIO_PENDING) - thread->prio_state = BINDER_PRIO_SET; - spin_unlock(&thread->prio_lock); + if (task->policy == policy && task->normal_prio == desired.prio) return; - } has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE); - priority = to_userspace_prio(policy, desired->prio); + priority = to_userspace_prio(policy, desired.prio); if (verify && is_rt_policy(policy) && !has_cap_nice) { long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO); @@ -724,30 +1204,16 @@ static void binder_do_set_priority(struct binder_thread *thread, } } - if (policy != desired->sched_policy || - to_kernel_prio(policy, priority) != desired->prio) + if (policy != desired.sched_policy || + to_kernel_prio(policy, priority) != desired.prio) binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: priority %d not allowed, using %d instead\n", - task->pid, desired->prio, + task->pid, desired.prio, to_kernel_prio(policy, priority)); trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, to_kernel_prio(policy, priority), - desired->prio); - - spin_lock(&thread->prio_lock); - if (!verify && thread->prio_state == BINDER_PRIO_ABORT) { - /* - * A new priority has been set by an incoming nested - * transaction. Abort this priority restore and allow - * the transaction to run at the new desired priority. - */ - spin_unlock(&thread->prio_lock); - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: %s: aborting priority restore\n", - thread->pid, __func__); - return; - } + desired.prio); /* Set the actual priority */ if (task->policy != policy || is_rt_policy(policy)) { @@ -761,46 +1227,37 @@ static void binder_do_set_priority(struct binder_thread *thread, } if (is_fair_policy(policy)) set_user_nice(task, priority); - - thread->prio_state = BINDER_PRIO_SET; - spin_unlock(&thread->prio_lock); } -static void binder_set_priority(struct binder_thread *thread, - const struct binder_priority *desired) +static void binder_set_priority(struct task_struct *task, + struct binder_priority desired) { - binder_do_set_priority(thread, desired, /* verify = */ true); + binder_do_set_priority(task, desired, /* verify = */ true); } -static void binder_restore_priority(struct binder_thread *thread, - const struct binder_priority *desired) +static void binder_restore_priority(struct task_struct *task, + struct binder_priority desired) { - binder_do_set_priority(thread, desired, /* verify = */ false); + binder_do_set_priority(task, desired, /* verify = */ false); } -static void binder_transaction_priority(struct binder_thread *thread, +static void binder_transaction_priority(struct task_struct *task, struct binder_transaction *t, - struct binder_node *node) + struct binder_priority node_prio, + bool inherit_rt) { - struct task_struct *task = thread->task; - struct binder_priority desired = t->priority; - const struct binder_priority node_prio = { - .sched_policy = node->sched_policy, - .prio = node->min_priority, - }; - bool skip = false; + struct binder_priority desired_prio = t->priority; if (t->set_priority_called) return; t->set_priority_called = true; + t->saved_priority.sched_policy = task->policy; + t->saved_priority.prio = task->normal_prio; - if (skip) - return; - - if (!node->inherit_rt && is_rt_policy(desired.sched_policy)) { - desired.prio = NICE_TO_PRIO(0); - desired.sched_policy = SCHED_NORMAL; + if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) { + desired_prio.prio = NICE_TO_PRIO(0); + desired_prio.sched_policy = SCHED_NORMAL; } if (node_prio.prio < t->priority.prio || @@ -813,29 +1270,10 @@ static void binder_transaction_priority(struct binder_thread *thread, * SCHED_FIFO, prefer SCHED_FIFO, since it can * run unbounded, unlike SCHED_RR. */ - desired = node_prio; + desired_prio = node_prio; } - spin_lock(&thread->prio_lock); - if (thread->prio_state == BINDER_PRIO_PENDING) { - /* - * Task is in the process of changing priorities - * saving its current values would be incorrect. - * Instead, save the pending priority and signal - * the task to abort the priority restore. - */ - t->saved_priority = thread->prio_next; - thread->prio_state = BINDER_PRIO_ABORT; - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: saved pending priority %d\n", - current->pid, thread->prio_next.prio); - } else { - t->saved_priority.sched_policy = task->policy; - t->saved_priority.prio = task->normal_prio; - } - spin_unlock(&thread->prio_lock); - - binder_set_priority(thread, &desired); + binder_set_priority(task, desired_prio); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, @@ -942,9 +1380,9 @@ static struct binder_node *binder_init_node_ilocked( static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { - struct binder_node *node, *new_node; + struct binder_node *node; + struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); - new_node = kmem_cache_zalloc(binder_node_pool, GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); @@ -954,14 +1392,14 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, /* * The node was already added by another thread */ - kmem_cache_free(binder_node_pool, new_node); + kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { - kmem_cache_free(binder_node_pool, node); + kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } @@ -979,7 +1417,8 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && - node == node->proc->context->binder_context_mgr_node && + node == node->proc->context-> + binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); @@ -989,12 +1428,19 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { - struct binder_thread *thread = container_of(target_list, - struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); - BUG_ON(&thread->todo != target_list); - binder_enqueue_deferred_thread_work_ilocked(thread, - &node->work); + /* + * Note: this function is the only place where we queue + * directly to a thread->todo without using the + * corresponding binder_enqueue_thread_work() helper + * functions; in this case it's ok to not set the + * process_todo flag, since we know this node work will + * always be followed by other work that starts queue + * processing: in case of synchronous transactions, a + * BR_REPLY or BR_ERROR; in case of oneway + * transactions, a BR_TRANSACTION_COMPLETE. + */ + binder_enqueue_work_ilocked(&node->work, target_list); } } else { if (!internal) @@ -1148,14 +1594,10 @@ static void binder_dec_node_tmpref(struct binder_node *node) binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); - else - __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); - else - __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 @@ -1445,9 +1887,8 @@ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); - if (ref->death) - kmem_cache_free(binder_ref_death_pool, ref->death); - kmem_cache_free(binder_ref_pool, ref); + kfree(ref->death); + kfree(ref); } /** @@ -1540,7 +1981,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); - new_ref = kmem_cache_zalloc(binder_ref_pool, GFP_KERNEL); + new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); @@ -1566,7 +2007,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, * Another thread created the ref first so * free the one we allocated */ - kmem_cache_free(binder_ref_pool, new_ref); + kfree(new_ref); return ret; } @@ -1625,9 +2066,9 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread) static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); - proc->tmp_ref--; + atomic_dec(&proc->tmp_ref); if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && - !proc->tmp_ref) { + !atomic_read(&proc->tmp_ref)) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; @@ -1671,89 +2112,45 @@ static struct binder_thread *binder_get_txn_from( */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) - __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); - if (!from) { - __acquire(&from->proc->inner_lock); + if (!from) return NULL; - } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); - __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } -/** - * binder_free_txn_fixups() - free unprocessed fd fixups - * @t: binder transaction for t->from - * - * If the transaction is being torn down prior to being - * processed by the target process, free all of the - * fd fixups and fput the file structs. It is safe to - * call this function after the fixups have been - * processed -- in that case, the list will be empty. - */ -static void binder_free_txn_fixups(struct binder_transaction *t) -{ - struct binder_txn_fd_fixup *fixup, *tmp; - - list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { - fput(fixup->file); - list_del(&fixup->fixup_entry); - kfree(fixup); - } -} - -static void binder_txn_latency_free(struct binder_transaction *t) -{ - int from_proc, from_thread, to_proc, to_thread; - - spin_lock(&t->lock); - from_proc = t->from ? t->from->proc->pid : 0; - from_thread = t->from ? t->from->pid : 0; - to_proc = t->to_proc ? t->to_proc->pid : 0; - to_thread = t->to_thread ? t->to_thread->pid : 0; - spin_unlock(&t->lock); - - trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); -} - static void binder_free_transaction(struct binder_transaction *t) { - struct binder_proc *target_proc = t->to_proc; + struct binder_proc *target_proc; + spin_lock(&t->lock); + target_proc = t->to_proc; if (target_proc) { + atomic_inc(&target_proc->tmp_ref); + spin_unlock(&t->lock); + binder_inner_proc_lock(target_proc); - target_proc->outstanding_txns--; - if (target_proc->outstanding_txns < 0) - pr_warn("%s: Unexpected outstanding_txns %d\n", - __func__, target_proc->outstanding_txns); - if (!target_proc->outstanding_txns && target_proc->is_frozen) - wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); + binder_proc_dec_tmpref(target_proc); + } else { + /* + * If the transaction has no target_proc, then + * t->buffer->transaction * has already been cleared. + */ + spin_unlock(&t->lock); } - if (trace_binder_txn_latency_free_enabled()) - binder_txn_latency_free(t); - /* - * If the transaction has no target_proc, then - * t->buffer->transaction has already been cleared. - */ - binder_free_txn_fixups(t); - /* - * If the transaction has no target_proc, then - * t->buffer->transaction has already been cleared. - */ - kmem_cache_free(binder_transaction_pool, t); + kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } @@ -1795,7 +2192,6 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_free_transaction(t); return; } - __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, @@ -1838,21 +2234,15 @@ static void binder_cleanup_transaction(struct binder_transaction *t, /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer - * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * - * Copy the binder object at the given offset into @object. If @u is - * provided then the copy is from the sender's buffer. If not, then - * it is copied from the target's @buffer. - * - * Return: If there's a valid metadata object at @offset, the + * Return: If there's a valid metadata object at @offset in @buffer, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, - const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) @@ -1862,16 +2252,11 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (offset > buffer->data_size || read_size < sizeof(*hdr)) + if (offset > buffer->data_size || read_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) return 0; - if (u) { - if (copy_from_user(object, u + offset, read_size)) - return 0; - } else { - if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, - offset, read_size)) - return 0; - } + binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, + offset, read_size); /* Ok, now see if we read a complete object. */ hdr = &object->hdr; @@ -1940,11 +2325,9 @@ static struct binder_buffer_object *binder_validate_ptr( return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; - if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, - b, buffer_offset, - sizeof(object_offset))) - return NULL; - object_size = binder_get_object(proc, NULL, b, object_offset, object); + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + b, buffer_offset, sizeof(object_offset)); + object_size = binder_get_object(proc, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) @@ -2009,8 +2392,7 @@ static bool binder_validate_fixup(struct binder_proc *proc, unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; - size_t object_size = binder_get_object(proc, NULL, b, - last_obj_offset, + size_t object_size = binder_get_object(proc, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; @@ -2024,78 +2406,15 @@ static bool binder_validate_fixup(struct binder_proc *proc, return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + - sizeof(binder_size_t) * last_bbo->parent; - if (binder_alloc_copy_from_buffer(&proc->alloc, - &last_obj_offset, - b, buffer_offset, - sizeof(last_obj_offset))) - return false; + sizeof(binder_size_t) * last_bbo->parent, + binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, + b, buffer_offset, + sizeof(last_obj_offset)); } return (fixup_offset >= last_min_offset); } -/** - * struct binder_task_work_cb - for deferred close - * - * @twork: callback_head for task work - * @fd: fd to close - * - * Structure to pass task work to be handled after - * returning from binder_ioctl() via task_work_add(). - */ -struct binder_task_work_cb { - struct callback_head twork; - struct file *file; -}; - -/** - * binder_do_fd_close() - close list of file descriptors - * @twork: callback head for task work - * - * It is not safe to call ksys_close() during the binder_ioctl() - * function if there is a chance that binder's own file descriptor - * might be closed. This is to meet the requirements for using - * fdget() (see comments for __fget_light()). Therefore use - * task_work_add() to schedule the close operation once we have - * returned from binder_ioctl(). This function is a callback - * for that mechanism and does the actual ksys_close() on the - * given file descriptor. - */ -static void binder_do_fd_close(struct callback_head *twork) -{ - struct binder_task_work_cb *twcb = container_of(twork, - struct binder_task_work_cb, twork); - - fput(twcb->file); - kfree(twcb); -} - -/** - * binder_deferred_fd_close() - schedule a close for the given file-descriptor - * @fd: file-descriptor to close - * - * See comments in binder_do_fd_close(). This function is used to schedule - * a file-descriptor to be closed after returning from binder_ioctl(). - */ -static void binder_deferred_fd_close(int fd) -{ - struct binder_task_work_cb *twcb; - - twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); - if (!twcb) - return; - init_task_work(&twcb->twork, binder_do_fd_close); - close_fd_get_file(fd, &twcb->file); - if (twcb->file) { - filp_close(twcb->file, current->files); - task_work_add(current, &twcb->twork, true); - } else { - kfree(twcb); - } -} - static void binder_transaction_buffer_release(struct binder_proc *proc, - struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t failed_at, bool is_failure) @@ -2113,20 +2432,20 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_end_offset = is_failure && failed_at ? failed_at : + off_end_offset = is_failure ? failed_at : off_start_offset + buffer->offsets_size; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; - size_t object_size = 0; + size_t object_size; struct binder_object object; binder_size_t object_offset; - if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, - buffer, buffer_offset, - sizeof(object_offset))) - object_size = binder_get_object(proc, NULL, buffer, - object_offset, &object); + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + buffer, buffer_offset, + sizeof(object_offset)); + object_size = binder_get_object(proc, buffer, + object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); @@ -2174,15 +2493,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } break; case BINDER_TYPE_FD: { - /* - * No need to close the file here since user-space - * closes it for for successfully delivered - * transactions. For transactions that weren't - * delivered, the new fd was never allocated so - * there is no need to close and the fput on the - * file is done when the transaction is torn - * down. - */ + struct binder_fd_object *fp = to_binder_fd_object(hdr); + + binder_debug(BINDER_DEBUG_TRANSACTION, + " fd %d\n", fp->fd); + if (failed_at) + task_close_fd(proc, fp->fd); } break; case BINDER_TYPE_PTR: /* @@ -2199,14 +2515,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_size_t fd_buf_size; binder_size_t num_valid; - if (is_failure) { - /* - * The fd fixups have not been applied so no - * fds need to be closed. - */ - continue; - } - num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); @@ -2216,7 +2524,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, NULL, num_valid); if (!parent) { - pr_err("transaction release %d bad parent offset\n", + pr_err("transaction release %d bad parent offset", debug_id); continue; } @@ -2246,24 +2554,15 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; - int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); - err = binder_alloc_copy_from_buffer( - &proc->alloc, &fd, buffer, - offset, sizeof(fd)); - WARN_ON(err); - if (!err) { - binder_deferred_fd_close(fd); - /* - * Need to make sure the thread goes - * back to userspace to complete the - * deferred close - */ - if (thread) - thread->looper_need_return = true; - } + binder_alloc_copy_from_buffer(&proc->alloc, + &fd, + buffer, + offset, + sizeof(fd)); + task_close_fd(proc, fd); } } break; default: @@ -2359,15 +2658,11 @@ static int binder_translate_handle(struct flat_binder_object *fp, fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); - else - __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); - else - __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", @@ -2400,16 +2695,16 @@ static int binder_translate_handle(struct flat_binder_object *fp, return ret; } -static int binder_translate_fd(u32 fd, binder_size_t fd_offset, +static int binder_translate_fd(int fd, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; - struct binder_txn_fd_fixup *fixup; + int target_fd; struct file *file; - int ret = 0; + int ret; bool target_allows_fd; if (in_reply_to) @@ -2438,24 +2733,19 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, goto err_security; } - /* - * Add fixup record for this transaction. The allocation - * of the fd in the target needs to be done from a - * target thread. - */ - fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); - if (!fixup) { + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); + if (target_fd < 0) { ret = -ENOMEM; - goto err_alloc; + goto err_get_unused_fd; } - fixup->file = file; - fixup->offset = fd_offset; - trace_binder_transaction_fd_send(t, fd, fixup->offset); - list_add_tail(&fixup->fixup_entry, &t->fd_fixups); + task_fd_install(target_proc, target_fd, file); + trace_binder_transaction_fd(t, fd, target_fd); + binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", + fd, target_fd); - return ret; + return target_fd; -err_alloc: +err_get_unused_fd: err_security: fput(file); err_fget: @@ -2463,266 +2753,17 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, return ret; } -/** - * struct binder_ptr_fixup - data to be fixed-up in target buffer - * @offset offset in target buffer to fixup - * @skip_size bytes to skip in copy (fixup will be written later) - * @fixup_data data to write at fixup offset - * @node list node - * - * This is used for the pointer fixup list (pf) which is created and consumed - * during binder_transaction() and is only accessed locally. No - * locking is necessary. - * - * The list is ordered by @offset. - */ -struct binder_ptr_fixup { - binder_size_t offset; - size_t skip_size; - binder_uintptr_t fixup_data; - struct list_head node; -}; - -/** - * struct binder_sg_copy - scatter-gather data to be copied - * @offset offset in target buffer - * @sender_uaddr user address in source buffer - * @length bytes to copy - * @node list node - * - * This is used for the sg copy list (sgc) which is created and consumed - * during binder_transaction() and is only accessed locally. No - * locking is necessary. - * - * The list is ordered by @offset. - */ -struct binder_sg_copy { - binder_size_t offset; - const void __user *sender_uaddr; - size_t length; - struct list_head node; -}; - -/** - * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data - * @alloc: binder_alloc associated with @buffer - * @buffer: binder buffer in target process - * @sgc_head: list_head of scatter-gather copy list - * @pf_head: list_head of pointer fixup list - * - * Processes all elements of @sgc_head, applying fixups from @pf_head - * and copying the scatter-gather data from the source process' user - * buffer to the target's buffer. It is expected that the list creation - * and processing all occurs during binder_transaction() so these lists - * are only accessed in local context. - * - * Return: 0=success, else -errno - */ -static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, - struct binder_buffer *buffer, - struct list_head *sgc_head, - struct list_head *pf_head) -{ - int ret = 0; - struct binder_sg_copy *sgc, *tmpsgc; - struct binder_ptr_fixup *tmppf; - struct binder_ptr_fixup *pf = - list_first_entry_or_null(pf_head, struct binder_ptr_fixup, - node); - - list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { - size_t bytes_copied = 0; - - while (bytes_copied < sgc->length) { - size_t copy_size; - size_t bytes_left = sgc->length - bytes_copied; - size_t offset = sgc->offset + bytes_copied; - - /* - * We copy up to the fixup (pointed to by pf) - */ - copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) - : bytes_left; - if (!ret && copy_size) - ret = binder_alloc_copy_user_to_buffer( - alloc, buffer, - offset, - sgc->sender_uaddr + bytes_copied, - copy_size); - bytes_copied += copy_size; - if (copy_size != bytes_left) { - BUG_ON(!pf); - /* we stopped at a fixup offset */ - if (pf->skip_size) { - /* - * we are just skipping. This is for - * BINDER_TYPE_FDA where the translated - * fds will be fixed up when we get - * to target context. - */ - bytes_copied += pf->skip_size; - } else { - /* apply the fixup indicated by pf */ - if (!ret) - ret = binder_alloc_copy_to_buffer( - alloc, buffer, - pf->offset, - &pf->fixup_data, - sizeof(pf->fixup_data)); - bytes_copied += sizeof(pf->fixup_data); - } - list_del(&pf->node); - kfree(pf); - pf = list_first_entry_or_null(pf_head, - struct binder_ptr_fixup, node); - } - } - list_del(&sgc->node); - kfree(sgc); - } - list_for_each_entry_safe(pf, tmppf, pf_head, node) { - BUG_ON(pf->skip_size == 0); - list_del(&pf->node); - kfree(pf); - } - BUG_ON(!list_empty(sgc_head)); - - return ret > 0 ? -EINVAL : ret; -} - -/** - * binder_cleanup_deferred_txn_lists() - free specified lists - * @sgc_head: list_head of scatter-gather copy list - * @pf_head: list_head of pointer fixup list - * - * Called to clean up @sgc_head and @pf_head if there is an - * error. - */ -static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, - struct list_head *pf_head) -{ - struct binder_sg_copy *sgc, *tmpsgc; - struct binder_ptr_fixup *pf, *tmppf; - - list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { - list_del(&sgc->node); - kfree(sgc); - } - list_for_each_entry_safe(pf, tmppf, pf_head, node) { - list_del(&pf->node); - kfree(pf); - } -} - -/** - * binder_defer_copy() - queue a scatter-gather buffer for copy - * @sgc_head: list_head of scatter-gather copy list - * @offset: binder buffer offset in target process - * @sender_uaddr: user address in source process - * @length: bytes to copy - * - * Specify a scatter-gather block to be copied. The actual copy must - * be deferred until all the needed fixups are identified and queued. - * Then the copy and fixups are done together so un-translated values - * from the source are never visible in the target buffer. - * - * We are guaranteed that repeated calls to this function will have - * monotonically increasing @offset values so the list will naturally - * be ordered. - * - * Return: 0=success, else -errno - */ -static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, - const void __user *sender_uaddr, size_t length) -{ - struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); - - if (!bc) - return -ENOMEM; - - bc->offset = offset; - bc->sender_uaddr = sender_uaddr; - bc->length = length; - INIT_LIST_HEAD(&bc->node); - - /* - * We are guaranteed that the deferred copies are in-order - * so just add to the tail. - */ - list_add_tail(&bc->node, sgc_head); - - return 0; -} - -/** - * binder_add_fixup() - queue a fixup to be applied to sg copy - * @pf_head: list_head of binder ptr fixup list - * @offset: binder buffer offset in target process - * @fixup: bytes to be copied for fixup - * @skip_size: bytes to skip when copying (fixup will be applied later) - * - * Add the specified fixup to a list ordered by @offset. When copying - * the scatter-gather buffers, the fixup will be copied instead of - * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup - * will be applied later (in target process context), so we just skip - * the bytes specified by @skip_size. If @skip_size is 0, we copy the - * value in @fixup. - * - * This function is called *mostly* in @offset order, but there are - * exceptions. Since out-of-order inserts are relatively uncommon, - * we insert the new element by searching backward from the tail of - * the list. - * - * Return: 0=success, else -errno - */ -static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, - binder_uintptr_t fixup, size_t skip_size) -{ - struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); - struct binder_ptr_fixup *tmppf; - - if (!pf) - return -ENOMEM; - - pf->offset = offset; - pf->fixup_data = fixup; - pf->skip_size = skip_size; - INIT_LIST_HEAD(&pf->node); - - /* Fixups are *mostly* added in-order, but there are some - * exceptions. Look backwards through list for insertion point. - */ - list_for_each_entry_reverse(tmppf, pf_head, node) { - if (tmppf->offset < pf->offset) { - list_add(&pf->node, &tmppf->node); - return 0; - } - } - /* - * if we get here, then the new offset is the lowest so - * insert at the head - */ - list_add(&pf->node, pf_head); - return 0; -} - -static int binder_translate_fd_array(struct list_head *pf_head, - struct binder_fd_array_object *fda, - const void __user *sender_ubuffer, +static int binder_translate_fd_array(struct binder_fd_array_object *fda, struct binder_buffer_object *parent, - struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { - binder_size_t fdi, fd_buf_size; + binder_size_t fdi, fd_buf_size, num_installed_fds; binder_size_t fda_offset; - const void __user *sender_ufda_base; + int target_fd; struct binder_proc *proc = thread->proc; - int ret; - - if (fda->num_fds == 0) - return 0; + struct binder_proc *target_proc = t->to_proc; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { @@ -2746,36 +2787,46 @@ static int binder_translate_fd_array(struct list_head *pf_head, */ fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) + fda->parent_offset; - sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + - fda->parent_offset; - - if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || - !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { + if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } - ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); - if (ret) - return ret; - for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; + binder_size_t offset = fda_offset + fdi * sizeof(fd); - binder_size_t sender_uoffset = fdi * sizeof(fd); - ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); - if (!ret) - ret = binder_translate_fd(fd, offset, t, thread, - in_reply_to); - if (ret) - return ret > 0 ? -EINVAL : ret; + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + target_fd = binder_translate_fd(fd, t, thread, in_reply_to); + if (target_fd < 0) + goto err_translate_fd_failed; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, offset, + &target_fd, sizeof(fd)); } return 0; + +err_translate_fd_failed: + /* + * Failed to allocate fd or security error, free fds + * installed so far. + */ + num_installed_fds = fdi; + for (fdi = 0; fdi < num_installed_fds; fdi++) { + u32 fd; + binder_size_t offset = fda_offset + fdi * sizeof(fd); + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + task_close_fd(target_proc, fd); + } + return target_fd; } -static int binder_fixup_parent(struct list_head *pf_head, - struct binder_transaction *t, +static int binder_fixup_parent(struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, @@ -2821,57 +2872,10 @@ static int binder_fixup_parent(struct list_head *pf_head, } buffer_offset = bp->parent_offset + (uintptr_t)parent->buffer - (uintptr_t)b->user_data; - return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); -} - -/** - * binder_can_update_transaction() - Can a txn be superseded by an updated one? - * @t1: the pending async txn in the frozen process - * @t2: the new async txn to supersede the outdated pending one - * - * Return: true if t2 can supersede t1 - * false if t2 can not supersede t1 - */ -static bool binder_can_update_transaction(struct binder_transaction *t1, - struct binder_transaction *t2) -{ - if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != - (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) - return false; - if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && - t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && - t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && - t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) - return true; - return false; -} - -/** - * binder_find_outdated_transaction_ilocked() - Find the outdated transaction - * @t: new async transaction - * @target_list: list to find outdated transaction - * - * Return: the outdated transaction if found - * NULL if no outdated transacton can be found - * - * Requires the proc->inner_lock to be held. - */ -static struct binder_transaction * -binder_find_outdated_transaction_ilocked(struct binder_transaction *t, - struct list_head *target_list) -{ - struct binder_work *w; + binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset, + &bp->buffer, sizeof(bp->buffer)); - list_for_each_entry(w, target_list, entry) { - struct binder_transaction *t_queued; - - if (w->type != BINDER_WORK_TRANSACTION) - continue; - t_queued = container_of(w, struct binder_transaction, work); - if (binder_can_update_transaction(t_queued, t)) - return t_queued; - } - return NULL; + return 0; } /** @@ -2888,91 +2892,60 @@ binder_find_outdated_transaction_ilocked(struct binder_transaction *t, * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * - * Return: 0 if the transaction was successfully queued - * BR_DEAD_REPLY if the target process or thread is dead - * BR_FROZEN_REPLY if the target process or thread is frozen + * Return: true if the transactions was successfully queued + * false if the target process or thread is dead */ -static int binder_proc_transaction(struct binder_transaction *t, +static bool binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; + struct binder_priority node_prio; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; - bool skip = false; - struct binder_transaction *t_outdated = NULL; BUG_ON(!node); binder_node_lock(node); + node_prio.prio = node->min_priority; + node_prio.sched_policy = node->sched_policy; if (oneway) { BUG_ON(thread); - if (node->has_async_transaction) + if (node->has_async_transaction) { pending_async = true; - else + } else { node->has_async_transaction = true; + } } binder_inner_proc_lock(proc); - if (proc->is_frozen) { - proc->sync_recv |= !oneway; - proc->async_recv |= oneway; - } - if ((proc->is_frozen && !oneway) || proc->is_dead || - (thread && thread->is_dead)) { + if (proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); - return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; + return false; } - if (!thread && !pending_async && !skip) + if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { - binder_transaction_priority(thread, t, node); + binder_transaction_priority(thread->task, t, node_prio, + node->inherit_rt); binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { - if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) { - t_outdated = binder_find_outdated_transaction_ilocked(t, - &node->async_todo); - if (t_outdated) { - binder_debug(BINDER_DEBUG_TRANSACTION, - "txn %d supersedes %d\n", - t->debug_id, t_outdated->debug_id); - list_del_init(&t_outdated->work.entry); - proc->outstanding_txns--; - } - } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); - proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); - /* - * To reduce potential contention, free the outdated transaction and - * buffer after releasing the locks. - */ - if (t_outdated) { - struct binder_buffer *buffer = t_outdated->buffer; - - t_outdated->buffer = NULL; - buffer->transaction = NULL; - trace_binder_transaction_update_buffer_release(buffer); - binder_transaction_buffer_release(proc, NULL, buffer, 0, 0); - binder_alloc_free_buf(&proc->alloc, buffer); - kfree(t_outdated); - binder_stats_deleted(BINDER_STAT_TRANSACTION); - } - - return 0; + return true; } /** @@ -3008,7 +2981,7 @@ static struct binder_node *binder_get_node_refs_for_txn( target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); - node->proc->tmp_ref++; + atomic_inc(&node->proc->tmp_ref); *procp = node->proc; } else *error = BR_DEAD_REPLY; @@ -3024,13 +2997,11 @@ static void binder_transaction(struct binder_proc *proc, { int ret; struct binder_transaction *t; - struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; - binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; @@ -3045,13 +3016,6 @@ static void binder_transaction(struct binder_proc *proc, int t_debug_id = atomic_inc_return(&binder_last_id); char *secctx = NULL; u32 secctx_sz = 0; - bool is_nested = false; - struct list_head sgc_head; - struct list_head pf_head; - const void __user *user_buffer = (const void __user *) - (uintptr_t)tr->data.ptr.buffer; - INIT_LIST_HEAD(&sgc_head); - INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -3061,7 +3025,7 @@ static void binder_transaction(struct binder_proc *proc, e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; - strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); + e->context_name = proc->context->name; if (reply) { binder_inner_proc_lock(proc); @@ -3095,8 +3059,6 @@ static void binder_transaction(struct binder_proc *proc, binder_inner_proc_unlock(proc); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { - /* annotation for sparse */ - __release(&target_thread->proc->inner_lock); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; @@ -3116,7 +3078,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_binder; } target_proc = target_thread->proc; - target_proc->tmp_ref++; + atomic_inc(&target_proc->tmp_ref); binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { @@ -3137,8 +3099,8 @@ static void binder_transaction(struct binder_proc *proc, ref->node, &target_proc, &return_error); } else { - binder_user_error("%d:%d got transaction to invalid handle, %u\n", - proc->pid, thread->pid, tr->target.handle); + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); @@ -3152,7 +3114,7 @@ static void binder_transaction(struct binder_proc *proc, else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); - if (target_node && target_proc->pid == proc->pid) { + if (target_node && target_proc == proc) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -3184,29 +3146,6 @@ static void binder_transaction(struct binder_proc *proc, goto err_invalid_target_handle; } binder_inner_proc_lock(proc); - - w = list_first_entry_or_null(&thread->todo, - struct binder_work, entry); - if (!(tr->flags & TF_ONE_WAY) && w && - w->type == BINDER_WORK_TRANSACTION) { - /* - * Do not allow new outgoing transaction from a - * thread that has a transaction at the head of - * its todo list. Only need to check the head - * because binder_select_thread_ilocked picks a - * thread from proc->waiting_threads to enqueue - * the transaction, and nothing is queued to the - * todo list while the thread is on waiting_threads. - */ - binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", - proc->pid, thread->pid); - binder_inner_proc_unlock(proc); - return_error = BR_FAILED_REPLY; - return_error_param = -EPROTO; - return_error_line = __LINE__; - goto err_bad_todo_list; - } - if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; @@ -3234,7 +3173,6 @@ static void binder_transaction(struct binder_proc *proc, atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); - is_nested = true; break; } spin_unlock(&tmp->lock); @@ -3248,18 +3186,17 @@ static void binder_transaction(struct binder_proc *proc, e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ - t = kmem_cache_zalloc(binder_transaction_pool, GFP_KERNEL); + t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } - INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); - tcomplete = kmem_cache_zalloc(binder_work_pool, GFP_KERNEL); + tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; @@ -3298,7 +3235,6 @@ static void binder_transaction(struct binder_proc *proc, t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; - t->is_nested = is_nested; if (!(t->flags & TF_ONE_WAY) && binder_supported_policy(current->policy)) { /* Inherit supported policies for synchronous transactions */ @@ -3326,7 +3262,7 @@ static void binder_transaction(struct binder_proc *proc, if (extra_buffers_size < added_size) { /* integer overflow of extra_buffers_size */ return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; + return_error_param = EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } @@ -3349,20 +3285,15 @@ static void binder_transaction(struct binder_proc *proc, goto err_binder_alloc_buf_failed; } if (secctx) { - int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); t->security_ctx = (uintptr_t)t->buffer->user_data + buf_offset; - err = binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, buf_offset, - secctx, secctx_sz); - if (err) { - t->security_ctx = 0; - WARN_ON(1); - } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, buf_offset, + secctx, secctx_sz); security_release_secctx(secctx, secctx_sz); secctx = NULL; } @@ -3372,6 +3303,19 @@ static void binder_transaction(struct binder_proc *proc, t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, 0, + (const void __user *) + (uintptr_t)tr->data.ptr.buffer, + tr->data_size)) { + binder_user_error("%d:%d got transaction with invalid data ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + return_error_param = -EFAULT; + return_error_line = __LINE__; + goto err_copy_data_failed; + } if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, @@ -3416,39 +3360,14 @@ static void binder_transaction(struct binder_proc *proc, size_t object_size; struct binder_object object; binder_size_t object_offset; - binder_size_t copy_size; - - if (binder_alloc_copy_from_buffer(&target_proc->alloc, - &object_offset, - t->buffer, - buffer_offset, - sizeof(object_offset))) { - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_bad_offset; - } - /* - * Copy the source user buffer up to the next object - * that will be processed. - */ - copy_size = object_offset - user_offset; - if (copy_size && (user_offset > object_offset || - binder_alloc_copy_user_to_buffer( - &target_proc->alloc, - t->buffer, user_offset, - user_buffer + user_offset, - copy_size))) { - binder_user_error("%d:%d got transaction with invalid data ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EFAULT; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - object_size = binder_get_object(target_proc, user_buffer, - t->buffer, object_offset, &object); + binder_alloc_copy_from_buffer(&target_proc->alloc, + &object_offset, + t->buffer, + buffer_offset, + sizeof(object_offset)); + object_size = binder_get_object(target_proc, t->buffer, + object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, @@ -3460,11 +3379,6 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - /* - * Set offset to the next buffer fragment to be - * copied - */ - user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; @@ -3475,17 +3389,15 @@ static void binder_transaction(struct binder_proc *proc, fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); - - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { @@ -3493,42 +3405,37 @@ static void binder_transaction(struct binder_proc *proc, fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); - binder_size_t fd_offset = object_offset + - (uintptr_t)&fp->fd - (uintptr_t)fp; - int ret = binder_translate_fd(fp->fd, fd_offset, t, - thread, in_reply_to); + int target_fd = binder_translate_fd(fp->fd, t, thread, + in_reply_to); - fp->pad_binder = 0; - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (target_fd < 0) { return_error = BR_FAILED_REPLY; - return_error_param = ret; + return_error_param = target_fd; return_error_line = __LINE__; goto err_translate_failed; } + fp->pad_binder = 0; + fp->fd = target_fd; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; - struct binder_object user_object; - size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / @@ -3560,35 +3467,11 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_parent; } - /* - * We need to read the user version of the parent - * object to get the original user offset - */ - user_parent_size = - binder_get_object(proc, user_buffer, t->buffer, - parent_offset, &user_object); - if (user_parent_size != sizeof(user_object.bbo)) { - binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", - proc->pid, thread->pid, - user_parent_size, - sizeof(user_object.bbo)); + ret = binder_translate_fd_array(fda, parent, t, thread, + in_reply_to); + if (ret < 0) { return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_bad_parent; - } - ret = binder_translate_fd_array(&pf_head, fda, - user_buffer, parent, - &user_object.bbo, t, - thread, in_reply_to); - if (!ret) - ret = binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fda, sizeof(*fda)); - if (ret) { - return_error = BR_FAILED_REPLY; - return_error_param = ret > 0 ? -EINVAL : ret; + return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } @@ -3610,14 +3493,19 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - ret = binder_defer_copy(&sgc_head, sg_buf_offset, - (const void __user *)(uintptr_t)bp->buffer, - bp->length); - if (ret) { + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, + sg_buf_offset, + (const void __user *) + (uintptr_t)bp->buffer, + bp->length)) { + binder_user_error("%d:%d got transaction with invalid offsets ptr\n", + proc->pid, thread->pid); + return_error_param = -EFAULT; return_error = BR_FAILED_REPLY; - return_error_param = ret; return_error_line = __LINE__; - goto err_translate_failed; + goto err_copy_data_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = (uintptr_t) @@ -3626,22 +3514,20 @@ static void binder_transaction(struct binder_proc *proc, num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); - ret = binder_fixup_parent(&pf_head, t, - thread, bp, + ret = binder_fixup_parent(t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - bp, sizeof(*bp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + bp, sizeof(*bp)); last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; @@ -3654,57 +3540,22 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_object_type; } } - /* Done processing objects, copy the rest of the buffer */ - if (binder_alloc_copy_user_to_buffer( - &target_proc->alloc, - t->buffer, user_offset, - user_buffer + user_offset, - tr->data_size - user_offset)) { - binder_user_error("%d:%d got transaction with invalid data ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EFAULT; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - - ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, - &sgc_head, &pf_head); - if (ret) { - binder_user_error("%d:%d got transaction with invalid offsets ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = ret; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - if (t->buffer->oneway_spam_suspect) - tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; - else - tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; t->work.type = BINDER_WORK_TRANSACTION; if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { - return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); - target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); - if (in_reply_to->is_nested) { - spin_lock(&thread->prio_lock); - thread->prio_state = BINDER_PRIO_PENDING; - thread->prio_next = in_reply_to->saved_priority; - spin_unlock(&thread->prio_lock); - } wake_up_interruptible_sync(&target_thread->wait); - binder_restore_priority(thread, &in_reply_to->saved_priority); + binder_restore_priority(current, in_reply_to->saved_priority); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -3721,9 +3572,7 @@ static void binder_transaction(struct binder_proc *proc, t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); - return_error = binder_proc_transaction(t, - target_proc, target_thread); - if (return_error) { + if (!binder_proc_transaction(t, target_proc, target_thread)) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); @@ -3733,8 +3582,7 @@ static void binder_transaction(struct binder_proc *proc, BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); binder_enqueue_thread_work(thread, tcomplete); - return_error = binder_proc_transaction(t, target_proc, NULL); - if (return_error) + if (!binder_proc_transaction(t, target_proc, NULL)) goto err_dead_proc_or_thread; } if (target_thread) @@ -3751,6 +3599,7 @@ static void binder_transaction(struct binder_proc *proc, return; err_dead_proc_or_thread: + return_error = BR_DEAD_REPLY; return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: @@ -3758,10 +3607,8 @@ static void binder_transaction(struct binder_proc *proc, err_bad_offset: err_bad_parent: err_copy_data_failed: - binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); - binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, NULL, t->buffer, + binder_transaction_buffer_release(target_proc, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); @@ -3773,15 +3620,12 @@ static void binder_transaction(struct binder_proc *proc, if (secctx) security_release_secctx(secctx, secctx_sz); err_get_secctx_failed: - kmem_cache_free(binder_work_pool, tcomplete); + kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: - if (trace_binder_txn_latency_free_enabled()) - binder_txn_latency_free(t); - kmem_cache_free(binder_transaction_pool, t); + kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: -err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: @@ -3815,65 +3659,19 @@ static void binder_transaction(struct binder_proc *proc, */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); - WRITE_ONCE(fe->debug_id_done, t_debug_id); - } - - BUG_ON(thread->return_error.cmd != BR_OK); - if (in_reply_to) { - binder_restore_priority(thread, &in_reply_to->saved_priority); - thread->return_error.cmd = BR_TRANSACTION_COMPLETE; - binder_enqueue_thread_work(thread, &thread->return_error.work); - binder_send_failed_reply(in_reply_to, return_error); - } else { - thread->return_error.cmd = return_error; - binder_enqueue_thread_work(thread, &thread->return_error.work); - } -} - -/** - * binder_free_buf() - free the specified buffer - * @proc: binder proc that owns buffer - * @buffer: buffer to be freed - * @is_failure: failed to send transaction - * - * If buffer for an async transaction, enqueue the next async - * transaction from the node. - * - * Cleanup buffer and free it. - */ -static void -binder_free_buf(struct binder_proc *proc, - struct binder_thread *thread, - struct binder_buffer *buffer, bool is_failure) -{ - binder_inner_proc_lock(proc); - if (buffer->transaction) { - buffer->transaction->buffer = NULL; - buffer->transaction = NULL; - } - binder_inner_proc_unlock(proc); - if (buffer->async_transaction && buffer->target_node) { - struct binder_node *buf_node; - struct binder_work *w; - - buf_node = buffer->target_node; - binder_node_inner_lock(buf_node); - BUG_ON(!buf_node->has_async_transaction); - BUG_ON(buf_node->proc != proc); - w = binder_dequeue_work_head_ilocked( - &buf_node->async_todo); - if (!w) { - buf_node->has_async_transaction = false; - } else { - binder_enqueue_work_ilocked( - w, &proc->todo); - binder_wakeup_proc_ilocked(proc); - } - binder_node_inner_unlock(buf_node); + WRITE_ONCE(fe->debug_id_done, t_debug_id); + } + + BUG_ON(thread->return_error.cmd != BR_OK); + if (in_reply_to) { + binder_restore_priority(current, in_reply_to->saved_priority); + thread->return_error.cmd = BR_TRANSACTION_COMPLETE; + binder_enqueue_thread_work(thread, &thread->return_error.work); + binder_send_failed_reply(in_reply_to, return_error); + } else { + thread->return_error.cmd = return_error; + binder_enqueue_thread_work(thread, &thread->return_error.work); } - trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure); - binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, @@ -3917,7 +3715,6 @@ static int binder_thread_write(struct binder_proc *proc, ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; - mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { @@ -4074,7 +3871,35 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); - binder_free_buf(proc, thread, buffer, false); + + binder_inner_proc_lock(proc); + if (buffer->transaction) { + buffer->transaction->buffer = NULL; + buffer->transaction = NULL; + } + binder_inner_proc_unlock(proc); + if (buffer->async_transaction && buffer->target_node) { + struct binder_node *buf_node; + struct binder_work *w; + + buf_node = buffer->target_node; + binder_node_inner_lock(buf_node); + BUG_ON(!buf_node->has_async_transaction); + BUG_ON(buf_node->proc != proc); + w = binder_dequeue_work_head_ilocked( + &buf_node->async_todo); + if (!w) { + buf_node->has_async_transaction = false; + } else { + binder_enqueue_work_ilocked( + w, &proc->todo); + binder_wakeup_proc_ilocked(proc); + } + binder_node_inner_unlock(buf_node); + } + trace_binder_transaction_buffer_release(buffer); + binder_transaction_buffer_release(proc, buffer, 0, false); + binder_alloc_free_buf(&proc->alloc, buffer); break; } @@ -4157,7 +3982,7 @@ static int binder_thread_write(struct binder_proc *proc, * Allocate memory for death notification * before taking lock */ - death = kmem_cache_zalloc(binder_ref_death_pool, GFP_KERNEL); + death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); @@ -4182,8 +4007,7 @@ static int binder_thread_write(struct binder_proc *proc, "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); - if (death) - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); break; } @@ -4204,7 +4028,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); @@ -4387,7 +4211,7 @@ static int binder_wait_for_work(struct binder_thread *thread, binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { - ret = -EINTR; + ret = -ERESTARTSYS; break; } } @@ -4398,71 +4222,6 @@ static int binder_wait_for_work(struct binder_thread *thread, return ret; } -/** - * binder_apply_fd_fixups() - finish fd translation - * @proc: binder_proc associated @t->buffer - * @t: binder transaction with list of fd fixups - * - * Now that we are in the context of the transaction target - * process, we can allocate and install fds. Process the - * list of fds to translate and fixup the buffer with the - * new fds. - * - * If we fail to allocate an fd, then free the resources by - * fput'ing files that have not been processed and ksys_close'ing - * any fds that have already been allocated. - */ -static int binder_apply_fd_fixups(struct binder_proc *proc, - struct binder_transaction *t) -{ - struct binder_txn_fd_fixup *fixup, *tmp; - int ret = 0; - - list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { - int fd = get_unused_fd_flags(O_CLOEXEC); - - if (fd < 0) { - binder_debug(BINDER_DEBUG_TRANSACTION, - "failed fd fixup txn %d fd %d\n", - t->debug_id, fd); - ret = -ENOMEM; - break; - } - binder_debug(BINDER_DEBUG_TRANSACTION, - "fd fixup txn %d fd %d\n", - t->debug_id, fd); - trace_binder_transaction_fd_recv(t, fd, fixup->offset); - fd_install(fd, fixup->file); - fixup->file = NULL; - if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, - fixup->offset, &fd, - sizeof(u32))) { - ret = -EINVAL; - break; - } - } - list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { - if (fixup->file) { - fput(fixup->file); - } else if (ret) { - u32 fd; - int err; - - err = binder_alloc_copy_from_buffer(&proc->alloc, &fd, - t->buffer, - fixup->offset, - sizeof(fd)); - WARN_ON(err); - if (!err) - binder_deferred_fd_close(fd); - } - list_del(&fixup->fixup_entry); - kfree(fixup); - } - - return ret; -} - static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, @@ -4499,7 +4258,7 @@ static int binder_thread_read(struct binder_proc *proc, wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } - binder_restore_priority(thread, &proc->default_priority); + binder_restore_priority(current, proc->default_priority); } if (non_block) { @@ -4525,8 +4284,6 @@ static int binder_thread_read(struct binder_proc *proc, size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); - if (list) - goto skip; if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && @@ -4540,7 +4297,7 @@ static int binder_thread_read(struct binder_proc *proc, goto retry; break; } -skip: + if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; @@ -4566,18 +4323,11 @@ static int binder_thread_read(struct binder_proc *proc, e->cmd = BR_OK; ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, cmd); + binder_stat_br(proc, thread, e->cmd); } break; - case BINDER_WORK_TRANSACTION_COMPLETE: - case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { - if (proc->oneway_spam_detection_enabled && - w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) - cmd = BR_ONEWAY_SPAM_SUSPECT; - else - cmd = BR_TRANSACTION_COMPLETE; + case BINDER_WORK_TRANSACTION_COMPLETE: { binder_inner_proc_unlock(proc); - kmem_cache_free(binder_work_pool, w); - binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); + cmd = BR_TRANSACTION_COMPLETE; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); @@ -4586,6 +4336,8 @@ static int binder_thread_read(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); + kfree(w); + binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); @@ -4697,7 +4449,7 @@ static int binder_thread_read(struct binder_proc *proc, (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( @@ -4715,11 +4467,6 @@ static int binder_thread_read(struct binder_proc *proc, if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; - default: - binder_inner_proc_unlock(proc); - pr_err("%d:%d: bad work type %d\n", - proc->pid, thread->pid, w->type); - break; } if (!t) @@ -4728,10 +4475,14 @@ static int binder_thread_read(struct binder_proc *proc, BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; + struct binder_priority node_prio; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; - binder_transaction_priority(thread, t, target_node); + node_prio.sched_policy = target_node->sched_policy; + node_prio.prio = target_node->min_priority; + binder_transaction_priority(current, t, node_prio, + target_node->inherit_rt); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; @@ -4753,34 +4504,6 @@ static int binder_thread_read(struct binder_proc *proc, trd->sender_pid = 0; } - ret = binder_apply_fd_fixups(proc, t); - if (ret) { - struct binder_buffer *buffer = t->buffer; - bool oneway = !!(t->flags & TF_ONE_WAY); - int tid = t->debug_id; - - if (t_from) - binder_thread_dec_tmpref(t_from); - buffer->transaction = NULL; - binder_cleanup_transaction(t, "fd fixups failed", - BR_FAILED_REPLY); - binder_free_buf(proc, thread, buffer, true); - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", - proc->pid, thread->pid, - oneway ? "async " : - (cmd == BR_REPLY ? "reply " : ""), - tid, BR_FAILED_REPLY, ret, __LINE__); - if (cmd == BR_REPLY) { - cmd = BR_FAILED_REPLY; - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, cmd); - break; - } - continue; - } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = (uintptr_t)t->buffer->user_data; @@ -4900,7 +4623,7 @@ static void binder_release_work(struct binder_proc *proc, case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); - kmem_cache_free(binder_work_pool, w); + kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: @@ -4911,7 +4634,7 @@ static void binder_release_work(struct binder_proc *proc, binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: @@ -4961,8 +4684,6 @@ static struct binder_thread *binder_get_thread_ilocked( thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; - spin_lock_init(&thread->prio_lock); - thread->prio_state = BINDER_PRIO_SET; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } @@ -4976,37 +4697,27 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) thread = binder_get_thread_ilocked(proc, NULL); binder_inner_proc_unlock(proc); if (!thread) { - new_thread = kmem_cache_zalloc(binder_thread_pool, GFP_KERNEL); + new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); if (new_thread == NULL) return NULL; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, new_thread); binder_inner_proc_unlock(proc); if (thread != new_thread) - kmem_cache_free(binder_thread_pool, new_thread); + kfree(new_thread); } return thread; } static void binder_free_proc(struct binder_proc *proc) { - struct binder_device *device; - BUG_ON(!list_empty(&proc->todo)); BUG_ON(!list_empty(&proc->delivered_death)); - if (proc->outstanding_txns) - pr_warn("%s: Unexpected outstanding_txns %d\n", - __func__, proc->outstanding_txns); - device = container_of(proc->context, struct binder_device, context); - if (refcount_dec_and_test(&device->ref)) { - kfree(proc->context->name); - kfree(device); - } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); - kmem_cache_free(binder_proc_pool, proc); + kfree(proc); } static void binder_free_thread(struct binder_thread *thread) @@ -5015,7 +4726,7 @@ static void binder_free_thread(struct binder_thread *thread) binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); put_task_struct(thread->task); - kmem_cache_free(binder_thread_pool, thread); + kfree(thread); } static int binder_thread_release(struct binder_proc *proc, @@ -5033,7 +4744,7 @@ static int binder_thread_release(struct binder_proc *proc, * The corresponding dec is when we actually * free the thread in binder_free_thread() */ - proc->tmp_ref++; + atomic_inc(&proc->tmp_ref); /* * take a ref on this thread to ensure it * survives while we are releasing it @@ -5045,8 +4756,6 @@ static int binder_thread_release(struct binder_proc *proc, spin_lock(&t->lock); if (t->to_thread == thread) send_reply = t; - } else { - __acquire(&t->lock); } thread->is_dead = true; @@ -5060,7 +4769,6 @@ static int binder_thread_release(struct binder_proc *proc, (t->to_thread == thread) ? "in" : "out"); if (t->to_thread == thread) { - thread->proc->outstanding_txns--; t->to_proc = NULL; t->to_thread = NULL; if (t->buffer) { @@ -5076,11 +4784,7 @@ static int binder_thread_release(struct binder_proc *proc, spin_unlock(&last_t->lock); if (t) spin_lock(&t->lock); - else - __acquire(&t->lock); } - /* annotation for sparse, lock not acquired in last iteration above */ - __release(&t->lock); /* * If this thread used poll, make sure we remove the waitqueue from any @@ -5108,7 +4812,7 @@ static int binder_thread_release(struct binder_proc *proc, return active_transactions; } -static __poll_t binder_poll(struct file *filp, +static unsigned int binder_poll(struct file *filp, struct poll_table_struct *wait) { struct binder_proc *proc = filp->private_data; @@ -5128,7 +4832,7 @@ static __poll_t binder_poll(struct file *filp, poll_wait(filp, &thread->wait, wait); if (binder_has_work(thread, wait_for_proc_work)) - return EPOLLIN; + return POLLIN; return 0; } @@ -5284,8 +4988,7 @@ static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, } static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, - struct binder_node_debug_info *info) -{ + struct binder_node_debug_info *info) { struct rb_node *n; binder_uintptr_t ptr = info->ptr; @@ -5308,100 +5011,6 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, return 0; } -static bool binder_txns_pending_ilocked(struct binder_proc *proc) -{ - struct rb_node *n; - struct binder_thread *thread; - - if (proc->outstanding_txns > 0) - return true; - - for (n = rb_first(&proc->threads); n; n = rb_next(n)) { - thread = rb_entry(n, struct binder_thread, rb_node); - if (thread->transaction_stack) - return true; - } - return false; -} - -static int binder_ioctl_freeze(struct binder_freeze_info *info, - struct binder_proc *target_proc) -{ - int ret = 0; - - if (!info->enable) { - binder_inner_proc_lock(target_proc); - target_proc->sync_recv = false; - target_proc->async_recv = false; - target_proc->is_frozen = false; - binder_inner_proc_unlock(target_proc); - return 0; - } - - /* - * Freezing the target. Prevent new transactions by - * setting frozen state. If timeout specified, wait - * for transactions to drain. - */ - binder_inner_proc_lock(target_proc); - target_proc->sync_recv = false; - target_proc->async_recv = false; - target_proc->is_frozen = true; - binder_inner_proc_unlock(target_proc); - - if (info->timeout_ms > 0) - ret = wait_event_interruptible_timeout( - target_proc->freeze_wait, - (!target_proc->outstanding_txns), - msecs_to_jiffies(info->timeout_ms)); - - /* Check pending transactions that wait for reply */ - if (ret >= 0) { - binder_inner_proc_lock(target_proc); - if (binder_txns_pending_ilocked(target_proc)) - ret = -EAGAIN; - binder_inner_proc_unlock(target_proc); - } - - if (ret < 0) { - binder_inner_proc_lock(target_proc); - target_proc->is_frozen = false; - binder_inner_proc_unlock(target_proc); - } - - return ret; -} - -static int binder_ioctl_get_freezer_info( - struct binder_frozen_status_info *info) -{ - struct binder_proc *target_proc; - bool found = false; - __u32 txns_pending; - - info->sync_recv = 0; - info->async_recv = 0; - - mutex_lock(&binder_procs_lock); - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid == info->pid) { - found = true; - binder_inner_proc_lock(target_proc); - txns_pending = binder_txns_pending_ilocked(target_proc); - info->sync_recv |= target_proc->sync_recv | - (txns_pending << 1); - info->async_recv |= target_proc->async_recv; - binder_inner_proc_unlock(target_proc); - } - } - mutex_unlock(&binder_procs_lock); - - if (!found) - return -EINVAL; - - return 0; -} - static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@ -5520,96 +5129,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } - case BINDER_FREEZE: { - struct binder_freeze_info info; - struct binder_proc **target_procs = NULL, *target_proc; - int target_procs_count = 0, i = 0; - - ret = 0; - - if (copy_from_user(&info, ubuf, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - mutex_lock(&binder_procs_lock); - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid == info.pid) - target_procs_count++; - } - - if (target_procs_count == 0) { - mutex_unlock(&binder_procs_lock); - ret = -EINVAL; - goto err; - } - - target_procs = kcalloc(target_procs_count, - sizeof(struct binder_proc *), - GFP_KERNEL); - - if (!target_procs) { - mutex_unlock(&binder_procs_lock); - ret = -ENOMEM; - goto err; - } - - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid != info.pid) - continue; - - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - - target_procs[i++] = target_proc; - } - mutex_unlock(&binder_procs_lock); - - for (i = 0; i < target_procs_count; i++) { - if (ret >= 0) - ret = binder_ioctl_freeze(&info, - target_procs[i]); - - binder_proc_dec_tmpref(target_procs[i]); - } - - kfree(target_procs); - - if (ret < 0) - goto err; - break; - } - case BINDER_GET_FROZEN_INFO: { - struct binder_frozen_status_info info; - - if (copy_from_user(&info, ubuf, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - ret = binder_ioctl_get_freezer_info(&info); - if (ret < 0) - goto err; - - if (copy_to_user(ubuf, &info, sizeof(info))) { - ret = -EFAULT; - goto err; - } - break; - } - case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: { - uint32_t enable; - - if (copy_from_user(&enable, ubuf, sizeof(enable))) { - ret = -EFAULT; - goto err; - } - binder_inner_proc_lock(proc); - proc->oneway_spam_detection_enabled = (bool)enable; - binder_inner_proc_unlock(proc); - break; - } default: ret = -EINVAL; goto err; @@ -5619,7 +5138,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); - if (ret && ret != -EINTR) + if (ret && ret != -ERESTARTSYS) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); @@ -5647,6 +5166,7 @@ static void binder_vma_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); + binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -5662,11 +5182,16 @@ static const struct vm_operations_struct binder_vm_ops = { static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { + int ret; struct binder_proc *proc = filp->private_data; + const char *failure_string; if (proc->tsk != current->group_leader) return -EINVAL; + if ((vma->vm_end - vma->vm_start) > SZ_4M) + vma->vm_end = vma->vm_start + SZ_4M; + binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", __func__, proc->pid, vma->vm_start, vma->vm_end, @@ -5674,9 +5199,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot)); if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { - pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, - proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); - return -EPERM; + ret = -EPERM; + failure_string = "bad vm_flags"; + goto err_bad_arg; } vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; vma->vm_flags &= ~VM_MAYWRITE; @@ -5684,30 +5209,39 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; - return binder_alloc_mmap_handler(&proc->alloc, vma); + ret = binder_alloc_mmap_handler(&proc->alloc, vma); + if (ret) + return ret; + mutex_lock(&proc->files_lock); + proc->files = get_files_struct(current); + mutex_unlock(&proc->files_lock); + return 0; + +err_bad_arg: + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + return ret; } static int binder_open(struct inode *nodp, struct file *filp) { - struct binder_proc *proc, *itr; + struct binder_proc *proc; struct binder_device *binder_dev; - struct binderfs_info *info; - struct dentry *binder_binderfs_dir_entry_proc = NULL; - bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); - proc = kmem_cache_zalloc(binder_proc_pool, GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); + atomic_set(&proc->tmp_ref, 0); get_task_struct(current->group_leader); proc->tsk = current->group_leader; + mutex_init(&proc->files_lock); proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); - init_waitqueue_head(&proc->freeze_wait); if (binder_supported_policy(current->policy)) { proc->default_priority.sched_policy = current->policy; proc->default_priority.prio = current->normal_prio; @@ -5716,16 +5250,8 @@ static int binder_open(struct inode *nodp, struct file *filp) proc->default_priority.prio = NICE_TO_PRIO(0); } - /* binderfs stashes devices in i_private */ - if (is_binderfs_device(nodp)) { - binder_dev = nodp->i_private; - info = nodp->i_sb->s_fs_info; - binder_binderfs_dir_entry_proc = info->proc_log_dir; - } else { - binder_dev = container_of(filp->private_data, - struct binder_device, miscdev); - } - refcount_inc(&binder_dev->ref); + binder_dev = container_of(filp->private_data, struct binder_device, + miscdev); proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); @@ -5736,52 +5262,24 @@ static int binder_open(struct inode *nodp, struct file *filp) filp->private_data = proc; mutex_lock(&binder_procs_lock); - hlist_for_each_entry(itr, &binder_procs, proc_node) { - if (itr->pid == proc->pid) { - existing_pid = true; - break; - } - } hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); - if (binder_debugfs_dir_entry_proc && !existing_pid) { + + if (binder_debugfs_dir_entry_proc) { char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* - * proc debug entries are shared between contexts. - * Only create for the first PID to avoid debugfs log spamming - * The printing code will anyway print all contexts for a given - * PID so this is not a problem. + * proc debug entries are shared between contexts, so + * this will fail if the process tries to open the driver + * again with a different context. The priting code will + * anyway print all contexts that a given PID has, so this + * is not a problem. */ proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, - &proc_fops); - } - - if (binder_binderfs_dir_entry_proc && !existing_pid) { - char strbuf[11]; - struct dentry *binderfs_entry; - - snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); - /* - * Similar to debugfs, the process specific log file is shared - * between contexts. Only create for the first PID. - * This is ok since same as debugfs, the log file will contain - * information on all contexts of a given PID. - */ - binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc, - strbuf, &proc_fops, (void *)(unsigned long)proc->pid); - if (!IS_ERR(binderfs_entry)) { - proc->binderfs_entry = binderfs_entry; - } else { - int error; - - error = PTR_ERR(binderfs_entry); - pr_warn("Unable to create file %s in binderfs (error %d)\n", - strbuf, error); - } + &binder_proc_fops); } return 0; @@ -5823,12 +5321,6 @@ static int binder_release(struct inode *nodp, struct file *filp) struct binder_proc *proc = filp->private_data; debugfs_remove(proc->debugfs_entry); - - if (proc->binderfs_entry) { - binderfs_remove_file(proc->binderfs_entry); - proc->binderfs_entry = NULL; - } - binder_defer_work(proc, BINDER_DEFERRED_RELEASE); return 0; @@ -5905,6 +5397,8 @@ static void binder_deferred_release(struct binder_proc *proc) struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; + BUG_ON(proc->files); + mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); @@ -5923,12 +5417,9 @@ static void binder_deferred_release(struct binder_proc *proc) * Make sure proc stays alive after we * remove all the threads */ - proc->tmp_ref++; + atomic_inc(&proc->tmp_ref); proc->is_dead = true; - proc->is_frozen = false; - proc->sync_recv = false; - proc->async_recv = false; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { @@ -5989,6 +5480,7 @@ static void binder_deferred_release(struct binder_proc *proc) static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; + struct files_struct *files; int defer; @@ -6006,11 +5498,23 @@ static void binder_deferred_func(struct work_struct *work) } mutex_unlock(&binder_deferred_lock); + files = NULL; + if (defer & BINDER_DEFERRED_PUT_FILES) { + mutex_lock(&proc->files_lock); + files = proc->files; + if (files) + proc->files = NULL; + mutex_unlock(&proc->files_lock); + } + if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ + + if (files) + put_files_struct(files); } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); @@ -6281,9 +5785,7 @@ static const char * const binder_return_strings[] = { "BR_FINISHED", "BR_DEAD_BINDER", "BR_CLEAR_DEATH_NOTIFICATION_DONE", - "BR_FAILED_REPLY", - "BR_FROZEN_REPLY", - "BR_ONEWAY_SPAM_SUSPECT", + "BR_FAILED_REPLY" }; static const char * const binder_command_strings[] = { @@ -6424,7 +5926,8 @@ static void print_binder_proc_stats(struct seq_file *m, print_binder_stats(m, " ", &proc->stats); } -static int state_show(struct seq_file *m, void *unused) + +static int binder_state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; @@ -6463,7 +5966,7 @@ static int state_show(struct seq_file *m, void *unused) return 0; } -static int stats_show(struct seq_file *m, void *unused) +static int binder_stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; @@ -6479,7 +5982,7 @@ static int stats_show(struct seq_file *m, void *unused) return 0; } -static int transactions_show(struct seq_file *m, void *unused) +static int binder_transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; @@ -6492,7 +5995,7 @@ static int transactions_show(struct seq_file *m, void *unused) return 0; } -static int proc_show(struct seq_file *m, void *unused) +static int binder_proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; @@ -6535,7 +6038,7 @@ static void print_binder_transaction_log_entry(struct seq_file *m, "\n" : " (incomplete)\n"); } -static int transaction_log_show(struct seq_file *m, void *unused) +static int binder_transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; unsigned int log_cur = atomic_read(&log->cur); @@ -6556,7 +6059,7 @@ static int transaction_log_show(struct seq_file *m, void *unused) return 0; } -const struct file_operations binder_fops = { +static const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, .unlocked_ioctl = binder_ioctl, @@ -6567,44 +6070,10 @@ const struct file_operations binder_fops = { .release = binder_release, }; -DEFINE_SHOW_ATTRIBUTE(state); -DEFINE_SHOW_ATTRIBUTE(stats); -DEFINE_SHOW_ATTRIBUTE(transactions); -DEFINE_SHOW_ATTRIBUTE(transaction_log); - -const struct binder_debugfs_entry binder_debugfs_entries[] = { - { - .name = "state", - .mode = 0444, - .fops = &state_fops, - .data = NULL, - }, - { - .name = "stats", - .mode = 0444, - .fops = &stats_fops, - .data = NULL, - }, - { - .name = "transactions", - .mode = 0444, - .fops = &transactions_fops, - .data = NULL, - }, - { - .name = "transaction_log", - .mode = 0444, - .fops = &transaction_log_fops, - .data = &binder_transaction_log, - }, - { - .name = "failed_transaction_log", - .mode = 0444, - .fops = &transaction_log_fops, - .data = &binder_transaction_log_failed, - }, - {} /* terminator */ -}; +BINDER_DEBUG_ENTRY(state); +BINDER_DEBUG_ENTRY(stats); +BINDER_DEBUG_ENTRY(transactions); +BINDER_DEBUG_ENTRY(transaction_log); static int __init init_binder_device(const char *name) { @@ -6619,7 +6088,6 @@ static int __init init_binder_device(const char *name) binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; - refcount_set(&binder_device->ref, 1); binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; mutex_init(&binder_device->context.context_mgr_node_lock); @@ -6635,130 +6103,70 @@ static int __init init_binder_device(const char *name) return ret; } -static int __init binder_create_pools(void) -{ - int ret; - - ret = binder_buffer_pool_create(); - if (ret) - return ret; - - binder_node_pool = KMEM_CACHE(binder_node, SLAB_HWCACHE_ALIGN); - if (!binder_node_pool) - goto err_node_pool; - - binder_proc_pool = KMEM_CACHE(binder_proc, SLAB_HWCACHE_ALIGN); - if (!binder_proc_pool) - goto err_proc_pool; - - binder_ref_death_pool = KMEM_CACHE(binder_ref_death, SLAB_HWCACHE_ALIGN); - if (!binder_ref_death_pool) - goto err_ref_death_pool; - - binder_ref_pool = KMEM_CACHE(binder_ref, SLAB_HWCACHE_ALIGN); - if (!binder_ref_pool) - goto err_ref_pool; - - binder_thread_pool = KMEM_CACHE(binder_thread, SLAB_HWCACHE_ALIGN); - if (!binder_thread_pool) - goto err_thread_pool; - - binder_transaction_pool = KMEM_CACHE(binder_transaction, SLAB_HWCACHE_ALIGN); - if (!binder_transaction_pool) - goto err_transaction_pool; - - binder_work_pool = KMEM_CACHE(binder_work, SLAB_HWCACHE_ALIGN); - if (!binder_work_pool) - goto err_work_pool; - - return 0; - -err_work_pool: - kmem_cache_destroy(binder_transaction_pool); -err_transaction_pool: - kmem_cache_destroy(binder_thread_pool); -err_thread_pool: - kmem_cache_destroy(binder_ref_pool); -err_ref_pool: - kmem_cache_destroy(binder_ref_death_pool); -err_ref_death_pool: - kmem_cache_destroy(binder_proc_pool); -err_proc_pool: - kmem_cache_destroy(binder_node_pool); -err_node_pool: - binder_buffer_pool_destroy(); - return -ENOMEM; -} - -static void __init binder_destroy_pools(void) -{ - binder_buffer_pool_destroy(); - kmem_cache_destroy(binder_node_pool); - kmem_cache_destroy(binder_proc_pool); - kmem_cache_destroy(binder_ref_death_pool); - kmem_cache_destroy(binder_ref_pool); - kmem_cache_destroy(binder_thread_pool); - kmem_cache_destroy(binder_transaction_pool); - kmem_cache_destroy(binder_work_pool); -} - static int __init binder_init(void) { int ret; - char *device_name, *device_tmp; + char *device_name, *device_names, *device_tmp; struct binder_device *device; struct hlist_node *tmp; - char *device_names = NULL; - - ret = binder_create_pools(); - if (ret) - return ret; ret = binder_alloc_shrinker_init(); if (ret) - goto err_alloc_shrinker_failed; + return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); - if (binder_debugfs_dir_entry_root) { - const struct binder_debugfs_entry *db_entry; - - binder_for_each_debugfs_entry(db_entry) - debugfs_create_file(db_entry->name, - db_entry->mode, - binder_debugfs_dir_entry_root, - db_entry->data, - db_entry->fops); - + if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); - } - if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) && - strcmp(binder_devices_param, "") != 0) { - /* - * Copy the module_parameter string, because we don't want to - * tokenize it in-place. - */ - device_names = kstrdup(binder_devices_param, GFP_KERNEL); - if (!device_names) { - ret = -ENOMEM; - goto err_alloc_device_names_failed; - } + if (binder_debugfs_dir_entry_root) { + debugfs_create_file("state", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_state_fops); + debugfs_create_file("stats", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_stats_fops); + debugfs_create_file("transactions", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_transactions_fops); + debugfs_create_file("transaction_log", + 0444, + binder_debugfs_dir_entry_root, + &binder_transaction_log, + &binder_transaction_log_fops); + debugfs_create_file("failed_transaction_log", + 0444, + binder_debugfs_dir_entry_root, + &binder_transaction_log_failed, + &binder_transaction_log_fops); + } - device_tmp = device_names; - while ((device_name = strsep(&device_tmp, ","))) { - ret = init_binder_device(device_name); - if (ret) - goto err_init_binder_device_failed; - } + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) { + ret = -ENOMEM; + goto err_alloc_device_names_failed; } + strcpy(device_names, binder_devices_param); - ret = init_binderfs(); - if (ret) - goto err_init_binder_device_failed; + device_tmp = device_names; + while ((device_name = strsep(&device_tmp, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; + } return ret; @@ -6774,9 +6182,6 @@ static int __init binder_init(void) err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); -err_alloc_shrinker_failed: - binder_destroy_pools(); - return ret; } @@ -6784,7 +6189,5 @@ device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(binder_transaction_received); -EXPORT_TRACEPOINT_SYMBOL_GPL(binder_txn_latency_free); MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9eb15d712567..5addcd56afb4 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1,13 +1,23 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder_alloc.c * * Android IPC Subsystem * * Copyright (C) 2007-2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -18,11 +28,8 @@ #include #include #include -#include -#include #include #include -#include #include "binder_alloc.h" #include "binder_trace.h" @@ -36,7 +43,7 @@ enum { BINDER_DEBUG_BUFFER_ALLOC = 1U << 2, BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, }; -static uint32_t binder_alloc_debug_mask = BINDER_DEBUG_USER_ERROR; +static uint32_t binder_alloc_debug_mask; module_param_named(debug_mask, binder_alloc_debug_mask, uint, 0644); @@ -44,25 +51,9 @@ module_param_named(debug_mask, binder_alloc_debug_mask, #define binder_alloc_debug(mask, x...) \ do { \ if (binder_alloc_debug_mask & mask) \ - pr_info_ratelimited(x); \ + pr_info(x); \ } while (0) -static struct kmem_cache *binder_buffer_pool; - -int binder_buffer_pool_create(void) -{ - binder_buffer_pool = KMEM_CACHE(binder_buffer, SLAB_HWCACHE_ALIGN); - if (!binder_buffer_pool) - return -ENOMEM; - - return 0; -} - -void binder_buffer_pool_destroy(void) -{ - kmem_cache_destroy(binder_buffer_pool); -} - static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer) { return list_entry(buffer->entry.next, struct binder_buffer, entry); @@ -173,7 +164,7 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( } /** - * binder_alloc_prepare_to_free() - get buffer given user ptr + * binder_alloc_buffer_lookup() - get buffer given user ptr * @alloc: binder_alloc for this proc * @user_ptr: User pointer to buffer data * @@ -228,14 +219,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, mm = alloc->vma_vm_mm; if (mm) { - down_read; + down_read(&mm->mmap_sem); + if (!mmget_still_valid(mm)) { + if (allocate == 0) + goto free_range; + goto err_no_vma; + } vma = alloc->vma; } if (!vma && need_mm) { - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf failed to map pages in userspace, no vma\n", - alloc->pid); + pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", + alloc->pid); goto err_no_vma; } @@ -284,15 +279,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, alloc->pages_high = index + 1; trace_binder_alloc_page_end(alloc, index); + /* vm_insert_page does not seem to increment the refcount */ } if (mm) { - up_read; + up_read(&mm->mmap_sem); mmput(mm); } return 0; free_range: - for (page_addr = end - PAGE_SIZE; 1; page_addr -= PAGE_SIZE) { + for (page_addr = end - PAGE_SIZE; page_addr >= start; + page_addr -= PAGE_SIZE) { bool ret; size_t index; @@ -305,8 +302,6 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); - if (page_addr == start) - break; continue; err_vm_insert_page_failed: @@ -314,47 +309,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, page->page_ptr = NULL; err_alloc_page_failed: err_page_ptr_cleared: - if (page_addr == start) - break; + ; } err_no_vma: if (mm) { - up_read; + up_read(&mm->mmap_sem); mmput(mm); } return vma ? -ENOMEM : -ESRCH; } - -static inline void binder_alloc_set_vma(struct binder_alloc *alloc, - struct vm_area_struct *vma) -{ - if (vma) - alloc->vma_vm_mm = vma->vm_mm; - /* - * If we see alloc->vma is not NULL, buffer data structures set up - * completely. Look at smp_rmb side binder_alloc_get_vma. - * We also want to guarantee new alloc->vma_vm_mm is always visible - * if alloc->vma is set. - */ - smp_wmb(); - alloc->vma = vma; -} - -static inline struct vm_area_struct *binder_alloc_get_vma( - struct binder_alloc *alloc) -{ - struct vm_area_struct *vma = NULL; - - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } - return vma; -} - -static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) +static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid) { /* * Find the amount and size of buffers allocated by the current caller; @@ -363,7 +328,7 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) * and at some point we'll catch them in the act. This is more efficient * than keeping a map per pid. */ - struct rb_node *n; + struct rb_node *n = alloc->free_buffers.rb_node; struct binder_buffer *buffer; size_t total_alloc_size = 0; size_t num_buffers = 0; @@ -382,19 +347,13 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) /* * Warn if this pid has more than 50 transactions, or more than 50% of - * async space (which is 25% of total buffer size). Oneway spam is only - * detected when the threshold is exceeded. + * async space (which is 25% of total buffer size). */ if (num_buffers > 50 || total_alloc_size > alloc->buffer_size / 4) { binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: pid %d spamming oneway? %zd buffers allocated for a total size of %zd\n", alloc->pid, pid, num_buffers, total_alloc_size); - if (!alloc->oneway_spam_detected) { - alloc->oneway_spam_detected = true; - return true; - } } - return false; } static struct binder_buffer *binder_alloc_new_buf_locked( @@ -414,15 +373,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; - down_read; - if (!binder_alloc_get_vma(alloc)) { - up_read; - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf, no vma\n", - alloc->pid); + if (alloc->vma == NULL) { + pr_err("%d: binder_alloc_buf, no vma\n", + alloc->pid); return ERR_PTR(-ESRCH); } - up_read; data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -492,14 +447,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked( if (buffer_size > largest_free_size) largest_free_size = buffer_size; } - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf size %zd failed, no address space\n", - alloc->pid, size); - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", - total_alloc_size, allocated_buffers, - largest_alloc_size, total_free_size, - free_buffers, largest_free_size); + pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", + alloc->pid, size); + pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", + total_alloc_size, allocated_buffers, largest_alloc_size, + total_free_size, free_buffers, largest_free_size); return ERR_PTR(-ENOSPC); } if (n == NULL) { @@ -526,7 +478,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( if (buffer_size != size) { struct binder_buffer *new_buffer; - new_buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL); + new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!new_buffer) { pr_err("%s: %d failed to alloc new buffer struct\n", __func__, alloc->pid); @@ -550,7 +502,6 @@ static struct binder_buffer *binder_alloc_new_buf_locked( buffer->async_transaction = is_async; buffer->extra_buffers_size = extra_buffers_size; buffer->pid = pid; - buffer->oneway_spam_suspect = false; if (is_async) { alloc->free_async_space -= size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, @@ -562,9 +513,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( * of async space left (which is less than 10% of total * buffer size). */ - buffer->oneway_spam_suspect = debug_low_async_space_locked(alloc, pid); - } else { - alloc->oneway_spam_detected = false; + debug_low_async_space_locked(alloc, pid); } } return buffer; @@ -624,7 +573,6 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, { struct binder_buffer *prev, *next = NULL; bool to_free = true; - BUG_ON(alloc->buffers.next == &buffer->entry); prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); @@ -665,7 +613,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, buffer_start_page(buffer) + PAGE_SIZE); } list_del(&buffer->entry); - kmem_cache_free(binder_buffer_pool, buffer); + kfree(buffer); } static void binder_free_buf_locked(struct binder_alloc *alloc, @@ -690,7 +638,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, BUG_ON(buffer->user_data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { - alloc->free_async_space += buffer_size + sizeof(struct binder_buffer); + alloc->free_async_space += size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_free_buf size %zd async free %zd\n", @@ -731,7 +679,7 @@ static void binder_alloc_clear_buf(struct binder_alloc *alloc, * @alloc: binder_alloc for this proc * @buffer: kernel pointer to buffer * - * Free the buffer allocated via binder_alloc_new_buf() + * Free the buffer allocated via binder_alloc_new_buffer() */ void binder_alloc_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffer) @@ -773,34 +721,27 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, const char *failure_string; struct binder_buffer *buffer; - if (unlikely(vma->vm_mm != alloc->vma_vm_mm)) { - ret = -EINVAL; - failure_string = "invalid vma->vm_mm"; - goto err_invalid_mm; - } - mutex_lock(&binder_alloc_mmap_lock); - if (alloc->buffer_size) { + if (alloc->buffer) { ret = -EBUSY; failure_string = "already mapped"; goto err_already_mapped; } - alloc->buffer_size = min_t(unsigned long, vma->vm_end - vma->vm_start, - SZ_4M); - mutex_unlock(&binder_alloc_mmap_lock); alloc->buffer = (void __user *)vma->vm_start; + mutex_unlock(&binder_alloc_mmap_lock); - alloc->pages = kcalloc(alloc->buffer_size / PAGE_SIZE, - sizeof(alloc->pages[0]), + alloc->pages = kzalloc(sizeof(alloc->pages[0]) * + ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); if (alloc->pages == NULL) { ret = -ENOMEM; failure_string = "alloc page array"; goto err_alloc_pages_failed; } + alloc->buffer_size = vma->vm_end - vma->vm_start; - buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL); + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) { ret = -ENOMEM; failure_string = "alloc buffer struct"; @@ -812,7 +753,11 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; - binder_alloc_set_vma(alloc, vma); + barrier(); + alloc->vma = vma; + alloc->vma_vm_mm = vma->vm_mm; + /* Same as mmgrab() in later kernel versions */ + atomic_inc(&alloc->vma_vm_mm->mm_count); return 0; @@ -820,16 +765,12 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, kfree(alloc->pages); alloc->pages = NULL; err_alloc_pages_failed: - alloc->buffer = NULL; mutex_lock(&binder_alloc_mmap_lock); - alloc->buffer_size = 0; + alloc->buffer = NULL; err_already_mapped: mutex_unlock(&binder_alloc_mmap_lock); -err_invalid_mm: - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%s: %d %lx-%lx %s failed %d\n", __func__, - alloc->pid, vma->vm_start, vma->vm_end, - failure_string, ret); + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); return ret; } @@ -840,10 +781,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) int buffers, page_count; struct binder_buffer *buffer; - buffers = 0; - mutex_lock(&alloc->mutex); BUG_ON(alloc->vma); + buffers = 0; + mutex_lock(&alloc->mutex); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -865,7 +806,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) list_del(&buffer->entry); WARN_ON_ONCE(!list_empty(&alloc->buffers)); - kmem_cache_free(binder_buffer_pool, buffer); + kfree(buffer); } page_count = 0; @@ -945,18 +886,6 @@ void binder_alloc_print_pages(struct seq_file *m, int free = 0; mutex_lock(&alloc->mutex); - /* - * Make sure the binder_alloc is fully initialized, otherwise we might - * read inconsistent state. - */ - - down_read; - if (binder_alloc_get_vma(alloc) == NULL) { - up_read; - goto uninitialized; - } - - up_read; for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { page = &alloc->pages[i]; if (!page->page_ptr) @@ -966,8 +895,6 @@ void binder_alloc_print_pages(struct seq_file *m, else lru++; } - -uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); @@ -1002,7 +929,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) */ void binder_alloc_vma_close(struct binder_alloc *alloc) { - binder_alloc_set_vma(alloc, NULL); + WRITE_ONCE(alloc->vma, NULL); } /** @@ -1018,7 +945,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct list_lru_one *lru, spinlock_t *lock, void *cb_arg) - __must_hold(lock) { struct mm_struct *mm = NULL; struct binder_lru_page *page = container_of(item, @@ -1042,9 +968,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mm = alloc->vma_vm_mm; if (!mmget_not_zero(mm)) goto err_mmget; - if (!*down_read_trylock) - goto err_down_read_mmap_sem_failed; - vma = binder_alloc_get_vma(alloc); + if (!down_write_trylock(&mm->mmap_sem)) + goto err_down_write_mmap_sem_failed; + vma = alloc->vma; list_lru_isolate(lru, item); spin_unlock(lock); @@ -1056,8 +982,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_user_end(alloc, index); } - up_read; - mmput_async(mm); + up_write(&mm->mmap_sem); + mmput(mm); trace_binder_unmap_kernel_start(alloc, index); @@ -1070,7 +996,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mutex_unlock(&alloc->mutex); return LRU_REMOVED_RETRY; -err_down_read_mmap_sem_failed: +err_down_write_mmap_sem_failed: mmput_async(mm); err_mmget: err_page_already_freed: @@ -1112,8 +1038,6 @@ static struct shrinker binder_shrinker = { void binder_alloc_init(struct binder_alloc *alloc) { alloc->pid = current->group_leader->pid; - alloc->vma_vm_mm = current->mm; - mmgrab(alloc->vma_vm_mm); mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); } @@ -1271,16 +1195,15 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, return 0; } -static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc, - bool to_buffer, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *ptr, - size_t bytes) +static void binder_alloc_do_buffer_copy(struct binder_alloc *alloc, + bool to_buffer, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *ptr, + size_t bytes) { /* All copies must be 32-bit aligned and 32-bit size */ - if (!check_buffer(alloc, buffer, buffer_offset, bytes)) - return -EINVAL; + BUG_ON(!check_buffer(alloc, buffer, buffer_offset, bytes)); while (bytes) { unsigned long size; @@ -1308,25 +1231,25 @@ static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc, ptr = ptr + size; buffer_offset += size; } - return 0; } -int binder_alloc_copy_to_buffer(struct binder_alloc *alloc, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *src, - size_t bytes) +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes) { - return binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset, - src, bytes); + binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset, + src, bytes); } -int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, - void *dest, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - size_t bytes) +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes) { - return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, - dest, bytes); + binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, + dest, bytes); } + diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a30eb98d99f4..da025cc94cd9 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -1,6 +1,15 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #ifndef _LINUX_BINDER_ALLOC_H @@ -13,6 +22,11 @@ #include #include #include + +#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT +#define BINDER_IPC_32BIT 1 +#endif + #include extern struct list_lru binder_alloc_lru; @@ -26,8 +40,6 @@ struct binder_transaction; * @clear_on_free: %true if buffer must be zeroed after use * @allow_user_free: %true if user is allowed to free buffer * @async_transaction: %true if buffer is in use for an async txn - * @oneway_spam_suspect: %true if total async allocate size just exceed - * spamming detect threshold * @debug_id: unique ID for debugging * @transaction: pointer to associated struct binder_transaction * @target_node: struct binder_node associated with this buffer @@ -47,8 +59,7 @@ struct binder_buffer { unsigned clear_on_free:1; unsigned allow_user_free:1; unsigned async_transaction:1; - unsigned oneway_spam_suspect:1; - unsigned debug_id:27; + unsigned debug_id:28; struct binder_transaction *transaction; @@ -90,8 +101,6 @@ struct binder_lru_page { * @buffer_size: size of address space specified via mmap * @pid: pid for associated binder_proc (invariant after init) * @pages_high: high watermark of offset in @pages - * @oneway_spam_detected: %true if oneway spam detection fired, clear that - * flag once the async buffer has returned to a healthy state * * Bookkeeping structure for per-proc address space management for binder * buffers. It is normally initialized during binder_init() and binder_mmap() @@ -112,7 +121,6 @@ struct binder_alloc { uint32_t buffer_free; int pid; size_t pages_high; - bool oneway_spam_detected; }; #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST @@ -145,8 +153,6 @@ extern void binder_alloc_print_allocated(struct seq_file *m, struct binder_alloc *alloc); void binder_alloc_print_pages(struct seq_file *m, struct binder_alloc *alloc); -extern int binder_buffer_pool_create(void); -extern void binder_buffer_pool_destroy(void); /** * binder_alloc_get_free_async_space() - get free space available for async @@ -172,16 +178,17 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, const void __user *from, size_t bytes); -int binder_alloc_copy_to_buffer(struct binder_alloc *alloc, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *src, - size_t bytes); +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes); -int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, - void *dest, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - size_t bytes); +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes); #endif /* _LINUX_BINDER_ALLOC_H */ + diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index c2b323bc3b3a..c839c490fde3 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -1,9 +1,18 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder_alloc_selftest.c * * Android IPC Subsystem * * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h deleted file mode 100644 index 3b6918d8a977..000000000000 --- a/drivers/android/binder_internal.h +++ /dev/null @@ -1,603 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_BINDER_INTERNAL_H -#define _LINUX_BINDER_INTERNAL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "binder_alloc.h" - -#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c) -#define ida_free ida_remove - -typedef unsigned int __poll_t; -typedef __bitwise int vm_fault_t; - -struct binder_context { - struct binder_node *binder_context_mgr_node; - struct mutex context_mgr_node_lock; - kuid_t binder_context_mgr_uid; - const char *name; -}; - -/** - * struct binder_device - information about a binder device node - * @hlist: list of binder devices (only used for devices requested via - * CONFIG_ANDROID_BINDER_DEVICES) - * @miscdev: information about a binder character device node - * @context: binder context information - * @binderfs_inode: This is the inode of the root dentry of the super block - * belonging to a binderfs mount. - */ -struct binder_device { - struct hlist_node hlist; - struct miscdevice miscdev; - struct binder_context context; - struct inode *binderfs_inode; - refcount_t ref; -}; - -/** - * binderfs_mount_opts - mount options for binderfs - * @max: maximum number of allocatable binderfs binder devices - * @stats_mode: enable binder stats in binderfs. - */ -struct binderfs_mount_opts { - int max; - int stats_mode; -}; - -/** - * binderfs_info - information about a binderfs mount - * @ipc_ns: The ipc namespace the binderfs mount belongs to. - * @control_dentry: This records the dentry of this binderfs mount - * binder-control device. - * @root_uid: uid that needs to be used when a new binder device is - * created. - * @root_gid: gid that needs to be used when a new binder device is - * created. - * @mount_opts: The mount options in use. - * @device_count: The current number of allocated binder devices. - * @proc_log_dir: Pointer to the directory dentry containing process-specific - * logs. - */ -struct binderfs_info { - struct ipc_namespace *ipc_ns; - struct dentry *control_dentry; - kuid_t root_uid; - kgid_t root_gid; - struct binderfs_mount_opts mount_opts; - int device_count; - struct dentry *proc_log_dir; -}; - -extern const struct file_operations binder_fops; - -extern char *binder_devices_param; - -#ifdef CONFIG_ANDROID_BINDERFS -extern bool is_binderfs_device(const struct inode *inode); -extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name, - const struct file_operations *fops, - void *data); -extern void binderfs_remove_file(struct dentry *dentry); -#else -static inline bool is_binderfs_device(const struct inode *inode) -{ - return false; -} -static inline struct dentry *binderfs_create_file(struct dentry *dir, - const char *name, - const struct file_operations *fops, - void *data) -{ - return NULL; -} -static inline void binderfs_remove_file(struct dentry *dentry) {} -#endif - -#ifdef CONFIG_ANDROID_BINDERFS -extern int __init init_binderfs(void); -#else -static inline int __init init_binderfs(void) -{ - return 0; -} -#endif - -struct binder_debugfs_entry { - const char *name; - umode_t mode; - const struct file_operations *fops; - void *data; -}; - -extern const struct binder_debugfs_entry binder_debugfs_entries[]; - -#define binder_for_each_debugfs_entry(entry) \ - for ((entry) = binder_debugfs_entries; \ - (entry)->name; \ - (entry)++) - -enum binder_stat_types { - BINDER_STAT_PROC, - BINDER_STAT_THREAD, - BINDER_STAT_NODE, - BINDER_STAT_REF, - BINDER_STAT_DEATH, - BINDER_STAT_TRANSACTION, - BINDER_STAT_TRANSACTION_COMPLETE, - BINDER_STAT_COUNT -}; - -struct binder_stats { - atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1]; - atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; - atomic_t obj_created[BINDER_STAT_COUNT]; - atomic_t obj_deleted[BINDER_STAT_COUNT]; -}; - -/** - * struct binder_work - work enqueued on a worklist - * @entry: node enqueued on list - * @type: type of work to be performed - * - * There are separate work lists for proc, thread, and node (async). - */ -struct binder_work { - struct list_head entry; - - enum binder_work_type { - BINDER_WORK_TRANSACTION = 1, - BINDER_WORK_TRANSACTION_COMPLETE, - BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT, - BINDER_WORK_RETURN_ERROR, - BINDER_WORK_NODE, - BINDER_WORK_DEAD_BINDER, - BINDER_WORK_DEAD_BINDER_AND_CLEAR, - BINDER_WORK_CLEAR_DEATH_NOTIFICATION, - } type; -}; - -struct binder_error { - struct binder_work work; - uint32_t cmd; -}; - -/** - * struct binder_node - binder node bookkeeping - * @debug_id: unique ID for debugging - * (invariant after initialized) - * @lock: lock for node fields - * @work: worklist element for node work - * (protected by @proc->inner_lock) - * @rb_node: element for proc->nodes tree - * (protected by @proc->inner_lock) - * @dead_node: element for binder_dead_nodes list - * (protected by binder_dead_nodes_lock) - * @proc: binder_proc that owns this node - * (invariant after initialized) - * @refs: list of references on this node - * (protected by @lock) - * @internal_strong_refs: used to take strong references when - * initiating a transaction - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @local_weak_refs: weak user refs from local process - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @local_strong_refs: strong user refs from local process - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @tmp_refs: temporary kernel refs - * (protected by @proc->inner_lock while @proc - * is valid, and by binder_dead_nodes_lock - * if @proc is NULL. During inc/dec and node release - * it is also protected by @lock to provide safety - * as the node dies and @proc becomes NULL) - * @ptr: userspace pointer for node - * (invariant, no lock needed) - * @cookie: userspace cookie for node - * (invariant, no lock needed) - * @has_strong_ref: userspace notified of strong ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @pending_strong_ref: userspace has acked notification of strong ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @has_weak_ref: userspace notified of weak ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @pending_weak_ref: userspace has acked notification of weak ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @has_async_transaction: async transaction to node in progress - * (protected by @lock) - * @sched_policy: minimum scheduling policy for node - * (invariant after initialized) - * @accept_fds: file descriptor operations supported for node - * (invariant after initialized) - * @min_priority: minimum scheduling priority - * (invariant after initialized) - * @inherit_rt: inherit RT scheduling policy from caller - * @txn_security_ctx: require sender's security context - * (invariant after initialized) - * @async_todo: list of async work items - * (protected by @proc->inner_lock) - * - * Bookkeeping structure for binder nodes. - */ -struct binder_node { - int debug_id; - spinlock_t lock; - struct binder_work work; - union { - struct rb_node rb_node; - struct hlist_node dead_node; - }; - struct binder_proc *proc; - struct hlist_head refs; - int internal_strong_refs; - int local_weak_refs; - int local_strong_refs; - int tmp_refs; - binder_uintptr_t ptr; - binder_uintptr_t cookie; - struct { - /* - * bitfield elements protected by - * proc inner_lock - */ - u8 has_strong_ref:1; - u8 pending_strong_ref:1; - u8 has_weak_ref:1; - u8 pending_weak_ref:1; - }; - struct { - /* - * invariant after initialization - */ - u8 sched_policy:2; - u8 inherit_rt:1; - u8 accept_fds:1; - u8 txn_security_ctx:1; - u8 min_priority; - }; - bool has_async_transaction; - struct list_head async_todo; -}; - -struct binder_ref_death { - /** - * @work: worklist element for death notifications - * (protected by inner_lock of the proc that - * this ref belongs to) - */ - struct binder_work work; - binder_uintptr_t cookie; -}; - -/** - * struct binder_ref_data - binder_ref counts and id - * @debug_id: unique ID for the ref - * @desc: unique userspace handle for ref - * @strong: strong ref count (debugging only if not locked) - * @weak: weak ref count (debugging only if not locked) - * - * Structure to hold ref count and ref id information. Since - * the actual ref can only be accessed with a lock, this structure - * is used to return information about the ref to callers of - * ref inc/dec functions. - */ -struct binder_ref_data { - int debug_id; - uint32_t desc; - int strong; - int weak; -}; - -/** - * struct binder_ref - struct to track references on nodes - * @data: binder_ref_data containing id, handle, and current refcounts - * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree - * @rb_node_node: node for lookup by @node in proc's rb_tree - * @node_entry: list entry for node->refs list in target node - * (protected by @node->lock) - * @proc: binder_proc containing ref - * @node: binder_node of target node. When cleaning up a - * ref for deletion in binder_cleanup_ref, a non-NULL - * @node indicates the node must be freed - * @death: pointer to death notification (ref_death) if requested - * (protected by @node->lock) - * - * Structure to track references from procA to target node (on procB). This - * structure is unsafe to access without holding @proc->outer_lock. - */ -struct binder_ref { - /* Lookups needed: */ - /* node + proc => ref (transaction) */ - /* desc + proc => ref (transaction, inc/dec ref) */ - /* node => refs + procs (proc exit) */ - struct binder_ref_data data; - struct rb_node rb_node_desc; - struct rb_node rb_node_node; - struct hlist_node node_entry; - struct binder_proc *proc; - struct binder_node *node; - struct binder_ref_death *death; -}; - -/** - * struct binder_priority - scheduler policy and priority - * @sched_policy scheduler policy - * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT - * - * The binder driver supports inheriting the following scheduler policies: - * SCHED_NORMAL - * SCHED_BATCH - * SCHED_FIFO - * SCHED_RR - */ -struct binder_priority { - unsigned int sched_policy; - int prio; -}; - -enum binder_prio_state { - BINDER_PRIO_SET, /* desired priority set */ - BINDER_PRIO_PENDING, /* initiated a saved priority restore */ - BINDER_PRIO_ABORT, /* abort the pending priority restore */ -}; - -/** - * struct binder_proc - binder process bookkeeping - * @proc_node: element for binder_procs list - * @threads: rbtree of binder_threads in this proc - * (protected by @inner_lock) - * @nodes: rbtree of binder nodes associated with - * this proc ordered by node->ptr - * (protected by @inner_lock) - * @refs_by_desc: rbtree of refs ordered by ref->desc - * (protected by @outer_lock) - * @refs_by_node: rbtree of refs ordered by ref->node - * (protected by @outer_lock) - * @waiting_threads: threads currently waiting for proc work - * (protected by @inner_lock) - * @pid PID of group_leader of process - * (invariant after initialized) - * @tsk task_struct for group_leader of process - * (invariant after initialized) - * @cred struct cred associated with the `struct file` - * in binder_open() - * (invariant after initialized) - * @deferred_work_node: element for binder_deferred_list - * (protected by binder_deferred_lock) - * @deferred_work: bitmap of deferred work to perform - * (protected by binder_deferred_lock) - * @outstanding_txns: number of transactions to be transmitted before - * processes in freeze_wait are woken up - * (protected by @inner_lock) - * @is_dead: process is dead and awaiting free - * when outstanding transactions are cleaned up - * (protected by @inner_lock) - * @is_frozen: process is frozen and unable to service - * binder transactions - * (protected by @inner_lock) - * @sync_recv: process received sync transactions since last frozen - * bit 0: received sync transaction after being frozen - * bit 1: new pending sync transaction during freezing - * (protected by @inner_lock) - * @async_recv: process received async transactions since last frozen - * (protected by @inner_lock) - * @freeze_wait: waitqueue of processes waiting for all outstanding - * transactions to be processed - * (protected by @inner_lock) - * @todo: list of work for this process - * (protected by @inner_lock) - * @stats: per-process binder statistics - * (atomics, no lock needed) - * @delivered_death: list of delivered death notification - * (protected by @inner_lock) - * @max_threads: cap on number of binder threads - * (protected by @inner_lock) - * @requested_threads: number of binder threads requested but not - * yet started. In current implementation, can - * only be 0 or 1. - * (protected by @inner_lock) - * @requested_threads_started: number binder threads started - * (protected by @inner_lock) - * @tmp_ref: temporary reference to indicate proc is in use - * (protected by @inner_lock) - * @default_priority: default scheduler priority - * (invariant after initialized) - * @debugfs_entry: debugfs node - * @alloc: binder allocator bookkeeping - * @context: binder_context for this proc - * (invariant after initialized) - * @inner_lock: can nest under outer_lock and/or node lock - * @outer_lock: no nesting under innor or node lock - * Lock order: 1) outer, 2) node, 3) inner - * @binderfs_entry: process-specific binderfs log file - * @oneway_spam_detection_enabled: process enabled oneway spam detection - * or not - * - * Bookkeeping structure for binder processes - */ -struct binder_proc { - struct hlist_node proc_node; - struct rb_root threads; - struct rb_root nodes; - struct rb_root refs_by_desc; - struct rb_root refs_by_node; - struct list_head waiting_threads; - int pid; - struct task_struct *tsk; - const struct cred *cred; - struct hlist_node deferred_work_node; - int deferred_work; - int outstanding_txns; - bool is_dead; - bool is_frozen; - bool sync_recv; - bool async_recv; - wait_queue_head_t freeze_wait; - - struct list_head todo; - struct binder_stats stats; - struct list_head delivered_death; - int max_threads; - int requested_threads; - int requested_threads_started; - int tmp_ref; - struct binder_priority default_priority; - struct dentry *debugfs_entry; - struct binder_alloc alloc; - struct binder_context *context; - spinlock_t inner_lock; - spinlock_t outer_lock; - struct dentry *binderfs_entry; - bool oneway_spam_detection_enabled; -}; - -/** - * struct binder_thread - binder thread bookkeeping - * @proc: binder process for this thread - * (invariant after initialization) - * @rb_node: element for proc->threads rbtree - * (protected by @proc->inner_lock) - * @waiting_thread_node: element for @proc->waiting_threads list - * (protected by @proc->inner_lock) - * @pid: PID for this thread - * (invariant after initialization) - * @looper: bitmap of looping state - * (only accessed by this thread) - * @looper_needs_return: looping thread needs to exit driver - * (no lock needed) - * @transaction_stack: stack of in-progress transactions for this thread - * (protected by @proc->inner_lock) - * @todo: list of work to do for this thread - * (protected by @proc->inner_lock) - * @process_todo: whether work in @todo should be processed - * (protected by @proc->inner_lock) - * @return_error: transaction errors reported by this thread - * (only accessed by this thread) - * @reply_error: transaction errors reported by target thread - * (protected by @proc->inner_lock) - * @wait: wait queue for thread work - * @stats: per-thread statistics - * (atomics, no lock needed) - * @tmp_ref: temporary reference to indicate thread is in use - * (atomic since @proc->inner_lock cannot - * always be acquired) - * @is_dead: thread is dead and awaiting free - * when outstanding transactions are cleaned up - * (protected by @proc->inner_lock) - * @task: struct task_struct for this thread - * @prio_lock: protects thread priority fields - * @prio_next: saved priority to be restored next - * (protected by @prio_lock) - * @prio_state: state of the priority restore process as - * defined by enum binder_prio_state - * (protected by @prio_lock) - * - * Bookkeeping structure for binder threads. - */ -struct binder_thread { - struct binder_proc *proc; - struct rb_node rb_node; - struct list_head waiting_thread_node; - int pid; - int looper; /* only modified by this thread */ - bool looper_need_return; /* can be written by other thread */ - struct binder_transaction *transaction_stack; - struct list_head todo; - bool process_todo; - struct binder_error return_error; - struct binder_error reply_error; - wait_queue_head_t wait; - struct binder_stats stats; - atomic_t tmp_ref; - bool is_dead; - struct task_struct *task; - spinlock_t prio_lock; - struct binder_priority prio_next; - enum binder_prio_state prio_state; -}; - -/** - * struct binder_txn_fd_fixup - transaction fd fixup list element - * @fixup_entry: list entry - * @file: struct file to be associated with new fd - * @offset: offset in buffer data to this fixup - * - * List element for fd fixups in a transaction. Since file - * descriptors need to be allocated in the context of the - * target process, we pass each fd to be processed in this - * struct. - */ -struct binder_txn_fd_fixup { - struct list_head fixup_entry; - struct file *file; - size_t offset; -}; - -struct binder_transaction { - int debug_id; - struct binder_work work; - struct binder_thread *from; - struct binder_transaction *from_parent; - struct binder_proc *to_proc; - struct binder_thread *to_thread; - struct binder_transaction *to_parent; - unsigned need_reply:1; - /* unsigned is_dead:1; */ /* not used at the moment */ - - struct binder_buffer *buffer; - unsigned int code; - unsigned int flags; - struct binder_priority priority; - struct binder_priority saved_priority; - bool set_priority_called; - bool is_nested; - kuid_t sender_euid; - struct list_head fd_fixups; - binder_uintptr_t security_ctx; - /** - * @lock: protects @from, @to_proc, and @to_thread - * - * @from, @to_proc, and @to_thread can be set to NULL - * during thread teardown - */ - spinlock_t lock; -}; - -/** - * struct binder_object - union of flat binder object types - * @hdr: generic object header - * @fbo: binder object (nodes and refs) - * @fdo: file descriptor object - * @bbo: binder buffer pointer - * @fdao: file descriptor array - * - * Used for type-independent object copies - */ -struct binder_object { - union { - struct binder_object_header hdr; - struct flat_binder_object fbo; - struct binder_fd_object fdo; - struct binder_buffer_object bbo; - struct binder_fd_array_object fdao; - }; -}; - -#endif /* _LINUX_BINDER_INTERNAL_H */ diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 5d82cf8af88b..7674231af8cb 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -1,6 +1,15 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #undef TRACE_SYSTEM @@ -119,35 +128,6 @@ TRACE_EVENT(binder_wait_for_work, __entry->thread_todo) ); -TRACE_EVENT(binder_txn_latency_free, - TP_PROTO(struct binder_transaction *t, - int from_proc, int from_thread, - int to_proc, int to_thread), - TP_ARGS(t, from_proc, from_thread, to_proc, to_thread), - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, from_proc) - __field(int, from_thread) - __field(int, to_proc) - __field(int, to_thread) - __field(unsigned int, code) - __field(unsigned int, flags) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->from_proc = from_proc; - __entry->from_thread = from_thread; - __entry->to_proc = to_proc; - __entry->to_thread = to_thread; - __entry->code = t->code; - __entry->flags = t->flags; - ), - TP_printk("transaction=%d from %d:%d to %d:%d flags=0x%x code=0x%x", - __entry->debug_id, __entry->from_proc, __entry->from_thread, - __entry->to_proc, __entry->to_thread, __entry->code, - __entry->flags) -); - TRACE_EVENT(binder_transaction, TP_PROTO(bool reply, struct binder_transaction *t, struct binder_node *target_node), @@ -267,40 +247,22 @@ TRACE_EVENT(binder_transaction_ref_to_ref, __entry->dest_ref_debug_id, __entry->dest_ref_desc) ); -TRACE_EVENT(binder_transaction_fd_send, - TP_PROTO(struct binder_transaction *t, int fd, size_t offset), - TP_ARGS(t, fd, offset), +TRACE_EVENT(binder_transaction_fd, + TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd), + TP_ARGS(t, src_fd, dest_fd), TP_STRUCT__entry( __field(int, debug_id) - __field(int, fd) - __field(size_t, offset) + __field(int, src_fd) + __field(int, dest_fd) ), TP_fast_assign( __entry->debug_id = t->debug_id; - __entry->fd = fd; - __entry->offset = offset; + __entry->src_fd = src_fd; + __entry->dest_fd = dest_fd; ), - TP_printk("transaction=%d src_fd=%d offset=%zu", - __entry->debug_id, __entry->fd, __entry->offset) -); - -TRACE_EVENT(binder_transaction_fd_recv, - TP_PROTO(struct binder_transaction *t, int fd, size_t offset), - TP_ARGS(t, fd, offset), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, fd) - __field(size_t, offset) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->fd = fd; - __entry->offset = offset; - ), - TP_printk("transaction=%d dest_fd=%d offset=%zu", - __entry->debug_id, __entry->fd, __entry->offset) + TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d", + __entry->debug_id, __entry->src_fd, __entry->dest_fd) ); DECLARE_EVENT_CLASS(binder_buffer_class, @@ -310,17 +272,14 @@ DECLARE_EVENT_CLASS(binder_buffer_class, __field(int, debug_id) __field(size_t, data_size) __field(size_t, offsets_size) - __field(size_t, extra_buffers_size) ), TP_fast_assign( __entry->debug_id = buf->debug_id; __entry->data_size = buf->data_size; __entry->offsets_size = buf->offsets_size; - __entry->extra_buffers_size = buf->extra_buffers_size; ), - TP_printk("transaction=%d data_size=%zd offsets_size=%zd extra_buffers_size=%zd", - __entry->debug_id, __entry->data_size, __entry->offsets_size, - __entry->extra_buffers_size) + TP_printk("transaction=%d data_size=%zd offsets_size=%zd", + __entry->debug_id, __entry->data_size, __entry->offsets_size) ); DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf, @@ -335,10 +294,6 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, TP_PROTO(struct binder_buffer *buffer), TP_ARGS(buffer)); -DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release, - TP_PROTO(struct binder_buffer *buffer), - TP_ARGS(buffer)); - TRACE_EVENT(binder_update_page_range, TP_PROTO(struct binder_alloc *alloc, bool allocate, void __user *start, void __user *end), diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c deleted file mode 100644 index f80d1fb9d9b2..000000000000 --- a/drivers/android/binderfs.c +++ /dev/null @@ -1,819 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "binder_internal.h" - -#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c) -#define ida_free ida_remove - -#define FIRST_INODE 1 -#define SECOND_INODE 2 -#define INODE_OFFSET 3 -#define INTSTRLEN 21 -#define BINDERFS_MAX_MINOR (1U << MINORBITS) -/* Ensure that the initial ipc namespace always has devices available. */ -#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) - -static dev_t binderfs_dev; -static DEFINE_MUTEX(binderfs_minors_mutex); -static DEFINE_IDA(binderfs_minors); - -enum binderfs_param { - Opt_max, - Opt_stats_mode, -}; - -enum binderfs_stats_mode { - binderfs_stats_mode_unset, - binderfs_stats_mode_global, -}; - -struct binder_features { - bool oneway_spam_detection; -}; - -static const struct constant_table binderfs_param_stats[] = { - { "global", binderfs_stats_mode_global }, - {} -}; - -static const struct fs_parameter_spec binderfs_fs_parameters[] = { - fsparam_u32("max", Opt_max), - fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats), - {} -}; - -static struct binder_features binder_features = { - .oneway_spam_detection = true, -}; - -static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) -{ - return sb->s_fs_info; -} - -bool is_binderfs_device(const struct inode *inode) -{ - if (inode->i_sb->s_magic == BINDERFS_SUPER_MAGIC) - return true; - - return false; -} - -/** - * binderfs_binder_device_create - allocate inode from super block of a - * binderfs mount - * @ref_inode: inode from wich the super block will be taken - * @userp: buffer to copy information about new device for userspace to - * @req: struct binderfs_device as copied from userspace - * - * This function allocates a new binder_device and reserves a new minor - * number for it. - * Minor numbers are limited and tracked globally in binderfs_minors. The - * function will stash a struct binder_device for the specific binder - * device in i_private of the inode. - * It will go on to allocate a new inode from the super block of the - * filesystem mount, stash a struct binder_device in its i_private field - * and attach a dentry to that inode. - * - * Return: 0 on success, negative errno on failure - */ -static int binderfs_binder_device_create(struct inode *ref_inode, - struct binderfs_device __user *userp, - struct binderfs_device *req) -{ - int minor, ret; - struct dentry *dentry, *root; - struct binder_device *device; - char *name = NULL; - size_t name_len; - struct inode *inode = NULL; - struct super_block *sb = ref_inode->i_sb; - struct binderfs_info *info = sb->s_fs_info; -#if defined(CONFIG_IPC_NS) - bool use_reserve = (info->ipc_ns == &init_ipc_ns); -#else - bool use_reserve = true; -#endif - - /* Reserve new minor number for the new device. */ - mutex_lock(&binderfs_minors_mutex); - if (++info->device_count <= info->mount_opts.max) - minor = ida_alloc_max(&binderfs_minors, - use_reserve ? BINDERFS_MAX_MINOR : - BINDERFS_MAX_MINOR_CAPPED, - GFP_KERNEL); - else - minor = -ENOSPC; - if (minor < 0) { - --info->device_count; - mutex_unlock(&binderfs_minors_mutex); - return minor; - } - mutex_unlock(&binderfs_minors_mutex); - - ret = -ENOMEM; - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - goto err; - - inode = new_inode(sb); - if (!inode) - goto err; - - inode->i_ino = minor + INODE_OFFSET; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - init_special_inode(inode, S_IFCHR | 0600, - MKDEV(MAJOR(binderfs_dev), minor)); - inode->i_fop = &binder_fops; - inode->i_uid = info->root_uid; - inode->i_gid = info->root_gid; - - req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */ - name_len = strlen(req->name); - /* Make sure to include terminating NUL byte */ - name = kmemdup(req->name, name_len + 1, GFP_KERNEL); - if (!name) - goto err; - - refcount_set(&device->ref, 1); - device->binderfs_inode = inode; - device->context.binder_context_mgr_uid = INVALID_UID; - device->context.name = name; - device->miscdev.name = name; - device->miscdev.minor = minor; - mutex_init(&device->context.context_mgr_node_lock); - - req->major = MAJOR(binderfs_dev); - req->minor = minor; - - if (userp && copy_to_user(userp, req, sizeof(*req))) { - ret = -EFAULT; - goto err; - } - - root = sb->s_root; - inode_lock(d_inode(root)); - - /* look it up */ - dentry = lookup_one_len(name, root, name_len); - if (IS_ERR(dentry)) { - inode_unlock(d_inode(root)); - ret = PTR_ERR(dentry); - goto err; - } - - if (d_really_is_positive(dentry)) { - /* already exists */ - dput(dentry); - inode_unlock(d_inode(root)); - ret = -EEXIST; - goto err; - } - - inode->i_private = device; - d_instantiate(dentry, inode); - fsnotify_create(root->d_inode, dentry); - inode_unlock(d_inode(root)); - - return 0; - -err: - kfree(name); - kfree(device); - mutex_lock(&binderfs_minors_mutex); - --info->device_count; - ida_free(&binderfs_minors, minor); - mutex_unlock(&binderfs_minors_mutex); - iput(inode); - - return ret; -} - -/** - * binderfs_ctl_ioctl - handle binder device node allocation requests - * - * The request handler for the binder-control device. All requests operate on - * the binderfs mount the binder-control device resides in: - * - BINDER_CTL_ADD - * Allocate a new binder device. - * - * Return: 0 on success, negative errno on failure - */ -static long binder_ctl_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret = -EINVAL; - struct inode *inode = file_inode(file); - struct binderfs_device __user *device = (struct binderfs_device __user *)arg; - struct binderfs_device device_req; - - switch (cmd) { - case BINDER_CTL_ADD: - ret = copy_from_user(&device_req, device, sizeof(device_req)); - if (ret) { - ret = -EFAULT; - break; - } - - ret = binderfs_binder_device_create(inode, device, &device_req); - break; - default: - break; - } - - return ret; -} - -static void binderfs_evict_inode(struct inode *inode) -{ - struct binder_device *device = inode->i_private; - struct binderfs_info *info = BINDERFS_SB(inode->i_sb); - - clear_inode(inode); - - if (!S_ISCHR(inode->i_mode) || !device) - return; - - mutex_lock(&binderfs_minors_mutex); - --info->device_count; - ida_free(&binderfs_minors, device->miscdev.minor); - mutex_unlock(&binderfs_minors_mutex); - - if (refcount_dec_and_test(&device->ref)) { - kfree(device->context.name); - kfree(device); - } -} - -static int binderfs_fs_context_parse_param(struct fs_context *fc, - struct fs_parameter *param) -{ - int opt; - struct binderfs_mount_opts *ctx = fc->fs_private; - struct fs_parse_result result; - - opt = fs_parse(fc, binderfs_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_max: - if (result.uint_32 > BINDERFS_MAX_MINOR) - return invalfc(fc, "Bad value for '%s'", param->key); - - ctx->max = result.uint_32; - break; - case Opt_stats_mode: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - ctx->stats_mode = result.uint_32; - break; - default: - return invalfc(fc, "Unsupported parameter '%s'", param->key); - } - - return 0; -} - -static int binderfs_fs_context_reconfigure(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx = fc->fs_private; - struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb); - - if (info->mount_opts.stats_mode != ctx->stats_mode) - return invalfc(fc, "Binderfs stats mode cannot be changed during a remount"); - - info->mount_opts.stats_mode = ctx->stats_mode; - info->mount_opts.max = ctx->max; - return 0; -} - -static int binderfs_show_options(struct seq_file *seq, struct dentry *root) -{ - struct binderfs_info *info = BINDERFS_SB(root->d_sb); - - if (info->mount_opts.max <= BINDERFS_MAX_MINOR) - seq_printf(seq, ",max=%d", info->mount_opts.max); - - switch (info->mount_opts.stats_mode) { - case binderfs_stats_mode_unset: - break; - case binderfs_stats_mode_global: - seq_printf(seq, ",stats=global"); - break; - } - - return 0; -} - -static void binderfs_put_super(struct super_block *sb) -{ - struct binderfs_info *info = sb->s_fs_info; - - if (info && info->ipc_ns) - put_ipc_ns(info->ipc_ns); - - kfree(info); - sb->s_fs_info = NULL; -} - -static const struct super_operations binderfs_super_ops = { - .evict_inode = binderfs_evict_inode, - .show_options = binderfs_show_options, - .statfs = simple_statfs, - .put_super = binderfs_put_super, -}; - -static inline bool is_binderfs_control_device(const struct dentry *dentry) -{ - struct binderfs_info *info = dentry->d_sb->s_fs_info; - - return info->control_dentry == dentry; -} - -static int binderfs_rename(struct user_namespace *mnt_userns, - struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - if (is_binderfs_control_device(old_dentry) || - is_binderfs_control_device(new_dentry)) - return -EPERM; - - return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir, - new_dentry, flags); -} - -static int binderfs_unlink(struct inode *dir, struct dentry *dentry) -{ - if (is_binderfs_control_device(dentry)) - return -EPERM; - - return simple_unlink(dir, dentry); -} - -static const struct file_operations binder_ctl_fops = { - .owner = THIS_MODULE, - .open = nonseekable_open, - .unlocked_ioctl = binder_ctl_ioctl, - .compat_ioctl = binder_ctl_ioctl, - .llseek = noop_llseek, -}; - -/** - * binderfs_binder_ctl_create - create a new binder-control device - * @sb: super block of the binderfs mount - * - * This function creates a new binder-control device node in the binderfs mount - * referred to by @sb. - * - * Return: 0 on success, negative errno on failure - */ -static int binderfs_binder_ctl_create(struct super_block *sb) -{ - int minor, ret; - struct dentry *dentry; - struct binder_device *device; - struct inode *inode = NULL; - struct dentry *root = sb->s_root; - struct binderfs_info *info = sb->s_fs_info; -#if defined(CONFIG_IPC_NS) - bool use_reserve = (info->ipc_ns == &init_ipc_ns); -#else - bool use_reserve = true; -#endif - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - return -ENOMEM; - - /* If we have already created a binder-control node, return. */ - if (info->control_dentry) { - ret = 0; - goto out; - } - - ret = -ENOMEM; - inode = new_inode(sb); - if (!inode) - goto out; - - /* Reserve a new minor number for the new device. */ - mutex_lock(&binderfs_minors_mutex); - minor = ida_alloc_max(&binderfs_minors, - use_reserve ? BINDERFS_MAX_MINOR : - BINDERFS_MAX_MINOR_CAPPED, - GFP_KERNEL); - mutex_unlock(&binderfs_minors_mutex); - if (minor < 0) { - ret = minor; - goto out; - } - - inode->i_ino = SECOND_INODE; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - init_special_inode(inode, S_IFCHR | 0600, - MKDEV(MAJOR(binderfs_dev), minor)); - inode->i_fop = &binder_ctl_fops; - inode->i_uid = info->root_uid; - inode->i_gid = info->root_gid; - - refcount_set(&device->ref, 1); - device->binderfs_inode = inode; - device->miscdev.minor = minor; - - dentry = d_alloc_name(root, "binder-control"); - if (!dentry) - goto out; - - inode->i_private = device; - info->control_dentry = dentry; - d_add(dentry, inode); - - return 0; - -out: - kfree(device); - iput(inode); - - return ret; -} - -static const struct inode_operations binderfs_dir_inode_operations = { - .lookup = simple_lookup, - .rename = binderfs_rename, - .unlink = binderfs_unlink, -}; - -static struct inode *binderfs_make_inode(struct super_block *sb, int mode) -{ - struct inode *ret; - - ret = new_inode(sb); - if (ret) { - ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET); - ret->i_mode = mode; - ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret); - } - return ret; -} - -static struct dentry *binderfs_create_dentry(struct dentry *parent, - const char *name) -{ - struct dentry *dentry; - - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) - return dentry; - - /* Return error if the file/dir already exists. */ - if (d_really_is_positive(dentry)) { - dput(dentry); - return ERR_PTR(-EEXIST); - } - - return dentry; -} - -void binderfs_remove_file(struct dentry *dentry) -{ - struct inode *parent_inode; - - parent_inode = d_inode(dentry->d_parent); - inode_lock(parent_inode); - if (simple_positive(dentry)) { - dget(dentry); - simple_unlink(parent_inode, dentry); - d_delete(dentry); - dput(dentry); - } - inode_unlock(parent_inode); -} - -struct dentry *binderfs_create_file(struct dentry *parent, const char *name, - const struct file_operations *fops, - void *data) -{ - struct dentry *dentry; - struct inode *new_inode, *parent_inode; - struct super_block *sb; - - parent_inode = d_inode(parent); - inode_lock(parent_inode); - - dentry = binderfs_create_dentry(parent, name); - if (IS_ERR(dentry)) - goto out; - - sb = parent_inode->i_sb; - new_inode = binderfs_make_inode(sb, S_IFREG | 0444); - if (!new_inode) { - dput(dentry); - dentry = ERR_PTR(-ENOMEM); - goto out; - } - - new_inode->i_fop = fops; - new_inode->i_private = data; - d_instantiate(dentry, new_inode); - fsnotify_create(parent_inode, dentry); - -out: - inode_unlock(parent_inode); - return dentry; -} - -static struct dentry *binderfs_create_dir(struct dentry *parent, - const char *name) -{ - struct dentry *dentry; - struct inode *new_inode, *parent_inode; - struct super_block *sb; - - parent_inode = d_inode(parent); - inode_lock(parent_inode); - - dentry = binderfs_create_dentry(parent, name); - if (IS_ERR(dentry)) - goto out; - - sb = parent_inode->i_sb; - new_inode = binderfs_make_inode(sb, S_IFDIR | 0755); - if (!new_inode) { - dput(dentry); - dentry = ERR_PTR(-ENOMEM); - goto out; - } - - new_inode->i_fop = &simple_dir_operations; - new_inode->i_op = &simple_dir_inode_operations; - - set_nlink(new_inode, 2); - d_instantiate(dentry, new_inode); - inc_nlink(parent_inode); - fsnotify_mkdir(parent_inode, dentry); - -out: - inode_unlock(parent_inode); - return dentry; -} - -static int binder_features_show(struct seq_file *m, void *unused) -{ - bool *feature = m->private; - - seq_printf(m, "%d\n", *feature); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(binder_features); - -static int init_binder_features(struct super_block *sb) -{ - struct dentry *dentry, *dir; - - dir = binderfs_create_dir(sb->s_root, "features"); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - dentry = binderfs_create_file(dir, "oneway_spam_detection", - &binder_features_fops, - &binder_features.oneway_spam_detection); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - return 0; -} - -static int init_binder_logs(struct super_block *sb) -{ - struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir; - const struct binder_debugfs_entry *db_entry; - struct binderfs_info *info; - int ret = 0; - - binder_logs_root_dir = binderfs_create_dir(sb->s_root, - "binder_logs"); - if (IS_ERR(binder_logs_root_dir)) { - ret = PTR_ERR(binder_logs_root_dir); - goto out; - } - - binder_for_each_debugfs_entry(db_entry) { - dentry = binderfs_create_file(binder_logs_root_dir, - db_entry->name, - db_entry->fops, - db_entry->data); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out; - } - } - - proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc"); - if (IS_ERR(proc_log_dir)) { - ret = PTR_ERR(proc_log_dir); - goto out; - } - info = sb->s_fs_info; - info->proc_log_dir = proc_log_dir; - -out: - return ret; -} - -static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc) -{ - int ret; - struct binderfs_info *info; - struct binderfs_mount_opts *ctx = fc->fs_private; - struct inode *inode = NULL; - struct binderfs_device device_info = {}; - const char *name; - size_t len; - - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; - - /* - * The binderfs filesystem can be mounted by userns root in a - * non-initial userns. By default such mounts have the MS_NODEV flag - * set in s_iflags to prevent security issues where userns root can - * just create random device nodes via mknod() since it owns the - * filesystem mount. But binderfs does not allow to create any files - * including devices nodes. The only way to create binder devices nodes - * is through the binder-control device which userns root is explicitly - * allowed to do. So removing the MS_NODEV flag from s_iflags is both - * necessary and safe. - */ - sb->s_iflags &= ~MS_NODEV; - sb->s_iflags |= SB_I_NOEXEC; - sb->s_magic = BINDERFS_SUPER_MAGIC; - sb->s_op = &binderfs_super_ops; - sb->s_time_gran = 1; - - sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL); - if (!sb->s_fs_info) - return -ENOMEM; - info = sb->s_fs_info; - - info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); - - info->root_gid = make_kgid(sb->s_user_ns, 0); - if (!gid_valid(info->root_gid)) - info->root_gid = GLOBAL_ROOT_GID; - info->root_uid = make_kuid(sb->s_user_ns, 0); - if (!uid_valid(info->root_uid)) - info->root_uid = GLOBAL_ROOT_UID; - info->mount_opts.max = ctx->max; - info->mount_opts.stats_mode = ctx->stats_mode; - - inode = new_inode(sb); - if (!inode) - return -ENOMEM; - - inode->i_ino = FIRST_INODE; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | 0755; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - inode->i_op = &binderfs_dir_inode_operations; - set_nlink(inode, 2); - - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - - ret = binderfs_binder_ctl_create(sb); - if (ret) - return ret; - - name = binder_devices_param; - for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { - strscpy(device_info.name, name, len + 1); - ret = binderfs_binder_device_create(inode, NULL, &device_info); - if (ret) - return ret; - name += len; - if (*name == ',') - name++; - } - - ret = init_binder_features(sb); - if (ret) - return ret; - - if (info->mount_opts.stats_mode == binderfs_stats_mode_global) - return init_binder_logs(sb); - - return 0; -} - -static int binderfs_fs_context_get_tree(struct fs_context *fc) -{ - return get_tree_nodev(fc, binderfs_fill_super); -} - -static void binderfs_fs_context_free(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx = fc->fs_private; - - kfree(ctx); -} - -static const struct fs_context_operations binderfs_fs_context_ops = { - .free = binderfs_fs_context_free, - .get_tree = binderfs_fs_context_get_tree, - .parse_param = binderfs_fs_context_parse_param, - .reconfigure = binderfs_fs_context_reconfigure, -}; - -static int binderfs_init_fs_context(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx; - - ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->max = BINDERFS_MAX_MINOR; - ctx->stats_mode = binderfs_stats_mode_unset; - - fc->fs_private = ctx; - fc->ops = &binderfs_fs_context_ops; - - return 0; -} - -static struct file_system_type binder_fs_type = { - .name = "binder", - .init_fs_context = binderfs_init_fs_context, - .parameters = binderfs_fs_parameters, - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, -}; - -int __init init_binderfs(void) -{ - int ret; - const char *name; - size_t len; - - /* Verify that the default binderfs device names are valid. */ - name = binder_devices_param; - for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { - if (len > BINDERFS_MAX_NAME) - return -E2BIG; - name += len; - if (*name == ',') - name++; - } - - /* Allocate new major number for binderfs. */ - ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR, - "binder"); - if (ret) - return ret; - - ret = register_filesystem(&binder_fs_type); - if (ret) { - unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR); - return ret; - } - - return ret; -} diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c deleted file mode 100644 index fd718ab02392..000000000000 --- a/drivers/android/vendor_hooks.c +++ /dev/null @@ -1,433 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* vendor_hook.c - * - * Android Vendor Hook Support - * - * Copyright 2020 Google LLC - */ - -#ifndef __GENKSYMS__ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include <../fs/mount.h> -#include <../kernel/audit.h> -#include <../kernel/locking/mutex.h> -#include <../net/can/af_can.h> -#include <../net/tipc/bearer.h> -#include <../kernel/printk/printk_ringbuffer.h> -#endif - -#define CREATE_TRACE_POINTS -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef __GENKSYMS__ -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef __GENKSYMS__ -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -/* - * Export tracepoints that act as a bare tracehook (ie: have no trace event - * associated with them) to allow external modules to probe them. - */ -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_refrigerator); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_arch_set_freq_scale); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_is_fpsimd_save); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_transaction_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_priority_skip); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_set_priority); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_restore_priority); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wakeup_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_send_sig_info); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_sleep_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_futex); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_end); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_traverse_plist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_this); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_up_q_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_process_killed); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_killed_process); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_can_spin_on_owner); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sched_show_task); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_hotplug); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller_id); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_ext_header); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_gic_v3_set_affinity); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_set_affinity); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v3_affinity_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_suspend_epoch_val); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_resume_epoch_val); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_table_limits); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_resolve_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_fast_switch); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_target); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_offline); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_skip_swapcache_flags); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_gfp_zone_flags); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_readahead_gfp_mask); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_readahead_gfp_mask); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rmqueue_bulk); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_disable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_enable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_disable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_enable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_can_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_online); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_size_check); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_format_check); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_dump_buffer); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_compl_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_set_task); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_syscall_prctl_finished); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_uic_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sdev); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_clock_scaling); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_use_mcq_hooks); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_max_tag); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_map_tag); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_set_sqid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_handler); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_make_hba_operational); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_hba_capabilities); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_print_trs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_send_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_config); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_has_oustanding_reqs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_get_outstanding_reqs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_cmd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_pending); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_retry_complete); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kfree_skb); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cgroup_force_kthread_migration); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wait_for_work); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_entry); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_select_worklist_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sync_txn_recvd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpufreq_transition); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_add_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_update_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_remove_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_balance_anon_file_reclaim); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_show_max_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_failed_page_trylock); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_clear); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_get_result); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_page_trylock); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_referenced_check_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drain_all_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_drain_all_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pcplist_add_cma_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_slab_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_insert); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_delete); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_replace); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_lookup); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_commit_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_exit_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_override_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_revert_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_mutex_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rtmutex_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rwsem_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_pcpu_rwsem_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_nx); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_rw); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_before_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_after_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_oom_check_panic); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf_pr_cont); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks_dn); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_meminfo_proc_show); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_mm); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_slowpath); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mem); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_print_slabinfo_header); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_shrink_slab); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cache_show); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_report_bug); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watchdog_timer_softlockup); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_logging); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_unfrozen); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_die_kernel_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sea); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_mem_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sp_pc_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_undefinstr); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_ptrauth_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_panic_unhandled); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_arm64_serror_panic); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_serror); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_vmpressure); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_request_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_target_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_register); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_disable_thermal_cooling_stats); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_throttle_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tk_based_time_sync); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kswapd_per_node); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_vendor_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_ep_action); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_synctype); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_connect); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_audio_usb_offload_disconnect); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_atomic_remove_fb); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drm_atomic_check_modeset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_tos_resident_on); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_cpu_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_usb_new_device_added); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_regmap_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_mutex_list_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_unlock_slowpath); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dma_buf_release); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dmabuf_heap_flags_validation); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pass_input_event); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_check_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmap_region); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_unmap_one); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_node_memcgs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sdio_pm_flag_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_scan_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_swappiness); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_partial_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_cache_card_properties); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_print_transaction_info); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_tlb_conf); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_calc_decayed_watermark); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_watermark); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_reset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_attach_sd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sdhci_get_cd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_gpio_cd_irqt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_update_partition_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_cmdline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_dataline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_partition_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_cmdline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_dataline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_set_context); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_get_context); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_track_hash); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_id_remove); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_offline); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_online); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_slab); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpuset_fork); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_looper_state_registered); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_read); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_free_proc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_release); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_has_work_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_read_done); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v2_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_signal); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_alloc_new_buf_locked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_reply); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_trans); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_preset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_memcg_scan_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_thermal_stats); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_new_ref); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_del_ref); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mapcount_pages); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_traversal_lruvec); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_should_be_protected); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mark_page_accessed); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_ffu_update_cid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_uid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_user); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_cpu_get_power); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_cache_forced_ra); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_acct_update_power); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rmqueue); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_inactive_ratio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_hibernation_swap); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_cpu_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_hib_resume_bdev); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dma_buf_stats_teardown); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_cold_or_pageout); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_retry); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_encrypt_page); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_init_aes_encrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_skip_swap_map_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_post_image_save); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dm_update_clone_bio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ctl_dirty_rate); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_direct_io_update_bio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_loop_prepare_cmd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_event); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_group); -/* - * For type visibility - */ -const struct readahead_control *GKI_struct_readahead_control; -EXPORT_SYMBOL_GPL(GKI_struct_readahead_control); diff --git a/fs/file.c b/fs/file.c index 73b85f676357..be0792c0a231 100644 --- a/fs/file.c +++ b/fs/file.c @@ -656,37 +656,6 @@ int __close_fd(struct files_struct *files, unsigned fd) return -EBADF; } -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file, and then - * an fput(). - */ -int close_fd_get_file(unsigned int fd, struct file **res) -{ - struct files_struct *files = current->files; - struct file *file; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_unlock; - file = fdt->fd[fd]; - if (!file) - goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); - get_file(file); - *res = file; - return 0; - -out_unlock: - spin_unlock(&files->file_lock); - *res = NULL; - return -ENOENT; -} - void do_close_on_exec(struct files_struct *files) { unsigned i; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index b01be50dbb24..442b54a14cbc 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -120,7 +120,6 @@ extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); -extern int close_fd_get_file(unsigned int fd, struct file **res); extern struct kmem_cache *files_cachep; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a548961be39..d2d7208b2274 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,28 +32,6 @@ struct user_struct; struct writeback_control; struct bdi_writeback; -/** - * mmgrab() - Pin a &struct mm_struct. - * @mm: The &struct mm_struct to pin. - * - * Make sure that @mm will not get freed even after the owning task - * exits. This doesn't guarantee that the associated address space - * will still exist later on and mmget_not_zero() has to be used before - * accessing it. - * - * This is a preferred way to pin @mm for a longer/unbounded amount - * of time. - * - * Use mmdrop() to release the reference acquired by mmgrab(). - * - * See also for an in-depth explanation - * of &mm_struct.mm_count vs &mm_struct.mm_users. - */ -static inline void mmgrab(struct mm_struct *mm) -{ - atomic_inc(&mm->mm_count); -} - #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index d75248d81499..e305b66a9fb9 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -139,20 +139,6 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); -#define DEFINE_SHOW_ATTRIBUTE(__name) \ -static int __name ## _open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, __name ## _show, inode->i_private); \ -} \ - \ -static const struct file_operations __name ## _fops = { \ - .owner = THIS_MODULE, \ - .open = __name ## _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index dab24c19c82a..3558b58da3e4 100755 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -1,4 +1,3 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Copyright (C) 2008 Google, Inc. * @@ -67,7 +66,6 @@ enum flat_binder_object_flags { * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds. */ FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, - /** * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy * @@ -89,6 +87,7 @@ enum flat_binder_object_flags { * scheduling policy from the caller (for synchronous transactions). */ FLAT_BINDER_FLAG_INHERIT_RT = 0x800, +#ifdef __KERNEL__ /** * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts @@ -97,6 +96,7 @@ enum flat_binder_object_flags { * context */ FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000, +#endif /* __KERNEL__ */ }; #ifdef BINDER_IPC_32BIT @@ -265,25 +265,6 @@ struct binder_node_info_for_ref { __u32 reserved3; }; -struct binder_freeze_info { - __u32 pid; - __u32 enable; - __u32 timeout_ms; -}; - -struct binder_frozen_status_info { - __u32 pid; - - /* process received sync transactions since last frozen - * bit 0: received sync transaction after being frozen - * bit 1: new pending sync transaction during freezing - */ - __u32 sync_recv; - - /* process received async transactions since last frozen */ - __u32 async_recv; -}; - #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -294,9 +275,6 @@ struct binder_frozen_status_info { #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) #define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) #define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) -#define BINDER_FREEZE _IOW('b', 14, struct binder_freeze_info) -#define BINDER_GET_FROZEN_INFO _IOWR('b', 15, struct binder_frozen_status_info) -#define BINDER_ENABLE_ONEWAY_SPAM_DETECTION _IOW('b', 16, __u32) /* * NOTE: Two special error codes you should check for when calling @@ -319,7 +297,6 @@ enum transaction_flags { TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */ TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */ TF_CLEAR_BUF = 0x20, /* clear buffer on txn complete */ - TF_UPDATE_TXN = 0x40, /* update the outdated pending async txn */ }; struct binder_transaction_data { @@ -357,11 +334,13 @@ struct binder_transaction_data { } data; }; +#ifdef __KERNEL__ struct binder_transaction_data_secctx { struct binder_transaction_data transaction_data; binder_uintptr_t secctx; }; +#endif /* __KERNEL__ */ struct binder_transaction_data_sg { struct binder_transaction_data transaction_data; binder_size_t buffers_size; @@ -397,11 +376,13 @@ enum binder_driver_return_protocol { BR_OK = _IO('r', 1), /* No parameters! */ +#ifdef __KERNEL__ BR_TRANSACTION_SEC_CTX = _IOR('r', 2, struct binder_transaction_data_secctx), /* * binder_transaction_data_secctx: the received command. */ +#endif /* __KERNEL__ */ BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), BR_REPLY = _IOR('r', 3, struct binder_transaction_data), /* @@ -476,22 +457,9 @@ enum binder_driver_return_protocol { BR_FAILED_REPLY = _IO('r', 17), /* - * The last transaction (either a bcTRANSACTION or + * The the last transaction (either a bcTRANSACTION or * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. */ - - BR_FROZEN_REPLY = _IO('r', 18), - /* - * The target of the last transaction (either a bcTRANSACTION or - * a bcATTEMPT_ACQUIRE) is frozen. No parameters. - */ - - BR_ONEWAY_SPAM_SUSPECT = _IO('r', 19), - /* - * Current process sent too many oneway calls to target, and the last - * asynchronous transaction makes the allocated async buffer size exceed - * detection threshold. No parameters. - */ }; enum binder_driver_command_protocol { @@ -578,3 +546,4 @@ enum binder_driver_command_protocol { }; #endif /* _UAPI_LINUX_BINDER_H */ + diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h deleted file mode 100644 index 87410477aea9..000000000000 --- a/include/uapi/linux/android/binderfs.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2018 Canonical Ltd. - * - */ - -#ifndef _UAPI_LINUX_BINDERFS_H -#define _UAPI_LINUX_BINDERFS_H - -#include -#include -#include - -#define BINDERFS_MAX_NAME 255 - -/** - * struct binderfs_device - retrieve information about a new binder device - * @name: the name to use for the new binderfs binder device - * @major: major number allocated for binderfs binder devices - * @minor: minor number allocated for the new binderfs binder device - * - */ -struct binderfs_device { - char name[BINDERFS_MAX_NAME + 1]; - __u32 major; - __u32 minor; -}; - -/** - * Allocate a new binder device. - */ -#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device) - -#endif /* _UAPI_LINUX_BINDERFS_H */ - From 5dc53b260a1c9bcab6143d9484708cb947ff44b2 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:56:58 +0800 Subject: [PATCH 07/59] Revert "kernel: Backported cgroup freezer v2" This reverts commit 4e6ba9b5ff2cf3b049d780c4541d316ff811b09a. --- Documentation/cgroup-v1/rdma.txt | 109 - Documentation/cgroup-v2.txt | 371 +- Documentation/kernel-parameters.txt | 8 +- fs/internal.h | 1 - fs/kernfs/file.c | 2 +- fs/kernfs/mount.c | 11 +- include/linux/cgroup-defs.h | 184 +- include/linux/cgroup.h | 96 +- include/linux/cgroup_rdma.h | 53 - include/linux/cgroup_subsys.h | 4 - include/linux/cpuset.h | 4 +- include/linux/cred.h | 12 - include/linux/kernfs.h | 2 +- include/linux/sched.h | 4 - include/linux/sched/deadline.h | 8 +- include/linux/sched/prio.h | 6 +- include/linux/sched/rt.h | 10 +- include/linux/sched/sysctl.h | 10 +- init/Kconfig | 17 +- kernel/Makefile | 5 +- kernel/{cgroup => }/cgroup.c | 3503 ++++++++++------- kernel/cgroup/Makefile | 7 - kernel/cgroup/cgroup-internal.h | 242 -- kernel/cgroup/cgroup-v1.c | 1314 ------- kernel/cgroup/debug.c | 382 -- kernel/cgroup/freezer.c | 315 -- kernel/cgroup/namespace.c | 155 - kernel/cgroup/rdma.c | 619 --- .../legacy_freezer.c => cgroup_freezer.c} | 6 +- kernel/{cgroup/pids.c => cgroup_pids.c} | 5 +- kernel/{cgroup => }/cpuset.c | 77 +- kernel/cred.c | 2 +- kernel/events/core.c | 7 - kernel/exit.c | 1 - kernel/fork.c | 2 - kernel/sched/core.c | 4 +- kernel/signal.c | 79 +- mm/memcontrol.c | 2 +- net/core/netclassid_cgroup.c | 2 +- tools/perf/util/cgroup.c | 26 +- 40 files changed, 2221 insertions(+), 5446 deletions(-) delete mode 100644 Documentation/cgroup-v1/rdma.txt delete mode 100644 include/linux/cgroup_rdma.h rename kernel/{cgroup => }/cgroup.c (70%) delete mode 100644 kernel/cgroup/Makefile delete mode 100644 kernel/cgroup/cgroup-internal.h delete mode 100644 kernel/cgroup/cgroup-v1.c delete mode 100644 kernel/cgroup/debug.c delete mode 100644 kernel/cgroup/freezer.c delete mode 100644 kernel/cgroup/namespace.c delete mode 100644 kernel/cgroup/rdma.c rename kernel/{cgroup/legacy_freezer.c => cgroup_freezer.c} (99%) rename kernel/{cgroup/pids.c => cgroup_pids.c} (98%) rename kernel/{cgroup => }/cpuset.c (97%) diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt deleted file mode 100644 index af618171e0eb..000000000000 --- a/Documentation/cgroup-v1/rdma.txt +++ /dev/null @@ -1,109 +0,0 @@ - RDMA Controller - ---------------- - -Contents --------- - -1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? -2. Usage Examples - -1. Overview - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can leads to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - -2. Usage Examples ------------------ - -(a) Configure resource limit: -echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max -echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit: -cat /sys/fs/cgroup/rdma/2/rdma.max -#Output: -mlx4_0 hca_handle=2 hca_object=2000 -ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage: -cat /sys/fs/cgroup/rdma/2/rdma.current -#Output: -mlx4_0 hca_handle=1 hca_object=20 -ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit: -echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e4b6bf4de837..73950fdea31a 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -16,9 +16,7 @@ CONTENTS 1-2. What is cgroup? 2. Basic Operations 2-1. Mounting - 2-2. Organizing Processes and Threads - 2-2-1. Processes - 2-2-2. Threads + 2-2. Organizing Processes 2-3. [Un]populated Notification 2-4. Controlling Controllers 2-4-1. Enabling and Disabling @@ -49,12 +47,6 @@ CONTENTS 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback - 5-4. PID - 5-4-1. PID Interface Files - 5-5. Misc - 5-5-1. perf_event - 5-6. RDMA - 5-6-1. RDMA Interface Files 6. Namespace 6-1. Basics 6-2. The Root and Views @@ -151,20 +143,8 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. -cgroup v2 currently supports the following mount options. - nsdelegate - - Consider cgroup namespaces as delegation boundaries. This - option is system wide and can only be set on mount or modified - through remount from the init namespace. The mount option is - ignored on non-init namespace mounts. Please refer to the - Delegation section for details. - - -2-2. Organizing Processes and Threads - -2-2-1. Processes +2-2. Organizing Processes Initially, only the root cgroup exists to which all processes belong. A child cgroup can be created by creating a sub-directory. @@ -215,104 +195,6 @@ is removed subsequently, " (deleted)" is appended to the path. 0::/test-cgroup/test-cgroup-nested (deleted) -2-2-2. Threads - -cgroup v2 supports thread granularity for a subset of controllers to -support use cases requiring hierarchical resource distribution across -the threads of a group of processes. By default, all threads of a -process belong to the same cgroup, which also serves as the resource -domain to host resource consumptions which are not specific to a -process or thread. The thread mode allows threads to be spread across -a subtree while still maintaining the common resource domain for them. - -Controllers which support thread mode are called threaded controllers. -The ones which don't are called domain controllers. - -Marking a cgroup threaded makes it join the resource domain of its -parent as a threaded cgroup. The parent may be another threaded -cgroup whose resource domain is further up in the hierarchy. The root -of a threaded subtree, that is, the nearest ancestor which is not -threaded, is called threaded domain or thread root interchangeably and -serves as the resource domain for the entire subtree. - -Inside a threaded subtree, threads of a process can be put in -different cgroups and are not subject to the no internal process -constraint - threaded controllers can be enabled on non-leaf cgroups -whether they have threads in them or not. - -As the threaded domain cgroup hosts all the domain resource -consumptions of the subtree, it is considered to have internal -resource consumptions whether there are processes in it or not and -can't have populated child cgroups which aren't threaded. Because the -root cgroup is not subject to no internal process constraint, it can -serve both as a threaded domain and a parent to domain cgroups. - -The current operation mode or type of the cgroup is shown in the -"cgroup.type" file which indicates whether the cgroup is a normal -domain, a domain which is serving as the domain of a threaded subtree, -or a threaded cgroup. - -On creation, a cgroup is always a domain cgroup and can be made -threaded by writing "threaded" to the "cgroup.type" file. The -operation is single direction:: - - # echo threaded > cgroup.type - -Once threaded, the cgroup can't be made a domain again. To enable the -thread mode, the following conditions must be met. - -- As the cgroup will join the parent's resource domain. The parent - must either be a valid (threaded) domain or a threaded cgroup. - -- When the parent is an unthreaded domain, it must not have any domain - controllers enabled or populated domain children. The root is - exempt from this requirement. - -Topology-wise, a cgroup can be in an invalid state. Please consider -the following toplogy:: - - A (threaded domain) - B (threaded) - C (domain, just created) - -C is created as a domain but isn't connected to a parent which can -host child domains. C can't be used until it is turned into a -threaded cgroup. "cgroup.type" file will report "domain (invalid)" in -these cases. Operations which fail due to invalid topology use -EOPNOTSUPP as the errno. - -A domain cgroup is turned into a threaded domain when one of its child -cgroup becomes threaded or threaded controllers are enabled in the -"cgroup.subtree_control" file while there are processes in the cgroup. -A threaded domain reverts to a normal domain when the conditions -clear. - -When read, "cgroup.threads" contains the list of the thread IDs of all -threads in the cgroup. Except that the operations are per-thread -instead of per-process, "cgroup.threads" has the same format and -behaves the same way as "cgroup.procs". While "cgroup.threads" can be -written to in any cgroup, as it can only move threads inside the same -threaded domain, its operations are confined inside each threaded -subtree. - -The threaded domain cgroup serves as the resource domain for the whole -subtree, and, while the threads can be scattered across the subtree, -all the processes are considered to be in the threaded domain cgroup. -"cgroup.procs" in a threaded domain cgroup contains the PIDs of all -processes in the subtree and is not readable in the subtree proper. -However, "cgroup.procs" can be written to from anywhere in the subtree -to migrate all threads of the matching process to the cgroup. - -Only threaded controllers can be enabled in a threaded subtree. When -a threaded controller is enabled inside a threaded subtree, it only -accounts for and controls resource consumptions associated with the -threads in the cgroup and its descendants. All consumptions which -aren't tied to a specific thread belong to the threaded domain cgroup. - -Because a threaded subtree is exempt from no internal process -constraint, a threaded controller must be able to handle competition -between threads in a non-leaf cgroup and its child cgroups. Each -threaded controller defines how such competitions are handled. - - 2-3. [Un]populated Notification Each non-root cgroup has a "cgroup.events" file which contains @@ -391,15 +273,15 @@ disabled if one or more children have it enabled. 2-4-3. No Internal Process Constraint -Non-root cgroups can distribute domain resources to their children -only when they don't have any processes of their own. In other words, -only domain cgroups which don't contain any processes can have domain -controllers enabled in their "cgroup.subtree_control" files. +Non-root cgroups can only distribute resources to their children when +they don't have any processes of their own. In other words, only +cgroups which don't contain any processes can have controllers enabled +in their "cgroup.subtree_control" files. -This guarantees that, when a domain controller is looking at the part -of the hierarchy which has it enabled, processes are always only on -the leaves. This rules out situations where child cgroups compete -against internal processes of the parent. +This guarantees that, when a controller is looking at the part of the +hierarchy which has it enabled, processes are always only on the +leaves. This rules out situations where child cgroups compete against +internal processes of the parent. The root cgroup is exempt from this restriction. Root contains processes and anonymous resource consumption which can't be associated @@ -420,27 +302,18 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated in two ways. First, to a less privileged -user by granting write access of the directory and its "cgroup.procs", -"cgroup.threads" and "cgroup.subtree_control" files to the user. -Second, if the "nsdelegate" mount option is set, automatically to a -cgroup namespace on namespace creation. - -Because the resource control interface files in a given directory -control the distribution of the parent's resources, the delegatee -shouldn't be allowed to write to them. For the first method, this is -achieved by not granting access to these files. For the second, the -kernel rejects writes to all files other than "cgroup.procs" and -"cgroup.subtree_control" on a namespace root from inside the -namespace. - -The end results are equivalent for both delegation types. Once -delegated, the user can build sub-hierarchy under the directory, -organize processes inside it as it sees fit and further distribute the -resources it received from the parent. The limits and other settings -of all resource controllers are hierarchical and regardless of what -happens in the delegated sub-hierarchy, nothing can escape the -resource restrictions imposed by the parent. +A cgroup can be delegated to a less privileged user by granting write +access of the directory and its "cgroup.procs" file to the user. Note +that resource control interface files in a given directory control the +distribution of the parent's resources and thus must not be delegated +along with the directory. + +Once delegated, the user can build sub-hierarchy under the directory, +organize processes as it sees fit and further distribute the resources +it received from the parent. The limits and other settings of all +resource controllers are hierarchical and regardless of what happens +in the delegated sub-hierarchy, nothing can escape the resource +restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -450,19 +323,19 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. +can't be moved into or out of the sub-hierarchy by the delegatee. For +a process with a non-root euid to migrate a target process into a +cgroup by writing its PID to the "cgroup.procs" file, the following +conditions must be met. -For delegations to a less privileged user, this is achieved by -requiring the following conditions for a process with a non-root euid -to migrate a target process into a cgroup by writing its PID to the -"cgroup.procs" file. +- The writer's euid must match either uid or suid of the target process. - The writer must have write access to the "cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file of the common ancestor of the source and destination cgroups. -The above two constraints ensure that while a delegatee may migrate +The above three constraints ensure that while a delegatee may migrate processes around freely in the delegated sub-hierarchy it can't pull in from or push out to outside the sub-hierarchy. @@ -477,15 +350,10 @@ all processes under C0 and C1 belong to U0. Let's also say U0 wants to write the PID of a process which is currently in C10 into "C00/cgroup.procs". U0 has write access to the -file; however, the common ancestor of the source cgroup C10 and the -destination cgroup C00 is above the points of delegation and U0 would -not have write access to its "cgroup.procs" files and thus the write -will be denied with -EACCES. - -For delegations to namespaces, containment is achieved by requiring -that both the source and destination cgroups are reachable from the -namespace of the process which is attempting the migration. If either -is not reachable, the migration is rejected with -ENOENT. +file and uid match on the process; however, the common ancestor of the +source cgroup C10 and the destination cgroup C00 is above the points +of delegation and U0 would not have write access to its "cgroup.procs" +files and thus the write will be denied with -EACCES. 2-6. Guidelines @@ -718,29 +586,6 @@ may be specified in any order and not all pairs have to be specified. All cgroup core files are prefixed with "cgroup." - cgroup.type - - A read-write single value file which exists on non-root - cgroups. - - When read, it indicates the current type of the cgroup, which - can be one of the following values. - - - "domain" : A normal valid domain cgroup. - - - "domain threaded" : A threaded domain cgroup which is - serving as the root of a threaded subtree. - - - "domain invalid" : A cgroup which is in an invalid state. - It can't be populated or have controllers enabled. It may - be allowed to become a threaded cgroup. - - - "threaded" : A threaded cgroup which is a member of a - threaded subtree. - - A cgroup can be turned into a threaded cgroup by writing - "threaded" to this file. - cgroup.procs A read-write new-line separated values file which exists on @@ -756,36 +601,10 @@ All cgroup core files are prefixed with "cgroup." the PID to the cgroup. The writer should match all of the following conditions. - - It must have write access to the "cgroup.procs" file. + - Its euid is either root or must match either uid or suid of + the target process. - - It must have write access to the "cgroup.procs" file of the - common ancestor of the source and destination cgroups. - - When delegating a sub-hierarchy, write access to this file - should be granted along with the containing directory. - - In a threaded cgroup, reading this file fails with EOPNOTSUPP - as all the processes belong to the thread root. Writing is - supported and moves every thread of the process to the cgroup. - - cgroup.threads - A read-write new-line separated values file which exists on - all cgroups. - - When read, it lists the TIDs of all threads which belong to - the cgroup one-per-line. The TIDs are not ordered and the - same TID may show up more than once if the thread got moved to - another cgroup and then back or the TID got recycled while - reading. - - A TID can be written to migrate the thread associated with the - TID to the cgroup. The writer should match all of the - following conditions. - - - It must have write access to the "cgroup.threads" file. - - - The cgroup that the thread is currently in must be in the - same resource domain as the destination cgroup. + - It must have write access to the "cgroup.procs" file. - It must have write access to the "cgroup.procs" file of the common ancestor of the source and destination cgroups. @@ -829,38 +648,6 @@ All cgroup core files are prefixed with "cgroup." 1 if the cgroup or its descendants contains any live processes; otherwise, 0. - cgroup.max.descendants - A read-write single value files. The default is "max". - - Maximum allowed number of descent cgroups. - If the actual number of descendants is equal or larger, - an attempt to create a new cgroup in the hierarchy will fail. - - cgroup.max.depth - A read-write single value files. The default is "max". - - Maximum allowed descent depth below the current cgroup. - If the actual descent depth is equal or larger, - an attempt to create a new child cgroup will fail. - - cgroup.stat - A read-only flat-keyed file with the following entries: - - nr_descendants - Total number of visible descendant cgroups. - - nr_dying_descendants - Total number of dying descendant cgroups. A cgroup becomes - dying after being deleted by a user. The cgroup will remain - in dying state for some time undefined time (which can depend - on system load) before being completely destroyed. - - A process can't enter a dying cgroup under any circumstances, - a dying cgroup can't revive. - - A dying cgroup can consume system resources not exceeding - limits, which were active at the moment of cgroup deletion. - 5. Controllers @@ -1350,92 +1137,6 @@ writeback as follows. vm.dirty[_background]_ratio. -5-4. PID - -The process number controller is used to allow a cgroup to stop any -new tasks from being fork()'d or clone()'d after a specified limit is -reached. - -The number of tasks in a cgroup can be exhausted in ways which other -controllers cannot prevent, thus warranting its own controller. For -example, a fork bomb is likely to exhaust the number of tasks before -hitting memory restrictions. - -Note that PIDs used in this controller refer to TIDs, process IDs as -used by the kernel. - - -5-4-1. PID Interface Files - - pids.max - - A read-write single value file which exists on non-root - cgroups. The default is "max". - - Hard limit of number of processes. - - pids.current - - A read-only single value file which exists on all cgroups. - - The number of processes currently in the cgroup and its - descendants. - -Organisational operations are not blocked by cgroup policies, so it is -possible to have pids.current > pids.max. This can be done by either -setting the limit to be smaller than pids.current, or attaching enough -processes to the cgroup such that pids.current is larger than -pids.max. However, it is not possible to violate a cgroup PID policy -through fork() or clone(). These will return -EAGAIN if the creation -of a new process would cause a cgroup policy to be violated. - - -5-5. Misc - -5-5-1. perf_event - -perf_event controller, if not mounted on a legacy hierarchy, is -automatically enabled on the v2 hierarchy so that perf events can -always be filtered by cgroup v2 path. The controller can still be -moved to a legacy hierarchy after v2 hierarchy is populated. - - -5-6. RDMA - -The "rdma" controller regulates the distribution and accounting of -of RDMA resources. - -5-6-1. RDMA Interface Files - - rdma.max - A readwrite nested-keyed file that exists for all the cgroups - except root that describes current configured resource limit - for a RDMA/IB device. - - Lines are keyed by device name and are not ordered. - Each line contains space separated resource name and its configured - limit that can be distributed. - - The following nested keys are defined. - - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - - An example for mlx4 and ocrdma device follows. - - mlx4_0 hca_handle=2 hca_object=2000 - ocrdma1 hca_handle=3 hca_object=max - - rdma.current - A read-only file that describes current resource usage. - It exists for all the cgroup except root. - - An example for mlx4 and ocrdma device follows. - - mlx4_0 hca_handle=1 hca_object=20 - ocrdma1 hca_handle=1 hca_object=23 - - 6. Namespace 6-1. Basics @@ -1623,7 +1324,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All v1 mount options are not supported. +- All mount options and remounting are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9afba613a5c3..a66de7db0118 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -692,14 +692,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Specifying "pressure" disables per-cgroup pressure stall information accounting feature - cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 - Format: { { controller | "all" | "named" } - [,{ controller | "all" | "named" }...] } + cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1 + Format: { controller[,controller...] | "all" } Like cgroup_disable, but only applies to cgroup v1; the blacklisted controllers remain available in cgroup2. - "all" blacklists all controllers and "named" disables - named mounts. Specifying both "all" and "named" disables - all v1 hierarchies. cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: diff --git a/fs/internal.h b/fs/internal.h index 380bae4c5ff7..3e58863de514 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -68,7 +68,6 @@ extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -int path_umount(struct path *path, int flags); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b70288a713b3..27358c854203 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -516,7 +516,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) goto out_put; rc = 0; - of->mmapped = true; + of->mmapped = 1; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1c2ea6ca0381..d5b149a45be1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -123,10 +123,8 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) { - dput(dentry); + if (WARN_ON(!knparent)) return ERR_PTR(-EINVAL); - } do { struct dentry *dtmp; @@ -135,11 +133,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) { - dput(dentry); + if (WARN_ON(!kntmp)) return ERR_PTR(-EINVAL); - } - dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name)); + dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ab429b48f8bd..35a28e4fb2dd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -63,38 +62,18 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, - - /* Control group has to be frozen. */ - CGRP_FREEZE, - - /* Cgroup is frozen. */ - CGRP_FROZEN, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ - - /* - * Consider namespaces as delegation boundaries. If this flag is - * set, controller specific interface files in a namespace root - * aren't writeable from inside the namespace. - */ - CGRP_ROOT_NS_DELEGATE = (1 << 3), - - /* - * Enable cpuset controller in v1 cgroup to use v2 behavior. - */ - CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ - CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ @@ -131,6 +110,9 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* PI: the parent css */ + struct cgroup_subsys_state *parent; + /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; @@ -160,12 +142,6 @@ struct cgroup_subsys_state { /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; - - /* - * PI: the parent css. Placed here for cache proximity to following - * fields of the containing structure. - */ - struct cgroup_subsys_state *parent; }; /* @@ -176,29 +152,14 @@ struct cgroup_subsys_state { * set for a task. */ struct css_set { - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* reference count */ - refcount_t refcount; + /* Reference count */ + atomic_t refcount; /* - * For a domain cgroup, the following points to self. If threaded, - * to the matching cset of the nearest domain ancestor. The - * dom_cset provides access to the domain cgroup and its csses to - * which domain level resource consumptions should be charged. + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock */ - struct css_set *dom_cset; - - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* internal task count, protected by css_set_lock */ - int nr_tasks; + struct hlist_node hlist; /* * Lists running through all tasks using this cgroup group. @@ -209,42 +170,28 @@ struct css_set { */ struct list_head tasks; struct list_head mg_tasks; - struct list_head dying_tasks; - - /* all css_task_iters currently walking this cset */ - struct list_head task_iters; /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* all threaded csets whose ->dom_cset points to this cset */ - struct list_head threaded_csets; - struct list_head threaded_csets_node; + struct list_head cgrp_links; - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; /* - * List of cgrp_cset_links pointing at cgroups referenced from this - * css_set. Protected by css_set_lock. + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). */ - struct list_head cgrp_links; + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ - struct list_head mg_src_preload_node; - struct list_head mg_dst_preload_node; + struct list_head mg_preload_node; struct list_head mg_node; /* @@ -258,6 +205,18 @@ struct css_set { struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* all css_task_iters currently walking this cset */ + struct list_head task_iters; + /* dead and being drained, ignore for migration */ bool dead; @@ -265,25 +224,6 @@ struct css_set { struct rcu_head rcu_head; }; -struct cgroup_freezer_state { - /* Should the cgroup and its descendants be frozen. */ - bool freeze; - - /* Should the cgroup actually be frozen? */ - int e_freeze; - - /* Fields below are protected by css_set_lock */ - - /* Number of frozen descendant cgroups */ - int nr_frozen_descendants; - - /* - * Number of tasks, which are counted as frozen: - * frozen, SIGSTOPped, and PTRACEd. - */ - int nr_frozen_tasks; -}; - struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -308,40 +248,13 @@ struct cgroup { */ int level; - /* Maximum allowed descent tree depth */ - int max_depth; - - /* - * Keep track of total numbers of visible and dying descent cgroups. - * Dying cgroups are cgroups which were deleted by a user, - * but are still existing because someone else is holding a reference. - * max_descendants is a maximum allowed number of descent cgroups. - * - * nr_descendants and nr_dying_descendants are protected - * by cgroup_mutex and css_set_lock. It's fine to read them holding - * any of cgroup_mutex and css_set_lock; for writing both locks - * should be held. - */ - int nr_descendants; - int nr_dying_descendants; - int max_descendants; - /* * Each non-empty css_set associated with this cgroup contributes - * one to nr_populated_csets. The counter is zero iff this cgroup - * doesn't have any tasks. - * - * All children which have non-zero nr_populated_csets and/or - * nr_populated_children of their own contribute one to either - * nr_populated_domain_children or nr_populated_threaded_children - * depending on their type. Each counter is zero iff all cgroups - * of the type in the subtree proper don't have any tasks. + * one to populated_cnt. All children with non-zero popuplated_cnt + * of their own contribute one. The count is zero iff there's no + * task in this cgroup or its subtree. */ - int nr_populated_csets; - int nr_populated_domain_children; - int nr_populated_threaded_children; - - int nr_threaded_children; /* # of live threaded child cgroups */ + int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ @@ -379,16 +292,6 @@ struct cgroup { */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - /* - * If !threaded, self. If threaded, it points to the nearest - * domain ancestor. Inside a threaded subtree, cgroups are exempt - * from process granularity and no-internal-task constraint. - * Domain level resource consumptions which aren't tied to a - * specific task are charged to the dom_cgrp. - */ - struct cgroup *dom_cgrp; - struct cgroup *old_dom_cgrp; /* used while enabling threaded */ - /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. @@ -408,9 +311,6 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; - /* Used to store internal freezer state */ - struct cgroup_freezer_state freezer; - /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; @@ -548,7 +448,7 @@ struct cftype { /* * Control Group subsystem type. - * See Documentation/cgroup-v1/cgroups.txt for details + * See Documentation/cgroups/cgroups.txt for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); @@ -567,7 +467,7 @@ struct cgroup_subsys { void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); - void (*release)(struct task_struct *task); + void (*free)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; @@ -585,18 +485,6 @@ struct cgroup_subsys { */ bool implicit_on_dfl:1; - /* - * If %true, the controller, supports threaded mode on the default - * hierarchy. In a threaded subtree, both process granularity and - * no-internal-process constraint are ignored and a threaded - * controllers should be able to handle that. - * - * Note that as an implicit controller is automatically enabled on - * all cgroups on the default hierarchy, it should also be - * threaded. implicit && !threaded is not supported. - */ - bool threaded:1; - /* * If %false, this subsystem is properly hierarchical - * configuration, resource accounting and restriction on a parent diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 55a878aebe21..4e93ff0e45ba 100755 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,11 +17,11 @@ #include #include #include +#include #include #include #include #include -#include #include @@ -36,33 +36,18 @@ #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 -/* walk only threadgroup leaders */ -#define CSS_TASK_ITER_PROCS (1U << 0) -/* walk all threaded css_sets in the domain */ -#define CSS_TASK_ITER_THREADED (1U << 1) - -/* internal flags */ -#define CSS_TASK_ITER_SKIPPED (1U << 16) - /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; - unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; - struct list_head *tcset_pos; - struct list_head *tcset_head; - struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; - struct list_head *dying_tasks_head; - struct list_head *cur_tasks_head; struct css_set *cur_cset; - struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; @@ -122,7 +107,6 @@ extern int cgroup_can_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); @@ -145,7 +129,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, +void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); @@ -282,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css - * @tset: taskset to iterate + * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. @@ -563,27 +547,6 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } -/** - * cgroup_ancestor - find ancestor of cgroup - * @cgrp: cgroup to find ancestor of - * @ancestor_level: level of ancestor to find starting from root - * - * Find ancestor of cgroup at specified level starting from root if it exists - * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at - * @ancestor_level. - * - * This function is safe to call as long as @cgrp is accessible. - */ -static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, - int ancestor_level) -{ - if (cgrp->level < ancestor_level) - return NULL; - while (cgrp && cgrp->level > ancestor_level) - cgrp = cgroup_parent(cgrp); - return cgrp; -} - /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested @@ -604,8 +567,7 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + - cgrp->nr_populated_threaded_children; + return cgrp->populated_cnt; } /* returns ino associated with a cgroup */ @@ -709,7 +671,6 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } @@ -788,7 +749,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { - refcount_t count; + atomic_t count; struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; @@ -823,56 +784,13 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) - refcount_inc(&ns->count); + atomic_inc(&ns->count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->count)) + if (ns && atomic_dec_and_test(&ns->count)) free_cgroup_ns(ns); } -#ifdef CONFIG_CGROUPS - -void cgroup_enter_frozen(void); -void cgroup_leave_frozen(bool always_leave); -void cgroup_update_frozen(struct cgroup *cgrp); -void cgroup_freeze(struct cgroup *cgrp, bool freeze); -void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, - struct cgroup *dst); -void cgroup_freezer_frozen_exit(struct task_struct *task); -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - bool ret; - - if (task->flags & PF_KTHREAD) - return false; - - rcu_read_lock(); - ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); - rcu_read_unlock(); - - return ret; -} - -static inline bool cgroup_task_frozen(struct task_struct *task) -{ - return task->frozen; -} - -#else /* !CONFIG_CGROUPS */ - -static inline void cgroup_enter_frozen(void) { } -static inline void cgroup_leave_frozen(bool always_leave) { } -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - return false; -} -static inline bool cgroup_task_frozen(struct task_struct *task) -{ - return false; -} - -#endif /* !CONFIG_CGROUPS */ - #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h deleted file mode 100644 index e94290b29e99..000000000000 --- a/include/linux/cgroup_rdma.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. - */ - -#ifndef _CGROUP_RDMA_H -#define _CGROUP_RDMA_H - -#include - -enum rdmacg_resource_type { - RDMACG_RESOURCE_HCA_HANDLE, - RDMACG_RESOURCE_HCA_OBJECT, - RDMACG_RESOURCE_MAX, -}; - -#ifdef CONFIG_CGROUP_RDMA - -struct rdma_cgroup { - struct cgroup_subsys_state css; - - /* - * head to keep track of all resource pools - * that belongs to this cgroup. - */ - struct list_head rpools; -}; - -struct rdmacg_device { - struct list_head dev_node; - struct list_head rpools; - char *name; -}; - -/* - * APIs for RDMA/IB stack to publish when a device wants to - * participate in resource accounting - */ -int rdmacg_register_device(struct rdmacg_device *device); -void rdmacg_unregister_device(struct rdmacg_device *device); - -/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ -int rdmacg_try_charge(struct rdma_cgroup **rdmacg, - struct rdmacg_device *device, - enum rdmacg_resource_type index); -void rdmacg_uncharge(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index); -#endif /* CONFIG_CGROUP_RDMA */ -#endif /* _CGROUP_RDMA_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ff4cad3a2275..7f4a2a5a2a77 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,10 +60,6 @@ SUBSYS(hugetlb) SUBSYS(pids) #endif -#if IS_ENABLED(CONFIG_CGROUP_RDMA) -SUBSYS(rdma) -#endif - /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3cfe2d27811b..d807fa9b2051 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -56,7 +56,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); -extern void cpuset_update_active_cpus(void); +extern void cpuset_update_active_cpus(bool cpu_online); extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); @@ -172,7 +172,7 @@ static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } -static inline void cpuset_update_active_cpus(void) +static inline void cpuset_update_active_cpus(bool cpu_online) { partition_sched_domains(1, NULL, NULL); } diff --git a/include/linux/cred.h b/include/linux/cred.h index 796dc4380de0..09debf2e047f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -252,18 +252,6 @@ static inline const struct cred *get_cred(const struct cred *cred) return get_new_cred(nonconst_cred); } -static inline const struct cred *get_cred_rcu(const struct cred *cred) -{ - struct cred *nonconst_cred = (struct cred *) cred; - if (!cred) - return NULL; - if (!atomic_inc_not_zero(&nonconst_cred->usage)) - return NULL; - validate_creds(cred); - nonconst_cred->non_rcu = 0; - return cred; -} - /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 4df9b50cb1c3..44e529353b6b 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -188,7 +188,7 @@ struct kernfs_open_file { char *prealloc_buf; size_t atomic_write_len; - bool mmapped:1; + bool mmapped; bool released:1; const struct vm_operations_struct *vm_ops; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index d8106413464f..32111634c69b 100755 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1829,8 +1829,6 @@ struct task_struct { #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; - /* task is frozen/stopped (used by the cgroup freezer) */ - unsigned frozen:1; #endif unsigned long atomic_flags; /* Flags needing atomic access. */ @@ -2650,7 +2648,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -2659,7 +2656,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) #define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) -#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 975be862e083..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,7 +1,5 @@ -#ifndef _LINUX_SCHED_DEADLINE_H -#define _LINUX_SCHED_DEADLINE_H - -#include +#ifndef _SCHED_DEADLINE_H +#define _SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -28,4 +26,4 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#endif /* _LINUX_SCHED_DEADLINE_H */ +#endif /* _SCHED_DEADLINE_H */ diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 2cc450f6ec54..d9cf5a5762d9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -1,5 +1,5 @@ -#ifndef _LINUX_SCHED_PRIO_H -#define _LINUX_SCHED_PRIO_H +#ifndef _SCHED_PRIO_H +#define _SCHED_PRIO_H #define MAX_NICE 19 #define MIN_NICE -20 @@ -57,4 +57,4 @@ static inline long rlimit_to_nice(long prio) return (MAX_NICE - prio + 1); } -#endif /* _LINUX_SCHED_PRIO_H */ +#endif /* _SCHED_PRIO_H */ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 3bd668414f61..a30b172df6e1 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -1,9 +1,7 @@ -#ifndef _LINUX_SCHED_RT_H -#define _LINUX_SCHED_RT_H +#ifndef _SCHED_RT_H +#define _SCHED_RT_H -#include - -struct task_struct; +#include static inline int rt_prio(int prio) { @@ -59,4 +57,4 @@ extern void normalize_rt_tasks(void); */ #define RR_TIMESLICE (100 * HZ / 1000) -#endif /* _LINUX_SCHED_RT_H */ +#endif /* _SCHED_RT_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 07207044b5f4..9ff03d20b986 100755 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -1,9 +1,5 @@ -#ifndef _LINUX_SCHED_SYSCTL_H -#define _LINUX_SCHED_SYSCTL_H - -#include - -struct ctl_table; +#ifndef _SCHED_SYSCTL_H +#define _SCHED_SYSCTL_H #ifdef CONFIG_DETECT_HUNG_TASK extern int sysctl_hung_task_check_count; @@ -152,4 +148,4 @@ extern int sched_little_cluster_coloc_fmin_khz_handler(struct ctl_table *table, extern char sched_lib_name[LIB_PATH_LENGTH]; extern unsigned int sched_lib_mask_force; extern bool is_sched_lib_based_app(pid_t pid); -#endif /* _LINUX_SCHED_SYSCTL_H */ +#endif /* _SCHED_SYSCTL_H */ diff --git a/init/Kconfig b/init/Kconfig index bd93c3f5015a..25fb46dd2b56 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -858,16 +858,6 @@ config CGROUP_PIDS since the PIDs limit only affects a process's ability to fork, not to attach to a cgroup. -config CGROUP_RDMA - bool "RDMA controller" - help - Provides enforcement of RDMA resources defined by IB stack. - It is fairly easy for consumers to exhaust RDMA resources, which - can result into resource unavailability to other consumers. - RDMA controller is designed to stop this from happening. - Attaching processes with active RDMA resources to the cgroup - hierarchy is allowed even if can cross the hierarchy's limit. - config CGROUP_FREEZER bool "Freezer controller" help @@ -946,14 +936,11 @@ config CGROUP_BPF inet sockets. config CGROUP_DEBUG - bool "Debug controller" + bool "Example controller" default n - depends on DEBUG_KERNEL help This option enables a simple controller that exports - debugging information about the cgroups framework. This - controller is for control cgroup debugging only. Its - interfaces are not stable. + debugging information about the cgroups framework. Say N. diff --git a/kernel/Makefile b/kernel/Makefile index 9fec7d39f4b0..f3a91fa080bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -65,7 +65,10 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o -obj-$(CONFIG_CGROUPS) += cgroup/ +obj-$(CONFIG_CGROUPS) += cgroup.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o +obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup.c similarity index 70% rename from kernel/cgroup/cgroup.c rename to kernel/cgroup.c index 37cdbeb85a92..5c6deb033c96 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup.c @@ -28,13 +28,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "cgroup-internal.h" - +#include #include +#include #include #include #include +#include #include +#include #include #include #include @@ -45,11 +47,16 @@ #include #include #include +#include +#include +#include +#include #include +#include #include +#include /* TODO: replace with more sophisticated array */ #include #include -#include #include #include #include @@ -61,6 +68,14 @@ #define CREATE_TRACE_POINTS #include +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -74,12 +89,14 @@ * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ +#ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); - -#ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); +#else +static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -94,6 +111,12 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ @@ -109,9 +132,15 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; */ static struct workqueue_struct *cgroup_destroy_wq; +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, -struct cgroup_subsys *cgroup_subsys[] = { +static struct cgroup_subsys *cgroup_subsys[] = { #include }; #undef SUBSYS @@ -158,17 +187,18 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_visible; +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; - -/* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ -LIST_HEAD(cgroup_roots); + +static LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ @@ -184,25 +214,29 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static u64 css_serial_nr_next = 1; /* - * These bitmasks identify subsystems with specific features to avoid - * having to do iterative checks repeatedly. + * These bitmask flags indicate whether tasks in the fork and exit paths have + * fork/exit handlers to call. This avoids us having to do extra work in the + * fork/exit path to check which subsystems have fork/exit callbacks. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u16 have_free_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = REFCOUNT_INIT(2), + .count = { .counter = 2, }, .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; +/* Ditto for the can_fork callback. */ +static u16 have_canfork_callback __read_mostly; + static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; /* cgroup optional features */ enum cgroup_opt_features { @@ -220,10 +254,11 @@ static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { static u16 cgroup_feature_disable_mask __read_mostly; +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_skip(struct css_task_iter *it, - struct task_struct *task); +static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -241,7 +276,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ -bool cgroup_ssid_enabled(int ssid) +static bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; @@ -249,6 +284,11 @@ bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -302,7 +342,7 @@ bool cgroup_ssid_enabled(int ssid) * * - debug: disallowed on the default hierarchy. */ -bool cgroup_on_dfl(const struct cgroup *cgrp) +static bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } @@ -338,103 +378,14 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) -{ - return cgrp->nr_populated_csets; -} - -bool cgroup_is_threaded(struct cgroup *cgrp) -{ - return cgrp->dom_cgrp != cgrp; -} - -/* can @cgrp host both domain and threaded children? */ -static bool cgroup_is_mixable(struct cgroup *cgrp) -{ - /* - * Root isn't under domain level resource control exempting it from - * the no-internal-process constraint, so it can serve as a thread - * root and a parent of resource domains at the same time. - */ - return !cgroup_parent(cgrp); -} - -/* can @cgrp become a thread root? should always be true for a thread root */ -static bool cgroup_can_be_thread_root(struct cgroup *cgrp) -{ - /* mixables don't care */ - if (cgroup_is_mixable(cgrp)) - return true; - - /* domain roots can't be nested under threaded */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* can only have either domain or threaded children */ - if (cgrp->nr_populated_domain_children) - return false; - - /* and no domain controllers can be enabled */ - if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) - return false; - - return true; -} - -/* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) -{ - /* thread root should be a domain */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* a domain w/ threaded children is a thread root */ - if (cgrp->nr_threaded_children) - return true; - - /* - * A domain which has tasks and explicit threaded controllers - * enabled is a thread root. - */ - if (cgroup_has_tasks(cgrp) && - (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) - return true; - - return false; -} - -/* a domain which isn't connected to the root w/o brekage can't be used */ -static bool cgroup_is_valid_domain(struct cgroup *cgrp) -{ - /* the cgroup itself can be a thread root */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* but the ancestors can't be unless mixable */ - while ((cgrp = cgroup_parent(cgrp))) { - if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) - return false; - if (cgroup_is_threaded(cgrp)) - return false; - } - - return true; -} - /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent) { - u16 ss_mask = parent->subtree_control; - - /* threaded cgroups can only have threaded controllers */ - if (cgroup_is_threaded(cgrp)) - ss_mask &= cgrp_dfl_threaded_ss_mask; - return ss_mask; - } + if (parent) + return parent->subtree_control; if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | @@ -447,14 +398,8 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - if (parent) { - u16 ss_mask = parent->subtree_ss_mask; - - /* threaded cgroups can only have threaded controllers */ - if (cgroup_is_threaded(cgrp)) - ss_mask &= cgrp_dfl_threaded_ss_mask; - return ss_mask; - } + if (parent) + return parent->subtree_ss_mask; return cgrp->root->subsys_mask; } @@ -544,37 +489,10 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, return css; } -/** - * __cgroup_task_count - count the number of tasks in a cgroup. The caller - * is responsible for taking the css_set_lock. - * @cgrp: the cgroup in question - */ -int __cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - lockdep_assert_held(&css_set_lock); - - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - - return count; -} - -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) +/* convenient tests for these bits */ +static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - int count; - - spin_lock_irq(&css_set_lock); - count = __cgroup_task_count(cgrp); - spin_unlock_irq(&css_set_lock); - - return count; + return !(cgrp->self.flags & CSS_ONLINE); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) @@ -597,6 +515,11 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); +static int notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor @@ -626,6 +549,15 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor @@ -650,6 +582,10 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ @@ -676,6 +612,29 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else +static void cgroup_release_agent(struct work_struct *work); +static void check_for_release(struct cgroup *cgrp); + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -684,42 +643,20 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = REFCOUNT_INIT(1), - .dom_cset = &init_css_set, + .refcount = ATOMIC_INIT(1), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), - .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), - .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), - .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), - - /* - * The following field is re-initialized when this cset gets linked - * in cgroup_init(). However, let's initialize the field - * statically too so that the default cgroup can be accessed safely - * early during boot. - */ - .dfl_cgrp = &cgrp_dfl_root.cgrp, + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ -static bool css_set_threaded(struct css_set *cset) -{ - return cset->dom_cset != cset; -} - /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set - * - * css_set_populated() should be the same as !!cset->nr_tasks at steady - * state. However, css_set_populated() can be called while a task is being - * added to or removed from the linked list before the nr_tasks is - * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -729,48 +666,39 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - update the populated count of a cgroup + * cgroup_update_populated - updated populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->nr_populated_* accordingly. The - * count is propagated towards root so that a given cgroup's - * nr_populated_children is zero iff none of its descendants contain any - * tasks. - * - * @cgrp's interface file "cgroup.populated" is zero if both - * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and - * 1 otherwise. When the sum changes from or to zero, userland is notified - * that the content of the interface file has changed. This can be used to - * detect when @cgrp and its descendants become populated or empty. + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { - struct cgroup *child = NULL; - int adj = populated ? 1 : -1; - lockdep_assert_held(&css_set_lock); do { - bool was_populated = cgroup_is_populated(cgrp); + bool trigger; - if (!child) { - cgrp->nr_populated_csets += adj; - } else { - if (cgroup_is_threaded(child)) - cgrp->nr_populated_threaded_children += adj; - else - cgrp->nr_populated_domain_children += adj; - } + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; - if (was_populated == cgroup_is_populated(cgrp)) + if (!trigger) break; - cgroup1_check_for_release(cgrp); + check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); - child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -781,7 +709,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the - * populated counters of all associated cgroups accordingly. + * ->populated_cnt of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { @@ -793,21 +721,6 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } -/* - * @task is leaving, advance task iterators which are pointing to it so - * that they can resume at the next position. Advancing an iterator might - * remove it from the list, use safe walk. See css_task_iter_skip() for - * details. - */ -static void css_set_skip_task_iters(struct css_set *cset, - struct task_struct *task) -{ - struct css_task_iter *it, *pos; - - list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) - css_task_iter_skip(it, task); -} - /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -819,7 +732,7 @@ static void css_set_skip_task_iters(struct css_set *cset, * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated counter updates and + * This function automatically handles populated_cnt updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ @@ -833,9 +746,22 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { + struct css_task_iter *it, *pos; + WARN_ON_ONCE(list_empty(&task->cg_list)); - css_set_skip_task_iters(from_cset, task); + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -879,7 +805,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -void put_css_set_locked(struct css_set *cset) +static void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -887,11 +813,9 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!refcount_dec_and_test(&cset->refcount)) + if (!atomic_dec_and_test(&cset->refcount)) return; - WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); - /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -908,14 +832,34 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } - if (css_set_threaded(cset)) { - list_del(&cset->threaded_csets_node); - put_css_set_locked(cset->dom_cset); - } - kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + atomic_inc(&cset->refcount); +} + /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -931,7 +875,6 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { - struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -942,16 +885,6 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - - /* @cset's domain should match the default cgroup's */ - if (cgroup_on_dfl(new_cgrp)) - new_dfl_cgrp = new_cgrp; - else - new_dfl_cgrp = old_cset->dfl_cgrp; - - if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) - return false; - /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -1158,18 +1091,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - refcount_set(&cset->refcount, 1); - cset->dom_cset = cset; + atomic_set(&cset->refcount, 1); + INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); - INIT_LIST_HEAD(&cset->dying_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); INIT_LIST_HEAD(&cset->task_iters); - INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); - INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_src_preload_node); - INIT_LIST_HEAD(&cset->mg_dst_preload_node); - INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -1203,32 +1132,10 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); - /* - * If @cset should be threaded, look up the matching dom_cset and - * link them up. We first fully initialize @cset then look for the - * dom_cset. It's simpler this way and safe as @cset is guaranteed - * to stay empty until we return. - */ - if (cgroup_is_threaded(cset->dfl_cgrp)) { - struct css_set *dcset; - - dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); - if (!dcset) { - put_css_set(cset); - return NULL; - } - - spin_lock_irq(&css_set_lock); - cset->dom_cset = dcset; - list_add_tail(&cset->threaded_csets_node, - &dcset->threaded_csets); - spin_unlock_irq(&css_set_lock); - } - return cset; } -struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1256,7 +1163,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -void cgroup_free_root(struct cgroup_root *root) +static void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1352,8 +1259,6 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, if (cset == &init_css_set) { res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1375,8 +1280,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ -struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root) +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -1413,6 +1318,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; +static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) @@ -1425,7 +1331,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1506,7 +1412,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ -void cgroup_kn_unlock(struct kernfs_node *kn) +static void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -1538,7 +1444,8 @@ void cgroup_kn_unlock(struct kernfs_node *kn) * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1601,17 +1508,8 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); - } else { - list_for_each_entry(cfts, &css->ss->cfts, node) - cgroup_addrm_files(css, cgrp, cfts, false); - } } /** @@ -1631,20 +1529,18 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; + cfts = cgroup_dfl_base_files; else - cfts = cgroup1_base_files; + cfts = cgroup_legacy_base_files; - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; - } else { - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; - } + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + } + + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; } } @@ -1660,7 +1556,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; @@ -1753,8 +1649,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1780,56 +1676,245 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - char *token; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} - *root_flags = 0; +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; - if (!data || *data == '\0') - return 0; +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; continue; } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + continue; + } - return 0; -} + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup_ssid_no_v1(i)) + continue; -static void apply_cgroup_root_flags(unsigned int root_flags) -{ - if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { - if (root_flags & CGRP_ROOT_NS_DELEGATE) - cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; - else - cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; } -} -static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) - seq_puts(seq, ",nsdelegate"); + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + return 0; } static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - unsigned int root_flags; - int ret; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; - apply_cgroup_root_flags(root_flags); - return 0; + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); + return -EINVAL; + } + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + /* See cgroup_mount release_agent handling */ + if (opts.release_agent && + ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { + ret = -EINVAL; + goto out_unlock; + } + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; } /* @@ -1882,7 +1967,6 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); - cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -1903,18 +1987,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; - cgrp->dom_cgrp = cgrp; - cgrp->max_descendants = INT_MAX; - cgrp->max_depth = INT_MAX; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +static void init_cgroup_root(struct cgroup_root *root, + struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1926,18 +2008,17 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strcpy(root->release_agent_path, opts->release_agent); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strcpy(root->name, opts->name); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -1949,8 +2030,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - 0, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -1969,10 +2050,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - kf_sops = root == &cgrp_dfl_root ? - &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; - - root->kf_root = kernfs_create_root(kf_sops, + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -2033,52 +2111,20 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) -{ - struct dentry *dentry; - bool new_sb = false; - - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); - - /* - * In non-init cgroup namespace, instead of root cgroup's dentry, - * we return the dentry corresponding to the cgroupns->root_cgrp. - */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { - struct dentry *nsdentry; - struct super_block *sb = dentry->d_sb; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); - - cgrp = cset_cgroup_from_root(ns->root_cset, root); - - spin_unlock_bh(&css_set_lock); - mutex_unlock(&cgroup_mutex); - - nsdentry = kernfs_node_dentry(cgrp->kn, sb); - dput(dentry); - if (IS_ERR(nsdentry)) - deactivate_locked_super(sb); - dentry = nsdentry; - } - - if (!new_sb) - cgroup_put(&root->cgrp); - - return dentry; -} - static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; + struct super_block *pinned_sb = NULL; struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_subsys *ss; + struct cgroup_root *root; + struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + int i; + bool new_sb; get_cgroup_ns(ns); @@ -2095,25 +2141,190 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); put_cgroup_ns(ns); - return ERR_PTR(ret); + return ERR_PTR(-EINVAL); } - cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if (opts.release_agent && + ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { + ret = -EINVAL; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) { + put_cgroup_ns(ns); + return ERR_PTR(ret); + } +out_mount: + dentry = kernfs_mount(fs_type, flags, root->kf_root, + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + + if (IS_ERR(dentry) || !new_sb) + cgroup_put(&root->cgrp); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); } put_cgroup_ns(ns); @@ -2126,20 +2337,22 @@ static void cgroup_kill_sb(struct super_block *sb) struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* - * If @root doesn't have any children, start killing it. + * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ - if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) + if (!list_empty(&root->cgrp.self.children) || + root == &cgrp_dfl_root) + cgroup_put(&root->cgrp); + else percpu_ref_kill(&root->cgrp.self.refcnt); - cgroup_put(&root->cgrp); + kernfs_kill_sb(sb); } -struct file_system_type cgroup_fs_type = { +static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, @@ -2153,8 +2366,8 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); @@ -2217,18 +2430,49 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/* used to track tasks and other necessary states during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + /** - * cgroup_migrate_add_task - add a migration target task to a migration context + * cgroup_taskset_add - try to add a migration target task to a taskset * @task: target task - * @mgctx: target migration context + * @tset: target taskset * - * Add @task, which is a migration target, to @mgctx->tset. This function - * becomes noop if @task doesn't need to be migrated. @task's css_set - * should have been added as a migration source and @task->cg_list will be - * moved from the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. */ -static void cgroup_migrate_add_task(struct task_struct *task, - struct cgroup_mgctx *mgctx) +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) { struct css_set *cset; @@ -2246,15 +2490,12 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (!cset->mg_src_cgrp) return; - mgctx->tset.nr_tasks++; - list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, - &mgctx->tset.src_csets); + list_add_tail(&cset->mg_node, &tset->src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) - list_add_tail(&cset->mg_dst_cset->mg_node, - &mgctx->tset.dst_csets); + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); } /** @@ -2321,34 +2562,37 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @mgctx: migration context + * @tset: taget taskset + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @mgctx as setup by migration preparation functions. + * Migrate tasks in @tset as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @mgctx are migrated. - * @mgctx is consumed regardless of success. + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ -static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup_root *root) { - struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; + /* check that we can legitimately attach to the cgroup */ - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->can_attach) { - tset->ssid = ssid; - ret = ss->can_attach(tset); - if (ret) { - failed_ssid = ssid; - goto out_cancel_attach; - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); + if (ret) { + failed_ssid = ssid; + goto out_cancel_attach; } - } while_each_subsys_mask(); - } + } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2362,17 +2606,8 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); - to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - from_cset->nr_tasks--; - /* - * If the source or destination cgroup is frozen, - * the task might require to change its state. - */ - cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, - to_cset->dfl_cgrp); put_css_set_locked(from_cset); - } } spin_unlock_irq(&css_set_lock); @@ -2384,29 +2619,25 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) */ tset->csets = &tset->dst_csets; - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->attach) { - tset->ssid = ssid; - ss->attach(tset); - } - } while_each_subsys_mask(); - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ssid == failed_ssid) - break; - if (ss->cancel_attach) { - tset->ssid = ssid; - ss->cancel_attach(tset); - } - } while_each_subsys_mask(); - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) + break; + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2415,87 +2646,44 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); - - /* - * Re-initialize the cgroup_taskset structure in case it is reused - * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() - * iteration. - */ - tset->nr_tasks = 0; - tset->csets = &tset->src_csets; return ret; } /** - * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * - * On the default hierarchy, except for the mixable, (possible) thread root - * and threaded cgroups, subtree_control must be zero for migration - * destination cgroups with tasks so that child cgroups don't compete - * against tasks. + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. */ -int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) { - /* v1 doesn't have any restriction */ - if (!cgroup_on_dfl(dst_cgrp)) - return 0; - - /* verify @dst_cgrp can host resources */ - if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) - return -EOPNOTSUPP; - - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - - /* - * If @dst_cgrp is already or can become a thread root or is - * threaded, it doesn't matter. - */ - if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) - return 0; - - /* apply no-internal-process constraint */ - if (dst_cgrp->subtree_control) - return -EBUSY; - - return 0; + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; } /** * cgroup_migrate_finish - cleanup after attach - * @mgctx: migration context + * @preloaded_csets: list of preloaded css_sets * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) +static void cgroup_migrate_finish(struct list_head *preloaded_csets) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - - list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_src_preload_node) { - cset->mg_src_cgrp = NULL; - cset->mg_dst_cgrp = NULL; - cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_src_preload_node); - put_css_set_locked(cset); - } - - list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, - mg_dst_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_dst_preload_node); + list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_irq(&css_set_lock); } @@ -2503,10 +2691,10 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @mgctx: migration context + * @preloaded_csets: list of preloaded css_sets * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @mgctx->src_csets, which should later be cleaned + * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2515,9 +2703,9 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ -void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct cgroup_mgctx *mgctx) +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) { struct cgroup *src_cgrp; @@ -2534,7 +2722,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_src_preload_node)) + if (!list_empty(&src_cset->mg_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); @@ -2545,39 +2733,37 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); + list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @mgctx: migration context + * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @mgctx->preloaded_src_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @mgctx->preloaded_dst_csets. + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @mgctx. + * @preloaded_csets. */ -int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { + LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_src_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - struct cgroup_subsys *ss; - int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - return -ENOMEM; + goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2589,7 +2775,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_src_preload_node); + list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2597,25 +2783,24 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_dst_preload_node)) - list_add_tail(&dst_cset->mg_dst_preload_node, - &mgctx->preloaded_dst_csets); + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); else put_css_set(dst_cset); - - for_each_subsys(ss, ssid) - if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) - mgctx->ss_mask |= 1 << ssid; } + list_splice_tail(&csets, preloaded_csets); return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @mgctx: migration context + * @root: cgroup root migration is taking place on * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2629,9 +2814,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx) +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_root *root) { + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2643,14 +2829,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_task(task, mgctx); + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2661,23 +2847,23 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ -int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) { - DEFINE_CGROUP_MGCTX(mgctx); + LIST_HEAD(preloaded_csets); struct task_struct *task; int ret; - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2685,11 +2871,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&mgctx); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, &mgctx); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); - cgroup_migrate_finish(&mgctx); + cgroup_migrate_finish(&preloaded_csets); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2697,65 +2883,222 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) - __acquires(&cgroup_threadgroup_rwsem) +int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) { - struct task_struct *tsk; - pid_t pid; + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return ERR_PTR(-EINVAL); + if (capable(CAP_SYS_NICE)) + return 0; - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; - } - } else { - tsk = current; + if (current != task && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + return -EACCES; } - if (threadgroup) - tsk = tsk->group_leader; + return 0; +} - /* +static int cgroup_procs_write_permission(struct task_struct *task, + struct cgroup *dst_cgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + int ret = 0; + + /* + * even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) + ret = -EACCES; + + if (!ret && cgroup_on_dfl(dst_cgrp)) { + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *cgrp; + struct inode *inode; + + spin_lock_irq(&css_set_lock); + cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + while (!cgroup_is_descendant(dst_cgrp, cgrp)) + cgrp = cgroup_parent(cgrp); + + ret = -ENOMEM; + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (inode) { + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + } + } + + put_cred(tcred); + return ret; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup. + */ +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) +{ + struct task_struct *tsk; + struct cgroup_subsys *ss; + struct cgroup *cgrp; + pid_t pid; + int ssid, ret; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + percpu_down_write(&cgroup_threadgroup_rwsem); + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + ret = -ESRCH; + goto out_unlock_rcu; + } + } else { + tsk = current; + } + + if (threadgroup) + tsk = tsk->group_leader; + + /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { - tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + ret = -EINVAL; + goto out_unlock_rcu; } get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + ret = cgroup_procs_write_permission(tsk, cgrp, of); + if (!ret) + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + + put_task_struct(tsk); + goto out_unlock_threadgroup; -out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); out_unlock_rcu: rcu_read_unlock(); - return tsk; +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; } -void cgroup_procs_write_finish(struct task_struct *task) - __releases(&cgroup_threadgroup_rwsem) +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroup_subsys *ss; - int ssid; + struct cgroup_root *root; + int retval = 0; - /* release reference from cgroup_procs_write_start() */ - put_task_struct(task); + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } percpu_up_write(&cgroup_threadgroup_rwsem); - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, true); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -2803,7 +3146,8 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - DEFINE_CGROUP_MGCTX(mgctx); + LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -2819,29 +3163,33 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, &mgctx); + cgroup_migrate_add_src(link->cset, dsct, + &preloaded_csets); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&mgctx); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, - mg_src_preload_node) { + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_migrate_add_task(task, &mgctx); + cgroup_taskset_add(task, &tset); } spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_execute(&mgctx); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: - cgroup_migrate_finish(&mgctx); + cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -2854,7 +3202,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -void cgroup_lock_and_drain_offline(struct cgroup *cgrp) +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; @@ -2888,12 +3236,11 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) } /** - * cgroup_save_control - save control masks and dom_cgrp of a subtree + * cgroup_save_control - save control masks of a subtree * @cgrp: root of the target subtree * - * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the - * respective old_ prefixed fields for @cgrp's subtree including @cgrp - * itself. + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. */ static void cgroup_save_control(struct cgroup *cgrp) { @@ -2903,7 +3250,6 @@ static void cgroup_save_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; - dsct->old_dom_cgrp = dsct->dom_cgrp; } } @@ -2929,12 +3275,11 @@ static void cgroup_propagate_control(struct cgroup *cgrp) } /** - * cgroup_restore_control - restore control masks and dom_cgrp of a subtree + * cgroup_restore_control - restore control masks of a subtree * @cgrp: root of the target subtree * - * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the - * respective old_ prefixed fields for @cgrp's subtree including @cgrp - * itself. + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { @@ -2944,7 +3289,6 @@ static void cgroup_restore_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; - dsct->dom_cgrp = dsct->old_dom_cgrp; } } @@ -2984,6 +3328,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -2993,8 +3339,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } - WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); - if (css_visible(css)) { ret = css_populate_dir(css); if (ret) @@ -3030,11 +3374,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!css) continue; - WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); - if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); @@ -3103,46 +3447,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) -{ - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; - - /* if nothing is getting enabled, nothing to worry about */ - if (!enable) - return 0; - - /* can @cgrp host any resources? */ - if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) - return -EOPNOTSUPP; - - /* mixables don't care */ - if (cgroup_is_mixable(cgrp)) - return 0; - - if (domain_enable) { - /* can't enable domain controllers inside a thread subtree */ - if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) - return -EOPNOTSUPP; - } else { - /* - * Threaded controllers can handle internal competitions - * and are always allowed inside a (prospective) thread - * subtree. - */ - if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) - return 0; - } - - /* - * Controllers can't be enabled for a cgroup with tasks to avoid - * child cgroups competing against tasks. - */ - if (cgroup_has_tasks(cgrp)) - return -EBUSY; - - return 0; -} - /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3218,9 +3522,33 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - ret = cgroup_vet_subtree_control_enable(cgrp, enable); - if (ret) - goto out_unlock; + /* + * Except for the root, subtree_control must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; + } /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -3239,193 +3567,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return ret ?: nbytes; } -/** - * cgroup_enable_threaded - make @cgrp threaded - * @cgrp: the target cgroup - * - * Called when "threaded" is written to the cgroup.type interface file and - * tries to make @cgrp threaded and join the parent's resource domain. - * This function is never called on the root cgroup as cgroup.type doesn't - * exist on it. - */ -static int cgroup_enable_threaded(struct cgroup *cgrp) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup *dom_cgrp = parent->dom_cgrp; - struct cgroup *dsct; - struct cgroup_subsys_state *d_css; - int ret; - - lockdep_assert_held(&cgroup_mutex); - - /* noop if already threaded */ - if (cgroup_is_threaded(cgrp)) - return 0; - - /* - * If @cgroup is populated or has domain controllers enabled, it - * can't be switched. While the below cgroup_can_be_thread_root() - * test can catch the same conditions, that's only when @parent is - * not mixable, so let's check it explicitly. - */ - if (cgroup_is_populated(cgrp) || - cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) - return -EOPNOTSUPP; - - /* we're joining the parent's domain, ensure its validity */ - if (!cgroup_is_valid_domain(dom_cgrp) || - !cgroup_can_be_thread_root(dom_cgrp)) - return -EOPNOTSUPP; - - /* - * The following shouldn't cause actual migrations and should - * always succeed. - */ - cgroup_save_control(cgrp); - - cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) - if (dsct == cgrp || cgroup_is_threaded(dsct)) - dsct->dom_cgrp = dom_cgrp; - - ret = cgroup_apply_control(cgrp); - if (!ret) - parent->nr_threaded_children++; - - cgroup_finalize_control(cgrp, ret); - return ret; -} - -static int cgroup_type_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - if (cgroup_is_threaded(cgrp)) - seq_puts(seq, "threaded\n"); - else if (!cgroup_is_valid_domain(cgrp)) - seq_puts(seq, "domain invalid\n"); - else if (cgroup_is_thread_root(cgrp)) - seq_puts(seq, "domain threaded\n"); - else - seq_puts(seq, "domain\n"); - - return 0; -} - -static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - int ret; - - /* only switching to threaded mode is supported */ - if (strcmp(strstrip(buf), "threaded")) - return -EINVAL; - - /* drain dying csses before we re-apply (threaded) subtree control */ - cgrp = cgroup_kn_lock_live(of->kn, true); - if (!cgrp) - return -ENOENT; - - /* threaded can only be enabled */ - ret = cgroup_enable_threaded(cgrp); - - cgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -static int cgroup_max_descendants_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - int descendants = READ_ONCE(cgrp->max_descendants); - - if (descendants == INT_MAX) - seq_puts(seq, "max\n"); - else - seq_printf(seq, "%d\n", descendants); - - return 0; -} - -static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - int descendants; - ssize_t ret; - - buf = strstrip(buf); - if (!strcmp(buf, "max")) { - descendants = INT_MAX; - } else { - ret = kstrtoint(buf, 0, &descendants); - if (ret) - return ret; - } - - if (descendants < 0) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgrp->max_descendants = descendants; - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - -static int cgroup_max_depth_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - int depth = READ_ONCE(cgrp->max_depth); - - if (depth == INT_MAX) - seq_puts(seq, "max\n"); - else - seq_printf(seq, "%d\n", depth); - - return 0; -} - -static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - ssize_t ret; - int depth; - - buf = strstrip(buf); - if (!strcmp(buf, "max")) { - depth = INT_MAX; - } else { - ret = kstrtoint(buf, 0, &depth); - if (ret) - return ret; - } - - if (depth < 0) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgrp->max_depth = depth; - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - static int cgroup_events_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); - seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); - + seq_printf(seq, "populated %d\n", + cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } @@ -3519,108 +3664,31 @@ bool cgroup_psi_enabled(void) #endif /* CONFIG_PSI */ -static int cgroup_stat_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgroup = seq_css(seq)->cgroup; - - seq_printf(seq, "nr_descendants %d\n", - cgroup->nr_descendants); - seq_printf(seq, "nr_dying_descendants %d\n", - cgroup->nr_dying_descendants); - - return 0; -} - -static int cgroup_freeze_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgrp->freezer.freeze); - - return 0; -} - -static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - ssize_t ret; - int freeze; - - ret = kstrtoint(strstrip(buf), 0, &freeze); - if (ret) - return ret; - - if (freeze < 0 || freeze > 1) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgroup_freeze(cgrp, freeze); - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; - struct cgroup_file_ctx *ctx; - int ret; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->ns = current->nsproxy->cgroup_ns; - get_cgroup_ns(ctx->ns); - of->priv = ctx; - if (!cft->open) - return 0; - - ret = cft->open(of); - if (ret) { - put_cgroup_ns(ctx->ns); - kfree(ctx); - } - return ret; + if (cft->open) + return cft->open(of); + return 0; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; - struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); - put_cgroup_ns(ctx->ns); - kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; - /* - * If namespaces are delegation boundaries, disallow writes to - * files in an non-init namespace root from inside the namespace - * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. - */ - if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && - !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) - return -EPERM; - if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3715,6 +3783,52 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + /* do not accept '\n' to prevent making /proc//cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow on the default hierarchy. + */ + if (cgroup_on_dfl(cgrp)) + return -EPERM; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -3814,6 +3928,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { + LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; @@ -4014,6 +4129,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) @@ -4241,58 +4376,6 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } -static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) -{ - struct list_head *l; - struct cgrp_cset_link *link; - struct css_set *cset; - - lockdep_assert_held(&css_set_lock); - - /* find the next threaded cset */ - if (it->tcset_pos) { - l = it->tcset_pos->next; - - if (l != it->tcset_head) { - it->tcset_pos = l; - return container_of(l, struct css_set, - threaded_csets_node); - } - - it->tcset_pos = NULL; - } - - /* find the next cset */ - l = it->cset_pos; - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; - return NULL; - } - - if (it->ss) { - cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } - - it->cset_pos = l; - - /* initialize threaded css_set walking */ - if (it->flags & CSS_TASK_ITER_THREADED) { - if (it->cur_dcset) - put_css_set_locked(it->cur_dcset); - it->cur_dcset = cset; - get_css_set(cset); - - it->tcset_head = &cset->threaded_csets; - it->tcset_pos = &cset->threaded_csets; - } - - return cset; -} - /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -4301,33 +4384,39 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { + struct list_head *l = it->cset_pos; + struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - cset = css_task_iter_next_css_set(it); - if (!cset) { + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; it->task_pos = NULL; return; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) { + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + } while (!css_set_populated(cset)); + + it->cset_pos = l; + + if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { + else it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; - } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4353,74 +4442,32 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_skip(struct css_task_iter *it, - struct task_struct *task) -{ - lockdep_assert_held(&css_set_lock); - - if (it->task_pos == &task->cg_list) { - it->task_pos = it->task_pos->next; - it->flags |= CSS_TASK_ITER_SKIPPED; - } -} - static void css_task_iter_advance(struct css_task_iter *it) { - struct task_struct *task; + struct list_head *l = it->task_pos; lockdep_assert_held(&css_set_lock); -repeat: - if (it->task_pos) { - /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. - */ - if (it->flags & CSS_TASK_ITER_SKIPPED) - it->flags &= ~CSS_TASK_ITER_SKIPPED; - else - it->task_pos = it->task_pos->next; + WARN_ON_ONCE(!l); - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; - } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; - } - if (it->task_pos == it->dying_tasks_head) - css_task_iter_advance_css_set(it); - } else { - /* called from start, proceed to the first cset */ - css_task_iter_advance_css_set(it); - } - - if (!it->task_pos) - return; - - task = list_entry(it->task_pos, struct task_struct, cg_list); + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ + l = l->next; - if (it->flags & CSS_TASK_ITER_PROCS) { - /* if PROCS, skip over tasks which aren't group leaders */ - if (!thread_group_leader(task)) - goto repeat; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && - !atomic_read(&task->signal->live)) - goto repeat; - } else { - /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) - goto repeat; - } + if (l == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = l; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of - * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -4428,7 +4475,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, +void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -4439,7 +4486,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, spin_lock_irq(&css_set_lock); it->ss = css->ss; - it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -4448,7 +4494,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->cset_head = it->cset_pos; - css_task_iter_advance(it); + css_task_iter_advance_css_set(it); spin_unlock_irq(&css_set_lock); } @@ -4470,10 +4516,6 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); - /* @it may be half-advanced by skips, finish advancing */ - if (it->flags & CSS_TASK_ITER_SKIPPED) - css_task_iter_advance(it); - if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); @@ -4501,276 +4543,576 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } - if (it->cur_dcset) - put_css_set(it->cur_dcset); - if (it->cur_task) put_task_struct(it->cur_task); } -static void cgroup_procs_release(struct kernfs_open_file *of) +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&preloaded_csets); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) { - struct cgroup_file_ctx *ctx = of->priv; + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} - if (ctx->procs.started) - css_task_iter_end(&ctx->procs.iter); +static void pidlist_free(void *p) +{ + kvfree(p); } -static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) { - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l, *tmp_l; - if (pos) - (*pos)++; + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); - return css_task_iter_next(&ctx->procs.iter); + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); } -static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, - unsigned int iter_flags) +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { - struct kernfs_open_file *of = s->private; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_file_ctx *ctx = of->priv; - struct css_task_iter *it = &ctx->procs.iter; + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); /* - * When a seq_file is seeked, it's always traversed sequentially - * from position 0, so we can simply keep iterating on !0 *pos. + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. */ - if (!ctx->procs.started) { - if (WARN_ON_ONCE((*pos))) - return ERR_PTR(-EINVAL); - css_task_iter_start(&cgrp->self, iter_flags, it); - ctx->procs.started = true; - } else if (!(*pos)) { - css_task_iter_end(it); - css_task_iter_start(&cgrp->self, iter_flags, it); - } else - return it->cur_task; + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } - return cgroup_procs_next(s, NULL, NULL); + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); } -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) { - struct cgroup *cgrp = seq_css(s)->cgroup; + int src, dest = 1; /* - * All processes of a threaded subtree belong to the domain cgroup - * of the subtree. Only threads can be distributed across the - * subtree. Reject reads on cgroup.procs in the subtree proper. - * They're always empty anyway. + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either */ - if (cgroup_is_threaded(cgrp)) - return ERR_PTR(-EOPNOTSUPP); + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; - return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | - CSS_TASK_ITER_THREADED); + return (a << 1) | (b >> 1); } -static int cgroup_procs_show(struct seq_file *s, void *v) +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - seq_printf(s, "%d\n", task_pid_vnr(v)); - return 0; + if (cgroup_on_dfl(cgrp)) + return pid_fry(pid); + else + return pid; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) +static int cmppid(const void *a, const void *b) { - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; + return *(pid_t *)a - *(pid_t *)b; +} - if (capable(CAP_SYS_NICE)) - return 0; +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } + lockdep_assert_held(&cgrp->pidlist_mutex); - return 0; + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; } -static int cgroup_procs_write_permission(struct cgroup *src_cgrp, - struct cgroup *dst_cgrp, - struct super_block *sb, - struct cgroup_namespace *ns) +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { - struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; - int ret; + struct cgroup_pidlist *l; - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgrp->pidlist_mutex); - /* find the common ancestor */ - while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) - com_cgrp = cgroup_parent(com_cgrp); + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; - /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - if (ret) - return ret; + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); /* - * If namespaces are delegation boundaries, %current must be able - * to see both source and destination cgroups from its namespace. + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. */ - if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && - (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || - !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) - return -ENOENT; + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + if (cgroup_on_dfl(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; return 0; } -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - const struct cred *saved_cred; - ssize_t ret; - - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; - task = cgroup_procs_write_start(buf, true); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + mutex_lock(&cgroup_mutex); /* - * Process and thread migrations follow same delegation rule. Check - * permissions using the credentials from file open to protect against - * inherited fd attacks. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - ctx->ns); - revert_creds(saved_cred); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(dst_cgrp, task, true); + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); - return ret ?: nbytes; + mutex_unlock(&cgroup_mutex); + return 0; } -static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) -{ - return __cgroup_procs_start(s, pos, 0); -} -static ssize_t cgroup_threads_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - const struct cred *saved_cred; - ssize_t ret; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; - buf = strstrip(buf); + mutex_lock(&cgrp->pidlist_mutex); - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); - task = cgroup_procs_write_start(buf, false); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { + index = mid; + break; + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = cgroup_pid_fry(cgrp, *iter); + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; /* - * Process and thread migrations follow same delegation rule. Check - * permissions using the credentials from file open to protect against - * inherited fd attacks. + * Advance to the next pid in the array. If this goes off the + * end, we're done */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - ctx->ns); - revert_creds(saved_cred); - if (ret) - goto out_finish; + p++; + if (p >= end) { + return NULL; + } else { + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); + return p; + } +} - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); - ret = cgroup_attach_task(dst_cgrp, task, false); + return 0; +} -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} - return ret ?: nbytes; +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_base_files[] = { - { - .name = "cgroup.type", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_type_show, - .write = cgroup_type_write, - }, +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", - .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), - .release = cgroup_procs_release, - .seq_start = cgroup_procs_start, - .seq_next = cgroup_procs_next, - .seq_show = cgroup_procs_show, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, }, - { - .name = "cgroup.threads", - .flags = CFTYPE_NS_DELEGATABLE, - .release = cgroup_procs_release, - .seq_start = cgroup_threads_start, - .seq_next = cgroup_procs_next, - .seq_show = cgroup_procs_show, - .write = cgroup_threads_write, - }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4806,25 +5148,50 @@ static struct cftype cgroup_base_files[] = { .release = cgroup_pressure_release, }, #endif /* CONFIG_PSI */ + { } /* terminate */ +}; + +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, { - .name = "cgroup.max.descendants", - .seq_show = cgroup_max_descendants_show, - .write = cgroup_max_descendants_write, + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, }, { - .name = "cgroup.max.depth", - .seq_show = cgroup_max_depth_show, - .write = cgroup_max_depth_write, + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, }, { - .name = "cgroup.stat", - .seq_show = cgroup_stat_show, + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, }, { - .name = "cgroup.freeze", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_freeze_show, - .write = cgroup_freeze_write, + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -4874,7 +5241,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + cgroup_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -4927,17 +5294,9 @@ static void css_release_work_fn(struct work_struct *work) if (ss->css_released) ss->css_released(css); } else { - struct cgroup *tcgrp; - /* cgroup release path */ trace_cgroup_release(cgrp); - spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; - tcgrp = cgroup_parent(tcgrp)) - tcgrp->nr_dying_descendants--; - spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5024,6 +5383,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -5136,40 +5498,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_idr_free; - /* - * New cgroup inherits effective freeze counter, and - * if the parent has to be frozen, the child has too. - */ - cgrp->freezer.e_freeze = parent->freezer.e_freeze; - if (cgrp->freezer.e_freeze) { - /* - * Set the CGRP_FREEZE flag, so when a process will be - * attached to the child cgroup, it will become frozen. - * At this point the new cgroup is unpopulated, so we can - * consider it frozen immediately. - */ - set_bit(CGRP_FREEZE, &cgrp->flags); - set_bit(CGRP_FROZEN, &cgrp->flags); - } - - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5215,30 +5546,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return ERR_PTR(ret); } -static bool cgroup_check_hierarchy_limits(struct cgroup *parent) -{ - struct cgroup *cgroup; - int ret = false; - int level = 1; - - lockdep_assert_held(&cgroup_mutex); - - for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { - if (cgroup->nr_descendants >= cgroup->max_descendants) - goto fail; - - if (level > cgroup->max_depth) - goto fail; - - level++; - } - - ret = true; -fail: - return ret; -} - -int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -5252,11 +5561,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (!parent) return -ENODEV; - if (!cgroup_check_hierarchy_limits(parent)) { - ret = -EAGAIN; - goto out_unlock; - } - cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -5408,7 +5712,6 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -5447,27 +5750,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ - css_clear_dir(&cgrp->self); + /* + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. + */ kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) - parent->nr_threaded_children--; - - spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { - tcgrp->nr_descendants--; - tcgrp->nr_dying_descendants++; - /* - * If the dying cgroup is frozen, decrease frozen descendants - * counters of ancestor cgroups. - */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - tcgrp->freezer.nr_frozen_descendants--; - } - spin_unlock_irq(&css_set_lock); - - cgroup1_check_for_release(parent); + check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5475,7 +5764,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -int cgroup_rmdir(struct kernfs_node *kn) +static int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -5494,10 +5783,11 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { - .show_options = cgroup_show_options, .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, + .rename = cgroup_rename, .show_path = cgroup_show_path, }; @@ -5541,7 +5831,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_release_callback |= (bool)ss->release << ss->id; + have_free_callback |= (bool)ss->free << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5603,8 +5893,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5650,23 +5940,17 @@ int __init cgroup_init(void) if (!cgroup_ssid_enabled(ssid)) continue; - if (cgroup1_ssid_disabled(ssid)) + if (cgroup_ssid_no_v1(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; - /* implicit controllers must be threaded too */ - WARN_ON(ss->implicit_on_dfl && !ss->threaded); - if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; - if (ss->threaded) - cgrp_dfl_threaded_ss_mask |= 1 << ss->id; - if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { @@ -5707,6 +5991,15 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; } core_initcall(cgroup_wq_init); @@ -5789,6 +6082,42 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, return retval; } +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +static const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5895,29 +6224,8 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); - cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } - - /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - spin_unlock_irq(&css_set_lock); } @@ -5965,13 +6273,6 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; - - if (unlikely(cgroup_task_frozen(tsk))) - cgroup_freezer_frozen_exit(tsk); - else if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); @@ -5983,27 +6284,87 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_release(struct task_struct *task) +void cgroup_free(struct task_struct *task) { + struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_release_callback) { - ss->release(task); + do_each_subsys_mask(ss, ssid, have_free_callback) { + ss->free(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + put_css_set(cset); } -void cgroup_free(struct task_struct *task) +static void check_for_release(struct cgroup *cgrp) { - struct css_set *cset = task_css_set(task); - put_css_set(cset); + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +static void cgroup_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); } static int __init cgroup_disable(char *str) @@ -6039,6 +6400,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6068,7 +6456,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + cgrp = rcu_dereference(kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); @@ -6238,6 +6626,154 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct ucounts *ucounts; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + dec_cgroup_namespaces(ucounts); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags) @@ -6261,69 +6797,148 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } #endif /* CONFIG_CGROUP_BPF */ -#ifdef CONFIG_SYSFS -static ssize_t show_delegatable_files(struct cftype *files, char *buf, - ssize_t size, const char *prefix) +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cftype *cft; - ssize_t ret = 0; + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - for (cft = files; cft && cft->name[0] != '\0'; cft++) { - if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) - continue; + if (!css) + return ERR_PTR(-ENOMEM); - if (prefix) - ret += snprintf(buf + ret, size - ret, "%s.", prefix); + return css; +} - ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} - if (WARN_ON(ret >= size)) - break; - } +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} - return ret; +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; } -static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct cgroup_subsys *ss; - int ssid; - ssize_t ret = 0; + u64 count; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + rcu_read_lock(); + count = atomic_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} - for_each_subsys(ss, ssid) - ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, - PAGE_SIZE - ret, - cgroup_subsys_name[ssid]); +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; - return ret; + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; } -static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); -static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); -} -static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; -static struct attribute *cgroup_sysfs_attrs[] = { - &cgroup_delegate_attr.attr, - &cgroup_features_attr.attr, - NULL, -}; + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; -static const struct attribute_group cgroup_sysfs_attr_group = { - .attrs = cgroup_sysfs_attrs, - .name = "cgroup", -}; + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} -static int __init cgroup_sysfs_init(void) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); } -subsys_initcall(cgroup_sysfs_init); -#endif /* CONFIG_SYSFS */ + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile deleted file mode 100644 index 0a3e87cc648d..000000000000 --- a/kernel/cgroup/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o freezer.o - -obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o -obj-$(CONFIG_CGROUP_PIDS) += pids.o -obj-$(CONFIG_CGROUP_RDMA) += rdma.o -obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h deleted file mode 100644 index 90104f82593d..000000000000 --- a/kernel/cgroup/cgroup-internal.h +++ /dev/null @@ -1,242 +0,0 @@ -#ifndef __CGROUP_INTERNAL_H -#define __CGROUP_INTERNAL_H - -#include -#include -#include -#include -#include - -struct cgroup_pidlist; - -struct cgroup_file_ctx { - struct cgroup_namespace *ns; - - struct { - void *trigger; - } psi; - - struct { - bool started; - struct css_task_iter iter; - } procs; - - struct { - struct cgroup_pidlist *pidlist; - } procs1; -}; - -/* - * A cgroup can be associated with multiple css_sets as different tasks may - * belong to different cgroups on different hierarchies. In the other - * direction, a css_set is naturally associated with multiple cgroups. - * This M:N relationship is represented by the following link structure - * which exists for each association and allows traversing the associations - * from both sides. - */ -struct cgrp_cset_link { - /* the cgroup and css_set this link associates */ - struct cgroup *cgrp; - struct css_set *cset; - - /* list of cgrp_cset_links anchored at cgrp->cset_links */ - struct list_head cset_link; - - /* list of cgrp_cset_links anchored at css_set->cgrp_links */ - struct list_head cgrp_link; -}; - -/* used to track tasks and csets during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the number of tasks in the set */ - int nr_tasks; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -/* migration context also tracks preloading */ -struct cgroup_mgctx { - /* - * Preloaded source and destination csets. Used to guarantee - * atomic success or failure on actual migration. - */ - struct list_head preloaded_src_csets; - struct list_head preloaded_dst_csets; - - /* tasks and csets to migrate */ - struct cgroup_taskset tset; - - /* subsystems affected by migration */ - u16 ss_mask; -}; - -#define CGROUP_TASKSET_INIT(tset) \ -{ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - -#define CGROUP_MGCTX_INIT(name) \ -{ \ - LIST_HEAD_INIT(name.preloaded_src_csets), \ - LIST_HEAD_INIT(name.preloaded_dst_csets), \ - CGROUP_TASKSET_INIT(name.tset), \ -} - -#define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -extern struct mutex cgroup_mutex; -extern spinlock_t css_set_lock; -extern struct cgroup_subsys *cgroup_subsys[]; -extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; - -/* iterate across the hierarchies */ -#define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) - -/** - * for_each_subsys - iterate all enabled cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - */ -#define for_each_subsys(ss, ssid) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) - -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - -static inline bool notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - -bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); - -struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); -struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root); -struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); -void cgroup_kn_unlock(struct kernfs_node *kn); -int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); - -int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); -void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); -void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, - struct cgroup_mgctx *mgctx); -int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); -int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx); - -int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) - __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) - __releases(&cgroup_threadgroup_rwsem); - -void cgroup_lock_and_drain_offline(struct cgroup *cgrp); - -int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); -int cgroup_rmdir(struct kernfs_node *kn); -int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root); - -int __cgroup_task_count(const struct cgroup *cgrp); -int cgroup_task_count(const struct cgroup *cgrp); - -/* - * namespace.c - */ -extern const struct proc_ns_operations cgroupns_operations; - -/* - * cgroup-v1.c - */ -extern struct cftype cgroup1_base_files[]; -extern const struct file_operations proc_cgroupstats_operations; -extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; - -bool cgroup1_ssid_disabled(int ssid); -void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); -void cgroup1_release_agent(struct work_struct *work); -void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); - -#endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c deleted file mode 100644 index fc576131fdf2..000000000000 --- a/kernel/cgroup/cgroup-v1.c +++ /dev/null @@ -1,1314 +0,0 @@ -#include "cgroup-internal.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * pidlists linger the following amount before being destroyed. The goal - * is avoiding frequent destruction in the middle of consecutive read calls - * Expiring in the middle is a performance problem not a correctness one. - * 1 sec should be enough. - */ -#define CGROUP_PIDLIST_DESTROY_DELAY HZ - -/* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; - -/* disable named v1 mounts */ -static bool cgroup_no_v1_named; - -/* - * pidlist destructions need to be flushed on cgroup destruction. Use a - * separate workqueue as flush domain. - */ -static struct workqueue_struct *cgroup_pidlist_destroy_wq; - -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ -static DEFINE_SPINLOCK(release_agent_path_lock); - -bool cgroup1_ssid_disabled(int ssid) -{ - return cgroup_no_v1_mask & (1 << ssid); -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroup_root *root; - int retval = 0; - - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - for_each_root(root) { - struct cgroup *from_cgrp; - - if (root == &cgrp_dfl_root) - continue; - - spin_lock_irq(&css_set_lock); - from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_irq(&css_set_lock); - - retval = cgroup_attach_task(from_cgrp, tsk, false); - if (retval) - break; - } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - * - * Locking rules between cgroup_post_fork() and the migration path - * guarantee that, if a task is forking while being migrated, the new child - * is guaranteed to be either visible in the source cgroup after the - * parent's migration is complete or put into the target cgroup. No task - * can slip out of migration through forking. - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - DEFINE_CGROUP_MGCTX(mgctx); - struct cgrp_cset_link *link; - struct css_task_iter it; - struct task_struct *task; - int ret; - - if (cgroup_on_dfl(to)) - return -EINVAL; - - ret = cgroup_migrate_vet_dst(to); - if (ret) - return ret; - - mutex_lock(&cgroup_mutex); - - percpu_down_write(&cgroup_threadgroup_rwsem); - - /* all tasks in @from are being moved, all csets are source */ - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &mgctx); - spin_unlock_irq(&css_set_lock); - - ret = cgroup_migrate_prepare_dst(&mgctx); - if (ret) - goto out_err; - - /* - * Migrate tasks one-by-one until @from is empty. This fails iff - * ->can_attach() fails. - */ - do { - css_task_iter_start(&from->self, 0, &it); - - do { - task = css_task_iter_next(&it); - } while (task && (task->flags & PF_EXITING)); - - if (task) - get_task_struct(task); - css_task_iter_end(&it); - - if (task) { - ret = cgroup_migrate(task, false, &mgctx); - if (!ret) - trace_cgroup_transfer_tasks(to, task, false); - put_task_struct(task); - } - } while (task && !ret); -out_err: - cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - return ret; -} - -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* for delayed destruction */ - struct delayed_work destroy_dwork; -}; - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - -/* - * Used to destroy all pidlists lingering waiting for destroy timer. None - * should be left afterwards. - */ -void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) -{ - struct cgroup_pidlist *l, *tmp_l; - - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); - mutex_unlock(&cgrp->pidlist_mutex); - - flush_workqueue(cgroup_pidlist_destroy_wq); - BUG_ON(!list_empty(&cgrp->pidlists)); -} - -static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, - destroy_dwork); - struct cgroup_pidlist *tofree = NULL; - - mutex_lock(&l->owner->pidlist_mutex); - - /* - * Destroy iff we didn't get queued again. The state won't change - * as destroy_dwork can only be queued while locked. - */ - if (!delayed_work_pending(dwork)) { - list_del(&l->links); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - tofree = l; - } - - mutex_unlock(&l->owner->pidlist_mutex); - kfree(tofree); -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * Returns the number of unique elements. - */ -static int pidlist_uniq(pid_t *list, int length) -{ - int src, dest = 1; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - return dest; -} - -/* - * The two pid files - task and cgroup.procs - guaranteed that the result - * is sorted, which forced this whole pidlist fiasco. As pid order is - * different per namespace, each namespace needs differently sorted list, - * making it impossible to use, for example, single rbtree of member tasks - * sorted by task pointer. As pidlists can be fairly large, allocating one - * per open file is dangerous, so cgroup had to implement shared pool of - * pidlists keyed by cgroup and namespace. - */ -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - - lockdep_assert_held(&cgrp->pidlist_mutex); - - list_for_each_entry(l, &cgrp->pidlists, links) - if (l->key.type == type && l->key.ns == ns) - return l; - return NULL; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - l = cgroup_pidlist_find(cgrp, type); - if (l) - return l; - - /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) - return l; - - INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); - l->key.type = type; - /* don't need task_nsproxy() if we're looking at ourself */ - l->key.ns = get_pid_ns(task_active_pid_ns(current)); - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct css_task_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - css_task_iter_start(&cgrp->self, 0, &it); - while ((tsk = css_task_iter_next(&it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - css_task_iter_end(&it); - length = n; - /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); - - l = cgroup_pidlist_find_create(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; - } - - /* store array, freeing old if necessary */ - pidlist_free(l->list); - l->list = array; - l->length = length; - *lp = l; - return 0; -} - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) -{ - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_pidlist *l; - enum cgroup_filetype type = seq_cft(s)->private; - int index = 0, pid = *pos; - int *iter, ret; - - mutex_lock(&cgrp->pidlist_mutex); - - /* - * !NULL @ctx->procs1.pidlist indicates that this isn't the first - * start() after open. If the matching pidlist is around, we can use - * that. Look for it. Note that @ctx->procs1.pidlist can't be used - * directly. It could already have been destroyed. - */ - if (ctx->procs1.pidlist) - ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); - - /* - * Either this is the first start() after open or the matching - * pidlist has been destroyed inbetween. Create a new one. - */ - if (!ctx->procs1.pidlist) { - ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); - if (ret) - return ERR_PTR(ret); - } - l = ctx->procs1.pidlist; - - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (l->list[mid] == pid) { - index = mid; - break; - } else if (l->list[mid] <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = *iter; - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup_pidlist *l = ctx->procs1.pidlist; - - if (l) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, - CGROUP_PIDLIST_DESTROY_DELAY); - mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); -} - -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup_pidlist *l = ctx->procs1.pidlist; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - (*pos)++; - return NULL; - } else { - *pos = *p; - return p; - } -} - -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - seq_printf(s, "%d\n", *(int *)v); - - return 0; -} - -static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, - bool threadgroup) -{ - struct cgroup *cgrp; - struct task_struct *task; - const struct cred *cred, *tcred; - ssize_t ret; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - task = cgroup_procs_write_start(buf, threadgroup); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; - - /* - * Even if we're attaching all tasks in the thread group, we only need - * to check permissions on one of them. Check permissions using the - * credentials from file open to protect against inherited fd attacks. - */ - cred = of->file->f_cred; - tcred = get_task_cred(task); - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_NICE)) - ret = -EACCES; - put_cred(tcred); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(cgrp, task, threadgroup); - -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup1_procs_write(of, buf, nbytes, off, true); -} - -static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup1_procs_write(of, buf, nbytes, off, false); -} - -static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - - /* - * Release agent gets called with all capabilities, - * require capabilities to set release agent. - */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); - cgroup_kn_unlock(of->kn); - return nbytes; -} - -static int cgroup_release_agent_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); - seq_putc(seq, '\n'); - return 0; -} - -static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) -{ - seq_puts(seq, "0\n"); - return 0; -} - -static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return notify_on_release(css->cgroup); -} - -static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - return 0; -} - -static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); -} - -static int cgroup_clone_children_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - return 0; -} - -/* cgroup core interface files for the legacy hierarchies */ -struct cftype cgroup1_base_files[] = { - { - .name = "cgroup.procs", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, - .write = cgroup1_procs_write, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { - .name = "tasks", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_TASKS, - .write = cgroup1_tasks_write, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_release_agent_show, - .write = cgroup_release_agent_write, - .max_write_len = PATH_MAX - 1, - }, - { } /* terminate */ -}; - -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - struct cgroup_subsys *ss; - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - - for_each_subsys(ss, i) - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) -{ - struct kernfs_node *kn = kernfs_node_from_dentry(dentry); - struct cgroup *cgrp; - struct css_task_iter it; - struct task_struct *tsk; - - /* it should be kernfs_node belonging to cgroupfs and is a directory */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) - return -EINVAL; - - mutex_lock(&cgroup_mutex); - - /* - * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_online_from_dir(), - * @kn->priv is RCU safe. Let's do the RCU dancing. - */ - rcu_read_lock(); - cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { - rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); - return -ENOENT; - } - rcu_read_unlock(); - - css_task_iter_start(&cgrp->self, 0, &it); - while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - css_task_iter_end(&it); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -void cgroup1_check_for_release(struct cgroup *cgrp) -{ - if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && - !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -void cgroup1_release_agent(struct work_struct *work) -{ - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; - - mutex_lock(&cgroup_mutex); - - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; - - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(agentbuf); - kfree(pathbuf); -} - -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) -{ - struct cgroup *cgrp = kn->priv; - int ret; - - /* do not accept '\n' to prevent making /proc//cgroup unparsable */ - if (strchr(new_name_str, '\n')) - return -EINVAL; - - if (kernfs_type(kn) != KERNFS_DIR) - return -ENOTDIR; - if (kn->parent != new_parent) - return -EIO; - - /* - * We're gonna grab cgroup_mutex which nests outside kernfs - * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_mutex. - */ - kernfs_break_active_protection(new_parent); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_mutex); - - ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) - trace_cgroup_rename(cgrp); - - mutex_unlock(&cgroup_mutex); - - kernfs_unbreak_active_protection(kn); - kernfs_unbreak_active_protection(new_parent); - return ret; -} - -static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) - seq_puts(seq, ",cpuset_v2_mode"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup1_ssid_disabled(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) -{ - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - /* See cgroup1_mount release_agent handling */ - if (opts.release_agent && - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { - ret = -EINVAL; - goto out_unlock; - } - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; -} - -struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { - .rename = cgroup1_rename, - .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .show_path = cgroup_show_path, -}; - -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) -{ - struct cgroup_sb_opts opts; - struct cgroup_root *root = NULL; - struct cgroup_subsys *ss; - struct dentry *dentry; - int i, ret; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - ret = 0; - goto out_unlock; - } - - /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - /* - * Release agent gets called with all capabilities, - * require capabilities to set release agent. - */ - if (opts.release_agent && - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { - ret = -EINVAL; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) - return ERR_PTR(ret); - - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); - - if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); - deactivate_locked_super(sb); - msleep(10); - dentry = ERR_PTR(restart_syscall()); - } - return dentry; -} - -static int __init cgroup1_wq_init(void) -{ - /* - * Used to destroy pidlists and separate to serve as flush domain. - * Cap @max_active to 1 too. - */ - cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); - BUG_ON(!cgroup_pidlist_destroy_wq); - return 0; -} -core_initcall(cgroup1_wq_init); - -static int __init cgroup_no_v1(char *str) -{ - struct cgroup_subsys *ss; - char *token; - int i; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; - continue; - } - - if (!strcmp(token, "named")) { - cgroup_no_v1_named = true; - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->name) && - strcmp(token, ss->legacy_name)) - continue; - - cgroup_no_v1_mask |= 1 << i; - } - } - return 1; -} -__setup("cgroup_no_v1=", cgroup_no_v1); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c deleted file mode 100644 index f661b4cc5efd..000000000000 --- a/kernel/cgroup/debug.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Debug controller - * - * WARNING: This controller is for cgroup core debugging only. - * Its interfaces are unstable and subject to changes at any time. - */ -#include -#include -#include - -#include "cgroup-internal.h" - -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -/* - * debug_taskcount_read - return the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static int current_css_set_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct css_set *cset; - struct cgroup_subsys *ss; - struct cgroup_subsys_state *css; - int i, refcnt; - - if (!cgroup_kn_lock_live(of->kn, false)) - return -ENODEV; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - refcnt = refcount_read(&cset->refcount); - seq_printf(seq, "css_set %pK %d", cset, refcnt); - if (refcnt > cset->nr_tasks) - seq_printf(seq, " +%d", refcnt - cset->nr_tasks); - seq_puts(seq, "\n"); - - /* - * Print the css'es stored in the current css_set. - */ - for_each_subsys(ss, i) { - css = cset->subsys[ss->id]; - if (!css) - continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - cgroup_kn_unlock(of->kn); - return 0; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - int dead_cnt = 0, extra_refs = 0, threaded_csets = 0; - - spin_lock_irq(&css_set_lock); - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - int refcnt = refcount_read(&cset->refcount); - - /* - * Print out the proc_cset and threaded_cset relationship - * and highlight difference between refcount and task_count. - */ - seq_printf(seq, "css_set %pK", cset); - if (rcu_dereference_protected(cset->dom_cset, 1) != cset) { - threaded_csets++; - seq_printf(seq, "=>%pK", cset->dom_cset); - } - if (!list_empty(&cset->threaded_csets)) { - struct css_set *tcset; - int idx = 0; - - list_for_each_entry(tcset, &cset->threaded_csets, - threaded_csets_node) { - seq_puts(seq, idx ? "," : "<="); - seq_printf(seq, "%pK", tcset); - idx++; - } - } else { - seq_printf(seq, " %d", refcnt); - if (refcnt - cset->nr_tasks > 0) { - int extra = refcnt - cset->nr_tasks; - - seq_printf(seq, " +%d", extra); - /* - * Take out the one additional reference in - * init_css_set. - */ - if (cset == &init_css_set) - extra--; - extra_refs += extra; - } - } - seq_puts(seq, "\n"); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ <= MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ <= MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - /* show # of overflowed tasks */ - if (count > MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " ... (%d)\n", - count - MAX_TASKS_SHOWN_PER_CSS); - - if (cset->dead) { - seq_puts(seq, " [dead]\n"); - dead_cnt++; - } - - WARN_ON(count != cset->nr_tasks); - } - spin_unlock_irq(&css_set_lock); - - if (!dead_cnt && !extra_refs && !threaded_csets) - return 0; - - seq_puts(seq, "\n"); - if (threaded_csets) - seq_printf(seq, "threaded css_sets = %d\n", threaded_csets); - if (extra_refs) - seq_printf(seq, "extra references = %d\n", extra_refs); - if (dead_cnt) - seq_printf(seq, "dead css_sets = %d\n", dead_cnt); - - return 0; -} - -static int cgroup_subsys_states_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct cgroup *cgrp; - struct cgroup_subsys *ss; - struct cgroup_subsys_state *css; - char pbuf[16]; - int i; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - for_each_subsys(ss, i) { - css = rcu_dereference_check(cgrp->subsys[ss->id], true); - if (!css) - continue; - - pbuf[0] = '\0'; - - /* Show the parent CSS if applicable*/ - if (css->parent) - snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", - css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, - atomic_read(&css->online_cnt), pbuf); - } - - cgroup_kn_unlock(of->kn); - return 0; -} - -static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) -{ - struct cgroup_subsys *ss; - int ssid; - bool first = true; - - seq_printf(seq, "%-17s: ", name); - for_each_subsys(ss, ssid) { - if (!(mask & (1 << ssid))) - continue; - if (!first) - seq_puts(seq, ", "); - seq_puts(seq, ss->name); - first = false; - } - seq_putc(seq, '\n'); -} - -static int cgroup_masks_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct cgroup *cgrp; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); - cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); - - cgroup_kn_unlock(of->kn); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_legacy_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .seq_show = current_css_set_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "cgroup_subsys_states", - .seq_show = cgroup_subsys_states_read, - }, - - { - .name = "cgroup_masks", - .seq_show = cgroup_masks_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .seq_show = current_css_set_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "csses", - .seq_show = cgroup_subsys_states_read, - }, - - { - .name = "masks", - .seq_show = cgroup_masks_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_legacy_files, -}; - -/* - * On v2, debug is an implicit controller enabled by "cgroup_debug" boot - * parameter. - */ -static int __init enable_cgroup_debug(char *str) -{ - debug_cgrp_subsys.dfl_cftypes = debug_files; - debug_cgrp_subsys.implicit_on_dfl = true; - debug_cgrp_subsys.threaded = true; - return 1; -} -__setup("cgroup_debug", enable_cgroup_debug); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c deleted file mode 100644 index 103938f25757..000000000000 --- a/kernel/cgroup/freezer.c +++ /dev/null @@ -1,315 +0,0 @@ -//SPDX-License-Identifier: GPL-2.0 -#include -#include - -#include "cgroup-internal.h" - -/* - * Propagate the cgroup frozen state upwards by the cgroup tree. - */ -static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) -{ - int desc = 1; - - /* - * If the new state is frozen, some freezing ancestor cgroups may change - * their state too, depending on if all their descendants are frozen. - * - * Otherwise, all ancestor cgroups are forced into the non-frozen state. - */ - while ((cgrp = cgroup_parent(cgrp))) { - if (frozen) { - cgrp->freezer.nr_frozen_descendants += desc; - if (!test_bit(CGRP_FROZEN, &cgrp->flags) && - test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_descendants == - cgrp->nr_descendants) { - set_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - desc++; - } - } else { - cgrp->freezer.nr_frozen_descendants -= desc; - if (test_bit(CGRP_FROZEN, &cgrp->flags)) { - clear_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - desc++; - } - } - } -} - -/* - * Revisit the cgroup frozen state. - * Checks if the cgroup is really frozen and perform all state transitions. - */ -void cgroup_update_frozen(struct cgroup *cgrp) -{ - bool frozen; - - lockdep_assert_held(&css_set_lock); - - /* - * If the cgroup has to be frozen (CGRP_FREEZE bit set), - * and all tasks are frozen and/or stopped, let's consider - * the cgroup frozen. Otherwise it's not frozen. - */ - frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - - if (frozen) { - /* Already there? */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - set_bit(CGRP_FROZEN, &cgrp->flags); - } else { - /* Already there? */ - if (!test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - clear_bit(CGRP_FROZEN, &cgrp->flags); - } - cgroup_file_notify(&cgrp->events_file); - - /* Update the state of ancestor cgroups. */ - cgroup_propagate_frozen(cgrp, frozen); -} - -/* - * Increment cgroup's nr_frozen_tasks. - */ -static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) -{ - cgrp->freezer.nr_frozen_tasks++; -} - -/* - * Decrement cgroup's nr_frozen_tasks. - */ -static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) -{ - cgrp->freezer.nr_frozen_tasks--; - WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); -} - -/* - * Enter frozen/stopped state, if not yet there. Update cgroup's counters, - * and revisit the state of the cgroup, if necessary. - */ -void cgroup_enter_frozen(void) -{ - struct cgroup *cgrp; - - if (current->frozen) - return; - - spin_lock_irq(&css_set_lock); - current->frozen = true; - cgrp = task_dfl_cgroup(current); - cgroup_inc_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); - spin_unlock_irq(&css_set_lock); -} - -/* - * Conditionally leave frozen/stopped state. Update cgroup's counters, - * and revisit the state of the cgroup, if necessary. - * - * If always_leave is not set, and the cgroup is freezing, - * we're racing with the cgroup freezing. In this case, we don't - * drop the frozen counter to avoid a transient switch to - * the unfrozen state. - */ -void cgroup_leave_frozen(bool always_leave) -{ - struct cgroup *cgrp; - - spin_lock_irq(&css_set_lock); - cgrp = task_dfl_cgroup(current); - if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); - WARN_ON_ONCE(!current->frozen); - current->frozen = false; - } - spin_unlock_irq(&css_set_lock); - - if (unlikely(current->frozen)) { - /* - * If the task remained in the frozen state, - * make sure it won't reach userspace without - * entering the signal handling loop. - */ - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } -} - -/* - * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE - * jobctl bit. - */ -static void cgroup_freeze_task(struct task_struct *task, bool freeze) -{ - unsigned long flags; - - /* If the task is about to die, don't bother with freezing it. */ - if (!lock_task_sighand(task, &flags)) - return; - - if (freeze) { - task->jobctl |= JOBCTL_TRAP_FREEZE; - signal_wake_up(task, false); - } else { - task->jobctl &= ~JOBCTL_TRAP_FREEZE; - wake_up_process(task); - } - - unlock_task_sighand(task, &flags); -} - -/* - * Freeze or unfreeze all tasks in the given cgroup. - */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) -{ - struct css_task_iter it; - struct task_struct *task; - - lockdep_assert_held(&cgroup_mutex); - - spin_lock_irq(&css_set_lock); - if (freeze) - set_bit(CGRP_FREEZE, &cgrp->flags); - else - clear_bit(CGRP_FREEZE, &cgrp->flags); - spin_unlock_irq(&css_set_lock); - - css_task_iter_start(&cgrp->self, 0, &it); - while ((task = css_task_iter_next(&it))) { - /* - * Ignore kernel threads here. Freezing cgroups containing - * kthreads isn't supported. - */ - if (task->flags & PF_KTHREAD) - continue; - cgroup_freeze_task(task, freeze); - } - css_task_iter_end(&it); - - /* - * Cgroup state should be revisited here to cover empty leaf cgroups - * and cgroups which descendants are already in the desired state. - */ - spin_lock_irq(&css_set_lock); - if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) - cgroup_update_frozen(cgrp); - spin_unlock_irq(&css_set_lock); -} - -/* - * Adjust the task state (freeze or unfreeze) and revisit the state of - * source and destination cgroups. - */ -void cgroup_freezer_migrate_task(struct task_struct *task, - struct cgroup *src, struct cgroup *dst) -{ - lockdep_assert_held(&css_set_lock); - - /* - * Kernel threads are not supposed to be frozen at all. - */ - if (task->flags & PF_KTHREAD) - return; - - /* - * Adjust counters of freezing and frozen tasks. - * Note, that if the task is frozen, but the destination cgroup is not - * frozen, we bump both counters to keep them balanced. - */ - if (task->frozen) { - cgroup_inc_frozen_cnt(dst); - cgroup_dec_frozen_cnt(src); - } - cgroup_update_frozen(dst); - cgroup_update_frozen(src); - - /* - * Force the task to the desired state. - */ - cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); -} - -void cgroup_freezer_frozen_exit(struct task_struct *task) -{ - struct cgroup *cgrp = task_dfl_cgroup(task); - - lockdep_assert_held(&css_set_lock); - - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); -} - -void cgroup_freeze(struct cgroup *cgrp, bool freeze) -{ - struct cgroup_subsys_state *css; - struct cgroup *dsct; - bool applied = false; - - lockdep_assert_held(&cgroup_mutex); - - /* - * Nothing changed? Just exit. - */ - if (cgrp->freezer.freeze == freeze) - return; - - cgrp->freezer.freeze = freeze; - - /* - * Propagate changes downwards the cgroup tree. - */ - css_for_each_descendant_pre(css, &cgrp->self) { - dsct = css->cgroup; - - if (cgroup_is_dead(dsct)) - continue; - - if (freeze) { - dsct->freezer.e_freeze++; - /* - * Already frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 1) - continue; - } else { - dsct->freezer.e_freeze--; - /* - * Still frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 0) - continue; - - WARN_ON_ONCE(dsct->freezer.e_freeze < 0); - } - - /* - * Do change actual state: freeze or unfreeze. - */ - cgroup_do_freeze(dsct, freeze); - applied = true; - } - - /* - * Even if the actual state hasn't changed, let's notify a user. - * The state can be enforced by an ancestor cgroup: the cgroup - * can already be in the desired state or it can be locked in the - * opposite state, so that the transition will never happen. - * In both cases it's better to notify a user, that there is - * nothing to wait for. - */ - if (!applied) - cgroup_file_notify(&cgrp->events_file); -} diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c deleted file mode 100644 index 86e9bbeb57ec..000000000000 --- a/kernel/cgroup/namespace.c +++ /dev/null @@ -1,155 +0,0 @@ -#include "cgroup-internal.h" - -#include -#include -#include -#include - - -/* cgroup namespaces */ - -static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) -{ - return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); -} - -static void dec_cgroup_namespaces(struct ucounts *ucounts) -{ - dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); -} - -static struct cgroup_namespace *alloc_cgroup_ns(void) -{ - struct cgroup_namespace *new_ns; - int ret; - - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); - if (!new_ns) - return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - refcount_set(&new_ns->count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; -} - -void free_cgroup_ns(struct cgroup_namespace *ns) -{ - put_css_set(ns->root_cset); - dec_cgroup_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); -} -EXPORT_SYMBOL(free_cgroup_ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - struct cgroup_namespace *new_ns; - struct ucounts *ucounts; - struct css_set *cset; - - BUG_ON(!old_ns); - - if (!(flags & CLONE_NEWCGROUP)) { - get_cgroup_ns(old_ns); - return old_ns; - } - - /* Allow only sysadmin to create cgroup namespace. */ - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - ucounts = inc_cgroup_namespaces(user_ns); - if (!ucounts) - return ERR_PTR(-ENOSPC); - - /* It is not safe to take cgroup_mutex here */ - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - spin_unlock_irq(&css_set_lock); - - new_ns = alloc_cgroup_ns(); - if (IS_ERR(new_ns)) { - put_css_set(cset); - dec_cgroup_namespaces(ucounts); - return new_ns; - } - - new_ns->user_ns = get_user_ns(user_ns); - new_ns->ucounts = ucounts; - new_ns->root_cset = cset; - - return new_ns; -} - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) -{ - struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || - !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* Don't need to do anything if we are attaching to our own cgroupns. */ - if (cgroup_ns == nsproxy->cgroup_ns) - return 0; - - get_cgroup_ns(cgroup_ns); - put_cgroup_ns(nsproxy->cgroup_ns); - nsproxy->cgroup_ns = cgroup_ns; - - return 0; -} - -static struct ns_common *cgroupns_get(struct task_struct *task) -{ - struct cgroup_namespace *ns = NULL; - struct nsproxy *nsproxy; - - task_lock(task); - nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->cgroup_ns; - get_cgroup_ns(ns); - } - task_unlock(task); - - return ns ? &ns->ns : NULL; -} - -static void cgroupns_put(struct ns_common *ns) -{ - put_cgroup_ns(to_cg_ns(ns)); -} - -static struct user_namespace *cgroupns_owner(struct ns_common *ns) -{ - return to_cg_ns(ns)->user_ns; -} - -const struct proc_ns_operations cgroupns_operations = { - .name = "cgroup", - .type = CLONE_NEWCGROUP, - .get = cgroupns_get, - .put = cgroupns_put, - .install = cgroupns_install, - .owner = cgroupns_owner, -}; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c deleted file mode 100644 index defad3c5e7dc..000000000000 --- a/kernel/cgroup/rdma.c +++ /dev/null @@ -1,619 +0,0 @@ -/* - * RDMA resource limiting controller for cgroups. - * - * Used to allow a cgroup hierarchy to stop processes from consuming - * additional RDMA resources after a certain limit is reached. - * - * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. - */ - -#include -#include -#include -#include -#include -#include - -#define RDMACG_MAX_STR "max" - -/* - * Protects list of resource pools maintained on per cgroup basis - * and rdma device list. - */ -static DEFINE_MUTEX(rdmacg_mutex); -static LIST_HEAD(rdmacg_devices); - -enum rdmacg_file_type { - RDMACG_RESOURCE_TYPE_MAX, - RDMACG_RESOURCE_TYPE_STAT, -}; - -/* - * resource table definition as to be seen by the user. - * Need to add entries to it when more resources are - * added/defined at IB verb/core layer. - */ -static char const *rdmacg_resource_names[] = { - [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", - [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", -}; - -/* resource tracker for each resource of rdma cgroup */ -struct rdmacg_resource { - int max; - int usage; -}; - -/* - * resource pool object which represents per cgroup, per device - * resources. There are multiple instances of this object per cgroup, - * therefore it cannot be embedded within rdma_cgroup structure. It - * is maintained as list. - */ -struct rdmacg_resource_pool { - struct rdmacg_device *device; - struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; - - struct list_head cg_node; - struct list_head dev_node; - - /* count active user tasks of this pool */ - u64 usage_sum; - /* total number counts which are set to max */ - int num_max_cnt; -}; - -static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) -{ - return container_of(css, struct rdma_cgroup, css); -} - -static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) -{ - return css_rdmacg(cg->css.parent); -} - -static inline struct rdma_cgroup *get_current_rdmacg(void) -{ - return css_rdmacg(task_get_css(current, rdma_cgrp_id)); -} - -static void set_resource_limit(struct rdmacg_resource_pool *rpool, - int index, int new_max) -{ - if (new_max == S32_MAX) { - if (rpool->resources[index].max != S32_MAX) - rpool->num_max_cnt++; - } else { - if (rpool->resources[index].max == S32_MAX) - rpool->num_max_cnt--; - } - rpool->resources[index].max = new_max; -} - -static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) -{ - int i; - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) - set_resource_limit(rpool, i, S32_MAX); -} - -static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) -{ - lockdep_assert_held(&rdmacg_mutex); - - list_del(&rpool->cg_node); - list_del(&rpool->dev_node); - kfree(rpool); -} - -static struct rdmacg_resource_pool * -find_cg_rpool_locked(struct rdma_cgroup *cg, - struct rdmacg_device *device) - -{ - struct rdmacg_resource_pool *pool; - - lockdep_assert_held(&rdmacg_mutex); - - list_for_each_entry(pool, &cg->rpools, cg_node) - if (pool->device == device) - return pool; - - return NULL; -} - -static struct rdmacg_resource_pool * -get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) -{ - struct rdmacg_resource_pool *rpool; - - rpool = find_cg_rpool_locked(cg, device); - if (rpool) - return rpool; - - rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); - if (!rpool) - return ERR_PTR(-ENOMEM); - - rpool->device = device; - set_all_resource_max_limit(rpool); - - INIT_LIST_HEAD(&rpool->cg_node); - INIT_LIST_HEAD(&rpool->dev_node); - list_add_tail(&rpool->cg_node, &cg->rpools); - list_add_tail(&rpool->dev_node, &device->rpools); - return rpool; -} - -/** - * uncharge_cg_locked - uncharge resource for rdma cgroup - * @cg: pointer to cg to uncharge and all parents in hierarchy - * @device: pointer to rdmacg device - * @index: index of the resource to uncharge in cg (resource pool) - * - * It also frees the resource pool which was created as part of - * charging operation when there are no resources attached to - * resource pool. - */ -static void -uncharge_cg_locked(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - struct rdmacg_resource_pool *rpool; - - rpool = find_cg_rpool_locked(cg, device); - - /* - * rpool cannot be null at this stage. Let kernel operate in case - * if there a bug in IB stack or rdma controller, instead of crashing - * the system. - */ - if (unlikely(!rpool)) { - pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); - return; - } - - rpool->resources[index].usage--; - - /* - * A negative count (or overflow) is invalid, - * it indicates a bug in the rdma controller. - */ - WARN_ON_ONCE(rpool->resources[index].usage < 0); - rpool->usage_sum--; - if (rpool->usage_sum == 0 && - rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); - } -} - -/** - * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count - * @device: pointer to rdmacg device - * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup - * stop uncharging - * @index: index of the resource to uncharge in cg in given resource pool - */ -static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, - struct rdmacg_device *device, - struct rdma_cgroup *stop_cg, - enum rdmacg_resource_type index) -{ - struct rdma_cgroup *p; - - mutex_lock(&rdmacg_mutex); - - for (p = cg; p != stop_cg; p = parent_rdmacg(p)) - uncharge_cg_locked(p, device, index); - - mutex_unlock(&rdmacg_mutex); - - css_put(&cg->css); -} - -/** - * rdmacg_uncharge - hierarchically uncharge rdma resource count - * @device: pointer to rdmacg device - * @index: index of the resource to uncharge in cgroup in given resource pool - */ -void rdmacg_uncharge(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - if (index >= RDMACG_RESOURCE_MAX) - return; - - rdmacg_uncharge_hierarchy(cg, device, NULL, index); -} -EXPORT_SYMBOL(rdmacg_uncharge); - -/** - * rdmacg_try_charge - hierarchically try to charge the rdma resource - * @rdmacg: pointer to rdma cgroup which will own this resource - * @device: pointer to rdmacg device - * @index: index of the resource to charge in cgroup (resource pool) - * - * This function follows charging resource in hierarchical way. - * It will fail if the charge would cause the new value to exceed the - * hierarchical limit. - * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. - * Returns pointer to rdmacg for this resource when charging is successful. - * - * Charger needs to account resources on two criteria. - * (a) per cgroup & (b) per device resource usage. - * Per cgroup resource usage ensures that tasks of cgroup doesn't cross - * the configured limits. Per device provides granular configuration - * in multi device usage. It allocates resource pool in the hierarchy - * for each parent it come across for first resource. Later on resource - * pool will be available. Therefore it will be much faster thereon - * to charge/uncharge. - */ -int rdmacg_try_charge(struct rdma_cgroup **rdmacg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - struct rdma_cgroup *cg, *p; - struct rdmacg_resource_pool *rpool; - s64 new; - int ret = 0; - - if (index >= RDMACG_RESOURCE_MAX) - return -EINVAL; - - /* - * hold on to css, as cgroup can be removed but resource - * accounting happens on css. - */ - cg = get_current_rdmacg(); - - mutex_lock(&rdmacg_mutex); - for (p = cg; p; p = parent_rdmacg(p)) { - rpool = get_cg_rpool_locked(p, device); - if (IS_ERR(rpool)) { - ret = PTR_ERR(rpool); - goto err; - } else { - new = rpool->resources[index].usage + 1; - if (new > rpool->resources[index].max) { - ret = -EAGAIN; - goto err; - } else { - rpool->resources[index].usage = new; - rpool->usage_sum++; - } - } - } - mutex_unlock(&rdmacg_mutex); - - *rdmacg = cg; - return 0; - -err: - mutex_unlock(&rdmacg_mutex); - rdmacg_uncharge_hierarchy(cg, device, p, index); - return ret; -} -EXPORT_SYMBOL(rdmacg_try_charge); - -/** - * rdmacg_register_device - register rdmacg device to rdma controller. - * @device: pointer to rdmacg device whose resources need to be accounted. - * - * If IB stack wish a device to participate in rdma cgroup resource - * tracking, it must invoke this API to register with rdma cgroup before - * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. - */ -int rdmacg_register_device(struct rdmacg_device *device) -{ - INIT_LIST_HEAD(&device->dev_node); - INIT_LIST_HEAD(&device->rpools); - - mutex_lock(&rdmacg_mutex); - list_add_tail(&device->dev_node, &rdmacg_devices); - mutex_unlock(&rdmacg_mutex); - return 0; -} -EXPORT_SYMBOL(rdmacg_register_device); - -/** - * rdmacg_unregister_device - unregister rdmacg device from rdma controller. - * @device: pointer to rdmacg device which was previously registered with rdma - * controller using rdmacg_register_device(). - * - * IB stack must invoke this after all the resources of the IB device - * are destroyed and after ensuring that no more resources will be created - * when this API is invoked. - */ -void rdmacg_unregister_device(struct rdmacg_device *device) -{ - struct rdmacg_resource_pool *rpool, *tmp; - - /* - * Synchronize with any active resource settings, - * usage query happening via configfs. - */ - mutex_lock(&rdmacg_mutex); - list_del_init(&device->dev_node); - - /* - * Now that this device is off the cgroup list, its safe to free - * all the rpool resources. - */ - list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) - free_cg_rpool_locked(rpool); - - mutex_unlock(&rdmacg_mutex); -} -EXPORT_SYMBOL(rdmacg_unregister_device); - -static int parse_resource(char *c, int *intval) -{ - substring_t argstr; - const char **table = &rdmacg_resource_names[0]; - char *name, *value = c; - size_t len; - int ret, i = 0; - - name = strsep(&value, "="); - if (!name || !value) - return -EINVAL; - - len = strlen(value); - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; - - argstr.from = value; - argstr.to = value + len; - - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; - } - return -EINVAL; -} - -static int rdmacg_parse_limits(char *options, - int *new_limits, unsigned long *enables) -{ - char *c; - int err = -EINVAL; - - /* parse resource options */ - while ((c = strsep(&options, " ")) != NULL) { - int index, intval; - - index = parse_resource(c, &intval); - if (index < 0) - goto err; - - new_limits[index] = intval; - *enables |= BIT(index); - } - return 0; - -err: - return err; -} - -static struct rdmacg_device *rdmacg_get_device_locked(const char *name) -{ - struct rdmacg_device *device; - - lockdep_assert_held(&rdmacg_mutex); - - list_for_each_entry(device, &rdmacg_devices, dev_node) - if (!strcmp(name, device->name)) - return device; - - return NULL; -} - -static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdma_cgroup *cg = css_rdmacg(of_css(of)); - const char *dev_name; - struct rdmacg_resource_pool *rpool; - struct rdmacg_device *device; - char *options = strstrip(buf); - int *new_limits; - unsigned long enables = 0; - int i = 0, ret = 0; - - /* extract the device name first */ - dev_name = strsep(&options, " "); - if (!dev_name) { - ret = -EINVAL; - goto err; - } - - new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); - if (!new_limits) { - ret = -ENOMEM; - goto err; - } - - ret = rdmacg_parse_limits(options, new_limits, &enables); - if (ret) - goto parse_err; - - /* acquire lock to synchronize with hot plug devices */ - mutex_lock(&rdmacg_mutex); - - device = rdmacg_get_device_locked(dev_name); - if (!device) { - ret = -ENODEV; - goto dev_err; - } - - rpool = get_cg_rpool_locked(cg, device); - if (IS_ERR(rpool)) { - ret = PTR_ERR(rpool); - goto dev_err; - } - - /* now set the new limits of the rpool */ - for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) - set_resource_limit(rpool, i, new_limits[i]); - - if (rpool->usage_sum == 0 && - rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); - } - -dev_err: - mutex_unlock(&rdmacg_mutex); - -parse_err: - kfree(new_limits); - -err: - return ret ?: nbytes; -} - -static void print_rpool_values(struct seq_file *sf, - struct rdmacg_resource_pool *rpool) -{ - enum rdmacg_file_type sf_type; - int i; - u32 value; - - sf_type = seq_cft(sf)->private; - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - seq_puts(sf, rdmacg_resource_names[i]); - seq_putc(sf, '='); - if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { - if (rpool) - value = rpool->resources[i].max; - else - value = S32_MAX; - } else { - if (rpool) - value = rpool->resources[i].usage; - else - value = 0; - } - - if (value == S32_MAX) - seq_puts(sf, RDMACG_MAX_STR); - else - seq_printf(sf, "%d", value); - seq_putc(sf, ' '); - } -} - -static int rdmacg_resource_read(struct seq_file *sf, void *v) -{ - struct rdmacg_device *device; - struct rdmacg_resource_pool *rpool; - struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); - - mutex_lock(&rdmacg_mutex); - - list_for_each_entry(device, &rdmacg_devices, dev_node) { - seq_printf(sf, "%s ", device->name); - - rpool = find_cg_rpool_locked(cg, device); - print_rpool_values(sf, rpool); - - seq_putc(sf, '\n'); - } - - mutex_unlock(&rdmacg_mutex); - return 0; -} - -static struct cftype rdmacg_files[] = { - { - .name = "max", - .write = rdmacg_resource_set_max, - .seq_show = rdmacg_resource_read, - .private = RDMACG_RESOURCE_TYPE_MAX, - .flags = CFTYPE_NOT_ON_ROOT, - }, - { - .name = "current", - .seq_show = rdmacg_resource_read, - .private = RDMACG_RESOURCE_TYPE_STAT, - .flags = CFTYPE_NOT_ON_ROOT, - }, - { } /* terminate */ -}; - -static struct cgroup_subsys_state * -rdmacg_css_alloc(struct cgroup_subsys_state *parent) -{ - struct rdma_cgroup *cg; - - cg = kzalloc(sizeof(*cg), GFP_KERNEL); - if (!cg) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&cg->rpools); - return &cg->css; -} - -static void rdmacg_css_free(struct cgroup_subsys_state *css) -{ - struct rdma_cgroup *cg = css_rdmacg(css); - - kfree(cg); -} - -/** - * rdmacg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away and responsible - * for shooting down all rdmacg associated with @css. As part of that it - * marks all the resource pool entries to max value, so that when resources are - * uncharged, associated resource pool can be freed as well. - */ -static void rdmacg_css_offline(struct cgroup_subsys_state *css) -{ - struct rdma_cgroup *cg = css_rdmacg(css); - struct rdmacg_resource_pool *rpool; - - mutex_lock(&rdmacg_mutex); - - list_for_each_entry(rpool, &cg->rpools, cg_node) - set_all_resource_max_limit(rpool); - - mutex_unlock(&rdmacg_mutex); -} - -struct cgroup_subsys rdma_cgrp_subsys = { - .css_alloc = rdmacg_css_alloc, - .css_free = rdmacg_css_free, - .css_offline = rdmacg_css_offline, - .legacy_cftypes = rdmacg_files, - .dfl_cftypes = rdmacg_files, -}; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup_freezer.c similarity index 99% rename from kernel/cgroup/legacy_freezer.c rename to kernel/cgroup_freezer.c index 08236798d173..1b72d56edce5 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup_freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); + css_task_iter_start(css, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, 0, &it); + css_task_iter_start(&freezer->css, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, 0, &it); + css_task_iter_start(&freezer->css, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup_pids.c similarity index 98% rename from kernel/cgroup/pids.c rename to kernel/cgroup_pids.c index 6f064cce257a..b8b898e21c19 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup_pids.c @@ -248,7 +248,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_release(struct task_struct *task) +static void pids_free(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -343,8 +343,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .release = pids_release, + .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, - .threaded = true, }; diff --git a/kernel/cgroup/cpuset.c b/kernel/cpuset.c similarity index 97% rename from kernel/cgroup/cpuset.c rename to kernel/cpuset.c index 4890211f5709..df64cb9ba63a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cpuset.c @@ -298,16 +298,6 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); -/* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. - */ -static inline bool is_in_v2_mode(void) -{ - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); -} - /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -504,7 +494,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -889,7 +880,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) update_cpus_allowed(cs, task, cs->effective_cpus); css_task_iter_end(&it); @@ -922,7 +913,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -939,7 +931,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!is_in_v2_mode() && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1134,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1192,7 +1184,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1209,7 +1202,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!is_in_v2_mode() && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1326,7 +1319,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); @@ -1500,7 +1493,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!is_in_v2_mode() && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1557,7 +1550,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - get_online_cpus(); mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1613,7 +1605,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); - put_online_cpus(); } /* The various types of files and directories in a cpuset file system */ @@ -2032,7 +2023,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (is_in_v2_mode()) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2113,7 +2104,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (is_in_v2_mode()) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2183,9 +2174,12 @@ int __init cpuset_init(void) { int err = 0; - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) + BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); cpumask_setall(top_cpuset.cpus_requested); @@ -2201,7 +2195,8 @@ int __init cpuset_init(void) if (err < 0) return err; - BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); return 0; } @@ -2327,7 +2322,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (is_in_v2_mode()) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2365,7 +2360,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = is_in_v2_mode(); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); @@ -2424,7 +2419,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } } -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { /* * We're inside cpu hotplug critical region which usually nests @@ -2469,11 +2464,8 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - /* - * cpus_allowd/mems_allowed set to v2 values in the initial - * cpuset_bind() call will be reset to v1 values in another - * cpuset_bind() call when v1 cpuset is mounted. - */ + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); + top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); @@ -2507,23 +2499,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } -/** - * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. - * @tsk: pointer to task_struct with which the scheduler is struggling - * - * Description: In the case that the scheduler cannot find an allowed cpu in - * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy - * mode however, this value is the same as task_cs(tsk)->effective_cpus, - * which will not contain a sane cpumask during cases such as cpu hotplugging. - * This is the absolute last resort for the scheduler and it is only used if - * _every_ other avenue has been traveled. - **/ - void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* diff --git a/kernel/cred.c b/kernel/cred.c index ad24a4cb25c0..d63a2d861ac2 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -196,7 +196,7 @@ const struct cred *get_task_cred(struct task_struct *task) do { cred = __task_cred((task)); BUG_ON(!cred); - } while (!get_cred_rcu(cred)); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); rcu_read_unlock(); return cred; diff --git a/kernel/events/core.c b/kernel/events/core.c index 540256086e91..547184b71dce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11458,12 +11458,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .attach = perf_cgroup_attach, - /* - * Implicitly enable on dfl hierarchy so that perf events can - * always be filtered by cgroup2 path as long as perf_event - * controller is not mounted on a legacy hierarchy. - */ - .implicit_on_dfl = true, - .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/exit.c b/kernel/exit.c index 5e0ca9c806a6..09beccfb0977 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,7 +178,6 @@ void release_task(struct task_struct *p) rcu_read_unlock(); proc_flush_task(p); - cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); diff --git a/kernel/fork.c b/kernel/fork.c index cbce7b33193b..00f93deb2829 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1069,9 +1069,7 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); - cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); - cgroup_leave_frozen(false); freezer_count(); if (killed) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29e696d490f1..f312d7a3b914 100755 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8055,7 +8055,7 @@ static void cpuset_cpu_active(void) */ cpuset_force_rebuild(); } - cpuset_update_active_cpus(); + cpuset_update_active_cpus(true); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -8078,7 +8078,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(); + cpuset_update_active_cpus(false); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index ff8ba82c9e03..8051e3741aed 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -38,7 +38,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -147,10 +146,9 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || + if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked) || - cgroup_task_frozen(t)) { + PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return 1; } @@ -1931,10 +1929,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) */ preempt_disable(); read_unlock(&tasklist_lock); - cgroup_enter_frozen(); preempt_enable_no_resched(); freezable_schedule(); - cgroup_leave_frozen(true); } else { /* * By the time we got the lock, our tracer went away. @@ -2112,7 +2108,6 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2159,43 +2154,6 @@ static void do_jobctl_trap(void) } } -/** - * do_freezer_trap - handle the freezer jobctl trap - * - * Puts the task into frozen state, if only the task is not about to quit. - * In this case it drops JOBCTL_TRAP_FREEZE. - * - * CONTEXT: - * Must be called with @current->sighand->siglock held, - * which is always released before returning. - */ -static void do_freezer_trap(void) - __releases(¤t->sighand->siglock) -{ - /* - * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, - * let's make another loop to give it a chance to be handled. - * In any case, we'll return back. - */ - if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != - JOBCTL_TRAP_FREEZE) { - spin_unlock_irq(¤t->sighand->siglock); - return; - } - - /* - * Now we're sure that there is no pending fatal signal and no - * pending traps. Clear TIF_SIGPENDING to not get out of schedule() - * immediately (if there is a non-fatal signal pending), and - * put the task into sleep. - */ - __set_current_state(TASK_INTERRUPTIBLE); - clear_thread_flag(TIF_SIGPENDING); - spin_unlock_irq(¤t->sighand->siglock); - cgroup_enter_frozen(); - freezable_schedule(); -} - static int ptrace_signal(int signr, siginfo_t *info) { ptrace_signal_deliver(); @@ -2308,10 +2266,6 @@ int get_signal(struct ksignal *ksig) trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL - 1]); recalc_sigpending(); - current->jobctl &= ~JOBCTL_TRAP_FREEZE; - spin_unlock_irq(&sighand->siglock); - if (unlikely(cgroup_task_frozen(current))) - cgroup_leave_frozen(true); goto fatal; } @@ -2322,24 +2276,9 @@ int get_signal(struct ksignal *ksig) do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & - (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { - if (current->jobctl & JOBCTL_TRAP_MASK) { - do_jobctl_trap(); - spin_unlock_irq(&sighand->siglock); - } else if (current->jobctl & JOBCTL_TRAP_FREEZE) - do_freezer_trap(); - - goto relock; - } - - /* - * If the task is leaving the frozen state, let's update - * cgroup counters and reset the frozen bit. - */ - if (unlikely(cgroup_task_frozen(current))) { + if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { + do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); - cgroup_leave_frozen(true); goto relock; } @@ -2433,8 +2372,8 @@ int get_signal(struct ksignal *ksig) continue; } - spin_unlock_irq(&sighand->siglock); fatal: + spin_unlock_irq(&sighand->siglock); /* * Anything else is fatal, maybe with a core dump. @@ -2469,7 +2408,7 @@ int get_signal(struct ksignal *ksig) } /** - * signal_delivered - + * signal_delivered - * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * @@ -3532,7 +3471,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { - return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); + return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } #endif @@ -3657,7 +3596,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { sigset_to_compat(&mask, &old_ka.sa.sa_mask); - ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), + ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); @@ -3835,7 +3774,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; return sigsuspend(&newset); } - + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a49cdf33a62e..b508b47ae3ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -966,7 +966,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index dad43d1924db..db65b0cdfc4c 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -128,7 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, cs->classid = (u32)value; - css_task_iter_start(css, 0, &it); + css_task_iter_start(css, &it); while ((p = css_task_iter_next(&it))) { update_classid_task(p, cs->classid); cond_resched(); diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 34156826c14f..5bc2b92ace6d 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; - char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; char *token, *saved_ptr = NULL; + int found = 0; fp = fopen("/proc/mounts", "r"); if (!fp) @@ -24,43 +24,31 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) * and inspect every cgroupfs mount point to find one that has * perf_event subsystem */ - path_v1[0] = '\0'; - path_v2[0] = '\0'; - while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" STR(PATH_MAX)"s %*d %*d\n", mountpoint, type, tokens) == 3) { - if (!path_v1[0] && !strcmp(type, "cgroup")) { + if (!strcmp(type, "cgroup")) { token = strtok_r(tokens, ",", &saved_ptr); while (token != NULL) { if (!strcmp(token, "perf_event")) { - strcpy(path_v1, mountpoint); + found = 1; break; } token = strtok_r(NULL, ",", &saved_ptr); } } - - if (!path_v2[0] && !strcmp(type, "cgroup2")) - strcpy(path_v2, mountpoint); - - if (path_v1[0] && path_v2[0]) + if (found) break; } fclose(fp); - - if (path_v1[0]) - path = path_v1; - else if (path_v2[0]) - path = path_v2; - else + if (!found) return -1; - if (strlen(path) < maxlen) { - strcpy(buf, path); + if (strlen(mountpoint) < maxlen) { + strcpy(buf, mountpoint); return 0; } return -1; From 16db2094d37e8b020169719069905f7b575b113f Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:58:52 +0800 Subject: [PATCH 08/59] =?UTF-8?q?Revert=20"block:=20=E5=90=AF=E7=94=A8BFQ?= =?UTF-8?q?=E5=B9=B6=E8=AE=BE=E4=B8=BA=E9=BB=98=E8=AE=A4IO=E8=B0=83?= =?UTF-8?q?=E5=BA=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2a3e4c244de481319fab871599a47079265d7471. --- block/Kconfig.iosched | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 75ee7ba34ebb..8dc4f04711fa 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -41,7 +41,7 @@ config CFQ_GROUP_IOSCHED config IOSCHED_BFQ tristate "BFQ I/O scheduler" - default y + default n ---help--- The BFQ I/O scheduler distributes bandwidth among all processes according to their weights, regardless of the @@ -60,7 +60,7 @@ config BFQ_GROUP_IOSCHED choice prompt "Default I/O scheduler" - default DEFAULT_BFQ + default DEFAULT_CFQ help Select the I/O scheduler which will be used by default for all block devices. From c0d7c1dbc5b7b61c8a61b8c8fb2339a780b90bee Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:31 +0800 Subject: [PATCH 09/59] Revert "bfq: fix warning [-wpointer-bool-conversion]" This reverts commit 510ac91f3820befe7bfbefa1d6a34d5f0203b6ca. --- block/bfq-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 52484f10bb6f..a66a7232a854 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -740,6 +740,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + BUG_ON(!bfqg->sched_data.service_tree); st = bfqg->sched_data.service_tree + i; /* * The idle tree may still contain bfq_queues belonging From 99f3ce30902422954a6ea1ffcb5cc436b7defa37 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:36 +0800 Subject: [PATCH 10/59] Revert "block: BFQ: fix improper use backing_dev_info struct" This reverts commit 5c755eb379f1b6fdf6a28ccd9bcef8ec5290bbfc. --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6e6025dacfc6..323923b2ff71 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4000,7 +4000,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); switch (ioprio_class) { default: - dev_err(bfqq->bfqd->queue->backing_dev_info->dev, + dev_err(bfqq->bfqd->queue->backing_dev_info.dev, "bfq: bad prio class %d\n", ioprio_class); case IOPRIO_CLASS_NONE: /* From bc1199e1a11f01283336a52e3ed3def0d939b210 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:37 +0800 Subject: [PATCH 11/59] Revert "block, bfq: improve and refactor throughput-boosting logic" This reverts commit 4955af72c2e587c76a9d6d634be2afc245355c3b. --- block/bfq-iosched.c | 141 ++++++++++++++++++++------------------------ block/bfq.h | 12 ++-- 2 files changed, 70 insertions(+), 83 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 323923b2ff71..3fdef504ce0b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -693,10 +693,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, unsigned int old_wr_coeff; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - if (bic->saved_has_short_ttime) - bfq_mark_bfqq_has_short_ttime(bfqq); + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); else - bfq_clear_bfqq_has_short_ttime(bfqq); + bfq_clear_bfqq_idle_window(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); @@ -2060,7 +2060,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); @@ -3226,9 +3226,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", + "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", reason, slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); + bfq_bfqq_idle_window(bfqq), entity->weight); /* * Increase, decrease or leave budget unchanged according to @@ -3310,55 +3310,35 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, - bfqq_sequential_and_IO_bound, - idling_boosts_thr, idling_boosts_thr_without_issues, + bool idling_boosts_thr, idling_boosts_thr_without_issues, idling_needed_for_service_guarantees, asymmetric_scenario; if (bfqd->strict_guarantees) return true; - /* - * Idling is performed only if slice_idle > 0. In addition, we - * do not idle if - * (a) bfqq is async - * (b) bfqq is in the idle io prio class: in this case we do - * not idle because we want to minimize the bandwidth that - * queues in this class can steal to higher-priority queues - */ - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || - bfq_class_idle(bfqq)) - return false; - - bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && - bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); /* * The next variable takes into account the cases where idling * boosts the throughput. * * The value of the variable is computed considering, first, that * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable and rotational, or - * (b) regardless of the presence of NCQ, the device is rotational and - * the request pattern for bfqq is I/O-bound and sequential, or - * (c) regardless of whether it is rotational, the device is - * not NCQ-capable and the request pattern for bfqq is - * I/O-bound and sequential. + * (a) the device is not NCQ-capable, or + * (b) regardless of the presence of NCQ, the device is rotational + * and the request pattern for bfqq is I/O-bound and sequential. * * Secondly, and in contrast to the above item (b), idling an * NCQ-capable flash-based device would not boost the * throughput even with sequential I/O; rather it would lower * the throughput in proportion to how fast the device * is. Accordingly, the next variable is true if any of the - * above conditions (a), (b) or (c) is true, and, in - * particular, happens to be false if bfqd is an NCQ-capable - * flash-based device. + * above conditions (a) and (b) is true, and, in particular, + * happens to be false if bfqd is an NCQ-capable flash-based + * device. */ - idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && - bfqq_sequential_and_IO_bound); + idling_boosts_thr = !bfqd->hw_tag || + (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && + bfq_bfqq_idle_window(bfqq)); /* * The value of the next variable, @@ -3529,10 +3509,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* - * We have now all the components we need to compute the - * return value of the function, which is true only if idling - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. + * We have now all the components we need to compute the return + * value of the function, which is true only if both the following + * conditions hold: + * 1) bfqq is sync, because idling make sense only for sync queues; + * 2) idling either boosts the throughput (without issues), or + * is necessary to preserve service guarantees. */ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", bfq_bfqq_sync(bfqq), idling_boosts_thr); @@ -3544,8 +3526,9 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfq_bfqq_IO_bound(bfqq), idling_needed_for_service_guarantees); - return idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees; + return bfq_bfqq_sync(bfqq) && + (idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees); } /* @@ -3561,7 +3544,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_may_idle(bfqq); } /* @@ -4020,6 +4006,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; bfqq->new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); break; } @@ -4083,14 +4070,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_set_next_ioprio_data(bfqq, bic); if (is_sync) { - /* - * No need to mark as has_short_ttime if in - * idle_class, because no device idling is performed - * for queues in idle class - */ if (!bfq_class_idle(bfqq)) - /* tentatively mark as has_short_ttime */ - bfq_mark_bfqq_has_short_ttime(bfqq); + bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); } else @@ -4225,19 +4206,18 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); } -static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { - bool has_short_ttime = true; + int enable_idle; - /* - * No need to update has_short_ttime if bfqq is async or in - * idle io prio class, or if bfq_slice_idle is zero, because - * no device idling is performed for bfqq in this case. - */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || - bfqd->bfq_slice_idle == 0) + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) return; /* Idle window just restored, statistics are meaningless. */ @@ -4245,22 +4225,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time)) return; - /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime - */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - (bfq_sample_valid(bic->ttime.ttime_samples) && - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; + enable_idle = bfq_bfqq_idle_window(bfqq); - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", - has_short_ttime); + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); - if (has_short_ttime) - bfq_mark_bfqq_has_short_ttime(bfqq); + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); else - bfq_clear_bfqq_has_short_ttime(bfqq); + bfq_clear_bfqq_idle_window(bfqq); } /* @@ -4276,12 +4261,14 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->meta_pending++; bfq_update_io_thinktime(bfqd, bic); - bfq_update_has_short_ttime(bfqd, bfqq, bic); bfq_update_io_seektime(bfqd, bfqq, rq); + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); + "rq_enqueued: idle_window=%d (seeky %d)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); diff --git a/block/bfq.h b/block/bfq.h index e35bf89b09f3..141f8960ad6f 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -351,11 +351,11 @@ struct bfq_io_cq { #endif /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to - * be able to restore it in case of split. + * Snapshot of the idle window before merging; taken to + * remember this value while the queue is merged, so as to be + * able to restore it in case of split. */ - bool saved_has_short_ttime; + bool saved_idle_window; /* * Same purpose as the previous two fields for the I/O bound * classification of a queue. @@ -612,7 +612,7 @@ enum bfqq_state_flags { */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once @@ -651,7 +651,7 @@ BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(has_short_ttime); +BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); From a2016cedd541e6b37f690aa3cd23bd8c9d138749 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:39 +0800 Subject: [PATCH 12/59] Revert "block, bfq: consider also in_service_entity to state whether an entity is active" This reverts commit ca07b96883f14c17cd9afa12d2fc56e3b3c792f3. --- block/bfq-sched.c | 140 +++++++++++++++++++++------------------------- block/bfq.h | 23 ++------ 2 files changed, 68 insertions(+), 95 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index be985d9d5f17..fdf1c713d050 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -196,23 +196,21 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) /* * This function tells whether entity stops being a candidate for next - * service, according to the restrictive definition of the field - * next_in_service. In particular, this function is invoked for an - * entity that is about to be set in service. + * service, according to the following logic. * - * If entity is a queue, then the entity is no longer a candidate for - * next service according to the that definition, because entity is - * about to become the in-service queue. This function then returns - * true if entity is a queue. + * This function is invoked for an entity that is about to be set in + * service. If such an entity is a queue, then the entity is no longer + * a candidate for next service (i.e, a candidate entity to serve + * after the in-service entity is expired). The function then returns + * true. * - * In contrast, entity could still be a candidate for next service if - * it is not a queue, and has more than one active child. In fact, - * even if one of its children is about to be set in service, other - * active children may still be the next to serve, for the parent - * entity, even according to the above definition. As a consequence, a - * non-queue entity is not a candidate for next-service only if it has - * only one active child. And only if this condition holds, then this - * function returns true for a non-queue entity. + * In contrast, the entity could stil be a candidate for next service + * if it is not a queue, and has more than one child. In fact, even if + * one of its children is about to be set in service, other children + * may still be the next to serve. As a consequence, a non-queue + * entity is not a candidate for next-service only if it has only one + * child. And only if this condition holds, then the function returns + * true for a non-queue entity. */ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) { @@ -225,18 +223,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); BUG_ON(bfqg->active_entities == 0); - /* - * The field active_entities does not always contain the - * actual number of active children entities: it happens to - * not account for the in-service entity in case the latter is - * removed from its active tree (which may get done after - * invoking the function bfq_no_longer_next_in_service in - * bfq_get_next_queue). Fortunately, here, i.e., while - * bfq_no_longer_next_in_service is not yet completed in - * bfq_get_next_queue, bfq_active_extract has not yet been - * invoked, and thus active_entities still coincides with the - * actual number of active entities. - */ if (bfqg->active_entities == 1) return true; @@ -1103,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possibly extracting it + * inserts entity into its active tree, ater possible extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1227,7 +1213,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) BUG_ON(entity->tree && entity->tree != &st->active); /* * In addition, if the entity had more than one child - * when set in service, then it was not extracted from + * when set in service, then was not extracted from * the active tree. This implies that the position of * the entity in the active tree may need to be * changed now, because we have just updated the start @@ -1235,8 +1221,9 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) * time in a moment (the requeueing is then, more * precisely, a repositioning in this case). To * implement this repositioning, we: 1) dequeue the - * entity here, 2) update the finish time and requeue - * the entity according to the new timestamps below. + * entity here, 2) update the finish time and + * requeue the entity according to the new + * timestamps below. */ if (entity->tree) bfq_active_extract(st, entity); @@ -1283,9 +1270,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, /** - * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, - * and activate, requeue or reposition all ancestors - * for which such an update becomes necessary. + * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, + * and activate, requeue or reposition all ancestors + * for which such an update becomes necessary. * @entity: the entity to activate. * @non_blocking_wait_rq: true if this entity was waiting for a request * @requeue: true if this is a requeue, which implies that bfqq is @@ -1321,9 +1308,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, * @ins_into_idle_tree: if false, the entity will not be put into the * idle tree. * - * Deactivates an entity, independently of its previous state. Must + * Deactivates an entity, independently from its previous state. Must * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it into the idle + * from that tree, and if necessary and allowed, puts it on the idle * tree. */ static bool __bfq_deactivate_entity(struct bfq_entity *entity, @@ -1372,7 +1359,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, /** * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put into the idle tree + * @ins_into_idle_tree: true if the entity can be put on the idle tree */ static void bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree, @@ -1419,29 +1406,16 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, */ bfq_update_next_in_service(sd, NULL); - if (sd->next_in_service || sd->in_service_entity) { + if (sd->next_in_service) { /* - * The parent entity is still active, because - * either next_in_service or in_service_entity - * is not NULL. So, no further upwards - * deactivation must be performed. Yet, - * next_in_service has changed. Then the - * schedule does need to be updated upwards. - * - * NOTE If in_service_entity is not NULL, then - * next_in_service may happen to be NULL, - * although the parent entity is evidently - * active. This happens if 1) the entity - * pointed by in_service_entity is the only - * active entity in the parent entity, and 2) - * according to the definition of - * next_in_service, the in_service_entity - * cannot be considered as - * next_in_service. See the comments on the - * definition of next_in_service for details. + * The parent entity is still backlogged, + * because next_in_service is not NULL. So, no + * further upwards deactivation must be + * performed. Yet, next_in_service has + * changed. Then the schedule does need to be + * updated upwards. */ BUG_ON(sd->next_in_service == entity); - BUG_ON(sd->in_service_entity == entity); break; } @@ -1832,33 +1806,45 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) /* * If entity is no longer a candidate for next - * service, then it must be extracted from its active - * tree, so as to make sure that it won't be - * considered when computing next_in_service. See the - * comments on the function - * bfq_no_longer_next_in_service() for details. + * service, then we extract it from its active tree, + * for the following reason. To further boost the + * throughput in some special case, BFQ needs to know + * which is the next candidate entity to serve, while + * there is already an entity in service. In this + * respect, to make it easy to compute/update the next + * candidate entity to serve after the current + * candidate has been set in service, there is a case + * where it is necessary to extract the current + * candidate from its service tree. Such a case is + * when the entity just set in service cannot be also + * a candidate for next service. Details about when + * this conditions holds are reported in the comments + * on the function bfq_no_longer_next_in_service() + * invoked below. */ if (bfq_no_longer_next_in_service(entity)) bfq_active_extract(bfq_entity_service_tree(entity), entity); /* - * Even if entity is not to be extracted according to - * the above check, a descendant entity may get - * extracted in one of the next iterations of this - * loop. Such an event could cause a change in - * next_in_service for the level of the descendant - * entity, and thus possibly back to this level. + * For the same reason why we may have just extracted + * entity from its active tree, we may need to update + * next_in_service for the sched_data of entity too, + * regardless of whether entity has been extracted. + * In fact, even if entity has not been extracted, a + * descendant entity may get extracted. Such an event + * would cause a change in next_in_service for the + * level of the descendant entity, and thus possibly + * back to upper levels. * - * However, we cannot perform the resulting needed - * update of next_in_service for this level before the - * end of the whole loop, because, to know which is - * the correct next-to-serve candidate entity for each - * level, we need first to find the leaf entity to set - * in service. In fact, only after we know which is - * the next-to-serve leaf entity, we can discover - * whether the parent entity of the leaf entity - * becomes the next-to-serve, and so on. + * We cannot perform the resulting needed update + * before the end of this loop, because, to know which + * is the correct next-to-serve candidate entity for + * each level, we need first to find the leaf entity + * to set in service. In fact, only after we know + * which is the next-to-serve leaf entity, we can + * discover whether the parent entity of the leaf + * entity becomes the next-to-serve, and so on. */ /* Log some information */ diff --git a/block/bfq.h b/block/bfq.h index 141f8960ad6f..64c0b9d5950a 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -70,30 +70,17 @@ struct bfq_service_tree { * * bfq_sched_data is the basic scheduler queue. It supports three * ioprio_classes, and can be used either as a toplevel queue or as an - * intermediate queue in a hierarchical setup. + * intermediate queue on a hierarchical setup. @next_in_service + * points to the active entity of the sched_data service trees that + * will be scheduled next. It is used to reduce the number of steps + * needed for each hierarchical-schedule update. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. * Requests from higher priority queues are served before all the * requests from lower priority queues; among requests of the same * queue requests are served according to B-WF2Q+. - * - * The schedule is implemented by the service trees, plus the field - * @next_in_service, which points to the entity on the active trees - * that will be served next, if 1) no changes in the schedule occurs - * before the current in-service entity is expired, 2) the in-service - * queue becomes idle when it expires, and 3) if the entity pointed by - * in_service_entity is not a queue, then the in-service child entity - * of the entity pointed by in_service_entity becomes idle on - * expiration. This peculiar definition allows for the following - * optimization, not yet exploited: while a given entity is still in - * service, we already know which is the best candidate for next - * service among the other active entitities in the same parent - * entity. We can then quickly compare the timestamps of the - * in-service entity with those of such best candidate. - * - * All the fields are protected by the queue lock of the containing - * bfqd. + * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_sched_data { struct bfq_entity *in_service_entity; /* entity in service */ From d7de18a42273461279dfc561fa33bce91224b6dd Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:41 +0800 Subject: [PATCH 13/59] Revert "block, bfq: reset in_service_entity if it becomes idle" This reverts commit 742d69de42e18d6017cd1817aa99e6a1bf698773. --- block/bfq-sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index fdf1c713d050..b6eb25887262 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1336,10 +1336,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - if (is_in_service) { + if (is_in_service) bfq_calc_finish(entity, entity->service); - sd->in_service_entity = NULL; - } if (entity->tree == &st->active) bfq_active_extract(st, entity); From c7a2e89bb7b3668f1206433a7bccf73af8fdc79f Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:42 +0800 Subject: [PATCH 14/59] Revert "Add extra checks related to entity scheduling" This reverts commit 53db2270fb5cec47b22c944171e5ff0df29375dd. --- block/bfq-sched.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index b6eb25887262..90d2856358a1 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -812,7 +812,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, } #endif - BUG_ON(entity->tree && update_class_too); BUG_ON(old_st->wsum < entity->weight); old_st->wsum -= entity->weight; @@ -884,10 +883,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, new_st->wsum += entity->weight; - if (new_st != old_st) { - BUG_ON(!update_class_too); + if (new_st != old_st) entity->start = new_st->vtime; - } } return new_st; @@ -996,7 +993,6 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * tree, then it is safe to invoke next function with the last * parameter set (see the comments on the function). */ - BUG_ON(entity->tree); st = __bfq_entity_update_weight_prio(st, entity, true); bfq_calc_finish(entity, entity->budget); @@ -1117,11 +1113,9 @@ static void __bfq_activate_entity(struct bfq_entity *entity, * check for that. */ bfq_idle_extract(st, entity); - BUG_ON(entity->tree); entity->start = bfq_gt(min_vstart, entity->finish) ? min_vstart : entity->finish; } else { - BUG_ON(entity->tree); /* * The finish time of the entity may be invalid, and * it is in the past for sure, otherwise the queue @@ -1209,7 +1203,6 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) */ bfq_calc_finish(entity, entity->service); entity->start = entity->finish; - BUG_ON(entity->tree && entity->tree == &st->idle); BUG_ON(entity->tree && entity->tree != &st->active); /* * In addition, if the entity had more than one child From 1047fc84ba5e15de2a2c5050af4cccddf10c856c Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:44 +0800 Subject: [PATCH 15/59] Revert "BFQ-v8r12" This reverts commit d6d716d93589d7c443a34de7c554cdcedc0e1ace. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3fdef504ce0b..51748c487c27 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5311,7 +5311,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; + char msg[60] = "BFQ I/O-scheduler: v8r11"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 64c0b9d5950a..1dc2143da1ed 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r12 for 4.9.0: data structures and common functions prototypes. + * BFQ v8r11 for 4.9.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From 50a7bdddb0b0436623a050bd082fb43af6d3a886 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:46 +0800 Subject: [PATCH 16/59] Revert "block, bfq: don't change ioprio class for a bfq_queue on a service tree" This reverts commit 7227531003cbe193b30255be4e236455cbc111e2. --- block/bfq-iosched.c | 14 ++++---------- block/bfq-sched.c | 38 ++++---------------------------------- 2 files changed, 8 insertions(+), 44 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 51748c487c27..abe6d49e8855 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3701,17 +3701,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) } } } - /* - * To improve latency (for this or other queues), immediately - * update weight both if it must be raised and if it must be - * lowered. Since, entity may be on some active tree here, and - * might have a pending change of its ioprio class, invoke - * next function with the last parameter unset (see the - * comments on the function). - */ + /* Update weight both if it must be raised and if it must be lowered */ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) - __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), - entity, false); + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); } /* diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 90d2856358a1..d8efd6bad965 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -765,28 +765,9 @@ static void bfq_forget_idle(struct bfq_service_tree *st) bfq_put_idle_entity(st, first_idle); } -/* - * Update weight and priority of entity. If update_class_too is true, - * then update the ioprio_class of entity too. - * - * The reason why the update of ioprio_class is controlled through the - * last parameter is as follows. Changing the ioprio class of an - * entity implies changing the destination service trees for that - * entity. If such a change occurred when the entity is already on one - * of the service trees for its previous class, then the state of the - * entity would become more complex: none of the new possible service - * trees for the entity, according to bfq_entity_service_tree(), would - * match any of the possible service trees on which the entity - * is. Complex operations involving these trees, such as entity - * activations and deactivations, should take into account this - * additional complexity. To avoid this issue, this function is - * invoked with update_class_too unset in the points in the code where - * entity may happen to be on some tree. - */ static struct bfq_service_tree * __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - struct bfq_entity *entity, - bool update_class_too) + struct bfq_entity *entity) { struct bfq_service_tree *new_st = old_st; @@ -831,15 +812,9 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, bfq_weight_to_ioprio(entity->orig_weight); } - if (bfqq && update_class_too) + if (bfqq) bfqq->ioprio_class = bfqq->new_ioprio_class; - - /* - * Reset prio_changed only if the ioprio_class change - * is not pending any longer. - */ - if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) - entity->prio_changed = 0; + entity->prio_changed = 0; /* * NOTE: here we may be changing the weight too early, @@ -988,12 +963,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct bfq_sched_data *sd = entity->sched_data; - /* - * When this function is invoked, entity is not in any service - * tree, then it is safe to invoke next function with the last - * parameter set (see the comments on the function). - */ - st = __bfq_entity_update_weight_prio(st, entity, true); + st = __bfq_entity_update_weight_prio(st, entity); bfq_calc_finish(entity, entity->budget); /* From 47b0198ec4ebee4686d3c109f405956b12cb0d3b Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:47 +0800 Subject: [PATCH 17/59] Revert "Fix commit "don't dereference bic before null checking it"" This reverts commit 1ed63f7fb7a04a00d7068bbe8d7408ae61e1b6f2. --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index abe6d49e8855..ab5c196f5bbd 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4600,6 +4600,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, bool bfqq_already_existing = false, split = false; spin_lock_irqsave(q->queue_lock, flags); + bfq_check_ioprio_change(bic, bio); if (!bic) goto queue_fail; From eb0bbb4f661abe8270dbf241e33a5f499b5b1311 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:49 +0800 Subject: [PATCH 18/59] Revert "block, bfq: stress that low_latency must be off to get max throughput" This reverts commit 0e02e43e9d560fe0c90ed489555cc25272653417. --- Documentation/block/bfq-iosched.txt | 17 +---------------- block/bfq-iosched.c | 5 ----- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 0539e87962ed..13b5248eba7e 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -11,13 +11,6 @@ controllers), BFQ's main features are: groups (switching back to time distribution when needed to keep throughput high). -In its default configuration, BFQ privileges latency over -throughput. So, when needed for achieving a lower latency, BFQ builds -schedules that may lead to a lower throughput. If your main or only -goal, for a given device, is to achieve the maximum-possible -throughput at all times, then do switch off all low-latency heuristics -for that device, by setting low_latency to 0. Full details in Section 3. - On average CPUs, the current version of BFQ can handle devices performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a reference, 30-50 KIOPS correspond to very high bandwidths with @@ -381,19 +374,11 @@ default, low latency mode is enabled. If enabled, interactive and soft real-time applications are privileged and experience a lower latency, as explained in more detail in the description of how BFQ works. -DISABLE this mode if you need full control on bandwidth +DO NOT enable this mode if you need full control on bandwidth distribution. In fact, if it is enabled, then BFQ automatically increases the bandwidth share of privileged applications, as the main means to guarantee a lower latency to them. -In addition, as already highlighted at the beginning of this document, -DISABLE this mode if your only goal is to achieve a high throughput. -In fact, privileging the I/O of some application over the rest may -entail a lower throughput. To achieve the highest-possible throughput -on a non-rotational device, setting slice_idle to 0 may be needed too -(at the cost of giving up any strong guarantee on fairness and low -latency). - timeout_sync ------------ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ab5c196f5bbd..39bb4f5783f6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -36,11 +36,6 @@ * boost the throughput), and yet guarantee a low latency to * interactive and soft real-time applications. * - * NOTE: if the main or only goal, with a given device, is to achieve - * the maximum-possible throughput at all times, then do switch off - * all low-latency heuristics for that device, by setting low_latency - * to 0. - * * BFQ is described in [1], where also a reference to the initial, more * theoretical paper on BFQ can be found. The interested reader can find * in the latter paper full details on the main algorithm, as well as From dee36241f01081e9ce91213f8868d25428d82189 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:51 +0800 Subject: [PATCH 19/59] Revert "block, bfq: update wr_busy_queues if needed on a queue split" This reverts commit 612c7ed1459d62476973d7af8ce70068d3ecfbb1. --- block/bfq-iosched.c | 47 ++++++++------------------------------------- block/bfq-sched.c | 9 ++------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 39bb4f5783f6..b86c171bc3dc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -682,12 +682,8 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) } static void -bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { - unsigned int old_wr_coeff; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - if (bic->saved_idle_window) bfq_mark_bfqq_idle_window(bfqq); else @@ -698,9 +694,6 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); - if (unlikely(busy)) - old_wr_coeff = bfqq->wr_coeff; - bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); @@ -709,8 +702,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { bfq_log_bfqq(bfqq->bfqd, bfqq, "resume state: switching off wr (%lu + %lu < %lu)", bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, @@ -718,20 +711,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->wr_coeff = 1; } - /* make sure weight will be updated, however we got here */ bfqq->entity.prio_changed = 1; - - if (likely(!busy)) - return; - - if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { - bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { - bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -1479,7 +1460,6 @@ static void bfq_add_request(struct request *rq) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); bfqq->entity.prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, "non-idle wrais starting, " @@ -1722,10 +1702,8 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(!bfqq); - if (bfq_bfqq_busy(bfqq)) { + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; - BUG_ON(bfqq->bfqd->wr_busy_queues < 0); - } bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; bfqq->last_wr_start_finish = jiffies; @@ -2104,11 +2082,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - if (bfq_bfqq_busy(new_bfqq)) { + if (bfq_bfqq_busy(new_bfqq)) bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } - new_bfqq->entity.prio_changed = 1; bfq_log_bfqq(bfqd, new_bfqq, "wr start after merge with %d, rais_max_time %u", @@ -2119,11 +2094,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ bfqq->wr_coeff = 1; bfqq->entity.prio_changed = 1; - if (bfq_bfqq_busy(bfqq)) { + if (bfq_bfqq_busy(bfqq)) bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } - } bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", @@ -4592,7 +4564,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; unsigned long flags; - bool bfqq_already_existing = false, split = false; + bool split = false; spin_lock_irqsave(q->queue_lock, flags); bfq_check_ioprio_change(bic, bio); @@ -4652,8 +4624,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, split = true; if (!bfqq) goto new_queue; - else - bfqq_already_existing = true; } } @@ -4679,8 +4649,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, * queue, restore the idle window and the possible * weight raising period. */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); + bfq_bfqq_resume_state(bfqq, bic); } } diff --git a/block/bfq-sched.c b/block/bfq-sched.c index d8efd6bad965..8311bdbeceea 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1932,10 +1932,8 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (bfqq->wr_coeff > 1) { + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } bfqg_stats_update_dequeue(bfqq_group(bfqq)); @@ -1964,9 +1962,6 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_weights_tree_add(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (bfqq->wr_coeff > 1) { + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } - } From 008a63c6616301cb6c7407b407ab7c64029c35e1 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:53 +0800 Subject: [PATCH 20/59] Revert "BFQ-v8r11" This reverts commit 05028dff0e6f5f36c83132dd6e6e295efbea0082. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b86c171bc3dc..bf706ca47656 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5270,7 +5270,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r11"; + char msg[60] = "BFQ I/O-scheduler: v8r10"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 1dc2143da1ed..3fdc9f0edb2f 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r11 for 4.9.0: data structures and common functions prototypes. + * BFQ v8r10 for 4.9.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From 6beac0053855de514feb04dc5ae51454d357f09b Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:55 +0800 Subject: [PATCH 21/59] Revert "block, bfq: don't dereference bic before null checking it" This reverts commit 618883a12da5ceceb4d7c07787363c9c863de462. --- block/bfq-iosched.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bf706ca47656..b9440d952609 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4572,8 +4572,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, if (!bic) goto queue_fail; - bfq_check_ioprio_change(bic, bio); - bfq_bic_update_cgroup(bic, bio); new_queue: From 2924863f334d64664b0d0a860981fb8037dc6d8e Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:57 +0800 Subject: [PATCH 22/59] Revert "block, bfq: use pointer entity->sched_data only if set" This reverts commit 74af90e9f434963c4929f4ec189dd19b183c9c06. --- block/bfq-sched.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 8311bdbeceea..70aac56b02ef 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1280,23 +1280,14 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) { struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; - bool is_in_service; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + bool is_in_service = entity == sd->in_service_entity; if (!entity->on_st) { /* entity never activated, or already inactive */ - BUG_ON(sd && entity == sd->in_service_entity); + BUG_ON(entity == entity->sched_data->in_service_entity); return false; } - /* - * If we get here, then entity is active, which implies that - * bfq_group_set_parent has already been invoked for the group - * represented by entity. Therefore, the field - * entity->sched_data has been set, and we can safely use it. - */ - st = bfq_entity_service_tree(entity); - is_in_service = entity == sd->in_service_entity; - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); if (is_in_service) From 7f1d24b50aec353a92622cc1ad6e99bec8235237 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 16:59:59 +0800 Subject: [PATCH 23/59] Revert "BFQ-v8r10" This reverts commit 3f4f96de160c79f33728291f813af02bb6a07409. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b9440d952609..7fd537e4ea5a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5268,7 +5268,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r10"; + char msg[60] = "BFQ I/O-scheduler: v8r10-rc1"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 3fdc9f0edb2f..0c73102e37f4 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r10 for 4.9.0: data structures and common functions prototypes. + * BFQ v8r10-rc1 for 4.11.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From ebde97882018f425b913f34f677cb4011d38a23b Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:02 +0800 Subject: [PATCH 24/59] Revert "BFQ-v8r10-rc1" This reverts commit 2e3711c8cb89837508f29e15f616563027e631c0. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7fd537e4ea5a..03df472e845c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5268,7 +5268,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r10-rc1"; + char msg[60] = "BFQ I/O-scheduler: v8r9"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 0c73102e37f4..4f7d0b887c34 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r10-rc1 for 4.11.0: data structures and common functions prototypes. + * BFQ v8r9 for 4.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From 23ab10a7695c76f1bbc3237bdf4f05c9e9363512 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:04 +0800 Subject: [PATCH 25/59] Revert "BUGFIX: Remove problematic check on max service duration" This reverts commit 25d0a01965694611cabfd3b234c18ae724bdd933. --- block/bfq-iosched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 03df472e845c..a9b6c1718425 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2934,8 +2934,8 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); - /* don't use too short time intervals */ - if (delta_usecs < 1000) { + /* don't trust short/unrealistic values. */ + if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { if (blk_queue_nonrot(bfqd->queue)) /* * give same worst-case guarantees as idling @@ -2945,7 +2945,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, else /* charge at least one seek */ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); + bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); return slow; } From 1840a13c34de7c7d3e48dd6750639f6fdef06ddc Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:07 +0800 Subject: [PATCH 26/59] Revert "BUGFIX: Handle failure of weight-counter allocation" This reverts commit 5241cdd021599c26cab5acaf7fb7b2f6098ff06d. --- block/bfq-iosched.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a9b6c1718425..98f1abb8dda5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -483,22 +483,6 @@ static void bfq_weights_tree_add(struct bfq_data *bfqd, entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), GFP_ATOMIC); - - /* - * In the unlucky event of an allocation failure, we just - * exit. This will cause the weight of entity to not be - * considered in bfq_differentiated_weights, which, in its - * turn, causes the scenario to be deemed wrongly symmetric in - * case entity's weight would have been the only weight making - * the scenario asymmetric. On the bright side, no unbalance - * will however occur when entity becomes inactive again (the - * invocation of this function is triggered by an activation - * of entity). In fact, bfq_weights_tree_remove does nothing - * if !entity->weight_counter. - */ - if (unlikely(!entity->weight_counter)) - return; - entity->weight_counter->weight = entity->weight; rb_link_node(&entity->weight_counter->weights_node, parent, new); rb_insert_color(&entity->weight_counter->weights_node, root); From ff3ddc350f1044527bf27aa6357dd25bb6ef853b Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:10 +0800 Subject: [PATCH 27/59] Revert "BFQ-v8r9" This reverts commit e99fbdb7130b68422dfd620c2e355c8ed5f3f85c. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 98f1abb8dda5..9cf87ba31082 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5252,7 +5252,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r9"; + char msg[60] = "BFQ I/O-scheduler: v8r8"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 4f7d0b887c34..2a2bc303b110 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r9 for 4.10.0: data structures and common functions prototypes. + * BFQ v8r8 for 4.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From 3e83f1d633024f6ed1c5a3ef578f7bdcb0b619b6 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:12 +0800 Subject: [PATCH 28/59] Revert "BUGFIX: remove use of bfq queues after free" This reverts commit 830d4b4d65ba6fc0c46e31bf6d17b7dacbedcc41. --- block/bfq-cgroup.c | 1 + block/bfq-iosched.c | 36 +++++++---------------- block/bfq-sched.c | 71 +++++++++++++++------------------------------ 3 files changed, 35 insertions(+), 73 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a66a7232a854..de045cff3353 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -771,6 +771,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) __bfq_deactivate_entity(entity, false); bfq_put_async_queues(bfqd, bfqg); + BUG_ON(entity->tree); /* * @blkg is going offline and will be ignored by diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9cf87ba31082..6348d5530f6a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1391,6 +1391,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQ_BFQQ_PREEMPTED); + BUG_ON(in_serv->entity.budget < 0); } } @@ -1559,10 +1560,8 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->entity.budget < 0); if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ bfq_del_bfqq_busy(bfqd, bfqq, false); - /* - * bfqq emptied. In normal operation, when + /* bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and * bfqq->entity.budget must contain, * respectively, the service received and the @@ -1571,8 +1570,7 @@ static void bfq_remove_request(struct request *rq) * this last removal occurred while bfqq is * not in service. To avoid inconsistencies, * reset both bfqq->entity.service and - * bfqq->entity.budget, if bfqq has still a - * process that may issue I/O requests to it. + * bfqq->entity.budget. */ bfqq->entity.budget = bfqq->entity.service = 0; } @@ -2064,8 +2062,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, new_bfqq->wr_coeff = bfqq->wr_coeff; new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; - new_bfqq->wr_start_at_switch_to_srt = - bfqq->wr_start_at_switch_to_srt; + new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; if (bfq_bfqq_busy(new_bfqq)) bfqd->wr_busy_queues++; new_bfqq->entity.prio_changed = 1; @@ -2108,7 +2105,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->bic = NULL; bfqq->bic = NULL; - /* release process reference to bfqq */ bfq_put_queue(bfqq); } @@ -3081,7 +3077,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, bool slow; unsigned long delta = 0; struct bfq_entity *entity = &bfqq->entity; - int ref; BUG_ON(bfqq != bfqd->in_service_queue); @@ -3189,15 +3184,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); BUG_ON(bfqq->next_rq == NULL && bfqq->entity.budget < bfqq->entity.service); - ref = bfqq->ref; __bfq_bfqq_expire(bfqd, bfqq); - BUG_ON(ref > 1 && - !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && !bfq_class_idle(bfqq)); - /* mark bfqq as waiting a request only if a bic still points to it */ - if (ref > 1 && !bfq_bfqq_busy(bfqq) && + if (!bfq_bfqq_busy(bfqq) && reason != BFQ_BFQQ_BUDGET_TIMEOUT && reason != BFQ_BFQQ_BUDGET_EXHAUSTED) bfq_mark_bfqq_non_blocking_wait_rq(bfqq); @@ -3817,8 +3809,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) * Task holds one reference to the queue, dropped when task exits. Each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * - * Queue lock must be held here. Recall not to use bfqq after calling - * this function on it. + * Queue lock must be held here. */ static void bfq_put_queue(struct bfq_queue *bfqq) { @@ -3887,7 +3878,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); /* release process reference */ + bfq_put_queue(bfqq); } static void bfq_init_icq(struct io_cq *icq) @@ -3986,7 +3977,6 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - /* release process reference on this queue */ bfq_put_queue(bfqq); bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); bic_set_bfqq(bic, bfqq, false); @@ -4120,7 +4110,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, } out: - bfqq->ref++; /* get a process reference to this queue */ + bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); rcu_read_unlock(); return bfqq; @@ -4294,14 +4284,10 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) bfqq->allocated[rq_data_dir(rq)]--; new_bfqq->ref++; bfq_clear_bfqq_just_created(bfqq); + bfq_put_queue(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq - */ - bfq_put_queue(bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; } @@ -4722,7 +4708,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) } static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) + struct bfq_queue **bfqq_ptr) { struct bfq_group *root_group = bfqd->root_group; struct bfq_queue *bfqq = *bfqq_ptr; diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 70aac56b02ef..2e9dc59de0ed 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -154,13 +154,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, #define for_each_entity(entity) \ for (; entity ; entity = entity->parent) -/* - * For each iteration, compute parent in advance, so as to be safe if - * entity is deallocated during the iteration. Such a deallocation may - * happen as a consequence of a bfq_put_queue that frees the bfq_queue - * containing entity. - */ -#define for_each_entity_safe(entity, parent) \ +#define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) /* @@ -697,31 +691,27 @@ static void bfq_idle_insert(struct bfq_service_tree *st, } /** - * bfq_forget_entity - do not consider entity any longer for scheduling + * bfq_forget_entity - remove an entity from the wfq trees. * @st: the service tree. * @entity: the entity being removed. - * @is_in_service: true if entity is currently the in-service entity. * - * Forget everything about @entity. In addition, if entity represents - * a queue, and the latter is not in service, then release the service - * reference to the queue (the one taken through bfq_get_entity). In - * fact, in this case, there is really no more service reference to - * the queue, as the latter is also outside any service tree. If, - * instead, the queue is in service, then __bfq_bfqd_reset_in_service - * will take care of putting the reference when the queue finally - * stops being served. + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. */ static void bfq_forget_entity(struct bfq_service_tree *st, - struct bfq_entity *entity, - bool is_in_service) + struct bfq_entity *entity) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd; + BUG_ON(!entity->on_st); entity->on_st = false; st->wsum -= entity->weight; - if (bfqq && !is_in_service) { - bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", + if (bfqq) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", bfqq, bfqq->ref); bfq_put_queue(bfqq); } @@ -736,8 +726,7 @@ static void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity) { bfq_idle_extract(st, entity); - bfq_forget_entity(st, entity, - entity == entity->sched_data->in_service_entity); + bfq_forget_entity(st, entity); } /** @@ -1093,12 +1082,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity, */ entity->start = min_vstart; st->wsum += entity->weight; - /* - * entity is about to be inserted into a service tree, - * and then set in service: get a reference to make - * sure entity does not disappear until it is no - * longer in service or scheduled for service. - */ bfq_get_entity(entity); BUG_ON(entity->on_st && bfqq); @@ -1281,27 +1264,27 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, { struct bfq_sched_data *sd = entity->sched_data; struct bfq_service_tree *st = bfq_entity_service_tree(entity); - bool is_in_service = entity == sd->in_service_entity; + bool was_in_service = entity == sd->in_service_entity; if (!entity->on_st) { /* entity never activated, or already inactive */ BUG_ON(entity == entity->sched_data->in_service_entity); return false; } - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); + BUG_ON(was_in_service && entity->tree && entity->tree != &st->active); - if (is_in_service) + if (was_in_service) bfq_calc_finish(entity, entity->service); if (entity->tree == &st->active) bfq_active_extract(st, entity); - else if (!is_in_service && entity->tree == &st->idle) + else if (!was_in_service && entity->tree == &st->idle) bfq_idle_extract(st, entity); else if (entity->tree) BUG(); if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) - bfq_forget_entity(st, entity, is_in_service); + bfq_forget_entity(st, entity); else bfq_idle_insert(st, entity); @@ -1337,8 +1320,8 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { /* - * entity is not in any tree any more, so - * this deactivation is a no-op, and there is + * Entity is not any tree any more, so, this + * deactivation is a no-op, and there is * nothing to change for upper-level entities * (in case of expiration, this can never * happen). @@ -1838,16 +1821,14 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { - struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; + struct bfq_entity *entity = &bfqd->in_service_queue->entity; if (bfqd->in_service_bic) { put_io_context(bfqd->in_service_bic->icq.ioc); bfqd->in_service_bic = NULL; } - bfq_clear_bfqq_wait_request(in_serv_bfqq); + bfq_clear_bfqq_wait_request(bfqd->in_service_queue); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; @@ -1859,14 +1840,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) */ for_each_entity(entity) entity->sched_data->in_service_entity = NULL; - - /* - * in_serv_entity is no longer in service, so, if it is in no - * service tree either, then release the service reference to - * the queue it represents (taken with bfq_get_entity). - */ - if (!in_serv_entity->on_st) - bfq_put_queue(in_serv_bfqq); } static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1931,6 +1904,8 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(bfqq->entity.budget < 0); bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); + + BUG_ON(bfqq->entity.budget < 0); } /* From da6b37653b88dfe1e2fac67d98c1db473c0f92a0 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:15 +0800 Subject: [PATCH 29/59] Revert "BFQ-v8r8" This reverts commit 3037a778b1ee1a479a1bb09340c0a668e343ca4a. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6348d5530f6a..5bfeb1638578 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5238,7 +5238,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r8"; + char msg[60] = "BFQ I/O-scheduler: v8r8-rc2"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 2a2bc303b110..a08e8a6f0a36 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r8 for 4.10.0: data structures and common functions prototypes. + * BFQ v8r8-rc2 for 4.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From b0525e68fc5eb8bffa4a4943507cebb228e0449c Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:17 +0800 Subject: [PATCH 30/59] Revert "block/bfq-cgroup: fix bfq_bic_update_cgroup() API" This reverts commit 171598f7bd80a8f4b36e5a75adc130daf9ff0353. --- block/bfq-cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index de045cff3353..a5f8dc16900a 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1157,7 +1157,13 @@ static void bfq_init_entity(struct bfq_entity *entity, entity->sched_data = &bfqg->sched_data; } -static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} +static struct bfq_group * +bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + + return bfqd->root_group; +} static void bfq_end_wr_async(struct bfq_data *bfqd) { From 6f597696ee9df755f1bd4bb305b2ab646df11172 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:20 +0800 Subject: [PATCH 31/59] Revert "BFQ-v8r8-rc2" This reverts commit 23e8e99d65e6d1e8a67d9829306d96c375c594ac. --- block/bfq-iosched.c | 4 ++-- block/bfq.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5bfeb1638578..27969273bf26 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -9,7 +9,7 @@ * * Copyright (C) 2015 Paolo Valente * - * Copyright (C) 2017 Paolo Valente + * Copyright (C) 2016 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. @@ -5238,7 +5238,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r8-rc2"; + char msg[60] = "BFQ I/O-scheduler: v8r8-rc1"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index a08e8a6f0a36..7b12f3ce5093 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r8-rc2 for 4.10.0: data structures and common functions prototypes. + * BFQ v8r8-rc1 for 4.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -9,7 +9,7 @@ * * Copyright (C) 2015 Paolo Valente * - * Copyright (C) 2017 Paolo Valente + * Copyright (C) 2016 Paolo Valente */ #ifndef _BFQ_H From 9e734e8287c41866ea17673aebada6bbe554f880 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:22 +0800 Subject: [PATCH 32/59] Revert "Avoid a second dispatch in case of budget exhaustion" This reverts commit ea1b2efe3d18479169afa8290b21a2b2afcd59b3. --- block/bfq-iosched.c | 122 +++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 27969273bf26..7ffc16743a2f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -519,45 +519,13 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, entity->weight_counter = NULL; } -/* - * Return expired entry, or NULL to just start from scratch in rbtree. - */ -static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - struct request *last) -{ - struct request *rq; - - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; - - bfq_mark_bfqq_fifo_expire(bfqq); - - rq = rq_entry_fifo(bfqq->fifo.next); - - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; -} - static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next, *prev = NULL; - - BUG_ON(list_empty(&bfqq->fifo)); - - /* Follow expired path, else get first next available. */ - next = bfq_check_fifo(bfqq, last); - if (next) { - BUG_ON(next == last); - return next; - } + struct request *next = NULL, *prev = NULL; BUG_ON(RB_EMPTY_NODE(&last->rb_node)); @@ -1555,12 +1523,11 @@ static void bfq_remove_request(struct request *rq) elv_rb_del(&bfqq->sort_list, rq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfqq->next_rq = NULL; - BUG_ON(bfqq->entity.budget < 0); if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { bfq_del_bfqq_busy(bfqd, bfqq, false); + /* bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and * bfqq->entity.budget must contain, @@ -2649,6 +2616,29 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) elv_dispatch_sort(q, rq); } +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (ktime_get_ns() < rq->fifo_time) + return NULL; + + return rq; +} + static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { BUG_ON(bfqq != bfqd->in_service_queue); @@ -3514,29 +3504,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) !bfq_bfqq_must_idle(bfqq)) goto expire; -check_queue: - /* - * This loop is rarely executed more than once. Even when it - * happens, it is much more convenient to re-execute this loop - * than to return NULL and trigger a new dispatch to get a - * request served. - */ next_rq = bfqq->next_rq; /* * If bfqq has requests queued and it has enough budget left to * serve them, keep the queue, otherwise expire it. */ if (next_rq) { - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - if (bfq_serv_to_charge(next_rq, bfqq) > bfq_bfqq_budget_left(bfqq)) { - /* - * Expire the queue for budget exhaustion, - * which makes sure that the next budget is - * enough to serve the next request, even if - * it comes from the fifo expired path. - */ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; goto expire; } else { @@ -3584,16 +3559,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_bfqq_expire(bfqd, bfqq, false, reason); new_queue: bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); - goto check_queue; - } + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq ? bfqq->pid : 0); keep_queue: - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); - else - bfq_log(bfqd, "select_queue: no queue returned"); - return bfqq; } @@ -3659,17 +3627,45 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, struct bfq_queue *bfqq) { int dispatched = 0; - struct request *rq = bfqq->next_rq; + struct request *rq; unsigned long service_to_charge; BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - BUG_ON(!rq); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (!rq) + rq = bfqq->next_rq; service_to_charge = bfq_serv_to_charge(rq, bfqq); - BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + goto expire; + } BUG_ON(bfqq->entity.budget < bfqq->entity.service); - + /* Finally, insert request into driver dispatch list. */ bfq_bfqq_served(bfqq, service_to_charge); BUG_ON(bfqq->entity.budget < bfqq->entity.service); From 068865d24908a0dce177621f8e1bac441a42bf8a Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:24 +0800 Subject: [PATCH 33/59] Revert "BFQ-v8r8-rc1" This reverts commit 2bc2623c70fadd32fd945575717fcf9b7deae67a. --- block/bfq-iosched.c | 2 +- block/bfq.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7ffc16743a2f..ba82d8f30a6a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5234,7 +5234,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r8-rc1"; + char msg[60] = "BFQ I/O-scheduler: v8r7"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); diff --git a/block/bfq.h b/block/bfq.h index 7b12f3ce5093..bef8244cc03f 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r8-rc1 for 4.10.0: data structures and common functions prototypes. + * BFQ v8r7 for 4.9.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe From 3cead70391e15b12f91a404826752176ff4ee725 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:26 +0800 Subject: [PATCH 34/59] Revert "Fix check of the percentage of sequential dispatches" This reverts commit a9cddfd9e0dd03230c873d65d79c7c1661a2c08e. --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ba82d8f30a6a..82c07a82cd8f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2400,7 +2400,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) * total, and rate is below the current estimated peak rate * - rate is unreasonably high (> 20M sectors/sec) */ - if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && + if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 && rate <= bfqd->peak_rate) || rate > 20< Date: Sat, 7 Mar 2026 17:00:28 +0800 Subject: [PATCH 35/59] Revert "Better tune weight-raising for slow flash-based devices" This reverts commit 22edd6284f1b6ae156f10b69d4eb5d0cf2f94e34. --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 82c07a82cd8f..517f513895af 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5260,7 +5260,7 @@ static int __init bfq_init(void) * be run for a long time. */ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ - T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ + T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ From 1477958f98d0fc0ec8ae9f7c57a11c0c23980a9d Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:30 +0800 Subject: [PATCH 36/59] Revert "BUGFIX: Put async queues on exit also without cgroups" This reverts commit 53ba58bb56b02f513a3f6c4b77375f62461f2b38. --- block/bfq-cgroup.c | 3 --- block/bfq-iosched.c | 11 +++-------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a5f8dc16900a..bbaecd00449e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1140,9 +1140,6 @@ static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) {} - static void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 517f513895af..98a1acd870b5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4093,13 +4093,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, * prune it. */ if (async_bfqq) { - bfqq->ref++; /* - * Extra group reference, w.r.t. sync - * queue. This extra reference is removed - * only if bfqq->bfqg disappears, to - * guarantee that this queue is not freed - * until its group goes away. - */ + bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", bfqq, bfqq->ref); *async_bfqq = bfqq; @@ -4703,6 +4697,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) cancel_work_sync(&bfqd->unplug_work); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { @@ -4735,6 +4730,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); } +#endif static void bfq_exit_queue(struct elevator_queue *e) { @@ -4759,7 +4755,6 @@ static void bfq_exit_queue(struct elevator_queue *e) #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(q, &blkcg_policy_bfq); #else - bfq_put_async_queues(bfqd, bfqd->root_group); kfree(bfqd->root_group); #endif From 3b3d9c4612c4414fb342b24dbda3b3eac38a0bb9 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:32 +0800 Subject: [PATCH 37/59] Revert "Add a ton of forgotten static qualifiers" This reverts commit 0827cbbc131f08d83509354d192c2c8ee221a2ce. --- block/bfq-iosched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 98a1acd870b5..2a2c130df35a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -107,7 +107,7 @@ static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ static const int bfq_timeout = (HZ / 8); -static struct kmem_cache *bfq_pool; +struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) @@ -1868,7 +1868,7 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, * positives. In case bfqq is weight-raised, such false positives * would evidently degrade latency guarantees for bfqq. */ -static bool wr_from_too_long(struct bfq_queue *bfqq) +bool wr_from_too_long(struct bfq_queue *bfqq) { return bfqq->wr_coeff > 1 && time_is_before_jiffies(bfqq->last_wr_start_finish + @@ -2298,7 +2298,7 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) * function of the estimated peak rate. See comments on * bfq_calc_max_budget(), and on T_slow and T_fast arrays. */ -static void update_thr_responsiveness_params(struct bfq_data *bfqd) +void update_thr_responsiveness_params(struct bfq_data *bfqd) { int dev_type = blk_queue_nonrot(bfqd->queue); @@ -2333,7 +2333,7 @@ static void update_thr_responsiveness_params(struct bfq_data *bfqd) BFQ_RATE_SHIFT); } -static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; @@ -2350,7 +2350,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq bfqd->tot_sectors_dispatched); } -static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) { u32 rate, weight, divisor; @@ -2515,7 +2515,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) * of the observed dispatch rate. The function assumes to be invoked * on every request dispatch. */ -static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) +void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { u64 now_ns = ktime_get_ns(); From 19f0a47b17690c9f09e0ac94597b29e0ade190da Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:34 +0800 Subject: [PATCH 38/59] Revert "Remove wrong compilation warning" This reverts commit 3a4cb6003869a0380139c7a945e169ab8f132f3e. --- block/bfq-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 2e9dc59de0ed..797bce75db01 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1301,7 +1301,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, bool expiration) { struct bfq_sched_data *sd; - struct bfq_entity *parent = NULL; + struct bfq_entity *parent; for_each_entity_safe(entity, parent) { sd = entity->sched_data; From e83e01a9b69f698c92f37c96419445f095c1d7ac Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:36 +0800 Subject: [PATCH 39/59] Revert "Turn into BFQ-v8r7 for 4.9.0" This reverts commit 659c7d1932982e18d9f0bbf3d1cd816216b8d9e0. --- Documentation/block/00-INDEX | 2 - Documentation/block/bfq-iosched.txt | 530 ----- block/Kconfig.iosched | 18 +- block/bfq-cgroup.c | 501 ++-- block/bfq-iosched.c | 3278 ++++++++++----------------- block/bfq-sched.c | 1288 +++-------- block/bfq.h | 800 +++---- 7 files changed, 2097 insertions(+), 4320 deletions(-) delete mode 100644 Documentation/block/bfq-iosched.txt diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index f8614b3d49f9..a542b9f2a30d 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -1,7 +1,5 @@ 00-INDEX - This file -bfq-iosched.txt - - BFQ IO scheduler and its tunables biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 biovecs.txt diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt deleted file mode 100644 index 13b5248eba7e..000000000000 --- a/Documentation/block/bfq-iosched.txt +++ /dev/null @@ -1,530 +0,0 @@ -BFQ (Budget Fair Queueing) -========================== - -BFQ is a proportional-share I/O scheduler, with some extra -low-latency capabilities. In addition to cgroups support (blkio or io -controllers), BFQ's main features are: -- BFQ guarantees a high system and application responsiveness, and a - low latency for time-sensitive applications, such as audio or video - players; -- BFQ distributes bandwidth, and not just time, among processes or - groups (switching back to time distribution when needed to keep - throughput high). - -On average CPUs, the current version of BFQ can handle devices -performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -reference, 30-50 KIOPS correspond to very high bandwidths with -sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -to 120-200 MB/s with 4KB random I/O. - -The table of contents follow. Impatients can just jump to Section 3. - -CONTENTS - -1. When may BFQ be useful? - 1-1 Personal systems - 1-2 Server systems -2. How does BFQ work? -3. What are BFQ's tunable? -4. BFQ group scheduling - 4-1 Service guarantees provided - 4-2 Interface - -1. When may BFQ be useful? -========================== - -BFQ provides the following benefits on personal and server systems. - -1-1 Personal systems --------------------- - -Low latency for interactive applications - -Regardless of the actual background workload, BFQ guarantees that, for -interactive tasks, the storage device is virtually as responsive as if -it was idle. For example, even if one or more of the following -background workloads are being executed: -- one or more large files are being read, written or copied, -- a tree of source files is being compiled, -- one or more virtual machines are performing I/O, -- a software update is in progress, -- indexing daemons are scanning filesystems and updating their - databases, -starting an application or loading a file from within an application -takes about the same time as if the storage device was idle. As a -comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, -applications experience high latencies, or even become unresponsive -until the background workload terminates (also on SSDs). - -Low latency for soft real-time applications - -Also soft real-time applications, such as audio and video -players/streamers, enjoy a low latency and a low drop rate, regardless -of the background I/O workload. As a consequence, these applications -do not suffer from almost any glitch due to the background workload. - -Higher speed for code-development tasks - -If some additional workload happens to be executed in parallel, then -BFQ executes the I/O-related components of typical code-development -tasks (compilation, checkout, merge, ...) much more quickly than CFQ, -NOOP or DEADLINE. - -High throughput - -On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and -up to 150% higher throughput than DEADLINE and NOOP, with all the -sequential workloads considered in our tests. With random workloads, -and with all the workloads on flash-based devices, BFQ achieves, -instead, about the same throughput as the other schedulers. - -Strong fairness, bandwidth and delay guarantees - -BFQ distributes the device throughput, and not just the device time, -among I/O-bound applications in proportion their weights, with any -workload and regardless of the device parameters. From these bandwidth -guarantees, it is possible to compute tight per-I/O-request delay -guarantees by a simple formula. If not configured for strict service -guarantees, BFQ switches to time-based resource sharing (only) for -applications that would otherwise cause a throughput loss. - -1-2 Server systems ------------------- - -Most benefits for server systems follow from the same service -properties as above. In particular, regardless of whether additional, -possibly heavy workloads are being served, BFQ guarantees: - -. audio and video-streaming with zero or very low jitter and drop - rate; - -. fast retrieval of WEB pages and embedded objects; - -. real-time recording of data in live-dumping applications (e.g., - packet logging); - -. responsiveness in local and remote access to a server. - - -2. How does BFQ work? -===================== - -BFQ is a proportional-share I/O scheduler, whose general structure, -plus a lot of code, are borrowed from CFQ. - -- Each process doing I/O on a device is associated with a weight and a - (bfq_)queue. - -- BFQ grants exclusive access to the device, for a while, to one queue - (process) at a time, and implements this service model by - associating every queue with a budget, measured in number of - sectors. - - - After a queue is granted access to the device, the budget of the - queue is decremented, on each request dispatch, by the size of the - request. - - - The in-service queue is expired, i.e., its service is suspended, - only if one of the following events occurs: 1) the queue finishes - its budget, 2) the queue empties, 3) a "budget timeout" fires. - - - The budget timeout prevents processes doing random I/O from - holding the device for too long and dramatically reducing - throughput. - - - Actually, as in CFQ, a queue associated with a process issuing - sync requests may not be expired immediately when it empties. In - contrast, BFQ may idle the device for a short time interval, - giving the process the chance to go on being served if it issues - a new request in time. Device idling typically boosts the - throughput on rotational devices, if processes do synchronous - and sequential I/O. In addition, under BFQ, device idling is - also instrumental in guaranteeing the desired throughput - fraction to processes issuing sync requests (see the description - of the slice_idle tunable in this document, or [1, 2], for more - details). - - - With respect to idling for service guarantees, if several - processes are competing for the device at the same time, but - all processes (and groups, after the following commit) have - the same weight, then BFQ guarantees the expected throughput - distribution without ever idling the device. Throughput is - thus as high as possible in this common scenario. - - - If low-latency mode is enabled (default configuration), BFQ - executes some special heuristics to detect interactive and soft - real-time applications (e.g., video or audio players/streamers), - and to reduce their latency. The most important action taken to - achieve this goal is to give to the queues associated with these - applications more than their fair share of the device - throughput. For brevity, we call just "weight-raising" the whole - sets of actions taken by BFQ to privilege these queues. In - particular, BFQ provides a milder form of weight-raising for - interactive applications, and a stronger form for soft real-time - applications. - - - BFQ automatically deactivates idling for queues born in a burst of - queue creations. In fact, these queues are usually associated with - the processes of applications and services that benefit mostly - from a high throughput. Examples are systemd during boot, or git - grep. - - - As CFQ, BFQ merges queues performing interleaved I/O, i.e., - performing random I/O that becomes mostly sequential if - merged. Differently from CFQ, BFQ achieves this goal with a more - reactive mechanism, called Early Queue Merge (EQM). EQM is so - responsive in detecting interleaved I/O (cooperating processes), - that it enables BFQ to achieve a high throughput, by queue - merging, even for queues for which CFQ needs a different - mechanism, preemption, to get a high throughput. As such EQM is a - unified mechanism to achieve a high throughput with interleaved - I/O. - - - Queues are scheduled according to a variant of WF2Q+, named - B-WF2Q+, and implemented using an augmented rb-tree to preserve an - O(log N) overall complexity. See [2] for more details. B-WF2Q+ is - also ready for hierarchical scheduling. However, for a cleaner - logical breakdown, the code that enables and completes - hierarchical support is provided in the next commit, which focuses - exactly on this feature. - - - B-WF2Q+ guarantees a tight deviation with respect to an ideal, - perfectly fair, and smooth service. In particular, B-WF2Q+ - guarantees that each queue receives a fraction of the device - throughput proportional to its weight, even if the throughput - fluctuates, and regardless of: the device parameters, the current - workload and the budgets assigned to the queue. - - - The last, budget-independence, property (although probably - counterintuitive in the first place) is definitely beneficial, for - the following reasons: - - - First, with any proportional-share scheduler, the maximum - deviation with respect to an ideal service is proportional to - the maximum budget (slice) assigned to queues. As a consequence, - BFQ can keep this deviation tight not only because of the - accurate service of B-WF2Q+, but also because BFQ *does not* - need to assign a larger budget to a queue to let the queue - receive a higher fraction of the device throughput. - - - Second, BFQ is free to choose, for every process (queue), the - budget that best fits the needs of the process, or best - leverages the I/O pattern of the process. In particular, BFQ - updates queue budgets with a simple feedback-loop algorithm that - allows a high throughput to be achieved, while still providing - tight latency guarantees to time-sensitive applications. When - the in-service queue expires, this algorithm computes the next - budget of the queue so as to: - - - Let large budgets be eventually assigned to the queues - associated with I/O-bound applications performing sequential - I/O: in fact, the longer these applications are served once - got access to the device, the higher the throughput is. - - - Let small budgets be eventually assigned to the queues - associated with time-sensitive applications (which typically - perform sporadic and short I/O), because, the smaller the - budget assigned to a queue waiting for service is, the sooner - B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). - -- If several processes are competing for the device at the same time, - but all processes and groups have the same weight, then BFQ - guarantees the expected throughput distribution without ever idling - the device. It uses preemption instead. Throughput is then much - higher in this common scenario. - -- ioprio classes are served in strict priority order, i.e., - lower-priority queues are not served as long as there are - higher-priority queues. Among queues in the same class, the - bandwidth is distributed in proportion to the weight of each - queue. A very thin extra bandwidth is however guaranteed to - the Idle class, to prevent it from starving. - - -3. What are BFQ's tunable? -========================== - -The tunables back_seek-max, back_seek_penalty, fifo_expire_async and -fifo_expire_sync below are the same as in CFQ. Their description is -just copied from that for CFQ. Some considerations in the description -of slice_idle are copied from CFQ too. - -per-process ioprio and weight ------------------------------ - -Unless the cgroups interface is used (see "4. BFQ group scheduling"), -weights can be assigned to processes only indirectly, through I/O -priorities, and according to the relation: -weight = (IOPRIO_BE_NR - ioprio) * 10. - -Beware that, if low-latency is set, then BFQ automatically raises the -weight of the queues associated with interactive and soft real-time -applications. Unset this tunable if you need/want to control weights. - -slice_idle ----------- - -This parameter specifies how long BFQ should idle for next I/O -request, when certain sync BFQ queues become empty. By default -slice_idle is a non-zero value. Idling has a double purpose: boosting -throughput and making sure that the desired throughput distribution is -respected (see the description of how BFQ works, and, if needed, the -papers referred there). - -As for throughput, idling can be very helpful on highly seeky media -like single spindle SATA/SAS disks where we can cut down on overall -number of seeks and see improved throughput. - -Setting slice_idle to 0 will remove all the idling on queues and one -should see an overall improved throughput on faster storage devices -like multiple SATA/SAS disks in hardware RAID configuration. - -So depending on storage and workload, it might be useful to set -slice_idle=0. In general for SATA/SAS disks and software RAID of -SATA/SAS disks keeping slice_idle enabled should be useful. For any -configurations where there are multiple spindles behind single LUN -(Host based hardware RAID controller or for storage arrays), setting -slice_idle=0 might end up in better throughput and acceptable -latencies. - -Idling is however necessary to have service guarantees enforced in -case of differentiated weights or differentiated I/O-request lengths. -To see why, suppose that a given BFQ queue A must get several I/O -requests served for each request served for another queue B. Idling -ensures that, if A makes a new I/O request slightly after becoming -empty, then no request of B is dispatched in the middle, and thus A -does not lose the possibility to get more than one request dispatched -before the next request of B is dispatched. Note that idling -guarantees the desired differentiated treatment of queues only in -terms of I/O-request dispatches. To guarantee that the actual service -order then corresponds to the dispatch order, the strict_guarantees -tunable must be set too. - -There is an important flipside for idling: apart from the above cases -where it is beneficial also for throughput, idling can severely impact -throughput. One important case is random workload. Because of this -issue, BFQ tends to avoid idling as much as possible, when it is not -beneficial also for throughput. As a consequence of this behavior, and -of further issues described for the strict_guarantees tunable, -short-term service guarantees may be occasionally violated. And, in -some cases, these guarantees may be more important than guaranteeing -maximum throughput. For example, in video playing/streaming, a very -low drop rate may be more important than maximum throughput. In these -cases, consider setting the strict_guarantees parameter. - -strict_guarantees ------------------ - -If this parameter is set (default: unset), then BFQ - -- always performs idling when the in-service queue becomes empty; - -- forces the device to serve one I/O request at a time, by dispatching a - new request only if there is no outstanding request. - -In the presence of differentiated weights or I/O-request sizes, both -the above conditions are needed to guarantee that every BFQ queue -receives its allotted share of the bandwidth. The first condition is -needed for the reasons explained in the description of the slice_idle -tunable. The second condition is needed because all modern storage -devices reorder internally-queued requests, which may trivially break -the service guarantees enforced by the I/O scheduler. - -Setting strict_guarantees may evidently affect throughput. - -back_seek_max -------------- - -This specifies, given in Kbytes, the maximum "distance" for backward seeking. -The distance is the amount of space from the current head location to the -sectors that are backward in terms of distance. - -This parameter allows the scheduler to anticipate requests in the "backward" -direction and consider them as being the "next" if they are within this -distance from the current head location. - -back_seek_penalty ------------------ - -This parameter is used to compute the cost of backward seeking. If the -backward distance of request is just 1/back_seek_penalty from a "front" -request, then the seeking cost of two requests is considered equivalent. - -So scheduler will not bias toward one or the other request (otherwise scheduler -will bias toward front request). Default value of back_seek_penalty is 2. - -fifo_expire_async ------------------ - -This parameter is used to set the timeout of asynchronous requests. Default -value of this is 248ms. - -fifo_expire_sync ----------------- - -This parameter is used to set the timeout of synchronous requests. Default -value of this is 124ms. In case to favor synchronous requests over asynchronous -one, this value should be decreased relative to fifo_expire_async. - -low_latency ------------ - -This parameter is used to enable/disable BFQ's low latency mode. By -default, low latency mode is enabled. If enabled, interactive and soft -real-time applications are privileged and experience a lower latency, -as explained in more detail in the description of how BFQ works. - -DO NOT enable this mode if you need full control on bandwidth -distribution. In fact, if it is enabled, then BFQ automatically -increases the bandwidth share of privileged applications, as the main -means to guarantee a lower latency to them. - -timeout_sync ------------- - -Maximum amount of device time that can be given to a task (queue) once -it has been selected for service. On devices with costly seeks, -increasing this time usually increases maximum throughput. On the -opposite end, increasing this time coarsens the granularity of the -short-term bandwidth and latency guarantees, especially if the -following parameter is set to zero. - -max_budget ----------- - -Maximum amount of service, measured in sectors, that can be provided -to a BFQ queue once it is set in service (of course within the limits -of the above timeout). According to what said in the description of -the algorithm, larger values increase the throughput in proportion to -the percentage of sequential I/O requests issued. The price of larger -values is that they coarsen the granularity of short-term bandwidth -and latency guarantees. - -The default value is 0, which enables auto-tuning: BFQ sets max_budget -to the maximum number of sectors that can be served during -timeout_sync, according to the estimated peak rate. - -weights -------- - -Read-only parameter, used to show the weights of the currently active -BFQ queues. - - -wr_ tunables ------------- - -BFQ exports a few parameters to control/tune the behavior of -low-latency heuristics. - -wr_coeff - -Factor by which the weight of a weight-raised queue is multiplied. If -the queue is deemed soft real-time, then the weight is further -multiplied by an additional, constant factor. - -wr_max_time - -Maximum duration of a weight-raising period for an interactive task -(ms). If set to zero (default value), then this value is computed -automatically, as a function of the peak rate of the device. In any -case, when the value of this parameter is read, it always reports the -current duration, regardless of whether it has been set manually or -computed automatically. - -wr_max_softrt_rate - -Maximum service rate below which a queue is deemed to be associated -with a soft real-time application, and is then weight-raised -accordingly (sectors/sec). - -wr_min_idle_time - -Minimum idle period after which interactive weight-raising may be -reactivated for a queue (in ms). - -wr_rt_max_time - -Maximum weight-raising duration for soft real-time queues (in ms). The -start time from which this duration is considered is automatically -moved forward if the queue is detected to be still soft real-time -before the current soft real-time weight-raising period finishes. - -wr_min_inter_arr_async - -Minimum period between I/O request arrivals after which weight-raising -may be reactivated for an already busy async queue (in ms). - - -4. Group scheduling with BFQ -============================ - -BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely -blkio and io. In particular, BFQ supports weight-based proportional -share. To activate cgroups support, set BFQ_GROUP_IOSCHED. - -4-1 Service guarantees provided -------------------------------- - -With BFQ, proportional share means true proportional share of the -device bandwidth, according to group weights. For example, a group -with weight 200 gets twice the bandwidth, and not just twice the time, -of a group with weight 100. - -BFQ supports hierarchies (group trees) of any depth. Bandwidth is -distributed among groups and processes in the expected way: for each -group, the children of the group share the whole bandwidth of the -group in proportion to their weights. In particular, this implies -that, for each leaf group, every process of the group receives the -same share of the whole group bandwidth, unless the ioprio of the -process is modified. - -The resource-sharing guarantee for a group may partially or totally -switch from bandwidth to time, if providing bandwidth guarantees to -the group lowers the throughput too much. This switch occurs on a -per-process basis: if a process of a leaf group causes throughput loss -if served in such a way to receive its share of the bandwidth, then -BFQ switches back to just time-based proportional share for that -process. - -4-2 Interface -------------- - -To get proportional sharing of bandwidth with BFQ for a given device, -BFQ must of course be the active scheduler for that device. - -Within each group directory, the names of the files associated with -BFQ-specific cgroup parameters and stats begin with the "bfq." -prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -parameter to set the weight of a group with BFQ is blkio.bfq.weight -or io.bfq.weight. - -Parameters to set ------------------ - -For each group, there is only the following parameter to set. - -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -group inside its parent. Available values: 1..10000 (default 100). The -linear mapping between ioprio and weights, described at the beginning -of the tunable section, is still valid, but all weights higher than -IOPRIO_BE_NR*10 are mapped to ioprio 0. - -Recall that, if low-latency is set, then BFQ automatically raises the -weight of the queues associated with interactive and soft real-time -applications. Unset this tunable if you need/want to control weights. - - -[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O - Scheduler", Proceedings of the First Workshop on Mobile System - Technologies (MST-2015), May 2015. - http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf - -[2] P. Valente and M. Andreolini, "Improving Application - Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of - the 5th Annual International Systems and Storage Conference - (SYSTOR '12), June 2012. - Slightly extended version: - http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- - results.pdf diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 8dc4f04711fa..a29d749bbfe6 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -43,20 +43,20 @@ config IOSCHED_BFQ tristate "BFQ I/O scheduler" default n ---help--- - The BFQ I/O scheduler distributes bandwidth among all - processes according to their weights, regardless of the - device parameters and with any workload. It also guarantees - a low latency to interactive and soft real-time applications. - Details in Documentation/block/bfq-iosched.txt + The BFQ I/O scheduler tries to distribute bandwidth among + all processes according to their weights. + It aims at distributing the bandwidth as desired, independently of + the disk parameters and with any workload. It also tries to + guarantee low latency to interactive and soft real-time + applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" - depends on IOSCHED_BFQ && BLK_CGROUP + depends on CGROUPS && IOSCHED_BFQ=y default n ---help--- - - Enable hierarchical scheduling in BFQ, using the blkio - (cgroups-v1) or io (cgroups-v2) controller. + Enable hierarchical scheduling in BFQ, using the blkio controller. choice prompt "Default I/O scheduler" diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index bbaecd00449e..03679962d5c0 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -7,9 +7,7 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente + * Copyright (C) 2010 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. @@ -165,6 +163,8 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); + BUG_ON(!pd); + return pd_to_bfqg(pd); } @@ -208,49 +208,59 @@ static void bfqg_put(struct bfq_group *bfqg) static void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - int op, int op_flags) + int rw) { - blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1); + blkg_rwstat_add(&bfqg->stats.queued, rw, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, - int op_flags) +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) +{ + blkg_rwstat_add(&bfqg->stats.queued, rw, -1); +} + +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) { - blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1); + blkg_rwstat_add(&bfqg->stats.merged, rw, 1); } -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, - int op_flags) +static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, + uint64_t bytes, int rw) { - blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1); + blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); + blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); + blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); } static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) + uint64_t start_time, uint64_t io_start_time, int rw) { struct bfqg_stats *stats = &bfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); + blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, + blkg_rwstat_add(&stats->wait_time, rw, io_start_time - start_time); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { + if (!stats) + return; + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->service_bytes); + blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); blkg_stat_reset(&stats->time); + blkg_stat_reset(&stats->unaccounted_time); blkg_stat_reset(&stats->avg_queue_size_sum); blkg_stat_reset(&stats->avg_queue_size_samples); blkg_stat_reset(&stats->dequeue); @@ -260,16 +270,19 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) } /* @to += @from */ -static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) +static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; /* queued stats shouldn't be cleared */ + blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); + blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); blkg_stat_add_aux(&from->time, &from->time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); @@ -298,8 +311,10 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) if (unlikely(!parent)) return; - bfqg_stats_add_aux(&parent->stats, &bfqg->stats); + bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); + bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); bfqg_stats_reset(&bfqg->stats); + bfqg_stats_reset(&bfqg->dead_stats); } static void bfq_init_entity(struct bfq_entity *entity, @@ -314,17 +329,21 @@ static void bfq_init_entity(struct bfq_entity *entity, bfqq->ioprio_class = bfqq->new_ioprio_class; bfqg_get(bfqg); } - entity->parent = bfqg->my_entity; /* NULL for root group */ + entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { + blkg_rwstat_exit(&stats->service_bytes); + blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->sectors); blkg_stat_exit(&stats->time); + blkg_stat_exit(&stats->unaccounted_time); blkg_stat_exit(&stats->avg_queue_size_sum); blkg_stat_exit(&stats->avg_queue_size_samples); blkg_stat_exit(&stats->dequeue); @@ -335,11 +354,15 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->service_bytes, gfp) || + blkg_rwstat_init(&stats->serviced, gfp) || + blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->sectors, gfp) || blkg_stat_init(&stats->time, gfp) || + blkg_stat_init(&stats->unaccounted_time, gfp) || blkg_stat_init(&stats->avg_queue_size_sum, gfp) || blkg_stat_init(&stats->avg_queue_size_samples, gfp) || blkg_stat_init(&stats->dequeue, gfp) || @@ -363,27 +386,11 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - static void bfq_cpd_init(struct blkcg_policy_data *cpd) { struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); + d->weight = BFQ_DEFAULT_GRP_WEIGHT; } static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) @@ -394,7 +401,8 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) if (!bfqg) return NULL; - if (bfqg_stats_init(&bfqg->stats, gfp)) { + if (bfqg_stats_init(&bfqg->stats, gfp) || + bfqg_stats_init(&bfqg->dead_stats, gfp)) { kfree(bfqg); return NULL; } @@ -402,20 +410,27 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return &bfqg->pd; } -static void bfq_pd_init(struct blkg_policy_data *pd) +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) { - struct blkcg_gq *blkg; - struct bfq_group *bfqg; - struct bfq_data *bfqd; struct bfq_entity *entity; - struct bfq_group_data *d; - blkg = pd_to_blkg(pd); - BUG_ON(!blkg); - bfqg = blkg_to_bfqg(blkg); - bfqd = blkg->q->elevator->elevator_data; + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); + entity = &bfqg->entity; - d = blkcg_to_bfqgd(blkg->blkcg); + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +static void bfq_pd_init(struct blkg_policy_data *pd) +{ + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + struct bfq_data *bfqd = blkg->q->elevator->elevator_data; + struct bfq_entity *entity = &bfqg->entity; + struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; @@ -433,53 +448,70 @@ static void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); + bfqg_stats_exit(&bfqg->dead_stats); + return kfree(bfqg); } -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +/* offset delta from bfqg->stats to bfqg->dead_stats */ +static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - + offsetof(struct bfq_group, stats); + +/* to be used by recursive prfill, sums live and dead stats recursively */ +static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) { - struct bfq_group *bfqg = pd_to_bfqg(pd); + u64 sum = 0; - bfqg_stats_reset(&bfqg->stats); + sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); + sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, + off + dead_stats_off_delta); + return sum; } -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) +/* to be used by recursive prfill, sums live and dead rwstats recursively */ +static struct blkg_rwstat +bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) { - struct bfq_entity *entity; - - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); + struct blkg_rwstat a, b; - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; + a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); + b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, + off + dead_stats_off_delta); + blkg_rwstat_add_aux(&a, &b); + return a; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct blkcg_gq *blkg; + struct bfq_group *bfqg = pd_to_bfqg(pd); - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; + bfqg_stats_reset(&bfqg->stats); + bfqg_stats_reset(&bfqg->dead_stats); } -static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct bfq_group *bfqg, *parent; - struct bfq_entity *entity; + struct request_queue *q = bfqd->queue; + struct bfq_group *bfqg = NULL, *parent; + struct bfq_entity *entity = NULL; assert_spin_locked(bfqd->queue->queue_lock); - bfqg = bfq_lookup_bfqg(bfqd, blkcg); + /* avoid lookup for the common case where there's no blkcg */ + if (blkcg == &blkcg_root) { + bfqg = bfqd->root_group; + } else { + struct blkcg_gq *blkg; + + blkg = blkg_lookup_create(blkcg, q); + if (!IS_ERR(blkg)) + bfqg = blkg_to_bfqg(blkg); + else /* fallback to root_group */ + bfqg = bfqd->root_group; + } - if (unlikely(!bfqg)) - return NULL; + BUG_ON(!bfqg); /* * Update chain of bfq_groups as we might be handling a leaf group @@ -505,15 +537,11 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason); - /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. + * @entity: @bfqq's entity. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating @@ -524,40 +552,26 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * rcu_read_lock()). */ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) + struct bfq_entity *entity, struct bfq_group *bfqg) { - struct bfq_entity *entity = &bfqq->entity; + int busy, resume; - BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); - BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) - && entity->on_st && - bfqq != bfqd->in_service_queue); - BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); - - /* If bfqq is empty, then bfq_bfqq_expire also invokes - * bfq_del_bfqq_busy, thereby removing bfqq and its entity - * from data structures related to current group. Otherwise we - * need to remove bfqq explicitly with bfq_deactivate_bfqq, as - * we do below. - */ - if (bfqq == bfqd->in_service_queue) - bfq_bfqq_expire(bfqd, bfqd->in_service_queue, - false, BFQ_BFQQ_PREEMPTED); + busy = bfq_bfqq_busy(bfqq); + resume = !RB_EMPTY_ROOT(&bfqq->sort_list); - BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) - && &bfq_entity_service_tree(entity)->idle != - entity->tree); + BUG_ON(resume && !entity->on_st); + BUG_ON(busy && !resume && entity->on_st && + bfqq != bfqd->in_service_queue); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (busy) { + BUG_ON(atomic_read(&bfqq->ref) < 2); - if (bfq_bfqq_busy(bfqq)) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); - else if (entity->on_st) { - BUG_ON(&bfq_entity_service_tree(entity)->idle != - entity->tree); + if (!resume) + bfq_del_bfqq_busy(bfqd, bfqq, 0); + else + bfq_deactivate_bfqq(bfqd, bfqq, 0); + } else if (entity->on_st) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } bfqg_put(bfqq_group(bfqq)); /* @@ -569,17 +583,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->sched_data = &bfqg->sched_data; bfqg_get(bfqg); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_busy(bfqq)) { + if (busy) { bfq_pos_tree_add_move(bfqd, bfqq); - bfq_activate_bfqq(bfqd, bfqq); + if (resume) + bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); - BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) - && &bfq_entity_service_tree(entity)->idle != - entity->tree); } /** @@ -606,11 +617,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - + bfqg = bfq_find_alloc_group(bfqd, blkcg); if (async_bfqq) { entity = &async_bfqq->entity; @@ -618,8 +625,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, bic_set_bfqq(bic, NULL, 0); bfq_log_bfqq(bfqd, async_bfqq, "bic_change_group: %p %d", - async_bfqq, - async_bfqq->ref); + async_bfqq, atomic_read(&async_bfqq->ref)); bfq_put_queue(async_bfqq); } } @@ -627,7 +633,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (sync_bfqq) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); } return bfqg; @@ -636,23 +642,25 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); + struct blkcg *blkcg; struct bfq_group *bfqg = NULL; - uint64_t serial_nr; + uint64_t id; rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; + blkcg = bio_blkcg(bio); + id = blkcg->css.serial_nr; + rcu_read_unlock(); /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); - bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); + BUG_ON(!bfqg); + bic->blkcg_id = id; } /** @@ -664,7 +672,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) - __bfq_deactivate_entity(entity, false); + __bfq_deactivate_entity(entity, 0); } /** @@ -678,7 +686,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); + bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); } /** @@ -709,12 +717,11 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, } /** - * bfq_pd_offline - deactivate the entity associated with @pd, - * and reparent its children entities. - * @pd: descriptor of the policy going offline. + * bfq_destroy_group - destroy @bfqg. + * @bfqg: the group being destroyed. * - * blkio already grabs the queue_lock for us, so no need to use - * RCU-based magic + * Destroy @bfqg, making sure that it is not referenced from its parent. + * blkio already grabs the queue_lock for us, so no need to use RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { @@ -769,16 +776,10 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) BUG_ON(bfqg->sched_data.next_in_service); BUG_ON(bfqg->sched_data.in_service_entity); - __bfq_deactivate_entity(entity, false); + __bfq_deactivate_entity(entity, 0); bfq_put_async_queues(bfqd, bfqg); BUG_ON(entity->tree); - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so - * that they don't get lost. If IOs complete after this point, the - * stats for them will be lost. Oh well... - */ bfqg_stats_xfer_dead(bfqg); } @@ -788,35 +789,46 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); - BUG_ON(!bfqg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static int bfq_io_show_weight(struct seq_file *sf, void *v) +static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, + struct cftype *cftype) { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - unsigned int val = 0; + int ret = -EINVAL; - if (bfqgd) - val = bfqgd->weight; + spin_lock_irq(&blkcg->lock); + ret = bfqgd->weight; + spin_unlock_irq(&blkcg->lock); - seq_printf(sf, "%u\n", val); + return ret; +} + +static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + + spin_lock_irq(&blkcg->lock); + seq_printf(sf, "%u\n", bfqgd->weight); + spin_unlock_irq(&blkcg->lock); return 0; } -static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) +static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; - int ret = -ERANGE; + int ret = -EINVAL; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; @@ -861,18 +873,13 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, return ret; } -static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - u64 weight; /* First unsigned long found in the file is used */ - int ret = kstrtoull(strim(buf), 0, &weight); - - if (ret) - return ret; - - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); + return bfqio_cgroup_weight_write(of_css(of), NULL, + simple_strtoull(strim(buf), NULL, 0)); } static int bfqg_print_stat(struct seq_file *sf, void *v) @@ -892,17 +899,16 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, off); + u64 sum = bfqg_stat_pd_recursive_sum(pd, off); + return __blkg_prfill_u64(sf, pd, sum); } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, - off); + struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); + return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -922,41 +928,6 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } -static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); - return 0; -} - -static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, - false); - return 0; -} - - static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -993,15 +964,38 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) return blkg_to_bfqg(bfqd->queue->root_blkg); } -static struct cftype bfq_blkcg_legacy_files[] = { +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); + if (!bgd) + return NULL; + return &bgd->pd; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); +} + +static struct cftype bfqio_files_dfl[] = { { - .name = "bfq.weight", + .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write_u64 = bfq_io_set_weight_legacy, + .seq_show = bfqio_cgroup_weight_read_dfl, + .write = bfqio_cgroup_weight_write_dfl, }, + {} /* terminate */ +}; - /* statistics, covers only the tasks in the bfqg */ +static struct cftype bfqio_files[] = { + { + .name = "bfq.weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + /* statistics, cover only the tasks in the bfqg */ { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1009,17 +1003,18 @@ static struct cftype bfq_blkcg_legacy_files[] = { }, { .name = "bfq.sectors", - .seq_show = bfqg_print_stat_sectors, + .private = offsetof(struct bfq_group, stats.sectors), + .seq_show = bfqg_print_stat, }, { .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct bfq_group, stats.service_bytes), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct bfq_group, stats.serviced), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_service_time", @@ -1050,17 +1045,18 @@ static struct cftype bfq_blkcg_legacy_files[] = { }, { .name = "bfq.sectors_recursive", - .seq_show = bfqg_print_stat_sectors_recursive, + .private = offsetof(struct bfq_group, stats.sectors), + .seq_show = bfqg_print_stat_recursive, }, { .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct bfq_group, stats.service_bytes), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct bfq_group, stats.serviced), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_service_time_recursive", @@ -1106,39 +1102,31 @@ static struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, + { + .name = "bfq.unaccounted_time", + .private = offsetof(struct bfq_group, stats.unaccounted_time), + .seq_show = bfqg_print_stat, + }, { } /* terminate */ }; -static struct cftype bfq_blkg_files[] = { - { - .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write = bfq_io_set_weight, - }, - {} /* terminate */ +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfqio_files_dfl, + .legacy_cftypes = bfqio_files, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, }; -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, int op, int op_flags) { } -static inline void -bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { } -static inline void -bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { } -static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } -static inline void -bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) { } -static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } +#else static void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) @@ -1162,20 +1150,27 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) return bfqd->root_group; } +static void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) +{ +} + static void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_disconnect_groups(struct bfq_data *bfqd) { - return bfqd->root_group; + bfq_put_async_queues(bfqd, bfqd->root_group); } -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - return bfqq->bfqd->root_group; + return bfqd->root_group; } static struct bfq_group * diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2a2c130df35a..cf3e9b1800c9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1,5 +1,5 @@ /* - * Budget Fair Queueing (BFQ) I/O scheduler. + * Budget Fair Queueing (BFQ) disk scheduler. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -7,34 +7,25 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente + * Copyright (C) 2010 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. * - * BFQ is a proportional-share I/O scheduler, with some extra - * low-latency capabilities. BFQ also supports full hierarchical - * scheduling through cgroups. Next paragraphs provide an introduction - * on BFQ inner workings. Details on BFQ benefits and usage can be - * found in Documentation/block/bfq-iosched.txt. - * - * BFQ is a proportional-share storage-I/O scheduling algorithm based - * on the slice-by-slice service scheme of CFQ. But BFQ assigns - * budgets, measured in number of sectors, to processes instead of - * time slices. The device is not granted to the in-service process - * for a given time slice, but until it has exhausted its assigned - * budget. This change from the time to the service domain enables BFQ - * to distribute the device throughput among processes as desired, - * without any distortion due to throughput fluctuations, or to device - * internal queueing. BFQ uses an ad hoc internal scheduler, called - * B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated with processes. Thanks to - * the accurate policy of B-WF2Q+, BFQ can afford to assign high - * budgets to I/O-bound processes issuing sequential requests (to - * boost the throughput), and yet guarantee a low latency to - * interactive and soft real-time applications. + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. * * BFQ is described in [1], where also a reference to the initial, more * theoretical paper on BFQ can be found. The interested reader can find @@ -49,10 +40,10 @@ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) * complexity derives from the one introduced with EEVDF in [3]. * - * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O - * Scheduler", Proceedings of the First Workshop on Mobile System - * Technologies (MST-2015), May 2015. - * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness + * with the BFQ Disk I/O Scheduler'', + * Proceedings of the 5th Annual International Systems and Storage + * Conference (SYSTOR '12), June 2012. * * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf * @@ -79,23 +70,24 @@ #include "bfq.h" #include "blk.h" -/* Expiration time of sync (0) and async (1) requests, in ns. */ -static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* Maximum backwards seek, in KiB. */ -static const int bfq_back_max = (16 * 1024); +static const int bfq_back_max = 16 * 1024; /* Penalty of a backwards seek, in number of sectors. */ static const int bfq_back_penalty = 2; -/* Idling period duration, in ns. */ -static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = HZ / 125; /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = 194; /* Default maximum budget values, in sectors and number of requests. */ -static const int bfq_default_max_budget = (16 * 1024); +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; /* * Async to sync throughput distribution is controlled as follows: @@ -105,28 +97,23 @@ static const int bfq_default_max_budget = (16 * 1024); static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout = (HZ / 8); +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; struct kmem_cache *bfq_pool; -/* Below this threshold (in ns), we consider thinktime immediate. */ -#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 /* hw_tag detection: parallel requests threshold and min samples needed. */ #define BFQ_HW_QUEUE_THRESHOLD 4 #define BFQ_HW_QUEUE_SAMPLES 32 -#define BFQQ_SEEK_THR (sector_t)(8 * 100) -#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) -/* Min number of samples required to perform peak-rate update */ -#define BFQ_RATE_MIN_SAMPLES 32 -/* Min observation time interval required to perform a peak-rate update (ns) */ -#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -/* Target observation time interval for a peak-rate update (ns) */ -#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 /* Shift used for peak rate fixed precision calculations. */ #define BFQ_RATE_SHIFT 16 @@ -154,24 +141,16 @@ struct kmem_cache *bfq_pool; * The device's speed class is dynamically (re)detected in * bfq_update_peak_rate() every time the estimated peak rate is updated. * - * In the following definitions, R_slow[0]/R_fast[0] and - * T_slow[0]/T_fast[0] are the reference values for a slow/fast - * rotational device, whereas R_slow[1]/R_fast[1] and - * T_slow[1]/T_fast[1] are the reference values for a slow/fast - * non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. The reference - * rates are not the actual peak rates of the devices used as a - * reference, but slightly lower values. The reason for using these - * slightly lower values is that the peak-rate estimator tends to - * yield slightly lower values than the actual peak rate (it can yield - * the actual peak rate only if there is only one process doing I/O, - * and the process does sequential I/O). - * + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] + * are the reference values for a slow/fast rotational device, whereas + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for + * a slow/fast non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. * Both the reference peak rates and the thresholds are measured in * sectors/usec, left-shifted by BFQ_RATE_SHIFT. */ -static int R_slow[2] = {1000, 10700}; -static int R_fast[2] = {14000, 33000}; +static int R_slow[2] = {1536, 10752}; +static int R_fast[2] = {17415, 34791}; /* * To improve readability, a conversion function is used to initialize the * following arrays, which entails that they can be initialized only in a @@ -204,7 +183,10 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); */ static int bfq_bio_sync(struct bio *bio) { - return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; } /* @@ -427,7 +409,11 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return !bfq_differentiated_weights(bfqd); + return +#ifdef CONFIG_BFQ_GROUP_IOSCHED + !bfqd->active_numerous_groups && +#endif + !bfq_differentiated_weights(bfqd); } /* @@ -547,19 +533,9 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { - if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) - return blk_rq_sectors(rq); - - /* - * If there are no weight-raised queues, then amplify service - * by just the async charge factor; otherwise amplify service - * by twice the async charge factor, to further reduce latency - * for weight-raised queues. - */ - if (bfqq->bfqd->wr_busy_queues == 0) - return blk_rq_sectors(rq) * bfq_async_charge_factor; - - return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); } /** @@ -600,7 +576,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_requeue_bfqq(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); } } @@ -614,25 +590,14 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) dur = bfqd->RT_prod; do_div(dur, bfqd->peak_rate); - /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. - * - * On the other end, lower values than 3 seconds make it - * difficult for most interactive tasks to complete their jobs - * before weight-raising finishes. - */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - return dur; } +static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { @@ -640,31 +605,31 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); - if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - - bfqq->wr_coeff = 1; + /* Assuming that the flag in_large_burst is already correctly set */ + if (bic->wr_time_left && bfqq->bfqd->low_latency && + !bfq_bfqq_in_large_burst(bfqq) && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.prio_changed = 1; } - /* make sure weight will be updated, however we got here */ - bfqq->entity.prio_changed = 1; + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -674,7 +639,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) lockdep_assert_held(bfqq->bfqd->queue->queue_lock); io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); return process_refs; } @@ -689,7 +654,6 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; - bfqd->burst_parent_entity = bfqq->entity.parent; } /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ @@ -698,10 +662,6 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; - bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; @@ -711,19 +671,15 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * other to consider this burst as large. */ bfqd->large_burst = true; - bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) { + burst_list_node) bfq_mark_bfqq_in_large_burst(bfqq_item); - bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); - } bfq_mark_bfqq_in_large_burst(bfqq); - bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); /* * From now on, and until the current burst finishes, any @@ -735,79 +691,67 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&pos->burst_list_node); - } else /* - * Burst not yet large: add bfqq to the burst list. Do - * not increment the ref counter for bfqq, because bfqq - * is removed from the burst list before freeing bfqq - * in put_queue. - */ + } else /* burst not yet large: add bfqq to the burst list */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } /* - * If many queues belonging to the same group happen to be created - * shortly after each other, then the processes associated with these - * queues have typically a common goal. In particular, bursts of queue - * creations are usually caused by services or applications that spawn - * many parallel threads/processes. Examples are systemd during boot, - * or git grep. To help these processes get their job done as soon as - * possible, it is usually better to not grant either weight-raising - * or device idling to their queues. + * If many queues happen to become active shortly after each other, then, + * to help the processes associated to these queues get their job done as + * soon as possible, it is usually better to not grant either weight-raising + * or device idling to these queues. In this comment we describe, firstly, + * the reasons why this fact holds, and, secondly, the next function, which + * implements the main steps needed to properly mark these queues so that + * they can then be treated in a different way. * - * In this comment we describe, firstly, the reasons why this fact - * holds, and, secondly, the next function, which implements the main - * steps needed to properly mark these queues so that they can then be - * treated in a different way. + * As for the terminology, we say that a queue becomes active, i.e., + * switches from idle to backlogged, either when it is created (as a + * consequence of the arrival of an I/O request), or, if already existing, + * when a new request for the queue arrives while the queue is idle. + * Bursts of activations, i.e., activations of different queues occurring + * shortly after each other, are typically caused by services or applications + * that spawn or reactivate many parallel threads/processes. Examples are + * systemd during boot or git grep. * - * The above services or applications benefit mostly from a high - * throughput: the quicker the requests of the activated queues are - * cumulatively served, the sooner the target job of these queues gets - * completed. As a consequence, weight-raising any of these queues, - * which also implies idling the device for it, is almost always - * counterproductive. In most cases it just lowers throughput. + * These services or applications benefit mostly from a high throughput: + * the quicker the requests of the activated queues are cumulatively served, + * the sooner the target job of these queues gets completed. As a consequence, + * weight-raising any of these queues, which also implies idling the device + * for it, is almost always counterproductive: in most cases it just lowers + * throughput. * - * On the other hand, a burst of queue creations may be caused also by - * the start of an application that does not consist of a lot of - * parallel I/O-bound threads. In fact, with a complex application, - * several short processes may need to be executed to start-up the - * application. In this respect, to start an application as quickly as - * possible, the best thing to do is in any case to privilege the I/O - * related to the application with respect to all other - * I/O. Therefore, the best strategy to start as quickly as possible - * an application that causes a burst of queue creations is to - * weight-raise all the queues created during the burst. This is the + * On the other hand, a burst of activations may be also caused by the start + * of an application that does not consist in a lot of parallel I/O-bound + * threads. In fact, with a complex application, the burst may be just a + * consequence of the fact that several processes need to be executed to + * start-up the application. To start an application as quickly as possible, + * the best thing to do is to privilege the I/O related to the application + * with respect to all other I/O. Therefore, the best strategy to start as + * quickly as possible an application that causes a burst of activations is + * to weight-raise all the queues activated during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * - * In the end, to take the best action for each of the two cases, the - * two types of bursts need to be distinguished. Fortunately, this - * seems relatively easy, by looking at the sizes of the bursts. In - * particular, we found a threshold such that only bursts with a - * larger size than that threshold are apparently caused by - * services or commands such as systemd or git grep. For brevity, - * hereafter we call just 'large' these bursts. BFQ *does not* - * weight-raise queues whose creation occurs in a large burst. In - * addition, for each of these queues BFQ performs or does not perform - * idling depending on which choice boosts the throughput more. The - * exact choice depends on the device and request pattern at + * In the end, to take the best action for each of the two cases, the two + * types of bursts need to be distinguished. Fortunately, this seems + * relatively easy to do, by looking at the sizes of the bursts. In + * particular, we found a threshold such that bursts with a larger size + * than that threshold are apparently caused only by services or commands + * such as systemd or git grep. For brevity, hereafter we call just 'large' + * these bursts. BFQ *does not* weight-raise queues whose activations occur + * in a large burst. In addition, for each of these queues BFQ performs or + * does not perform idling depending on which choice boosts the throughput + * most. The exact choice depends on the device and request pattern at * hand. * - * Unfortunately, false positives may occur while an interactive task - * is starting (e.g., an application is being started). The - * consequence is that the queues associated with the task do not - * enjoy weight raising as expected. Fortunately these false positives - * are very rare. They typically occur if some service happens to - * start doing I/O exactly when the interactive task starts. - * - * Turning back to the next function, it implements all the steps - * needed to detect the occurrence of a large burst and to properly - * mark all the queues belonging to it (so that they can then be - * treated in a different way). This goal is achieved by maintaining a - * "burst list" that holds, temporarily, the queues that belong to the - * burst in progress. The list is then used to mark these queues as - * belonging to a large burst if the burst does become large. The main - * steps are the following. + * Turning back to the next function, it implements all the steps needed + * to detect the occurrence of a large burst and to properly mark all the + * queues belonging to it (so that they can then be treated in a different + * way). This goal is achieved by maintaining a special "burst list" that + * holds, temporarily, the queues that belong to the burst in progress. The + * list is then used to mark these queues as belonging to a large burst if + * the burst does become large. The main steps are the following. * - * . when the very first queue is created, the queue is inserted into the + * . when the very first queue is activated, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does @@ -828,13 +772,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * * . the device enters a large-burst mode * - * . if a queue Q that does not belong to the burst is created while + * . if a queue Q that does not belong to the burst is activated while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * - * . if a queue Q that does not belong to the burst is created a while + * . if a queue Q that does not belong to the burst is activated a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: @@ -847,44 +791,52 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * in a possible new burst (then the burst list contains just Q * after this step). */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool idle_for_long_time) { + /* + * If bfqq happened to be activated in a burst, but has been idle + * for at least as long as an interactive queue, then we assume + * that, in the overall I/O initiated in the burst, the I/O + * associated to bfqq is finished. So bfqq does not need to be + * treated as a queue belonging to a burst anymore. Accordingly, + * we reset bfqq's in_large_burst flag if set, and remove bfqq + * from the burst list if it's there. We do not decrement instead + * burst_size, because the fact that bfqq does not need to belong + * to the burst list any more does not invalidate the fact that + * bfqq may have been activated during the current burst. + */ + if (idle_for_long_time) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + /* * If bfqq is already in the burst list or is part of a large - * burst, or finally has just been split, then there is - * nothing else to do. + * burst, then there is nothing else to do. */ if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq) || - time_is_after_eq_jiffies(bfqq->split_time + - msecs_to_jiffies(10))) + bfq_bfqq_in_large_burst(bfqq)) return; /* - * If bfqq's creation happens late enough, or bfqq belongs to - * a different group than the burst group, then the current - * burst is finished, and related data structures must be - * reset. + * If bfqq's activation happens late enough, then the current + * burst is finished, and related data structures must be reset. * - * In this respect, consider the special case where bfqq is - * the very first queue created after BFQ is selected for this - * device. In this case, last_ins_in_burst and - * burst_parent_entity are not yet significant when we get - * here. But it is easy to verify that, whether or not the - * following condition is true, bfqq will end up being - * inserted into the burst list. In particular the list will - * happen to contain only bfqq. And this is exactly what has - * to happen, as bfqq may be the first queue of the first + * In this respect, consider the special case where bfqq is the very + * first queue being activated. In this case, last_ins_in_burst is + * not yet significant when we get here. But it is easy to verify + * that, whether or not the following condition is true, bfqq will + * end up being inserted into the burst list. In particular the + * list will happen to contain only bfqq. And this is exactly what + * has to happen, as bfqq may be the first queue in a possible * burst. */ if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval) || - bfqq->entity.parent != bfqd->burst_parent_entity) { + bfqd->bfq_burst_interval)) { bfqd->large_burst = false; bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, - "handle_burst: late activation or different group"); - goto end; + return; } /* @@ -893,9 +845,8 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * bfqq as belonging to this large burst immediately. */ if (bfqd->large_burst) { - bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); bfq_mark_bfqq_in_large_burst(bfqq); - goto end; + return; } /* @@ -904,491 +855,25 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * queue. Then we add bfqq to the burst. */ bfq_add_to_burst(bfqd, bfqq); -end: - /* - * At this point, bfqq either has been added to the current - * burst or has caused the current burst to terminate and a - * possible new burst to start. In particular, in the second - * case, bfqq has become the first queue in the possible new - * burst. In both cases last_ins_in_burst needs to be moved - * forward. - */ - bfqd->last_ins_in_burst = jiffies; - -} - -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - return entity->budget - entity->service; -} - -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason); - -/* - * The next function, invoked after the input queue bfqq switches from - * idle to busy, updates the budget of bfqq. The function also tells - * whether the in-service queue should be expired, by returning - * true. The purpose of expiring the in-service queue is to give bfqq - * the chance to possibly preempt the in-service queue, and the reason - * for preempting the in-service queue is to achieve one of the two - * goals below. - * - * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has - * expired because it has remained idle. In particular, bfqq may have - * expired for one of the following two reasons: - * - * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and - * did not make it to issue a new request before its last request - * was served; - * - * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue - * a new request before the expiration of the idling-time. - * - * Even if bfqq has expired for one of the above reasons, the process - * associated with the queue may be however issuing requests greedily, - * and thus be sensitive to the bandwidth it receives (bfqq may have - * remained idle for other reasons: CPU high load, bfqq not enjoying - * idling, I/O throttling somewhere in the path from the process to - * the I/O scheduler, ...). But if, after every expiration for one of - * the above two reasons, bfqq has to wait for the service of at least - * one full budget of another queue before being served again, then - * bfqq is likely to get a much lower bandwidth or resource time than - * its reserved ones. To address this issue, two countermeasures need - * to be taken. - * - * First, the budget and the timestamps of bfqq need to be updated in - * a special way on bfqq reactivation: they need to be updated as if - * bfqq did not remain idle and did not expire. In fact, if they are - * computed as if bfqq expired and remained idle until reactivation, - * then the process associated with bfqq is treated as if, instead of - * being greedy, it stopped issuing requests when bfqq remained idle, - * and restarts issuing requests only on this reactivation. In other - * words, the scheduler does not help the process recover the "service - * hole" between bfqq expiration and reactivation. As a consequence, - * the process receives a lower bandwidth than its reserved one. In - * contrast, to recover this hole, the budget must be updated as if - * bfqq was not expired at all before this reactivation, i.e., it must - * be set to the value of the remaining budget when bfqq was - * expired. Along the same line, timestamps need to be assigned the - * value they had the last time bfqq was selected for service, i.e., - * before last expiration. Thus timestamps need to be back-shifted - * with respect to their normal computation (see [1] for more details - * on this tricky aspect). - * - * Secondly, to allow the process to recover the hole, the in-service - * queue must be expired too, to give bfqq the chance to preempt it - * immediately. In fact, if bfqq has to wait for a full budget of the - * in-service queue to be completed, then it may become impossible to - * let the process recover the hole, even if the back-shifted - * timestamps of bfqq are lower than those of the in-service queue. If - * this happens for most or all of the holes, then the process may not - * receive its reserved bandwidth. In this respect, it is worth noting - * that, being the service of outstanding requests unpreemptible, a - * little fraction of the holes may however be unrecoverable, thereby - * causing a little loss of bandwidth. - * - * The last important point is detecting whether bfqq does need this - * bandwidth recovery. In this respect, the next function deems the - * process associated with bfqq greedy, and thus allows it to recover - * the hole, if: 1) the process is waiting for the arrival of a new - * request (which implies that bfqq expired for one of the above two - * reasons), and 2) such a request has arrived soon. The first - * condition is controlled through the flag non_blocking_wait_rq, - * while the second through the flag arrived_in_time. If both - * conditions hold, then the function computes the budget in the - * above-described special way, and signals that the in-service queue - * should be expired. Timestamp back-shifting is done later in - * __bfq_activate_entity. - * - * 2. Reduce latency. Even if timestamps are not backshifted to let - * the process associated with bfqq recover a service hole, bfqq may - * however happen to have, after being (re)activated, a lower finish - * timestamp than the in-service queue. That is, the next budget of - * bfqq may have to be completed before the one of the in-service - * queue. If this is the case, then preempting the in-service queue - * allows this goal to be achieved, apart from the unpreemptible, - * outstanding requests mentioned above. - * - * Unfortunately, regardless of which of the above two goals one wants - * to achieve, service trees need first to be updated to know whether - * the in-service queue must be preempted. To have service trees - * correctly updated, the in-service queue must be expired and - * rescheduled, and bfqq must be scheduled too. This is one of the - * most costly operations (in future versions, the scheduling - * mechanism may be re-designed in such a way to make it possible to - * know whether preemption is needed without needing to update service - * trees). In addition, queue preemptions almost always cause random - * I/O, and thus loss of throughput. Because of these facts, the next - * function adopts the following simple scheme to avoid both costly - * operations and too frequent preemptions: it requests the expiration - * of the in-service queue (unconditionally) only for queues that need - * to recover a hole, or that either are weight-raised or deserve to - * be weight-raised. - */ -static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool arrived_in_time, - bool wr_or_deserves_wr) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { - /* - * We do not clear the flag non_blocking_wait_rq here, as - * the latter is used in bfq_activate_bfqq to signal - * that timestamps need to be back-shifted (and is - * cleared right after). - */ - - /* - * In next assignment we rely on that either - * entity->service or entity->budget are not updated - * on expiration if bfqq is empty (see - * __bfq_bfqq_recalc_budget). Thus both quantities - * remain unchanged after such an expiration, and the - * following statement therefore assigns to - * entity->budget the remaining budget on such an - * expiration. For clarity, entity->service is not - * updated on expiration in any case, and, in normal - * operation, is reset only when bfqq is selected for - * service (see bfq_get_next_queue). - */ - BUG_ON(bfqq->max_budget < 0); - entity->budget = min_t(unsigned long, - bfq_bfqq_budget_left(bfqq), - bfqq->max_budget); - - BUG_ON(entity->budget < 0); - return true; - } - - BUG_ON(bfqq->max_budget < 0); - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(bfqq->next_rq, bfqq)); - BUG_ON(entity->budget < 0); - - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - return wr_or_deserves_wr; -} - -static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, - bool wr_or_deserves_wr, - bool interactive, - bool in_burst, - bool soft_rt) -{ - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { - bfqq->wr_start_at_switch_to_srt = jiffies; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - /* - * If needed, further reduce budget to make sure it is - * close to bfqq's backlog, so as to reduce the - * scheduling-error component due to a too large - * budget. Do not care about throughput consequences, - * but only about latency. Finally, do not assign a - * too small budget either, to avoid increasing - * latency by causing too frequent expirations. - */ - bfqq->entity.budget = min_t(unsigned long, - bfqq->entity.budget, - 2 * bfq_min_budget(bfqd)); - - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) { /* update wr coeff and duration */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else if (in_burst) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (soft_rt) { - /* - * The application is now or still meeting the - * requirements for being deemed soft rt. We - * can then correctly and safely (re)charge - * the weight-raising duration for the - * application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - if (bfqq->wr_cur_max_time != - bfqd->bfq_wr_rt_max_time) { - bfqq->wr_start_at_switch_to_srt = - bfqq->last_wr_start_finish; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfq_log_bfqq(bfqd, bfqq, - "switching to soft_rt wr"); - } else - bfq_log_bfqq(bfqd, bfqq, - "moving forward soft_rt wr duration"); - bfqq->last_wr_start_finish = jiffies; - } - } -} - -static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - return bfqq->dispatched == 0 && - time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); -} - -static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - int old_wr_coeff, - struct request *rq, - bool *interactive) -{ - bool soft_rt, in_burst, wr_or_deserves_wr, - bfqq_wants_to_preempt, - idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), - /* - * See the comments on - * bfq_bfqq_update_budg_for_activation for - * details on the usage of the next variable. - */ - arrived_in_time = ktime_get_ns() <= - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; - - bfq_log_bfqq(bfqd, bfqq, - "bfq_add_request non-busy: " - "jiffies %lu, in_time %d, idle_long %d busyw %d " - "wr_coeff %u", - jiffies, arrived_in_time, - idle_for_long_time, - bfq_bfqq_non_blocking_wait_rq(bfqq), - old_wr_coeff); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfqq == bfqd->in_service_queue); - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - req_op(rq), rq->cmd_flags); - - /* - * bfqq deserves to be weight-raised if: - * - it is sync, - * - it does not belong to a large burst, - * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense) - */ - in_burst = bfq_bfqq_in_large_burst(bfqq); - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - *interactive = - !in_burst && - idle_for_long_time; - wr_or_deserves_wr = bfqd->low_latency && - (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); - - bfq_log_bfqq(bfqd, bfqq, - "bfq_add_request: " - "in_burst %d, " - "soft_rt %d (next %lu), inter %d, bic %p", - bfq_bfqq_in_large_burst(bfqq), soft_rt, - bfqq->soft_rt_next_start, - *interactive, - bfqq->bic); - - /* - * Using the last flag, update budget and check whether bfqq - * may want to preempt the in-service queue. - */ - bfqq_wants_to_preempt = - bfq_bfqq_update_budg_for_activation(bfqd, bfqq, - arrived_in_time, - wr_or_deserves_wr); - - /* - * If bfqq happened to be activated in a burst, but has been - * idle for much more than an interactive queue, then we - * assume that, in the overall I/O initiated in the burst, the - * I/O associated with bfqq is finished. So bfqq does not need - * to be treated as a queue belonging to a burst - * anymore. Accordingly, we reset bfqq's in_large_burst flag - * if set, and remove bfqq from the burst list if it's - * there. We do not decrement burst_size, because the fact - * that bfqq does not need to belong to the burst list any - * more does not invalidate the fact that bfqq was created in - * a burst. - */ - if (likely(!bfq_bfqq_just_created(bfqq)) && - idle_for_long_time && - time_is_before_jiffies( - bfqq->budget_timeout + - msecs_to_jiffies(10000))) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - bfq_log_bfqq(bfqd, bfqq, "requests in time %d", - bfqq->requests_within_timer); - } - - if (bfqd->low_latency) { - if (unlikely(time_is_after_jiffies(bfqq->split_time))) - /* wraparound */ - bfqq->split_time = - jiffies - bfqd->bfq_wr_min_idle_time - 1; - - if (time_is_before_jiffies(bfqq->split_time + - bfqd->bfq_wr_min_idle_time)) { - bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, - old_wr_coeff, - wr_or_deserves_wr, - *interactive, - in_burst, - soft_rt); - - if (old_wr_coeff != bfqq->wr_coeff) - bfqq->entity.prio_changed = 1; - } - } - - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - - bfq_add_bfqq_busy(bfqd, bfqq); - - /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In this respect, the function - * next_queue_may_preempt just checks a simple, necessary - * condition, and not a sufficient condition based on - * timestamps. In fact, for the latter condition to be - * evaluated, timestamps would need first to be updated, and - * this operation is quite costly (see the comments on the - * function bfq_bfqq_update_budg_for_activation). - */ - if (bfqd->in_service_queue && bfqq_wants_to_preempt && - bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && - next_queue_may_preempt(bfqd)) { - struct bfq_queue *in_serv = - bfqd->in_service_queue; - BUG_ON(in_serv == bfqq); - - bfq_bfqq_expire(bfqd, bfqd->in_service_queue, - false, BFQ_BFQQ_PREEMPTED); - BUG_ON(in_serv->entity.budget < 0); - } } static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; struct bfq_data *bfqd = bfqq->bfqd; struct request *next_rq, *prev; - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned long old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqq->wr_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); - + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; elv_rb_add(&bfqq->sort_list, rq); /* - * Check if this request is a better next-to-serve candidate. + * Check if this request is a better next-serve candidate. */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); @@ -1401,10 +886,160 @@ static void bfq_add_request(struct request *rq) if (prev != bfqq->next_rq) bfq_pos_tree_add_move(bfqd, bfqq); - if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ - bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, - rq, &interactive); - else { + if (!bfq_bfqq_busy(bfqq)) { + bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, + rq->cmd_flags); +#endif + if (bfq_bfqq_sync(bfqq)) { + bool already_in_burst = + !hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq); + bfq_handle_burst(bfqd, bfqq, idle_for_long_time); + /* + * If bfqq was not already in the current burst, + * then, at this point, bfqq either has been + * added to the current burst or has caused the + * current burst to terminate. In particular, in + * the second case, bfqq has become the first + * queue in a possible new burst. + * In both cases last_ins_in_burst needs to be + * moved forward. + */ + if (!already_in_burst) + bfqd->last_ins_in_burst = jiffies; + } + + coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (!bfqd->low_latency) + goto add_bfqq_busy; + + if (bfq_bfqq_just_split(bfqq)) + goto set_prio_changed; + + /* + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. + */ + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + } +set_prio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->prio_changed = 1; +add_bfqq_busy: + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); + } else { if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && time_is_before_jiffies( bfqq->last_wr_start_finish + @@ -1413,43 +1048,16 @@ static void bfq_add_request(struct request *rq) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - bfqq->entity.prio_changed = 1; + entity->prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting, " - "wr_max_time %u wr_busy %d", - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqd->wr_busy_queues); + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); } if (prev != bfqq->next_rq) bfq_updated_next_req(bfqd, bfqq); } - /* - * Assign jiffies to last_wr_start_finish in the following - * cases: - * - * . if bfqq is not going to be weight-raised, because, for - * non weight-raised queues, last_wr_start_finish stores the - * arrival time of the last request; as of now, this piece - * of information is used only for deciding whether to - * weight-raise async queues - * - * . if bfqq is not weight-raised, because, if bfqq is now - * switching to weight-raised, then last_wr_start_finish - * stores the time when weight-raising starts - * - * . if bfqq is interactive, because, regardless of whether - * bfqq is currently weight-raised, the weight-raising - * period must start or restart (this case is considered - * separately because it is not detected by the above - * conditions, if bfqq is already weight-raised) - * - * last_wr_start_finish has to be updated also if bfqq is soft - * real-time, because the weight-raising period is constantly - * restarted on idle-to-busy transitions for these queues, but - * this is already done in bfq_bfqq_handle_idle_busy_switch if - * needed. - */ if (bfqd->low_latency && (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) bfqq->last_wr_start_finish = jiffies; @@ -1473,24 +1081,14 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, return NULL; } -static sector_t get_sdist(sector_t last_pos, struct request *rq) -{ - sector_t sdist = 0; - - if (last_pos) { - if (last_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - last_pos; - else - sdist = last_pos - blk_rq_pos(rq); - } - - return sdist; -} - static void bfq_activate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (unsigned long long) bfqd->last_position); } static void bfq_deactivate_request(struct request_queue *q, struct request *rq) @@ -1507,9 +1105,6 @@ static void bfq_remove_request(struct request *rq) struct bfq_data *bfqd = bfqq->bfqd; const int sync = rq_is_sync(rq); - BUG_ON(bfqq->entity.service > bfqq->entity.budget && - bfqq == bfqd->in_service_queue); - if (bfqq->next_rq == rq) { bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); bfq_updated_next_req(bfqd, bfqq); @@ -1523,25 +1118,8 @@ static void bfq_remove_request(struct request *rq) elv_rb_del(&bfqq->sort_list, rq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - BUG_ON(bfqq->entity.budget < 0); - - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - bfq_del_bfqq_busy(bfqd, bfqq, false); - - /* bfqq emptied. In normal operation, when - * bfqq is empty, bfqq->entity.service and - * bfqq->entity.budget must contain, - * respectively, the service received and the - * budget used last time bfqq emptied. These - * facts do not hold in this case, as at least - * this last removal occurred while bfqq is - * not in service. To avoid inconsistencies, - * reset both bfqq->entity.service and - * bfqq->entity.budget. - */ - bfqq->entity.budget = bfqq->entity.service = 0; - } - + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); /* * Remove queue from request-position tree as it is empty. */ @@ -1555,8 +1133,9 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } - bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq), - rq->cmd_flags); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); +#endif } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1566,7 +1145,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_bio_merge_ok(__rq, bio)) { + if (__rq && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -1611,8 +1190,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, static void bfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio), - bio->bi_opf); + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); } #endif @@ -1632,7 +1210,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, */ if (bfqq == next_bfqq && !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - next->fifo_time < rq->fifo_time) { + time_before(next->fifo_time, rq->fifo_time)) { list_del_init(&rq->queuelist); list_replace_init(&next->queuelist, &rq->queuelist); rq->fifo_time = next->fifo_time; @@ -1642,31 +1220,21 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); - bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next), - next->cmd_flags); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +#endif } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(!bfqq); - if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; - bfqq->last_wr_start_finish = jiffies; - /* - * Trigger a weight change on the next invocation of - * __bfq_entity_update_weight_prio. - */ + /* Trigger a weight change on the next activation of the queue */ bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, - "end_wr: wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", - bfqq->bfqd->wr_busy_queues); } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1709,7 +1277,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_CLOSE_THR; + BFQQ_SEEK_THR; } static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, @@ -1831,7 +1399,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * throughput. */ bfqq->new_bfqq = new_bfqq; - new_bfqq->ref += process_refs; + atomic_add(process_refs, &new_bfqq->ref); return new_bfqq; } @@ -1862,23 +1430,9 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - -/* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return - * NULL if no merge was scheduled, a pointer to the shared bfq_queue + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. * * The OOM queue is not allowed to participate to cooperation: in fact, since @@ -1887,18 +1441,6 @@ bool wr_from_too_long(struct bfq_queue *bfqq) * handle merging with the OOM queue would be quite complex and expensive * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. - * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the - * weight the same, a merged queue may be bloated with many more - * requests than the ones produced by its originally-associated - * process. */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1908,32 +1450,16 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - - if (io_struct && wr_from_too_long(bfqq) && - likely(bfqq != &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have looked for coop, but bfq%d wr", - bfqq->pid); - - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; - - /* If there is only one backlogged queue, don't search. */ + /* If device has only one backlogged bfq_queue, don't search. */ if (bfqd->busy_queues == 1) return NULL; in_service_bfqq = bfqd->in_service_queue; - if (in_service_bfqq && in_service_bfqq != bfqq && - bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) - && likely(in_service_bfqq == &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have tried merge with in-service-queue, but wr"); - if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || + !bfqd->in_service_bic || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; @@ -1955,15 +1481,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - if (new_bfqq && wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have merged with bfq%d, but wr", - new_bfqq->pid); - - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -1972,25 +1490,53 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { - struct bfq_io_cq *bic = bfqq->bic; - /* * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bic) + if (!bfqq->bic) return; - - bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -2015,40 +1561,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, if (bfq_bfqq_IO_bound(bfqq)) bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); - - /* - * If bfqq is weight-raised, then let new_bfqq inherit - * weight-raising. To reduce false positives, neglect the case - * where bfqq has just been created, but has not yet made it - * to be weight-raised (which may happen because EQM may merge - * bfqq even before bfq_add_request is executed for the first - * time for bfqq). Handling this case would however be very - * easy, thanks to the flag just_created. - */ - if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { - new_bfqq->wr_coeff = bfqq->wr_coeff; - new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; - new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; - new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - if (bfq_bfqq_busy(new_bfqq)) - bfqd->wr_busy_queues++; - new_bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, new_bfqq, - "wr start after merge with %d, rais_max_time %u", - bfqq->pid, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ - bfqq->wr_coeff = 1; - bfqq->entity.prio_changed = 1; - if (bfq_bfqq_busy(bfqq)) - bfqd->wr_busy_queues--; - } - - bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", - bfqd->wr_busy_queues); - /* * Grab a reference to the bic, to prevent it from being destroyed * before being possibly touched by a bfq_split_bfqq(). @@ -2075,8 +1587,20 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - struct bio *bio) +static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; @@ -2086,7 +1610,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * Disallow merge of a sync bio into an async request. */ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return false; + return 0; /* * Lookup the bfqq that this bio will be queued with. Allow @@ -2095,7 +1619,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, */ bic = bfq_bic_lookup(bfqd, current->io_context); if (!bic) - return false; + return 0; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); /* @@ -2112,111 +1636,30 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * to decide whether bio and rq can be merged. */ bfqq = new_bfqq; - } + } else + bfq_bfqq_increase_failed_cooperations(bfqq); } return bfqq == RQ_BFQQ(rq); } -static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, - struct request *next) -{ - return RQ_BFQQ(rq) == RQ_BFQQ(next); -} - -/* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the throughput. - * In practice, a time-slice service scheme is used with seeky - * processes. - */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - unsigned int timeout_coeff; - - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - - bfqd->last_budget_start = ktime_get(); - - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -} - static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); +#endif bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - BUG_ON(bfqq == bfqd->in_service_queue); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - if (time_is_before_jiffies(bfqq->last_wr_start_finish) && - bfqq->wr_coeff > 1 && - bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_before_jiffies(bfqq->budget_timeout)) { - /* - * For soft real-time queues, move the start - * of the weight-raising period forward by the - * time the queue has not received any - * service. Otherwise, a relatively long - * service delay is likely to cause the - * weight-raising period of the queue to end, - * because of the short duration of the - * weight-raising period of a soft real-time - * queue. It is worth noting that this move - * is not so dangerous for the other queues, - * because soft real-time queues are not - * greedy. - * - * To not add a further variable, we use the - * overloaded field budget_timeout to - * determine for how long the queue has not - * received service, i.e., how much time has - * elapsed since the queue expired. However, - * this is a little imprecise, because - * budget_timeout is set to jiffies if bfqq - * not only expires, but also remains with no - * request. - */ - if (time_after(bfqq->budget_timeout, - bfqq->last_wr_start_finish)) - bfqq->last_wr_start_finish += - jiffies - bfqq->budget_timeout; - else - bfqq->last_wr_start_finish = jiffies; - - if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { - pr_crit( - "BFQ WARNING:last %lu budget %lu jiffies %lu", - bfqq->last_wr_start_finish, - bfqq->budget_timeout, - jiffies); - pr_crit("diff %lu", jiffies - - max_t(unsigned long, - bfqq->last_wr_start_finish, - bfqq->budget_timeout)); - bfqq->last_wr_start_finish = jiffies; - } - } - - bfq_set_budget_timeout(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, "set_in_service_queue, cur-budget = %d", bfqq->entity.budget); - } else - bfq_log(bfqd, "set_in_service_queue: NULL"); + } bfqd->in_service_queue = bfqq; } @@ -2232,11 +1675,36 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) return bfqq; } +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; struct bfq_io_cq *bic; - u32 sl; + unsigned long sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -2260,343 +1728,59 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) sl = bfqd->bfq_slice_idle; /* * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue - * is seeky. A long idling is preserved for a weight-raised - * queue, or, more in general, in an asymemtric scenario, - * because a long idling is needed for guaranteeing to a queue - * its reserved share of the throughput (in particular, it is - * needed if the queue has a higher weight than some other - * queue). + * asymmetric, grant only minimum idle time if the queue either + * has been seeky for long enough or has already proved to be + * constantly seeky. */ - if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + if (bfq_sample_valid(bfqq->seek_samples) && + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > + bfq_max_budget(bfqq->bfqd) / 8) || + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) - sl = min_t(u32, sl, BFQ_MIN_TT); - + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->wr_coeff > 1) + sl = sl * 3; bfqd->last_idling_start = ktime_get(); - hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), - HRTIMER_MODE_REL); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); - bfq_log(bfqd, "arm idle: %ld/%ld ms", - sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -} - -/* - * In autotuning mode, max_budget is dynamically recomputed as the - * amount of sectors transferred in timeout at the estimated peak - * rate. This enables BFQ to utilize a full timeslice with a full - * budget, even if the in-service queue is served at peak rate. And - * this maximises throughput with sequential workloads. - */ -static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -{ - return (u64)bfqd->peak_rate * USEC_PER_MSEC * - jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -} - -/* - * Update parameters related to throughput and responsiveness, as a - * function of the estimated peak rate. See comments on - * bfq_calc_max_budget(), and on T_slow and T_fast arrays. - */ -void update_thr_responsiveness_params(struct bfq_data *bfqd) -{ - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd); - BUG_ON(bfqd->bfq_max_budget < 0); - bfq_log(bfqd, "new max_budget = %d", - bfqd->bfq_max_budget); - } - - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - - bfq_log(bfqd, -"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", - dev_type == 0 ? "ROT" : "NONROT", - bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", - bfqd->device_speed == BFQ_BFQD_FAST ? - (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : - (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> - BFQ_RATE_SHIFT); -} - -void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -{ - if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; - bfqd->peak_rate_samples = 1; - bfqd->sequential_samples = 0; - bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = - blk_rq_sectors(rq); - } else /* no new rq dispatched, just reset the number of samples */ - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, - "reset_rate_computation at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); -} - -void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -{ - u32 rate, weight, divisor; - - /* - * For the convergence property to hold (see comments on - * bfq_update_peak_rate()) and for the assessment to be - * reliable, a minimum number of samples must be present, and - * a minimum amount of time must have elapsed. If not so, do - * not compute new rate. Just reset parameters, to get ready - * for a new evaluation attempt. - */ - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, - "update_rate_reset: only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } - - /* - * If a new request completion has occurred after last - * dispatch, then, to approximate the rate at which requests - * have been served by the device, it is more precise to - * extend the observation interval to the last completion. - */ - bfqd->delta_from_first = - max_t(u64, bfqd->delta_from_first, - bfqd->last_completion - bfqd->first_dispatch); - - BUG_ON(bfqd->delta_from_first == 0); - /* - * Rate computed in sects/usec, and not sects/nsec, for - * precision issues. - */ - rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, -"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20< 20M sectors/sec) - */ - if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 && - rate <= bfqd->peak_rate) || - rate > 20<peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, - "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - } - - /* - * We have to update the peak rate, at last! To this purpose, - * we use a low-pass filter. We compute the smoothing constant - * of the filter as a function of the 'weight' of the new - * measured rate. - * - * As can be seen in next formulas, we define this weight as a - * quantity proportional to how sequential the workload is, - * and to how long the observation time interval is. - * - * The weight runs from 0 to 8. The maximum value of the - * weight, 8, yields the minimum value for the smoothing - * constant. At this minimum value for the smoothing constant, - * the measured rate contributes for half of the next value of - * the estimated peak rate. - * - * So, the first step is to compute the weight as a function - * of how sequential the workload is. Note that the weight - * cannot reach 9, because bfqd->sequential_samples cannot - * become equal to bfqd->peak_rate_samples, which, in its - * turn, holds true because bfqd->sequential_samples is not - * incremented for the first sample. - */ - weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; - - /* - * Second step: further refine the weight as a function of the - * duration of the observation interval. - */ - weight = min_t(u32, 8, - div_u64(weight * bfqd->delta_from_first, - BFQ_RATE_REF_INTERVAL)); - - /* - * Divisor ranging from 10, for minimum weight, to 2, for - * maximum weight. - */ - divisor = 10 - weight; - BUG_ON(divisor == 0); - - /* - * Finally, update peak rate: - * - * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor - */ - bfqd->peak_rate *= divisor-1; - bfqd->peak_rate /= divisor; - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, - "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); - - BUG_ON(bfqd->peak_rate == 0); - BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<bfq_slice_idle)); } /* - * Update the read/write peak rate (the main quantity used for - * auto-tuning, see update_thr_responsiveness_params()). - * - * It is not trivial to estimate the peak rate (correctly): because of - * the presence of sw and hw queues between the scheduler and the - * device components that finally serve I/O requests, it is hard to - * say exactly when a given dispatched request is served inside the - * device, and for how long. As a consequence, it is hard to know - * precisely at what rate a given set of requests is actually served - * by the device. - * - * On the opposite end, the dispatch time of any request is trivially - * available, and, from this piece of information, the "dispatch rate" - * of requests can be immediately computed. So, the idea in the next - * function is to use what is known, namely request dispatch times - * (plus, when useful, request completion times), to estimate what is - * unknown, namely in-device request service rate. - * - * The main issue is that, because of the above facts, the rate at - * which a certain set of requests is dispatched over a certain time - * interval can vary greatly with respect to the rate at which the - * same requests are then served. But, since the size of any - * intermediate queue is limited, and the service scheme is lossless - * (no request is silently dropped), the following obvious convergence - * property holds: the number of requests dispatched MUST become - * closer and closer to the number of requests completed as the - * observation interval grows. This is the key property used in - * the next function to estimate the peak service rate as a function - * of the observed dispatch rate. The function assumes to be invoked - * on every request dispatch. + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). */ -void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) +static void bfq_set_budget_timeout(struct bfq_data *bfqd) { - u64 now_ns = ktime_get_ns(); - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, - "update_peak_rate: goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ - } - - /* - * Device idle for very long: the observation interval lasting - * up to this dispatch cannot be a valid observation interval - * for computing a new peak rate (similarly to the late- - * completion event in bfq_completed_request()). Go to - * update_rate_and_reset to have the following three steps - * taken: - * - close the observation interval at the last (previous) - * request dispatch or completion - * - compute rate, if possible, for that observation interval - * - start a new observation interval with this dispatch - */ - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, -"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; - } - - /* Update sampling information */ - bfqd->peak_rate_samples++; - - if ((bfqd->rq_in_driver > 0 || - now_ns - bfqd->last_completion < BFQ_MIN_TT) - && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) - bfqd->sequential_samples++; - - bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; - /* Reset max observed rq size every 32 dispatches */ - if (likely(bfqd->peak_rate_samples % 32)) - bfqd->last_rq_max_size = - max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; else - bfqd->last_rq_max_size = blk_rq_sectors(rq); - - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfq_log(bfqd, - "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); + bfqd->last_budget_start = ktime_get(); - /* Target observation interval not yet reached, go on sampling */ - if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) - goto update_last_values; + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; -update_rate_and_reset: - bfq_update_rate_reset(bfqd, rq); -update_last_values: - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, - "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, - "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); } /* - * Move request from internal lists to the dispatch list of the request queue + * Move request from internal lists to the request queue dispatch list. */ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) { + struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq); /* @@ -2610,10 +1794,15 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * incrementing bfqq->dispatched. */ bfqq->dispatched++; - bfq_update_peak_rate(q->elevator->elevator_data, rq); - bfq_remove_request(rq); elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), + rq->cmd_flags); +#endif } /* @@ -2633,16 +1822,25 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) rq = rq_entry_fifo(bfqq->fifo.next); - if (ktime_get_ns() < rq->fifo_time) + if (time_before(jiffies, rq->fifo_time)) return NULL; return rq; } +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { BUG_ON(bfqq != bfqd->in_service_queue); + __bfq_bfqd_reset_in_service(bfqd); + /* * If this bfqq is shared between multiple processes, check * to make sure that those processes are still issuing I/Os @@ -2653,30 +1851,20 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_split_coop(bfqq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfqq->dispatched == 0) - /* - * Overloading budget_timeout field to store - * the time at which the queue remains with no - * backlog and no outstanding request; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; - - bfq_del_bfqq_busy(bfqd, bfqq, true); + /* + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); } else { - bfq_requeue_bfqq(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); /* * Resort priority tree of potential close cooperators. */ bfq_pos_tree_add_move(bfqd, bfqq); } - - /* - * All in-service entities must have been properly deactivated - * or requeued before executing the next function, which - * resets all in-service entites as no more in service. - */ - __bfq_bfqd_reset_in_service(bfqd); } /** @@ -2695,19 +1883,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct request *next_rq; int budget, min_budget; - BUG_ON(bfqq != bfqd->in_service_queue); - + budget = bfqq->max_budget; min_budget = bfq_min_budget(bfqd); - if (bfqq->wr_coeff == 1) - budget = bfqq->max_budget; - else /* - * Use a constant, low budget for weight-raised queues, - * to help achieve a low latency. Keep it slightly higher - * than the minimum possible budget, to cause a little - * bit fewer expirations. - */ - budget = 2 * min_budget; + BUG_ON(bfqq != bfqd->in_service_queue); bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); @@ -2716,7 +1895,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { + if (bfq_bfqq_sync(bfqq)) { switch (reason) { /* * Caveat: in all the following cases we trade latency @@ -2758,10 +1937,14 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, break; case BFQ_BFQQ_BUDGET_TIMEOUT: /* - * We double the budget here because it gives - * the chance to boost the throughput if this - * is not a seeky process (and has bumped into - * this timeout because of, e.g., ZBR). + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. */ budget = min(budget * 2, bfqd->bfq_max_budget); break; @@ -2778,49 +1961,17 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, budget = min(budget * 4, bfqd->bfq_max_budget); break; case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * For queues that expire for this reason, it - * is particularly important to keep the - * budget close to the actual service they - * need. Doing so reduces the timestamp - * misalignment problem described in the - * comments in the body of - * __bfq_activate_entity. In fact, suppose - * that a queue systematically expires for - * BFQ_BFQQ_NO_MORE_REQUESTS and presents a - * new request in time to enjoy timestamp - * back-shifting. The larger the budget of the - * queue is with respect to the service the - * queue actually requests in each service - * slot, the more times the queue can be - * reactivated with the same virtual finish - * time. It follows that, even if this finish - * time is pushed to the system virtual time - * to reduce the consequent timestamp - * misalignment, the queue unjustly enjoys for - * many re-activations a lower finish time - * than all newly activated queues. - * - * The service needed by bfqq is measured - * quite precisely by bfqq->entity.service. - * Since bfqq does not enjoy device idling, - * bfqq->entity.service is equal to the number - * of sectors that the process associated with - * bfqq requested to read/write before waiting - * for request completions, or blocking for - * other reasons. - */ - budget = max_t(int, bfqq->entity.service, min_budget); - break; + /* + * Leave the budget unchanged. + */ default: return; } - } else if (!bfq_bfqq_sync(bfqq)) + } else /* - * Async queues get always the maximum possible - * budget, as for them we do not care about latency - * (in addition, their ability to dispatch is limited - * by the charging factor). + * Async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). */ budget = bfqd->bfq_max_budget; @@ -2831,120 +1982,160 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); /* - * If there is still backlog, then assign a new budget, making - * sure that it is large enough for the next request. Since - * the finish time of bfqq must be kept in sync with the - * budget, be sure to call __bfq_bfqq_expire() *after* this + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the * update. - * - * If there is no backlog, then no need to update the budget; - * it will be updated on the arrival of a new request. */ next_rq = bfqq->next_rq; - if (next_rq) { - BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || - reason == BFQ_BFQQ_NO_MORE_REQUESTS); + if (next_rq) bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - } + else + bfqq->entity.budget = bfqq->max_budget; bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", next_rq ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + /* - * Return true if the process associated with bfqq is "slow". The slow - * flag is used, in addition to the budget timeout, to reduce the - * amount of service provided to seeky processes, and thus reduce - * their chances to lower the throughput. More details in the comments - * on the function bfq_bfqq_expire(). - * - * An important observation is in order: as discussed in the comments - * on the function bfq_update_peak_rate(), with devices with internal - * queues, it is hard if ever possible to know when and for how long - * an I/O request is processed by the device (apart from the trivial - * I/O pattern where a new request is dispatched only after the - * previous one has been completed). This makes it hard to evaluate - * the real rate at which the I/O requests of each bfq_queue are - * served. In fact, for an I/O scheduler like BFQ, serving a - * bfq_queue means just dispatching its requests during its service - * slot (i.e., until the budget of the queue is exhausted, or the - * queue remains idle, or, finally, a timeout fires). But, during the - * service slot of a bfq_queue, around 100 ms at most, the device may - * be even still processing requests of bfq_queues served in previous - * service slots. On the opposite end, the requests of the in-service - * bfq_queue may be completed after the service slot of the queue - * finishes. - * - * Anyway, unless more sophisticated solutions are used - * (where possible), the sum of the sizes of the requests dispatched - * during the service slot of a bfq_queue is probably the only - * approximation available for the service received by the bfq_queue - * during its service slot. And this sum is the quantity used in this - * function to evaluate the I/O speed of a process. + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. */ -static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) +static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason) { - ktime_t delta_ktime; - u32 delta_usecs; - bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; - if (!bfq_bfqq_sync(bfqq)) + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) return false; if (compensate) - delta_ktime = bfqd->last_idling_start; + delta = bfqd->last_idling_start; else - delta_ktime = ktime_get(); - delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); - delta_usecs = ktime_to_us(delta_ktime); - - /* don't trust short/unrealistic values. */ - if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { - if (blk_queue_nonrot(bfqd->queue)) - /* - * give same worst-case guarantees as idling - * for seeky - */ - *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - - bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); - - return slow; + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return false; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + if (bw == 0) + return 0; + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update) { + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, + timeout); + bfq_log(bfqd, "new max_budget=%d", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + } } - *delta_ms = delta_usecs / USEC_PER_MSEC; - /* - * Use only long (> 20ms) intervals to filter out excessive - * spikes in service rate estimation. + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. */ - if (delta_usecs > 20000) { - /* - * Caveat for rotational devices: processes doing I/O - * in the slower disk zones tend to be slow(er) even - * if not seeky. In this respect, the estimated peak - * rate is likely to be an average over the disk - * surface. Accordingly, to not be too harsh with - * unlucky processes, a process is deemed slow only if - * its rate has been lower than half of the estimated - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; - bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return false; - bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; - return slow; + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; } /* @@ -3002,35 +2193,20 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - bfq_log_bfqq(bfqd, bfqq, -"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -} - -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; + jiffies + bfqq->bfqd->bfq_slice_idle + 4); } /* - * Return the farthest past time instant according to jiffies - * macros. + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro + * time_is_before_jiffies(). */ -static unsigned long bfq_smallest_from_now(void) +static unsigned long bfq_infinity_from_now(unsigned long now) { - return jiffies - MAX_JIFFY_OFFSET; + return now + ULONG_MAX / 2; } /** @@ -3040,24 +2216,28 @@ static unsigned long bfq_smallest_from_now(void) * @compensate: if true, compensate for the time spent idling. * @reason: the reason causing the expiration. * - * If the process associated with bfqq does slow I/O (e.g., because it - * issues random requests), we charge bfqq with the time it has been - * in service instead of the service it has received (see - * bfq_bfqq_charge_time for details on how this goal is achieved). As - * a consequence, bfqq will typically get higher timestamps upon - * reactivation, and hence it will be rescheduled as if it had - * received more service than what it has actually received. In the - * end, bfqq receives less service in proportion to how slowly its - * associated process consumes its budgets (and hence how seriously it - * tends to lower the throughput). In addition, this time-charging - * strategy guarantees time fairness among slow processes. In - * contrast, if the process associated with bfqq is not slow, we - * charge bfqq exactly with the service it has received. * - * Charging time to the first type of queues and the exact service to - * the other has the effect of using the WF2Q+ policy to schedule the - * former on a timeslice basis, without violating service domain - * guarantees among the latter. + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3065,52 +2245,41 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, enum bfqq_expiration reason) { bool slow; - unsigned long delta = 0; - struct bfq_entity *entity = &bfqq->entity; BUG_ON(bfqq != bfqd->in_service_queue); /* - * Check whether the process is slow (see bfq_bfqq_is_slow). - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - - /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. + * Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). */ - bfqq->service_from_backlogged += entity->service; + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service - * received, to favor sequential workloads. + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. * - * Processes doing I/O in the slower disk zones will tend to - * be slow(er) even if not seeky. Therefore, since the - * estimated peak rate is actually an average over the disk - * surface, these processes may timeout just for bad luck. To - * avoid punishing them, do not charge time to processes that - * succeeded in consuming at least 2/3 of their budget. This - * allows BFQ to preserve enough elasticity to still perform - * bandwidth, and not time, distribution with little unlucky - * or quasi-sequential processes. + * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. */ - if (bfqq->wr_coeff == 1 && - (slow || - (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) - bfq_bfqq_charge_time(bfqd, bfqq, delta); + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); - BUG_ON(bfqq->entity.budget < bfqq->entity.service); + bfqq->service_from_backlogged += bfqq->entity.service; + + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { + bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } if (reason == BFQ_BFQQ_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) + bfqq->entity.service <= 2 * bfqq->entity.budget / 10) bfq_clear_bfqq_IO_bound(bfqq); if (bfqd->low_latency && bfqq->wr_coeff == 1) @@ -3119,23 +2288,19 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * If we get here, and there are no outstanding - * requests, then the request pattern is isochronous - * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. If, instead, the queue still - * has outstanding requests, then we have to wait for - * the completion of all the outstanding requests to + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to * discover whether the request pattern is actually * isochronous. */ - BUG_ON(bfqd->busy_queues < 1); - if (bfqq->dispatched == 0) { + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", - bfqq->soft_rt_next_start); - } else { + else { /* * The application is still waiting for the * completion of one or more requests: @@ -3152,7 +2317,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * happened to be in the past. */ bfqq->soft_rt_next_start = - bfq_greatest_from_now(); + bfq_infinity_from_now(jiffies); /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. @@ -3162,27 +2327,15 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", - reason, slow, bfqq->dispatched, - bfq_bfqq_idle_window(bfqq), entity->weight); + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); /* * Increase, decrease or leave budget unchanged according to * reason. */ - BUG_ON(bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_expire(bfqd, bfqq); - - BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && - !bfq_class_idle(bfqq)); - - if (!bfq_bfqq_busy(bfqq) && - reason != BFQ_BFQQ_BUDGET_TIMEOUT && - reason != BFQ_BFQQ_BUDGET_EXHAUSTED) - bfq_mark_bfqq_non_blocking_wait_rq(bfqq); } /* @@ -3192,17 +2345,20 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, */ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { - return time_is_before_eq_jiffies(bfqq->budget_timeout); + if (bfq_bfqq_budget_new(bfqq) || + time_before(jiffies, bfqq->budget_timeout)) + return false; + return true; } /* - * If we expire a queue that is actively waiting (i.e., with the - * device idled) for the arrival of a new request, then we may incur - * the timestamp misalignment problem described in the body of the - * function __bfq_activate_entity. Hence we return true only if this - * condition does not hold, or if the queue is slow enough to deserve - * only to be kicked off for preserving a high throughput. - */ + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, @@ -3244,12 +2400,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr, idling_boosts_thr_without_issues, + all_queues_seeky, on_hdd_and_not_all_queues_seeky, idling_needed_for_service_guarantees, asymmetric_scenario; - if (bfqd->strict_guarantees) - return true; - /* * The next variable takes into account the cases where idling * boosts the throughput. @@ -3312,27 +2466,74 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfqd->wr_busy_queues == 0; /* - * There is then a case where idling must be performed not + * There are then two cases where idling must be performed not * for throughput concerns, but to preserve service - * guarantees. + * guarantees. In the description of these cases, we say, for + * short, that a queue is sequential/random if the process + * associated to the queue issues sequential/random requests + * (in the second case the queue may be tagged as seeky or + * even constantly_seeky). * - * To introduce this case, we can note that allowing the drive - * to enqueue more than one request at a time, and hence - * delegating de facto final scheduling decisions to the - * drive's internal scheduler, entails loss of control on the - * actual request service order. In particular, the critical - * situation is when requests from different processes happen - * to be present, at the same time, in the internal queue(s) - * of the drive. In such a situation, the drive, by deciding - * the service order of the internally-queued requests, does - * determine also the actual throughput distribution among - * these processes. But the drive typically has no notion or - * concern about per-process throughput distribution, and - * makes its decisions only on a per-request basis. Therefore, - * the service distribution enforced by the drive's internal - * scheduler is likely to coincide with the desired - * device-throughput distribution only in a completely - * symmetric scenario where: + * To introduce the first case, we note that, since + * bfq_bfqq_idle_window(bfqq) is false if the device is + * NCQ-capable and bfqq is random (see + * bfq_update_idle_window()), then, from the above two + * assignments it follows that + * idling_boosts_thr_without_issues is false if the device is + * NCQ-capable and bfqq is random. Therefore, for this case, + * device idling would never be allowed if we used just + * idling_boosts_thr_without_issues to decide whether to allow + * it. And, beneficially, this would imply that throughput + * would always be boosted also with random I/O on NCQ-capable + * HDDs. + * + * But we must be careful on this point, to avoid an unfair + * treatment for bfqq. In fact, because of the same above + * assignments, idling_boosts_thr_without_issues is, on the + * other hand, true if 1) the device is an HDD and bfqq is + * sequential, and 2) there are no busy weight-raised + * queues. As a consequence, if we used just + * idling_boosts_thr_without_issues to decide whether to idle + * the device, then with an HDD we might easily bump into a + * scenario where queues that are sequential and I/O-bound + * would enjoy idling, whereas random queues would not. The + * latter might then get a low share of the device throughput, + * simply because the former would get many requests served + * after being set as in service, while the latter would not. + * + * To address this issue, we start by setting to true a + * sentinel variable, on_hdd_and_not_all_queues_seeky, if the + * device is rotational and not all queues with pending or + * in-flight requests are constantly seeky (i.e., there are + * active sequential queues, and bfqq might then be mistreated + * if it does not enjoy idling because it is random). + */ + all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && + bfqd->busy_in_flight_queues == + bfqd->const_seeky_busy_in_flight_queues; + + on_hdd_and_not_all_queues_seeky = + !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; + + /* + * To introduce the second case where idling needs to be + * performed to preserve service guarantees, we can note that + * allowing the drive to enqueue more than one request at a + * time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of + * control on the actual request service order. In particular, + * the critical situation is when requests from different + * processes happens to be present, at the same time, in the + * internal queue(s) of the drive. In such a situation, the + * drive, by deciding the service order of the + * internally-queued requests, does determine also the actual + * throughput distribution among these processes. But the + * drive typically has no notion or concern about per-process + * throughput distribution, and makes its decisions only on a + * per-request basis. Therefore, the service distribution + * enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution + * only in a completely symmetric scenario where: * (i) each of these processes must get the same throughput as * the others; * (ii) all these processes have the same I/O pattern @@ -3354,53 +2555,26 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * words, only if sub-condition (i) holds, then idling is * allowed, and the device tends to be prevented from queueing * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that we - * exploit preemption to preserve guarantees in case of - * symmetric scenarios, even if (ii) does not hold, as - * explained in the next two paragraphs. - * - * Even if a queue, say Q, is expired when it remains idle, Q - * can still preempt the new in-service queue if the next - * request of Q arrives soon (see the comments on - * bfq_bfqq_update_budg_for_activation). If all queues and - * groups have the same weight, this form of preemption, - * combined with the hole-recovery heuristic described in the - * comments on function bfq_bfqq_update_budg_for_activation, - * are enough to preserve a correct bandwidth distribution in - * the mid term, even without idling. In fact, even if not - * idling allows the internal queues of the device to contain - * many requests, and thus to reorder requests, we can rather - * safely assume that the internal scheduler still preserves a - * minimum of mid-term fairness. The motivation for using - * preemption instead of idling is that, by not idling, - * service guarantees are preserved without minimally - * sacrificing throughput. In other words, both a high - * throughput and its desired distribution are obtained. - * - * More precisely, this preemption-based, idleless approach - * provides fairness in terms of IOPS, and not sectors per - * second. This can be seen with a simple example. Suppose - * that there are two queues with the same weight, but that - * the first queue receives requests of 8 sectors, while the - * second queue receives requests of 1024 sectors. In - * addition, suppose that each of the two queues contains at - * most one request at a time, which implies that each queue - * always remains idle after it is served. Finally, after - * remaining idle, each queue receives very quickly a new - * request. It follows that the two queues are served - * alternatively, preempting each other if needed. This - * implies that, although both queues have the same weight, - * the queue with large requests receives a service that is - * 1024/8 times as high as the service received by the other - * queue. - * - * On the other hand, device idling is performed, and thus - * pure sector-domain guarantees are provided, for the - * following queues, which are likely to need stronger - * throughput guarantees: weight-raised queues, and queues - * with a higher weight than other queues. When such queues - * are active, sub-condition (i) is false, which triggers - * device idling. + * for not controlling also sub-condition (ii) is that, first, + * in the case of an HDD, the asymmetry in terms of types of + * I/O patterns is already taken in to account in the above + * sentinel variable + * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a + * flash-based device, we prefer however to privilege + * throughput (and idling lowers throughput for this type of + * devices), for the following reasons: + * 1) differently from HDDs, the service time of random + * requests is not orders of magnitudes lower than the service + * time of sequential requests; thus, even if processes doing + * sequential I/O get a preferential treatment with respect to + * others doing random I/O, the consequences are not as + * dramatic as with HDDs; + * 2) if a process doing random I/O does need strong + * throughput guarantees, it is hopefully already being + * weight-raised, or the user is likely to have assigned it a + * higher weight than the other processes (and thus + * sub-condition (i) is likely to be false, which triggers + * idling). * * According to the above considerations, the next variable is * true (only) if sub-condition (i) holds. To compute the @@ -3408,7 +2582,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * the function bfq_symmetric_scenario(), but also check * whether bfqq is being weight-raised, because * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments on + * weight-raised queues (see comments to * bfq_weights_tree_add()). * * As a side note, it is worth considering that the above @@ -3430,16 +2604,17 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * bfqq. Such a case is when bfqq became active in a burst of * queue activations. Queues that became active during a large * burst benefit only from throughput, as discussed in the - * comments on bfq_handle_burst. Thus, if bfqq became active + * comments to bfq_handle_burst. Thus, if bfqq became active * in a burst and not idling the device maximizes throughput, * then the device must no be idled, because not idling the * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the above case, we can - * now establish when idling is actually needed to preserve - * service guarantees. + * maximum benefit. Combining this and the two cases above, we + * can now establish when idling is actually needed to + * preserve service guarantees. */ idling_needed_for_service_guarantees = - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && + !bfq_bfqq_in_large_burst(bfqq); /* * We have now all the components we need to compute the return @@ -3449,16 +2624,6 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 2) idling either boosts the throughput (without issues), or * is necessary to preserve service guarantees. */ - bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, - "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), - idling_needed_for_service_guarantees); - return bfq_bfqq_sync(bfqq) && (idling_boosts_thr_without_issues || idling_needed_for_service_guarantees); @@ -3470,7 +2635,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments on the function bfq_bfqq_may_idle for the reasons + * See the comments to the function bfq_bfqq_may_idle for the reasons * why performing device idling is the best choice to boost the throughput * and preserve service guarantees when bfq_bfqq_may_idle itself * returns true. @@ -3500,7 +2665,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && + !timer_pending(&bfqd->idle_slice_timer) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -3520,8 +2685,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * not disable disk idling even when a new request * arrives. */ - if (bfq_bfqq_wait_request(bfqq)) { - BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); + if (timer_pending(&bfqd->idle_slice_timer)) { /* * If we get here: 1) at least a new request * has arrived but we have not disabled the @@ -3536,8 +2700,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * So we disable idling. */ bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + del_timer(&bfqd->idle_slice_timer); +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_idle_time(bfqq_group(bfqq)); +#endif } goto keep_queue; } @@ -3548,7 +2714,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. */ - if (hrtimer_active(&bfqd->idle_slice_timer) || + if (timer_pending(&bfqd->idle_slice_timer) || (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { bfqq = NULL; goto keep_queue; @@ -3570,9 +2736,6 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ - BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqd, bfqq, "raising period dur %u/%u msec, old coeff %u, w %d(%d)", jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), @@ -3586,30 +2749,22 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* - * If the queue was activated in a burst, or too much - * time has elapsed from the beginning of this - * weight-raising period, then end weight raising. + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. */ - if (bfq_bfqq_in_large_burst(bfqq)) - bfq_bfqq_end_wr(bfqq); - else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + if (bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time)) { - if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || - time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, - "back to interactive wr"); - } + bfqq->last_wr_start_finish = jiffies; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); } } /* Update weight both if it must be raised and if it must be lowered */ @@ -3660,29 +2815,13 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, */ if (!bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); - BUG_ON(bfqq->entity.budget < bfqq->entity.service); goto expire; } - BUG_ON(bfqq->entity.budget < bfqq->entity.service); /* Finally, insert request into driver dispatch list. */ bfq_bfqq_served(bfqq, service_to_charge); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - bfq_dispatch_insert(bfqd->queue, rq); - /* - * If weight raising has to terminate for bfqq, then next - * function causes an immediate update of bfqq's weight, - * without waiting for next activation. As a consequence, on - * expiration, bfqq will be timestamped as if has never been - * weight-raised during this service slot, even if it has - * received part or even most of the service as a - * weight-raised queue. This inflates bfqq's timestamps, which - * is beneficial, as bfqq is then more willing to leave the - * device immediately to possible other weight-raised queues. - */ bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, @@ -3698,7 +2837,9 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfqd->in_service_bic = RQ_BIC(rq); } - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) goto expire; return dispatched; @@ -3744,8 +2885,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) st = bfq_entity_service_tree(&bfqq->entity); dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); + bfq_forget_idle(st); } @@ -3758,37 +2899,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; + int max_dispatch; bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - if (bfqd->busy_queues == 0) return 0; if (unlikely(force)) return bfq_forced_dispatch(bfqd); - /* - * Force device to serve one request at a time if - * strict_guarantees is true. Forcing this service scheme is - * currently the ONLY way to guarantee that the request - * service order enforced by the scheduler is respected by a - * queueing device. Otherwise the device is free even to make - * some unlucky request wait for as long as the device - * wishes. - * - * Of course, serving one request at at time may cause loss of - * throughput. - */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) - return 0; - bfqq = bfq_select_queue(bfqd); if (!bfqq) return 0; - BUG_ON(bfqq->entity.budget < bfqq->entity.service); + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; - BUG_ON(bfq_bfqq_wait_request(bfqq)); + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); if (!bfq_dispatch_request(bfqd, bfqq)) return 0; @@ -3796,8 +2937,6 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", bfq_bfqq_sync(bfqq) ? "sync" : "async"); - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); return 1; } @@ -3809,21 +2948,23 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { + struct bfq_data *bfqd = bfqq->bfqd; #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqg = bfqq_group(bfqq); #endif - BUG_ON(bfqq->ref <= 0); + BUG_ON(atomic_read(&bfqq->ref) <= 0); - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); - bfqq->ref--; - if (bfqq->ref) + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) return; BUG_ON(rb_first(&bfqq->sort_list)); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->in_service_queue == bfqq); if (bfq_bfqq_sync(bfqq)) /* @@ -3836,7 +2977,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) */ hlist_del_init(&bfqq->burst_list_node); - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -3870,7 +3011,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_schedule_dispatch(bfqd); } - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); bfq_put_cooperator(bfqq); @@ -3879,7 +3021,28 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) static void bfq_init_icq(struct io_cq *icq) { - icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; } static void bfq_exit_icq(struct io_cq *icq) @@ -3887,21 +3050,21 @@ static void bfq_exit_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); - if (bic_to_bfqq(bic, false)) { - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); - bic_set_bfqq(bic, NULL, false); + if (bic->bfqq[BLK_RW_ASYNC]) { + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); + bic->bfqq[BLK_RW_ASYNC] = NULL; } - if (bic_to_bfqq(bic, true)) { + if (bic->bfqq[BLK_RW_SYNC]) { /* * If the bic is using a shared queue, put the reference * taken on the io_context when the bic started using a * shared bfq_queue. */ - if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); - bic_set_bfqq(bic, NULL, true); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; } } @@ -3909,8 +3072,8 @@ static void bfq_exit_icq(struct io_cq *icq) * Update the entity prio values; note that the new values will not * be used until the next (re)activation. */ -static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - struct bfq_io_cq *bic) +static void +bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { struct task_struct *tsk = current; int ioprio_class; @@ -3942,7 +3105,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); BUG(); @@ -3950,40 +3113,45 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, - "set_next_ioprio_data: bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); } static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { - struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_queue *bfqq; + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); /* * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - return; + goto out; bic->ioprio = ioprio; - bfqq = bic_to_bfqq(bic, false); + bfqq = bic->bfqq[BLK_RW_ASYNC]; if (bfqq) { - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, bfqq->ref); + new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, + GFP_ATOMIC); + if (new_bfqq) { + bic->bfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } } - bfqq = bic_to_bfqq(bic, true); + bfqq = bic->bfqq[BLK_RW_SYNC]; if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); + +out: + bfq_put_bfqd_unlock(bfqd, &flags); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3992,9 +3160,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - bfqq->ref = 0; + atomic_set(&bfqq->ref, 0); bfqq->bfqd = bfqd; if (bic) @@ -4004,7 +3171,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); - bfq_mark_bfqq_just_created(bfqq); } else bfq_clear_bfqq_sync(bfqq); bfq_mark_bfqq_IO_bound(bfqq); @@ -4014,19 +3180,72 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bfqq->budget_timeout = bfq_smallest_from_now(); - bfqq->split_time = bfq_smallest_from_now(); - + bfqq->last_wr_start_finish = 0; /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bio *bio, int is_sync, + struct bfq_io_cq *bic, + gfp_t gfp_mask) +{ + struct bfq_group *bfqg; + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct blkcg *blkcg; + +retry: + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + bfqg = bfq_find_alloc_group(bfqd, blkcg); + /* bic always exists here */ + bfqq = bic_to_bfqq(bic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (!bfqq || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfpflags_allow_blocking(gfp_mask)) { + rcu_read_unlock(); + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + } - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; + if (new_bfqq) + kmem_cache_free(bfq_pool, new_bfqq); + + rcu_read_unlock(); + + return bfqq; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, @@ -4049,86 +3268,90 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bio *bio, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq; - struct bfq_group *bfqg; - - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } + struct bfq_queue *bfqq = NULL; if (!is_sync) { + struct blkcg *blkcg; + struct bfq_group *bfqg; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + rcu_read_unlock(); + bfqg = bfq_find_alloc_group(bfqd, blkcg); async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; - if (bfqq) - goto out; } - bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, - bfqd->queue->node); - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - goto out; - } + if (!bfqq) + bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (async_bfqq) { - bfqq->ref++; + if (!is_sync && !(*async_bfqq)) { + atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, bfqq->ref); + bfqq, atomic_read(&bfqq->ref)); *async_bfqq = bfqq; } -out: - bfqq->ref++; - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); return bfqq; } static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_io_cq *bic) { - struct bfq_ttime *ttime = &bic->ttime; - u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; + unsigned long elapsed = jiffies - bic->ttime.last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); - elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - - ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / + bic->ttime.ttime_samples; } -static void -bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) { - bfqq->seek_history <<= 1; - bfqq->seek_history |= - get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && - (!blk_queue_nonrot(bfqd->queue) || - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); } /* @@ -4146,8 +3369,7 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, return; /* Idle window just restored, statistics are meaningless. */ - if (time_is_after_eq_jiffies(bfqq->split_time + - bfqd->bfq_wr_min_idle_time)) + if (bfq_bfqq_just_split(bfqq)) return; enable_idle = bfq_bfqq_idle_window(bfqq); @@ -4187,13 +3409,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_update_io_thinktime(bfqd, bic); bfq_update_io_seektime(bfqd, bfqq, rq); + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { + bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (unsigned long long) bfqq->seek_mean); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); @@ -4207,15 +3438,14 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * is small and the queue is not to be expired, then * just exit. * - * In this way, if the device is being idled to wait - * for a new request from the in-service queue, we - * avoid unplugging the device and committing the - * device to serve just a small request. On the - * contrary, we wait for the block layer to decide - * when to unplug the device: hopefully, new requests - * will be merged to this one quickly, then the device - * will be unplugged and larger requests will be - * dispatched. + * In this way, if the disk is being idled to wait for + * a new request from the in-service queue, we avoid + * unplugging the device and committing the disk to serve + * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this one + * quickly, then the device will be unplugged and + * larger requests will be dispatched. */ if (small_req && !budget_timeout) return; @@ -4227,8 +3457,10 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * timer. */ bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + del_timer(&bfqd->idle_slice_timer); +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_idle_time(bfqq_group(bfqq)); +#endif /* * The queue is not empty, because a new request just @@ -4272,20 +3504,28 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ new_bfqq->allocated[rq_data_dir(rq)]++; bfqq->allocated[rq_data_dir(rq)]--; - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); + atomic_inc(&new_bfqq->ref); bfq_put_queue(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; - } + } else + bfq_bfqq_increase_failed_cooperations(bfqq); } bfq_add_request(rq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic) + bfqq->bic->wr_time_left = 0; + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -4293,8 +3533,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) static void bfq_update_hw_tag(struct bfq_data *bfqd) { - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -4320,85 +3560,48 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; - u64 now_ns; - u32 delta_us; + bool sync = bfq_bfqq_sync(bfqq); - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", - blk_rq_sectors(rq)); + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", + blk_rq_sectors(rq), sync); - assert_spin_locked(bfqd->queue->queue_lock); bfq_update_hw_tag(bfqd); BUG_ON(!bfqd->rq_in_driver); BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_completion(bfqq_group(bfqq), rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); + rq_io_start_time_ns(rq), rq->cmd_flags); +#endif if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* - * Set budget_timeout (which we overload to store the - * time at which the queue remains with no backlog and - * no outstanding request; used by the weight-raising - * mechanism). - */ - bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } } - now_ns = ktime_get_ns(); - - RQ_BIC(rq)->ttime.last_end_request = now_ns; - - /* - * Using us instead of ns, to get a reasonable precision in - * computing rate in next check. - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* - * If the request took rather long to complete, and, according - * to the maximum request size recorded, this completion latency - * implies that the request was certainly served at a very low - * rate (less than 1M sectors/sec), then the whole observation - * interval that lasts up to this time instant cannot be a - * valid time interval for computing a new peak rate. Invoke - * bfq_update_rate_reset to have the following three steps - * taken: - * - close the observation interval at the last (previous) - * request dispatch or completion - * - compute rate, if possible, for that observation interval - * - reset to zero samples, which will trigger a proper - * re-initialization of the observation interval on next - * dispatch - */ - if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && - (bfqd->last_rq_max_size<last_completion = now_ns; + if (sync) { + bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; + } /* - * If we are waiting to discover whether the request pattern - * of the task associated with the queue is actually - * isochronous, and both requisites for this condition to hold - * are now satisfied, then compute soft_rt_next_start (see the - * comments on the function bfq_bfqq_softrt_next_start()). We - * schedule this delayed check when bfqq expires, if it still - * has in-flight requests. + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). */ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) @@ -4410,7 +3613,10 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + if (bfq_bfqq_must_idle(bfqq)) { bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) @@ -4440,7 +3646,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq) return ELV_MQUEUE_MAY; } -static int bfq_may_queue(struct request_queue *q, int op, int op_flags) +static int bfq_may_queue(struct request_queue *q, int rw) { struct bfq_data *bfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -4457,7 +3663,7 @@ static int bfq_may_queue(struct request_queue *q, int op, int op_flags) if (!bic) return ELV_MQUEUE_MAY; - bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags)); + bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); if (bfqq) return __bfq_may_queue(bfqq); @@ -4481,14 +3687,14 @@ static void bfq_put_request(struct request *rq) rq->elv.priv[1] = NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, bfqq->ref); + bfqq, atomic_read(&bfqq->ref)); bfq_put_queue(bfqq); } } /* * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to that bfqq. + * was the last process referring to said bfqq. */ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) @@ -4526,9 +3732,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, unsigned long flags; bool split = false; - spin_lock_irqsave(q->queue_lock, flags); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + bfq_check_ioprio_change(bic, bio); + spin_lock_irqsave(q->queue_lock, flags); + if (!bic) goto queue_fail; @@ -4537,47 +3746,23 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, - "set_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, - bic->saved_in_large_burst, - bfqd->large_burst); - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, - "set_request: marking in " - "large burst"); + bic->saved_in_large_burst) bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, - "set_request: clearing in " - "large burst"); + else { bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } - bfqq->split_time = jiffies; } } else { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; - bfqq = bfq_split_bfqq(bic, bfqq); split = true; if (!bfqq) @@ -4586,8 +3771,9 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, } bfqq->allocated[rw]++; - bfqq->ref++; - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -4602,6 +3788,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; if (split) { + bfq_mark_bfqq_just_split(bfqq); /* * If the queue has just been split from a shared * queue, restore the idle window and the possible @@ -4611,9 +3798,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, } } - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -4640,10 +3824,9 @@ static void bfq_kick_queue(struct work_struct *work) * Handler of the expiration of the timer running if the in-service queue * is idling inside its time slice. */ -static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) +static void bfq_idle_slice_timer(unsigned long data) { - struct bfq_data *bfqd = container_of(timer, struct bfq_data, - idle_slice_timer); + struct bfq_data *bfqd = (struct bfq_data *)data; struct bfq_queue *bfqq; unsigned long flags; enum bfqq_expiration reason; @@ -4661,8 +3844,6 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) */ if (bfqq) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); - bfq_clear_bfqq_wait_request(bfqq); - if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -4688,16 +3869,14 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) bfq_schedule_dispatch(bfqd); spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; } static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) { - hrtimer_cancel(&bfqd->idle_slice_timer); + del_timer_sync(&bfqd->idle_slice_timer); cancel_work_sync(&bfqd->unplug_work); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { @@ -4706,9 +3885,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, bfq_log(bfqd, "put_async_bfqq: %p", bfqq); if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, bfqq->ref); + bfqq, atomic_read(&bfqq->ref)); bfq_put_queue(bfqq); *bfqq_ptr = NULL; } @@ -4730,7 +3909,6 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); } -#endif static void bfq_exit_queue(struct elevator_queue *e) { @@ -4744,13 +3922,15 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(bfqd->in_service_queue); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); + bfq_deactivate_bfqq(bfqd, bfqq, 0); spin_unlock_irq(q->queue_lock); bfq_shutdown_timer_wq(bfqd); - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(q, &blkcg_policy_bfq); @@ -4774,7 +3954,6 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->rq_pos_tree = RB_ROOT; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - root_group->sched_data.bfq_class_idle_last_service = jiffies; } static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) @@ -4799,14 +3978,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * will not attempt to free it. */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - bfqd->oom_bfqq.ref++; + atomic_inc(&bfqd->oom_bfqq.ref); bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); - - /* oom_bfqq does not participate to bursts */ - bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio @@ -4825,10 +4001,13 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqd->active_numerous_groups = 0; +#endif - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + init_timer(&bfqd->idle_slice_timer); bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; @@ -4848,19 +4027,21 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_back_max = bfq_back_max; bfqd->bfq_back_penalty = bfq_back_penalty; bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_timeout = bfq_timeout; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; - bfqd->bfq_burst_interval = msecs_to_jiffies(180); + bfqd->bfq_large_burst_thresh = 11; + bfqd->bfq_burst_interval = msecs_to_jiffies(500); bfqd->low_latency = true; - /* - * Trade-off between responsiveness and fairness. - */ - bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_coeff = 20; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); @@ -4872,15 +4053,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * video. */ bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; /* - * Begin by assuming, optimistically, that the device is a - * high-speed one, and that its peak rate is equal to 2/3 of - * the highest reference rate. + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. */ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; bfqd->device_speed = BFQ_BFQD_FAST; return 0; @@ -4906,7 +4088,7 @@ static int __init bfq_slab_setup(void) static ssize_t bfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%u\n", var); + return sprintf(page, "%d\n", var); } static ssize_t bfq_var_store(unsigned long *var, const char *page, @@ -4977,21 +4159,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct bfq_data *bfqd = e->elevator_data; \ - u64 __data = __VAR; \ - if (__CONV == 1) \ + unsigned int __data = __VAR; \ + if (__CONV) \ __data = jiffies_to_msecs(__data); \ - else if (__CONV == 2) \ - __data = div_u64(__data, NSEC_PER_MSEC); \ return bfq_var_show(__data, (page)); \ } -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); @@ -5001,17 +4183,6 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); #undef SHOW_FUNCTION -#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - u64 __data = __VAR; \ - __data = div_u64(__data, NSEC_PER_USEC); \ - return bfq_var_show(__data, (page)); \ -} -USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -#undef USEC_SHOW_FUNCTION - #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t \ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ @@ -5023,22 +4194,24 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV == 1) \ + if (__CONV) \ *(__PTR) = msecs_to_jiffies(__data); \ - else if (__CONV == 2) \ - *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ return ret; \ } STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 2); + INT_MAX, 1); STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 2); + INT_MAX, 1); STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, @@ -5051,23 +4224,6 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, INT_MAX, 0); #undef STORE_FUNCTION -#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ -} -USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, - UINT_MAX); -#undef USEC_STORE_FUNCTION - /* do nothing for the moment */ static ssize_t bfq_weights_store(struct elevator_queue *e, const char *page, size_t count) @@ -5075,6 +4231,16 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } +static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { @@ -5083,7 +4249,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, int ret = bfq_var_store(&__data, (page), count); if (__data == 0) - bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); else { if (__data > INT_MAX) __data = INT_MAX; @@ -5095,10 +4261,6 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, return ret; } -/* - * Leaving this name to preserve name compatibility with cfq - * parameters, but this timeout is used for both sync and async. - */ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { @@ -5111,27 +4273,9 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, else if (__data > INT_MAX) __data = INT_MAX; - bfqd->bfq_timeout = msecs_to_jiffies(__data); + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - - return ret; -} - -static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data > 1) - __data = 1; - if (!bfqd->strict_guarantees && __data == 1 - && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) - bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; - - bfqd->strict_guarantees = __data; + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); return ret; } @@ -5161,10 +4305,10 @@ static struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(back_seek_max), BFQ_ATTR(back_seek_penalty), BFQ_ATTR(slice_idle), - BFQ_ATTR(slice_idle_us), BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), BFQ_ATTR(timeout_sync), - BFQ_ATTR(strict_guarantees), + BFQ_ATTR(timeout_async), BFQ_ATTR(low_latency), BFQ_ATTR(wr_coeff), BFQ_ATTR(wr_max_time), @@ -5184,8 +4328,7 @@ static struct elevator_type iosched_bfq = { #ifdef CONFIG_BFQ_GROUP_IOSCHED .elevator_bio_merged_fn = bfq_bio_merged, #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, - .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, + .elevator_allow_merge_fn = bfq_allow_merge, .elevator_dispatch_fn = bfq_dispatch_requests, .elevator_add_req_fn = bfq_insert_request, .elevator_activate_req_fn = bfq_activate_request, @@ -5208,28 +4351,18 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, -}; -#endif - static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: v8r7"; + + /* + * Can be 0 on HZ < 1000 setups. + */ + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -5242,46 +4375,27 @@ static int __init bfq_init(void) goto err_pol_unreg; /* - * Times to load large popular applications for the typical - * systems installed on the reference devices (see the - * comments before the definitions of the next two - * arrays). Actually, we use slightly slower values, as the - * estimated peak rate tends to be smaller than the actual - * peak rate. The reason for this last fact is that estimates - * are computed over much shorter time intervals than the long - * intervals typically used for benchmarking. Why? First, to - * adapt more quickly to variations. Second, because an I/O - * scheduler cannot rely on a peak-rate-evaluation workload to - * be run for a long time. + * Times to load large popular applications for the typical systems + * installed on the reference devices (see the comments before the + * definitions of the two arrays). */ - T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ - T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */ - T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ - T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ + T_slow[0] = msecs_to_jiffies(2600); + T_slow[1] = msecs_to_jiffies(1000); + T_fast[0] = msecs_to_jiffies(5500); + T_fast[1] = msecs_to_jiffies(2000); /* - * Thresholds that determine the switch between speed classes - * (see the comments before the definition of the array - * device_speed_thresh). These thresholds are biased towards - * transitions to the fast class. This is safer than the - * opposite bias. In fact, a wrong transition to the slow - * class results in short weight-raising periods, because the - * speed of the device then tends to be higher that the - * reference peak rate. On the opposite end, a wrong - * transition to the fast class tends to increase - * weight-raising periods, because of the opposite reason. + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). */ - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; ret = elv_register(&iosched_bfq); if (ret) goto err_pol_unreg; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - strcat(msg, " (with cgroups support)"); -#endif - pr_info("%s", msg); + pr_info("BFQ I/O-scheduler: v7r11"); return 0; diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 797bce75db01..a5ed6948471a 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -7,166 +7,28 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente - */ - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static int bfq_gt(u64 a, u64 b) -{ - return (s64)(a - b) > 0; -} - -static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -{ - struct rb_node *node = tree->rb_node; - - return rb_entry(node, struct bfq_entity, rb_node); -} - -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); - -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - -/** - * bfq_update_next_in_service - update sd->next_in_service - * @sd: sched_data for which to perform the update. - * @new_entity: if not NULL, pointer to the entity whose activation, - * requeueing or repositionig triggered the invocation of - * this function. - * - * This function is called to update sd->next_in_service, which, in - * its turn, may change as a consequence of the insertion or - * extraction of an entity into/from one of the active trees of - * sd. These insertions/extractions occur as a consequence of - * activations/deactivations of entities, with some activations being - * 'true' activations, and other activations being requeueings (i.e., - * implementing the second, requeueing phase of the mechanism used to - * reposition an entity in its active tree; see comments on - * __bfq_activate_entity and __bfq_requeue_entity for details). In - * both the last two activation sub-cases, new_entity points to the - * just activated or requeued entity. - * - * Returns true if sd->next_in_service changes in such a way that - * entity->parent may become the next_in_service for its parent - * entity. + * Copyright (C) 2010 Paolo Valente */ -static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *new_entity) -{ - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; - bool parent_sched_may_change = false; - - /* - * If this update is triggered by the activation, requeueing - * or repositiong of an entity that does not coincide with - * sd->next_in_service, then a full lookup in the active tree - * can be avoided. In fact, it is enough to check whether the - * just-modified entity has a higher priority than - * sd->next_in_service, or, even if it has the same priority - * as sd->next_in_service, is eligible and has a lower virtual - * finish time than sd->next_in_service. If this compound - * condition holds, then the new entity becomes the new - * next_in_service. Otherwise no change is needed. - */ - if (new_entity && new_entity != sd->next_in_service) { - /* - * Flag used to decide whether to replace - * sd->next_in_service with new_entity. Tentatively - * set to true, and left as true if - * sd->next_in_service is NULL. - */ - bool replace_next = true; - - /* - * If there is already a next_in_service candidate - * entity, then compare class priorities or timestamps - * to decide whether to replace sd->service_tree with - * new_entity. - */ - if (next_in_service) { - unsigned int new_entity_class_idx = - bfq_class_idx(new_entity); - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - - /* - * For efficiency, evaluate the most likely - * sub-condition first. - */ - replace_next = - (new_entity_class_idx == - bfq_class_idx(next_in_service) - && - !bfq_gt(new_entity->start, st->vtime) - && - bfq_gt(next_in_service->finish, - new_entity->finish)) - || - new_entity_class_idx < - bfq_class_idx(next_in_service); - } - - if (replace_next) - next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ - next_in_service = bfq_lookup_next_entity(sd); - - if (next_in_service) { - parent_sched_may_change = !sd->next_in_service || - bfq_update_parent_budget(next_in_service); - } - - sd->next_in_service = next_in_service; - - if (!next_in_service) - return parent_sched_may_change; - - bfqq = bfq_entity_to_bfqq(next_in_service); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_next_in_service: chosen this queue"); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(next_in_service, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_next_in_service: chosen this entity"); - } -#endif - return parent_sched_may_change; -} #ifdef CONFIG_BFQ_GROUP_IOSCHED -/* both next loops stop at one of the child entities of the root group */ -#define for_each_entity(entity) \ +#define for_each_entity(entity) \ for (; entity ; entity = entity->parent) #define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -/* - * Returns true if this budget changes may let next_in_service->parent - * become the next_in_service entity for its parent entity. - */ -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + +static void bfq_update_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; struct bfq_group *bfqg; struct bfq_sched_data *group_sd; - bool ret = false; BUG_ON(!next_in_service); @@ -179,68 +41,60 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) * as it must never become an in-service entity. */ bfqg_entity = bfqg->my_entity; - if (bfqg_entity) { - if (bfqg_entity->budget > next_in_service->budget) - ret = true; + if (bfqg_entity) bfqg_entity->budget = next_in_service->budget; - } - - return ret; } -/* - * This function tells whether entity stops being a candidate for next - * service, according to the following logic. - * - * This function is invoked for an entity that is about to be set in - * service. If such an entity is a queue, then the entity is no longer - * a candidate for next service (i.e, a candidate entity to serve - * after the in-service entity is expired). The function then returns - * true. - * - * In contrast, the entity could stil be a candidate for next service - * if it is not a queue, and has more than one child. In fact, even if - * one of its children is about to be set in service, other children - * may still be the next to serve. As a consequence, a non-queue - * entity is not a candidate for next-service only if it has only one - * child. And only if this condition holds, then the function returns - * true for a non-queue entity. - */ -static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +static int bfq_update_next_in_service(struct bfq_sched_data *sd) { - struct bfq_group *bfqg; + struct bfq_entity *next_in_service; - if (bfq_entity_to_bfqq(entity)) - return true; + if (sd->in_service_entity) + /* will update/requeue at the end of service */ + return 0; - bfqg = container_of(entity, struct bfq_group, entity); + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_in_service = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_in_service = next_in_service; - BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); - BUG_ON(bfqg->active_entities == 0); - if (bfqg->active_entities == 1) - return true; + if (next_in_service) + bfq_update_budget(next_in_service); - return false; + return 1; } -#else /* CONFIG_BFQ_GROUP_IOSCHED */ +static void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_in_service != entity); +} +#else #define for_each_entity(entity) \ for (; entity ; entity = NULL) #define for_each_entity_safe(entity, parent) \ for (parent = NULL; entity ; entity = parent) -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) +static int bfq_update_next_in_service(struct bfq_sched_data *sd) { - return false; + return 0; } -static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +static void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) { - return true; } -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ +static void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif /* * Shift for timestamp calculations. This actually limits the maximum @@ -251,6 +105,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) */ #define WFQ_SERVICE_SHIFT 22 +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) { struct bfq_queue *bfqq = NULL; @@ -285,36 +151,20 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned long long start, finish, delta; BUG_ON(entity->weight == 0); entity->finish = entity->start + bfq_delta(service, entity->weight); - start = ((entity->start>>10)*1000)>>12; - finish = ((entity->finish>>10)*1000)>>12; - delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; - if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: start %llu, finish %llu, delta %llu", - start, finish, delta); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_finish group: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_finish group: start %llu, finish %llu, delta %llu", - start, finish, delta); -#endif + entity->start, entity->finish, + bfq_delta(service, entity->weight)); } } @@ -443,26 +293,10 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->min_start = entity->start; bfq_update_min(entity, node->rb_right); bfq_update_min(entity, node->rb_left); - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); -#endif - } } /** @@ -552,6 +386,8 @@ static void bfq_active_insert(struct bfq_service_tree *st, BUG_ON(!bfqg); BUG_ON(!bfqd); bfqg->active_entities++; + if (bfqg->active_entities == 2) + bfqd->active_numerous_groups++; } #endif } @@ -563,7 +399,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, static unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; } /** @@ -586,9 +422,9 @@ static void bfq_get_entity(struct bfq_entity *entity) struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); if (bfqq) { - bfqq->ref++; + atomic_inc(&bfqq->ref); bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, bfqq->ref); + bfqq, atomic_read(&bfqq->ref)); } } @@ -663,6 +499,10 @@ static void bfq_active_extract(struct bfq_service_tree *st, BUG_ON(!bfqd); BUG_ON(!bfqg->active_entities); bfqg->active_entities--; + if (bfqg->active_entities == 1) { + BUG_ON(!bfqd->active_numerous_groups); + bfqd->active_numerous_groups--; + } } #endif } @@ -707,12 +547,12 @@ static void bfq_forget_entity(struct bfq_service_tree *st, BUG_ON(!entity->on_st); - entity->on_st = false; + entity->on_st = 0; st->wsum -= entity->weight; if (bfqq) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, bfqq->ref); + bfqq, atomic_read(&bfqq->ref)); bfq_put_queue(bfqq); } } @@ -762,7 +602,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int prev_weight, new_weight; + unsigned short prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -790,10 +630,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, entity->new_weight > BFQ_MAX_WEIGHT) { pr_crit("update_weight_prio: new_weight %d\n", entity->new_weight); - if (entity->new_weight < BFQ_MIN_WEIGHT) - entity->new_weight = BFQ_MIN_WEIGHT; - else - entity->new_weight = BFQ_MAX_WEIGHT; + BUG(); } entity->orig_weight = entity->new_weight; if (bfqq) @@ -824,13 +661,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * associated with its new weight. */ if (prev_weight != new_weight) { - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "weight changed %d %d(%d %d)", - prev_weight, new_weight, - entity->orig_weight, - bfqq->wr_coeff); - root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; bfq_weights_tree_remove(bfqd, entity, root); @@ -877,7 +707,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st = bfq_entity_service_tree(entity); entity->service += served; - + BUG_ON(entity->service > entity->budget); BUG_ON(st->wsum == 0); st->vtime += bfq_delta(served, st->wsum); @@ -886,419 +716,170 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) #ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif - st = bfq_entity_service_tree(&bfqq->entity); - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", - served, ((st->vtime>>10)*1000)>>12, st); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); } /** - * bfq_bfqq_charge_time - charge an amount of service equivalent to the length - * of the time interval during which bfqq has been in - * service. - * @bfqd: the device + * bfq_bfqq_charge_full_budget - set the service to the entity budget. * @bfqq: the queue that needs a service update. - * @time_ms: the amount of time during which the queue has received service - * - * If a queue does not consume its budget fast enough, then providing - * the queue with service fairness may impair throughput, more or less - * severely. For this reason, queues that consume their budget slowly - * are provided with time fairness instead of service fairness. This - * goal is achieved through the BFQ scheduling engine, even if such an - * engine works in the service, and not in the time domain. The trick - * is charging these queues with an inflated amount of service, equal - * to the amount of service that they would have received during their - * service slot if they had been fast, i.e., if their requests had - * been dispatched at a rate equal to the estimated peak rate. * - * It is worth noting that time fairness can cause important - * distortions in terms of bandwidth distribution, on devices with - * internal queueing. The reason is that I/O requests dispatched - * during the service slot of a queue may be served after that service - * slot is finished, and may have a total processing time loosely - * correlated with the duration of the service slot. This is - * especially true for short service slots. + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. */ -static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - unsigned long time_ms) +static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - int tot_serv_to_charge = entity->service; - unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); - - if (time_ms > 0 && time_ms < timeout_ms) - tot_serv_to_charge = - (bfqd->bfq_max_budget * time_ms) / timeout_ms; - if (tot_serv_to_charge < entity->service) - tot_serv_to_charge = entity->service; + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "charge_time: %lu/%u ms, %d/%d/%d sectors", - time_ms, timeout_ms, entity->service, - tot_serv_to_charge, entity->budget); - - /* Increase budget to avoid inconsistencies */ - if (tot_serv_to_charge > entity->budget) - entity->budget = tot_serv_to_charge; - - bfq_bfqq_served(bfqq, - max_t(int, 0, tot_serv_to_charge - entity->service)); -} - -static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - struct bfq_service_tree *st, - bool backshifted) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_sched_data *sd = entity->sched_data; - - st = __bfq_entity_update_weight_prio(st, entity); - bfq_calc_finish(entity, entity->budget); - - /* - * If some queues enjoy backshifting for a while, then their - * (virtual) finish timestamps may happen to become lower and - * lower than the system virtual time. In particular, if - * these queues often happen to be idle for short time - * periods, and during such time periods other queues with - * higher timestamps happen to be busy, then the backshifted - * timestamps of the former queues can become much lower than - * the system virtual time. In fact, to serve the queues with - * higher timestamps while the ones with lower timestamps are - * idle, the system virtual time may be pushed-up to much - * higher values than the finish timestamps of the idle - * queues. As a consequence, the finish timestamps of all new - * or newly activated queues may end up being much larger than - * those of lucky queues with backshifted timestamps. The - * latter queues may then monopolize the device for a lot of - * time. This would simply break service guarantees. - * - * To reduce this problem, push up a little bit the - * backshifted timestamps of the queue associated with this - * entity (only a queue can happen to have the backshifted - * flag set): just enough to let the finish timestamp of the - * queue be equal to the current value of the system virtual - * time. This may introduce a little unfairness among queues - * with backshifted timestamps, but it does not break - * worst-case fairness guarantees. - * - * As a special case, if bfqq is weight-raised, push up - * timestamps much less, to keep very low the probability that - * this push up causes the backshifted finish timestamps of - * weight-raised queues to become higher than the backshifted - * finish timestamps of non weight-raised queues. - */ - if (backshifted && bfq_gt(st->vtime, entity->finish)) { - unsigned long delta = st->vtime - entity->finish; - - if (bfqq) - delta /= bfqq->wr_coeff; - - entity->start += delta; - entity->finish += delta; - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: new group finish %llu", - ((entity->finish>>10)*1000)>>12); -#endif - } - } - - bfq_active_insert(st, entity); - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); -#endif - } - BUG_ON(RB_EMPTY_ROOT(&st->active)); - BUG_ON(&st->active != &sd->service_tree->active && - &st->active != &(sd->service_tree+1)->active && - &st->active != &(sd->service_tree+2)->active); + bfq_bfqq_served(bfqq, entity->budget - entity->service); } /** - * __bfq_activate_entity - handle activation of entity. + * __bfq_activate_entity - activate an entity. * @entity: the entity being activated. - * @non_blocking_wait_rq: true if entity was waiting for a request - * - * Called for a 'true' activation, i.e., if entity is not active and - * one of its children receives a new request. * - * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possible extracting it - * from its idle tree. + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. */ -static void __bfq_activate_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq) +static void __bfq_activate_entity(struct bfq_entity *entity) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_service_tree *st = bfq_entity_service_tree(entity); - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - bool backshifted = false; - unsigned long long min_vstart; - BUG_ON(!sd); - BUG_ON(!st); - - /* See comments on bfq_fqq_update_budg_for_activation */ - if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { - backshifted = true; - min_vstart = entity->finish; - } else - min_vstart = st->vtime; - - if (entity->tree == &st->idle) { + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_in_service entity below it. We reuse the + * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { /* * Must be on the idle tree, bfq_idle_extract() will * check for that. */ bfq_idle_extract(st, entity); - entity->start = bfq_gt(min_vstart, entity->finish) ? - min_vstart : entity->finish; + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; } else { /* * The finish time of the entity may be invalid, and * it is in the past for sure, otherwise the queue * would have been on the idle tree. */ - entity->start = min_vstart; + entity->start = st->vtime; st->wsum += entity->weight; bfq_get_entity(entity); - BUG_ON(entity->on_st && bfqq); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (entity->on_st && !bfqq) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, - bfqg, - "activate bug, class %d in_service %p", - bfq_class_idx(entity), sd->in_service_entity); - } -#endif - BUG_ON(entity->on_st && !bfqq); - entity->on_st = true; + BUG_ON(entity->on_st); + entity->on_st = 1; } - bfq_update_fin_time_enqueue(entity, st, backshifted); -} - -/** - * __bfq_requeue_entity - handle requeueing or repositioning of an entity. - * @entity: the entity being requeued or repositioned. - * - * Requeueing is needed if this entity stops being served, which - * happens if a leaf descendant entity has expired. On the other hand, - * repositioning is needed if the next_inservice_entity for the child - * entity has changed. See the comments inside the function for - * details. - * - * Basically, this function: 1) removes entity from its active tree if - * present there, 2) updates the timestamps of entity and 3) inserts - * entity back into its active tree (in the new, right position for - * the new values of the timestamps). - */ -static void __bfq_requeue_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - BUG_ON(!sd); - BUG_ON(!st); - - BUG_ON(entity != sd->in_service_entity && - entity->tree != &st->active); - - if (entity == sd->in_service_entity) { - /* - * We are requeueing the current in-service entity, - * which may have to be done for one of the following - * reasons: - * - entity represents the in-service queue, and the - * in-service queue is being requeued after an - * expiration; - * - entity represents a group, and its budget has - * changed because one of its child entities has - * just been either activated or requeued for some - * reason; the timestamps of the entity need then to - * be updated, and the entity needs to be enqueued - * or repositioned accordingly. - * - * In particular, before requeueing, the start time of - * the entity must be moved forward to account for the - * service that the entity has received while in - * service. This is done by the next instructions. The - * finish time will then be updated according to this - * new value of the start time, and to the budget of - * the entity. - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child - * when set in service, then was not extracted from - * the active tree. This implies that the position of - * the entity in the active tree may need to be - * changed now, because we have just updated the start - * time of the entity, and we will update its finish - * time in a moment (the requeueing is then, more - * precisely, a repositioning in this case). To - * implement this repositioning, we: 1) dequeue the - * entity here, 2) update the finish time and - * requeue the entity according to the new - * timestamps below. - */ - if (entity->tree) - bfq_active_extract(st, entity); - } else { /* The entity is already active, and not in service */ - /* - * In this case, this function gets called only if the - * next_in_service entity below this entity has - * changed, and this change has caused the budget of - * this entity to change, which, finally implies that - * the finish time of this entity must be - * updated. Such an update may cause the scheduling, - * i.e., the position in the active tree, of this - * entity to change. We handle this change by: 1) - * dequeueing the entity here, 2) updating the finish - * time and requeueing the entity according to the new - * timestamps below. This is the same approach as the - * non-extracted-entity sub-case above. - */ - bfq_active_extract(st, entity); - } - - bfq_update_fin_time_enqueue(entity, st, false); -} - -static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, - bool non_blocking_wait_rq) -{ - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - if (sd->in_service_entity == entity || entity->tree == &st->active) - /* - * in service or already queued on the active tree, - * requeue or reposition - */ - __bfq_requeue_entity(entity); - else - /* - * Not in service and not queued on its active tree: - * the activity is idle and this is a true activation. - */ - __bfq_activate_entity(entity, non_blocking_wait_rq); + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); } - /** - * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, - * and activate, requeue or reposition all ancestors - * for which such an update becomes necessary. + * bfq_activate_entity - activate an entity and its ancestors if necessary. * @entity: the entity to activate. - * @non_blocking_wait_rq: true if this entity was waiting for a request - * @requeue: true if this is a requeue, which implies that bfqq is - * being expired; thus ALL its ancestors stop being served and must - * therefore be requeued + * + * Activate @entity and all the entities on the path from it to the root. */ -static void bfq_activate_requeue_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq, - bool requeue) +static void bfq_activate_entity(struct bfq_entity *entity) { struct bfq_sched_data *sd; for_each_entity(entity) { - BUG_ON(!entity); - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && - RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && - RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); + __bfq_activate_entity(entity); - if (!bfq_update_next_in_service(sd, entity) && !requeue) { - BUG_ON(!sd->next_in_service); + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the in-service entity is rescheduled. + */ break; - } - BUG_ON(!sd->next_in_service); } } /** * __bfq_deactivate_entity - deactivate an entity from its service tree. * @entity: the entity to deactivate. - * @ins_into_idle_tree: if false, the entity will not be put into the - * idle tree. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. * - * Deactivates an entity, independently from its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it on the idle - * tree. + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. */ -static bool __bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree) +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) { struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - bool was_in_service = entity == sd->in_service_entity; + struct bfq_service_tree *st; + int was_in_service; + int ret = 0; - if (!entity->on_st) { /* entity never activated, or already inactive */ - BUG_ON(entity == entity->sched_data->in_service_entity); - return false; - } + if (sd == NULL || !entity->on_st) /* never activated, or inactive */ + return 0; - BUG_ON(was_in_service && entity->tree && entity->tree != &st->active); + st = bfq_entity_service_tree(entity); + was_in_service = entity == sd->in_service_entity; - if (was_in_service) - bfq_calc_finish(entity, entity->service); + BUG_ON(was_in_service && entity->tree); - if (entity->tree == &st->active) + if (was_in_service) { + bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) bfq_active_extract(st, entity); - else if (!was_in_service && entity->tree == &st->idle) + else if (entity->tree == &st->idle) bfq_idle_extract(st, entity); else if (entity->tree) BUG(); - if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) + if (was_in_service || sd->next_in_service == entity) + ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) bfq_forget_entity(st, entity); else bfq_idle_insert(st, entity); - return true; + BUG_ON(sd->in_service_entity == entity); + BUG_ON(sd->next_in_service == entity); + + return ret; } /** - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. + * bfq_deactivate_entity - deactivate an entity. * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put on the idle tree + * @requeue: true if the entity can be put on the idle tree */ -static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, - bool expiration) +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) { struct bfq_sched_data *sd; struct bfq_entity *parent; @@ -1306,154 +887,63 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, for_each_entity_safe(entity, parent) { sd = entity->sched_data; - BUG_ON(sd == NULL); /* - * It would mean that this is the - * root group. - */ - - BUG_ON(expiration && entity != sd->in_service_entity); - - BUG_ON(entity != sd->in_service_entity && - entity->tree == - &bfq_entity_service_tree(entity)->active && - !sd->next_in_service); - - if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { - /* - * Entity is not any tree any more, so, this - * deactivation is a no-op, and there is - * nothing to change for upper-level entities - * (in case of expiration, this can never - * happen). - */ - BUG_ON(expiration); /* - * entity cannot be already out of - * any tree - */ - return; - } - - if (sd->next_in_service == entity) + if (!__bfq_deactivate_entity(entity, requeue)) /* - * entity was the next_in_service entity, - * then, since entity has just been - * deactivated, a new one must be found. + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * in service. */ - bfq_update_next_in_service(sd, NULL); + break; - if (sd->next_in_service) { + if (sd->next_in_service) /* - * The parent entity is still backlogged, - * because next_in_service is not NULL. So, no - * further upwards deactivation must be - * performed. Yet, next_in_service has - * changed. Then the schedule does need to be - * updated upwards. + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. */ - BUG_ON(sd->next_in_service == entity); - break; - } - - /* - * If we get here, then the parent is no more - * backlogged and we need to propagate the - * deactivation upwards. Thus let the loop go on. - */ + goto update; /* - * Also let parent be queued into the idle tree on - * deactivation, to preserve service guarantees, and - * assuming that who invoked this function does not - * need parent entities too to be removed completely. + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. */ - ins_into_idle_tree = true; + requeue = 1; } - /* - * If the deactivation loop is fully executed, then there are - * no more entities to touch and next loop is not executed at - * all. Otherwise, requeue remaining entities if they are - * about to stop receiving service, or reposition them if this - * is not the case. - */ + return; + +update: entity = parent; for_each_entity(entity) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - /* - * Invoke __bfq_requeue_entity on entity, even if - * already active, to requeue/reposition it in the - * active tree (because sd->next_in_service has - * changed) - */ - __bfq_requeue_entity(entity); + __bfq_activate_entity(entity); sd = entity->sched_data; - BUG_ON(expiration && sd->in_service_entity != entity); - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "invoking udpdate_next for this queue"); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "invoking udpdate_next for this entity"); - } -#endif - if (!bfq_update_next_in_service(sd, entity) && - !expiration) - /* - * next_in_service unchanged or not causing - * any change in entity->parent->sd, and no - * requeueing needed for expiration: stop - * here. - */ + if (!bfq_update_next_in_service(sd)) break; } } /** - * bfq_calc_vtime_jump - compute the value to which the vtime should jump, - * if needed, to have at least one entity eligible. + * bfq_update_vtime - update vtime if necessary. * @st: the service tree to act upon. * - * Assumes that st is not empty. + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. */ -static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) +static void bfq_update_vtime(struct bfq_service_tree *st) { - struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); - - if (bfq_gt(root_entity->min_start, st->vtime)) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", - root_entity->min_start); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(root_entity, struct bfq_group, - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_vtime_jump: new value %llu", - root_entity->min_start); - } -#endif - return root_entity->min_start; - } - return st->vtime; -} + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; -static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -{ - if (new_value > st->vtime) { - st->vtime = new_value; + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; bfq_forget_idle(st); } } @@ -1462,7 +952,6 @@ static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) * bfq_first_active_entity - find the eligible entity with * the smallest finish time * @st: the service tree to select from. - * @vtime: the system virtual to use as a reference for eligibility * * This function searches the first schedulable entity, starting from the * root of the tree and going on the left every time on this side there is @@ -1470,8 +959,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) * the right is followed only if a) the left subtree contains no eligible * entities and b) no eligible entity has been found yet. */ -static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, - u64 vtime) +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) { struct bfq_entity *entry, *first = NULL; struct rb_node *node = st->active.rb_node; @@ -1479,15 +967,15 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, while (node) { entry = rb_entry(node, struct bfq_entity, rb_node); left: - if (!bfq_gt(entry->start, vtime)) + if (!bfq_gt(entry->start, st->vtime)) first = entry; - BUG_ON(bfq_gt(entry->min_start, vtime)); + BUG_ON(bfq_gt(entry->min_start, st->vtime)); if (node->rb_left) { entry = rb_entry(node->rb_left, struct bfq_entity, rb_node); - if (!bfq_gt(entry->min_start, vtime)) { + if (!bfq_gt(entry->min_start, st->vtime)) { node = node->rb_left; goto left; } @@ -1505,84 +993,31 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, * __bfq_lookup_next_entity - return the first eligible entity in @st. * @st: the service tree. * - * If there is no in-service entity for the sched_data st belongs to, - * then return the entity that will be set in service if: - * 1) the parent entity this st belongs to is set in service; - * 2) no entity belonging to such parent entity undergoes a state change - * that would influence the timestamps of the entity (e.g., becomes idle, - * becomes backlogged, changes its budget, ...). - * - * In this first case, update the virtual time in @st too (see the - * comments on this update inside the function). - * - * In constrast, if there is an in-service entity, then return the - * entity that would be set in service if not only the above - * conditions, but also the next one held true: the currently - * in-service entity, on expiration, - * 1) gets a finish time equal to the current one, or - * 2) is not eligible any more, or - * 3) is idle. + * Update the virtual time in @st and return the first eligible entity + * it contains. */ -static struct bfq_entity * -__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -#if 0 - , bool force -#endif - ) +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) { - struct bfq_entity *entity -#if 0 - , *new_next_in_service = NULL -#endif - ; - u64 new_vtime; - struct bfq_queue *bfqq; + struct bfq_entity *entity, *new_next_in_service = NULL; if (RB_EMPTY_ROOT(&st->active)) return NULL; - /* - * Get the value of the system virtual time for which at - * least one entity is eligible. - */ - new_vtime = bfq_calc_vtime_jump(st); + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); /* - * If there is no in-service entity for the sched_data this - * active tree belongs to, then push the system virtual time - * up to the value that guarantees that at least one entity is - * eligible. If, instead, there is an in-service entity, then - * do not make any such update, because there is already an - * eligible entity, namely the in-service one (even if the - * entity is not on st, because it was extracted when set in - * service). + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. */ - if (!in_service) - bfq_update_vtime(st, new_vtime); - - entity = bfq_first_active_entity(st, new_vtime); - BUG_ON(bfq_gt(entity->start, new_vtime)); - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); + if (unlikely(force && entity != entity->sched_data->next_in_service)) { + new_next_in_service = entity; + for_each_entity(new_next_in_service) + bfq_update_budget(new_next_in_service); } -#endif - - BUG_ON(!entity); return entity; } @@ -1590,81 +1025,50 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service /** * bfq_lookup_next_entity - return the first eligible entity in @sd. * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. * - * This function is invoked when there has been a change in the trees - * for sd, and we need know what is the new next entity after this - * change. + * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) { struct bfq_service_tree *st = sd->service_tree; - struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); - struct bfq_entity *entity = NULL; - struct bfq_queue *bfqq; - int class_idx = 0; + struct bfq_entity *entity; + int i = 0; - BUG_ON(!sd); - BUG_ON(!st); - /* - * Choose from idle class, if needed to guarantee a minimum - * bandwidth to this class (and if there is some active entity - * in idle class). This should also mitigate - * priority-inversion problems in case a low priority task is - * holding file system resources. - */ - if (time_is_before_jiffies(sd->bfq_class_idle_last_service + - BFQ_CL_IDLE_TIMEOUT)) { - if (!RB_EMPTY_ROOT(&idle_class_st->active)) - class_idx = BFQ_IOPRIO_CLASSES - 1; - /* About to be served if backlogged, or not yet backlogged */ - sd->bfq_class_idle_last_service = jiffies; - } - - /* - * Find the next entity to serve for the highest-priority - * class, unless the idle class needs to be served. - */ - for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { - entity = __bfq_lookup_next_entity(st + class_idx, - sd->in_service_entity); + BUG_ON(sd->in_service_entity); - if (entity) - break; + if (bfqd && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; + } } - - BUG_ON(!entity && - (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || - !RB_EMPTY_ROOT(&(st+2)->active))); - - if (!entity) - return NULL; - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", - st + class_idx, class_idx); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "chosen from st %p %d", - st + class_idx, class_idx); + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity) { + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); + sd->in_service_entity = entity; + sd->next_in_service = NULL; + } + break; + } } -#endif return entity; } -static bool next_queue_may_preempt(struct bfq_data *bfqd) -{ - struct bfq_sched_data *sd = &bfqd->root_group->sched_data; - - return sd->next_in_service != sd->in_service_entity; -} - /* * Get next queue for service. */ @@ -1679,208 +1083,58 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) if (bfqd->busy_queues == 0) return NULL; - /* - * Traverse the path from the root to the leaf entity to - * serve. Set in service all the entities visited along the - * way. - */ sd = &bfqd->root_group->sched_data; for (; sd ; sd = entity->my_sched_data) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (entity) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, - "get_next_queue: lookup in this group"); - if (!sd->next_in_service) - pr_crit("get_next_queue: lookup in this group"); - } else { - bfq_log_bfqg(bfqd, bfqd->root_group, - "get_next_queue: lookup in root group"); - if (!sd->next_in_service) - pr_crit("get_next_queue: lookup in root group"); - } -#endif - - BUG_ON(!sd->next_in_service); - - /* - * WARNING. We are about to set the in-service entity - * to sd->next_in_service, i.e., to the (cached) value - * returned by bfq_lookup_next_entity(sd) the last - * time it was invoked, i.e., the last time when the - * service order in sd changed as a consequence of the - * activation or deactivation of an entity. In this - * respect, if we execute bfq_lookup_next_entity(sd) - * in this very moment, it may, although with low - * probability, yield a different entity than that - * pointed to by sd->next_in_service. This rare event - * happens in case there was no CLASS_IDLE entity to - * serve for sd when bfq_lookup_next_entity(sd) was - * invoked for the last time, while there is now one - * such entity. - * - * If the above event happens, then the scheduling of - * such entity in CLASS_IDLE is postponed until the - * service of the sd->next_in_service entity - * finishes. In fact, when the latter is expired, - * bfq_lookup_next_entity(sd) gets called again, - * exactly to update sd->next_in_service. - */ - - /* Make next_in_service entity become in_service_entity */ - entity = sd->next_in_service; - sd->in_service_entity = entity; - - /* - * Reset the accumulator of the amount of service that - * the entity is about to receive. - */ + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(!entity); entity->service = 0; - - /* - * If entity is no longer a candidate for next - * service, then we extract it from its active tree, - * for the following reason. To further boost the - * throughput in some special case, BFQ needs to know - * which is the next candidate entity to serve, while - * there is already an entity in service. In this - * respect, to make it easy to compute/update the next - * candidate entity to serve after the current - * candidate has been set in service, there is a case - * where it is necessary to extract the current - * candidate from its service tree. Such a case is - * when the entity just set in service cannot be also - * a candidate for next service. Details about when - * this conditions holds are reported in the comments - * on the function bfq_no_longer_next_in_service() - * invoked below. - */ - if (bfq_no_longer_next_in_service(entity)) - bfq_active_extract(bfq_entity_service_tree(entity), - entity); - - /* - * For the same reason why we may have just extracted - * entity from its active tree, we may need to update - * next_in_service for the sched_data of entity too, - * regardless of whether entity has been extracted. - * In fact, even if entity has not been extracted, a - * descendant entity may get extracted. Such an event - * would cause a change in next_in_service for the - * level of the descendant entity, and thus possibly - * back to upper levels. - * - * We cannot perform the resulting needed update - * before the end of this loop, because, to know which - * is the correct next-to-serve candidate entity for - * each level, we need first to find the leaf entity - * to set in service. In fact, only after we know - * which is the next-to-serve leaf entity, we can - * discover whether the parent entity of the leaf - * entity becomes the next-to-serve, and so on. - */ - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "get_next_queue: this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, - "get_next_queue: this entity, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - } -#endif - } - BUG_ON(!entity); bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); - /* - * We can finally update all next-to-serve entities along the - * path from the leaf entity just set in service to the root. - */ - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->sched_data; - - if(!bfq_update_next_in_service(sd, NULL)) - break; - } - return bfqq; } static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { - struct bfq_entity *entity = &bfqd->in_service_queue->entity; - if (bfqd->in_service_bic) { put_io_context(bfqd->in_service_bic->icq.ioc); bfqd->in_service_bic = NULL; } - bfq_clear_bfqq_wait_request(bfqd->in_service_queue); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; - - /* - * When this function is called, all in-service entities have - * been properly deactivated or requeued, so we can safely - * execute the final step: reset in_service_entity along the - * path from entity to the root. - */ - for_each_entity(entity) - entity->sched_data->in_service_entity = NULL; + del_timer(&bfqd->idle_slice_timer); } static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool ins_into_idle_tree, bool expiration) + int requeue) { struct bfq_entity *entity = &bfqq->entity; - bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -} - -static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); + if (bfqq == bfqd->in_service_queue) + __bfq_bfqd_reset_in_service(bfqd); - BUG_ON(bfqq == bfqd->in_service_queue); - BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && - entity->on_st); - - bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), - false); - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + bfq_deactivate_entity(entity, requeue); } -static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - bfq_activate_requeue_entity(entity, false, - bfqq == bfqd->in_service_queue); + bfq_activate_entity(entity); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); +#endif /* * Called when the bfqq no longer has requests pending, remove it from - * the service tree. As a special case, it can be invoked during an - * expiration. + * the service tree. */ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) + int requeue) { BUG_ON(!bfq_bfqq_busy(bfqq)); BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -1892,20 +1146,27 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(bfqd->busy_queues == 0); bfqd->busy_queues--; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_dequeue(bfqq_group(bfqq)); +#endif - BUG_ON(bfqq->entity.budget < 0); - - bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - - BUG_ON(bfqq->entity.budget < 0); + bfq_deactivate_bfqq(bfqd, bfqq, requeue); } /* @@ -1923,11 +1184,16 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues++; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { if (bfqq->wr_coeff == 1) bfq_weights_tree_add(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; } diff --git a/block/bfq.h b/block/bfq.h index bef8244cc03f..fcce85528377 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ v8r7 for 4.9.0: data structures and common functions prototypes. + * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -7,9 +7,7 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente + * Copyright (C) 2010 Paolo Valente */ #ifndef _BFQ_H @@ -30,21 +28,20 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_DEFAULT_GRP_WEIGHT 10 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -/* - * Soft real-time applications are extremely more latency sensitive - * than interactive ones. Over-raise the weight of the former to - * privilege them against the latter. - */ -#define BFQ_SOFTRT_WEIGHT_FACTOR 100 - struct bfq_entity; /** * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. * * Each service tree represents a B-WF2Q+ scheduler on its own. Each * ioprio_class has its own independent scheduler, and so its own @@ -52,28 +49,27 @@ struct bfq_entity; * of the containing bfqd. */ struct bfq_service_tree { - /* tree for active entities (i.e., those backlogged) */ struct rb_root active; - /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ struct rb_root idle; - struct bfq_entity *first_idle; /* idle entity with minimum F_i */ - struct bfq_entity *last_idle; /* idle entity with maximum F_i */ + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; - u64 vtime; /* scheduler virtual time */ - /* scheduler weight sum; active and idle entities contribute to it */ + u64 vtime; unsigned long wsum; }; /** * struct bfq_sched_data - multi-class scheduler. + * @in_service_entity: entity in service. + * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. * * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as an - * intermediate queue on a hierarchical setup. @next_in_service - * points to the active entity of the sched_data service trees that - * will be scheduled next. It is used to reduce the number of steps - * needed for each hierarchical-schedule update. + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_in_service points to the active entity of the sched_data + * service trees that will be scheduled next. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. @@ -83,32 +79,48 @@ struct bfq_service_tree { * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_sched_data { - struct bfq_entity *in_service_entity; /* entity in service */ - /* head-of-the-line entity in the scheduler (see comments above) */ + struct bfq_entity *in_service_entity; struct bfq_entity *next_in_service; - /* array of service trees, one per ioprio_class */ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; - /* last time CLASS_IDLE was served */ - unsigned long bfq_class_idle_last_service; - }; /** * struct bfq_weight_counter - counter of the number of all active entities * with a given weight. + * @weight: weight of the entities that this counter refers to. + * @num_active: number of active entities with this weight. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree + * and @group_weights_tree). */ struct bfq_weight_counter { - unsigned int weight; /* weight of the entities this counter refers to */ - unsigned int num_active; /* nr of active entities with this weight */ - /* - * Weights tree member (see bfq_data's @queue_weights_tree and - * @group_weights_tree) - */ + short int weight; + unsigned int num_active; struct rb_node weights_node; }; /** * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @prio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -135,52 +147,27 @@ struct bfq_weight_counter { * containing bfqd. */ struct bfq_entity { - struct rb_node rb_node; /* service_tree member */ - /* pointer to the weight counter associated with this entity */ + struct rb_node rb_node; struct bfq_weight_counter *weight_counter; - /* - * Flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree) or is in service. - */ - bool on_st; + int on_st; - u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ - u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + u64 finish; + u64 start; - /* tree the entity is enqueued into; %NULL if not on a tree */ struct rb_root *tree; - /* - * minimum start time of the (active) subtree rooted at this - * entity; used for O(log N) lookups into active trees - */ u64 min_start; - /* amount of service received during the last service slot */ - int service; - - /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ - int budget; - - unsigned int weight; /* weight of the queue */ - unsigned int new_weight; /* next weight if a change is in progress */ - - /* original weight, used to implement weight boosting */ - unsigned int orig_weight; + int service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; - /* parent entity, for hierarchical scheduling */ struct bfq_entity *parent; - /* - * For non-leaf nodes in the hierarchy, the associated - * scheduler queue, %NULL on leaf nodes. - */ struct bfq_sched_data *my_sched_data; - /* the scheduler queue this entity belongs to */ struct bfq_sched_data *sched_data; - /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; }; @@ -188,6 +175,56 @@ struct bfq_group; /** * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_wr_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period + * @wr_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared * * A bfq_queue is a leaf request queue; it can be associated with an * io_context or more, if it is async or shared between cooperating @@ -198,175 +235,117 @@ struct bfq_group; * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { - /* reference counter */ - int ref; - /* parent bfq_data */ + atomic_t ref; struct bfq_data *bfqd; - /* current ioprio and ioprio class */ - unsigned short ioprio, ioprio_class; - /* next ioprio and ioprio class if a change is in progress */ - unsigned short new_ioprio, new_ioprio_class; + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; - /* - * Shared bfq_queue if queue is cooperating with one or more - * other queues. - */ + /* fields for cooperating queues handling */ struct bfq_queue *new_bfqq; - /* request-position tree member (see bfq_group's @rq_pos_tree) */ struct rb_node pos_node; - /* request-position tree root (see bfq_group's @rq_pos_tree) */ struct rb_root *pos_root; - /* sorted list of pending requests */ struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ struct request *next_rq; - /* number of sync and async requests queued */ int queued[2]; - /* number of sync and async requests currently allocated */ int allocated[2]; - /* number of pending metadata requests */ int meta_pending; - /* fifo list of requests in sort_list */ struct list_head fifo; - /* entity representing this queue in the scheduler */ struct bfq_entity entity; - /* maximum budget allowed from the feedback mechanism */ int max_budget; - /* budget expiration (in jiffies) */ unsigned long budget_timeout; - /* number of requests on the dispatch list or inside driver */ int dispatched; - unsigned int flags; /* status flags.*/ + unsigned int flags; - /* node for active/idle bfqq list inside parent bfqd */ struct list_head bfqq_list; - /* bit vector: a 1 for each seeky requests in history */ - u32 seek_history; - - /* node for the device's burst list */ struct hlist_node burst_list_node; - /* position of the last request enqueued */ + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; sector_t last_request_pos; - /* Number of consecutive pairs of request completion and - * arrival, such that the queue becomes idle after the - * completion, but the next request arrives within an idle - * time slice; used only if the queue's IO_bound flag has been - * cleared. - */ unsigned int requests_within_timer; - /* pid of the process owning the queue, used for logging purposes */ pid_t pid; - - /* - * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL - * if the queue is shared. - */ struct bfq_io_cq *bic; - /* current maximum weight-raising time for this queue */ + /* weight-raising fields */ unsigned long wr_cur_max_time; - /* - * Minimum time instant such that, only if a new request is - * enqueued after this time instant in an idle @bfq_queue with - * no outstanding requests, then the task associated with the - * queue it is deemed as soft real-time (see the comments on - * the function bfq_bfqq_softrt_next_start()) - */ unsigned long soft_rt_next_start; - /* - * Start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period. - */ unsigned long last_wr_start_finish; - /* factor by which the weight of this queue is multiplied */ unsigned int wr_coeff; - /* - * Time of the last transition of the @bfq_queue from idle to - * backlogged. - */ unsigned long last_idle_bklogged; - /* - * Cumulative service received from the @bfq_queue since the - * last transition from idle to backlogged. - */ unsigned long service_from_backlogged; - /* - * Value of wr start time when switching to soft rt - */ - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ }; /** * struct bfq_ttime - per process thinktime stats. + * @ttime_total: total process thinktime + * @ttime_samples: number of thinktime samples + * @ttime_mean: average process thinktime */ struct bfq_ttime { - u64 last_end_request; /* completion time of last request */ - - u64 ttime_total; /* total process thinktime */ - unsigned long ttime_samples; /* number of thinktime samples */ - u64 ttime_mean; /* average process thinktime */ + unsigned long last_end_request; + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; }; /** * struct bfq_io_cq - per (request_queue, io_context) structure. + * @icq: associated io_cq structure + * @bfqq: array of two process queues, the sync and the async + * @ttime: associated @bfq_ttime struct + * @ioprio: per (request_queue, blkcg) ioprio. + * @blkcg_id: id of the blkcg the related io_cq belongs to. + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @saved_in_large_burst: same purpose as the previous fields for the + * value of the field keeping the queue's belonging + * to a large burst + * @was_in_burst_list: true if the queue belonged to a burst list + * before its merge with another cooperating queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues */ struct bfq_io_cq { - /* associated io_cq structure */ struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ struct bfq_queue *bfqq[2]; - /* associated @bfq_ttime struct */ struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ int ioprio; + #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ + uint64_t blkcg_id; /* the current blkcg ID */ #endif - /* - * Snapshot of the idle window before merging; taken to - * remember this value while the queue is merged, so as to be - * able to restore it in case of split. - */ + unsigned int wr_time_left; bool saved_idle_window; - /* - * Same purpose as the previous two fields for the I/O bound - * classification of a queue. - */ bool saved_IO_bound; - /* - * Same purpose as the previous fields for the value of the - * field keeping the queue's belonging to a large burst - */ bool saved_in_large_burst; - /* - * True if the queue belonged to a burst list before its merge - * with another cooperating queue. - */ bool was_in_burst_list; - /* - * Similar to previous fields: save wr information. - */ - unsigned long saved_wr_coeff; - unsigned long saved_last_wr_start_finish; - unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; + unsigned int cooperations; + unsigned int failed_cooperations; }; enum bfq_device_speed { @@ -375,232 +354,224 @@ enum bfq_device_speed { }; /** - * struct bfq_data - per-device data structure. + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @active_numerous_groups: number of bfq_groups containing more than one + * active @bfq_entity. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to + * the functions bfq_weights_tree_[add|remove] for + * further details). + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted + * by weight. Used to keep track of whether all + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). + * @busy_queues: number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). + * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @in_service_queue: bfq_queue in service. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). + * @last_ins_in_burst: last time at which a queue entered the current + * burst of queues being activated shortly after + * each other; for more details about this and the + * following parameters related to a burst of + * activations, see the comments to the function + * @bfq_handle_burst. + * @bfq_burst_interval: reference time interval used to decide whether a + * queue has been activated shortly after + * @last_ins_in_burst. + * @burst_size: number of queues in the current burst of queue activations. + * @bfq_large_burst_thresh: maximum burst size above which the current + * queue-activation burst is deemed as 'large'. + * @large_burst: true if a large queue-activation burst is in progress. + * @burst_list: head of the burst list (as for the above fields, more details + * in the comments to the function bfq_handle_burst). + * @low_latency: if set to true, low-latency heuristics are enabled. + * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised + * queue is multiplied. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies). + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies). + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically. + * @device_speed: device-speed class for the low-latency heuristic. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. * * All the fields are protected by the @queue lock. */ struct bfq_data { - /* request queue for the device */ struct request_queue *queue; - /* root bfq_group for the device */ struct bfq_group *root_group; - /* - * rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues have - * the same weight. The tree contains one counter for each - * distinct weight associated to some active and not - * weight-raised @bfq_queue (see the comments to the functions - * bfq_weights_tree_[add|remove] for further details). - */ +#ifdef CONFIG_BFQ_GROUP_IOSCHED + int active_numerous_groups; +#endif + struct rb_root queue_weights_tree; - /* - * rbtree of non-queue @bfq_entity weight counters, sorted by - * weight. Used to keep track of whether all @bfq_groups have - * the same weight. The tree contains one counter for each - * distinct weight associated to some active @bfq_group (see - * the comments to the functions bfq_weights_tree_[add|remove] - * for further details). - */ struct rb_root group_weights_tree; - /* - * Number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - */ int busy_queues; - /* number of weight-raised busy @bfq_queues */ + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; int wr_busy_queues; - /* number of queued requests */ int queued; - /* number of requests dispatched and waiting for completion */ int rq_in_driver; + int sync_flight; - /* - * Maximum number of requests in driver in the last - * @hw_tag_samples completed requests. - */ int max_rq_in_driver; - /* number of samples used to calculate hw_tag */ int hw_tag_samples; - /* flag set to one if the driver is showing a queueing behavior */ int hw_tag; - /* number of budgets assigned */ int budgets_assigned; - /* - * Timer set when idling (waiting) for the next request from - * the queue in service. - */ - struct hrtimer idle_slice_timer; - /* delayed work to restart dispatching on the request queue */ + struct timer_list idle_slice_timer; struct work_struct unplug_work; - /* bfq_queue in service */ struct bfq_queue *in_service_queue; - /* bfq_io_cq (bic) associated with the @in_service_queue */ struct bfq_io_cq *in_service_bic; - /* on-disk position of the last served request */ sector_t last_position; - /* time of last request completion (ns) */ - u64 last_completion; - - /* time of first rq dispatch in current observation interval (ns) */ - u64 first_dispatch; - /* time of last rq dispatch in current observation interval (ns) */ - u64 last_dispatch; - - /* beginning of the last budget */ ktime_t last_budget_start; - /* beginning of the last idle slice */ ktime_t last_idling_start; - - /* number of samples in current observation interval */ int peak_rate_samples; - /* num of samples of seq dispatches in current observation interval */ - u32 sequential_samples; - /* total num of sectors transferred in current observation interval */ - u64 tot_sectors_dispatched; - /* max rq size seen during current observation interval (sectors) */ - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; - /* current estimate of device peak rate */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ + u64 peak_rate; int bfq_max_budget; - /* list of all the bfq_queues active on the device */ struct list_head active_list; - /* list of all the bfq_queues idle on the device */ struct list_head idle_list; - /* - * Timeout for async/sync requests; when it fires, requests - * are served in fifo order. - */ - u64 bfq_fifo_expire[2]; - /* weight of backward seeks wrt forward ones */ + unsigned int bfq_fifo_expire[2]; unsigned int bfq_back_penalty; - /* maximum allowed backward seek */ unsigned int bfq_back_max; - /* maximum idling time */ - u32 bfq_slice_idle; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; - /* user-configured max budget value (0 for auto-tuning) */ int bfq_user_max_budget; - /* - * Timeout for bfq_queues to consume their budget; used to - * prevent seeky queues from imposing long latencies to - * sequential or quasi-sequential ones (this also implies that - * seeky queues cannot receive guarantees in the service - * domain; after a timeout they are charged for the time they - * have been in service, to preserve fairness among them, but - * without service-domain guarantees). - */ - unsigned int bfq_timeout; - - /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ + int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; unsigned int bfq_requests_within_timer; - /* - * Force device idling whenever needed to provide accurate - * service guarantees, without caring about throughput - * issues. CAVEAT: this may even increase latencies, in case - * of useless idling for processes that did stop doing I/O. - */ - bool strict_guarantees; - - /* - * Last time at which a queue entered the current burst of - * queues being activated shortly after each other; for more - * details about this and the following parameters related to - * a burst of activations, see the comments on the function - * bfq_handle_burst. - */ unsigned long last_ins_in_burst; - /* - * Reference time interval used to decide whether a queue has - * been activated shortly after @last_ins_in_burst. - */ unsigned long bfq_burst_interval; - /* number of queues in the current burst of queue activations */ int burst_size; - - /* common parent entity for the queues in the burst */ - struct bfq_entity *burst_parent_entity; - /* Maximum burst size above which the current queue-activation - * burst is deemed as 'large'. - */ unsigned long bfq_large_burst_thresh; - /* true if a large queue-activation burst is in progress */ bool large_burst; - /* - * Head of the burst list (as for the above fields, more - * details in the comments on the function bfq_handle_burst). - */ struct hlist_head burst_list; - /* if set to true, low-latency heuristics are enabled */ bool low_latency; - /* - * Maximum factor by which the weight of a weight-raised queue - * is multiplied. - */ + + /* parameters of the low_latency heuristics */ unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ unsigned int bfq_wr_max_time; - - /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; - /* - * Minimum idle period after which weight-raising may be - * reactivated for a queue (in jiffies). - */ unsigned int bfq_wr_min_idle_time; - /* - * Minimum period between request arrivals after which - * weight-raising may be reactivated for an already busy async - * queue (in jiffies). - */ unsigned long bfq_wr_min_inter_arr_async; - - /* Max service-rate for a soft real-time queue, in sectors/sec */ unsigned int bfq_wr_max_softrt_rate; - /* - * Cached value of the product R*T, used for computing the - * maximum duration of weight raising automatically. - */ u64 RT_prod; - /* device-speed class for the low-latency heuristic */ enum bfq_device_speed device_speed; - /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; }; enum bfqq_state_flags { - BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ - BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ - BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* - * waiting for a request - * without idling the device - */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once * having consumed at most 2/10 of @@ -610,12 +581,17 @@ enum bfqq_state_flags { * bfqq activated in a large burst, * see comments to bfq_handle_burst. */ + BFQ_BFQQ_FLAG_constantly_seeky, /* + * bfqq has proved to be slow and + * seeky until budget timeout + */ BFQ_BFQQ_FLAG_softrt_update, /* * may need softrt-next-start * update */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -632,94 +608,28 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } -BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); -BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS /* Logging facilities. */ -#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("bfq%d%c %s " fmt "\n", \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ -} while (0) - -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s " fmt "\n", __pbuf, ##args); \ -} while (0) - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log(bfqd, fmt, args...) \ - pr_crit("bfq " fmt "\n", ##args) - -#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ -} while (0) - -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -} while (0) - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ /* Expiration reasons. */ enum bfqq_expiration { @@ -730,12 +640,15 @@ enum bfqq_expiration { BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ - BFQ_BFQQ_PREEMPTED /* preemption in progress */ }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfqg_stats { -#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -744,8 +657,12 @@ struct bfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; + /* time not charged to this cgroup */ + struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ @@ -763,10 +680,8 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; -#endif }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED /* * struct bfq_group_data - per-blkcg storage for the blkio subsystem. * @@ -777,7 +692,7 @@ struct bfq_group_data { /* must be the first member */ struct blkcg_policy_data pd; - unsigned int weight; + unsigned short weight; }; /** @@ -797,7 +712,7 @@ struct bfq_group_data { * unused for the root group. Used to know whether there * are groups with more than one active @bfq_entity * (see the comments to the function - * bfq_bfqq_may_idle()). + * bfq_bfqq_must_not_expire()). * @rq_pos_tree: rbtree sorted by next_request position, used when * determining if two or more queues have interleaving * requests (see bfq_find_close_cooperator()). @@ -830,6 +745,7 @@ struct bfq_group { struct rb_root rq_pos_tree; struct bfqg_stats stats; + struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; #else @@ -845,38 +761,17 @@ struct bfq_group { static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -static unsigned int bfq_class_idx(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - return bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS - 1; -} - static struct bfq_service_tree * bfq_entity_service_tree(struct bfq_entity *entity) { struct bfq_sched_data *sched_data = entity->sched_data; struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int idx = bfq_class_idx(entity); + unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS; BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); - } -#endif return sched_data->service_tree + idx; } @@ -896,6 +791,47 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) return bic->icq.q->elevator->elevator_data; } +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (ptr == NULL) + printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); + else if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + #ifdef CONFIG_BFQ_GROUP_IOSCHED static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) @@ -921,13 +857,11 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bio *bio, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -#endif static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); #endif /* _BFQ_H */ From e34c124a21d906907b23330e097cacf93eaa43c0 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:39 +0800 Subject: [PATCH 40/59] Revert "block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for 4.5.0" This reverts commit 24995f0db5a4b2f5e0145dd6de496c4d09c46de5. --- block/bfq-cgroup.c | 5 - block/bfq-iosched.c | 685 +------------------------------------------- block/bfq.h | 66 ----- 3 files changed, 13 insertions(+), 743 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 03679962d5c0..8b08a5758565 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -440,7 +440,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; - bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) @@ -534,9 +533,6 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, return bfqg; } -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq); - /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. @@ -584,7 +580,6 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqg_get(bfqg); if (busy) { - bfq_pos_tree_add_move(bfqd, bfqq); if (resume) bfq_activate_bfqq(bfqd, bfqq); } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf3e9b1800c9..85e216905a5d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -295,72 +295,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } -static struct bfq_queue * -bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, - struct rb_node ***rb_link) -{ - struct rb_node **p, *parent; - struct bfq_queue *bfqq = NULL; - - parent = NULL; - p = &root->rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - bfqq = rb_entry(parent, struct bfq_queue, pos_node); - - /* - * Sort strictly based on sector. Smallest to the left, - * largest to the right. - */ - if (sector > blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_right; - else if (sector < blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_left; - else - break; - p = n; - bfqq = NULL; - } - - *ret_parent = parent; - if (rb_link) - *rb_link = p; - - bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - - return bfqq; -} - -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct rb_node **p, *parent; - struct bfq_queue *__bfqq; - - if (bfqq->pos_root) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) - return; - - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; - __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, - blk_rq_pos(bfqq->next_rq), &parent, &p); - if (!__bfqq) { - rb_link_node(&bfqq->pos_node, parent, p); - rb_insert_color(&bfqq->pos_node, bfqq->pos_root); - } else - bfqq->pos_root = NULL; -} - /* * Tell whether there are active queues or groups with differentiated weights. */ @@ -593,57 +527,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } -static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) -{ - return bfqq->bic ? bfqq->bic->cooperations : 0; -} - -static void -bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -{ - if (bic->saved_idle_window) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); - else - bfq_clear_bfqq_IO_bound(bfqq); - /* Assuming that the flag in_large_burst is already correctly set */ - if (bic->wr_time_left && bfqq->bfqd->low_latency && - !bfq_bfqq_in_large_burst(bfqq) && - bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { - /* - * Start a weight raising period with the duration given by - * the raising_time_left snapshot. - */ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues++; - bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bic->wr_time_left; - bfqq->last_wr_start_finish = jiffies; - bfqq->entity.prio_changed = 1; - } - /* - * Clear wr_time_left to prevent bfq_bfqq_save_state() from - * getting confused about the queue's need of a weight-raising - * period. - */ - bic->wr_time_left = 0; -} - -static int bfqq_process_refs(struct bfq_queue *bfqq) -{ - int process_refs, io_refs; - - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -} - /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) { @@ -880,14 +763,8 @@ static void bfq_add_request(struct request *rq) BUG_ON(!next_rq); bfqq->next_rq = next_rq; - /* - * Adjust priority tree position, if next_rq changes. - */ - if (prev != bfqq->next_rq) - bfq_pos_tree_add_move(bfqd, bfqq); - if (!bfq_bfqq_busy(bfqq)) { - bool soft_rt, coop_or_in_burst, + bool soft_rt, in_burst, idle_for_long_time = time_is_before_jiffies( bfqq->budget_timeout + bfqd->bfq_wr_min_idle_time); @@ -915,12 +792,11 @@ static void bfq_add_request(struct request *rq) bfqd->last_ins_in_burst = jiffies; } - coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !coop_or_in_burst && + !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start); - interactive = !coop_or_in_burst && idle_for_long_time; + interactive = !in_burst && idle_for_long_time; entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); @@ -939,9 +815,6 @@ static void bfq_add_request(struct request *rq) if (!bfqd->low_latency) goto add_bfqq_busy; - if (bfq_bfqq_just_split(bfqq)) - goto set_prio_changed; - /* * If the queue: * - is not being boosted, @@ -966,7 +839,7 @@ static void bfq_add_request(struct request *rq) } else if (old_wr_coeff > 1) { if (interactive) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (coop_or_in_burst || + else if (in_burst || (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && !soft_rt)) { @@ -1031,7 +904,6 @@ static void bfq_add_request(struct request *rq) bfqd->bfq_wr_rt_max_time; } } -set_prio_changed: if (old_wr_coeff != bfqq->wr_coeff) entity->prio_changed = 1; add_bfqq_busy: @@ -1174,15 +1046,6 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, bfqd->last_position); BUG_ON(!next_rq); bfqq->next_rq = next_rq; - /* - * If next_rq changes, update both the queue's budget to - * fit the new request and the queue's position in its - * rq_pos_tree. - */ - if (prev != bfqq->next_rq) { - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } } } @@ -1265,346 +1128,11 @@ static void bfq_end_wr(struct bfq_data *bfqd) spin_unlock_irq(bfqd->queue->queue_lock); } -static sector_t bfq_io_struct_pos(void *io_struct, bool request) -{ - if (request) - return blk_rq_pos(io_struct); - else - return ((struct bio *)io_struct)->bi_iter.bi_sector; -} - -static int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) -{ - return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_SEEK_THR; -} - -static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - sector_t sector) -{ - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; - struct rb_node *parent, *node; - struct bfq_queue *__bfqq; - - if (RB_EMPTY_ROOT(root)) - return NULL; - - /* - * First, if we find a request starting at the end of the last - * request, choose it. - */ - __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); - if (__bfqq) - return __bfqq; - - /* - * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector (rq_pos_tree sorted by - * next_request position). - */ - __bfqq = rb_entry(parent, struct bfq_queue, pos_node); - if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) - return __bfqq; - - if (blk_rq_pos(__bfqq->next_rq) < sector) - node = rb_next(&__bfqq->pos_node); - else - node = rb_prev(&__bfqq->pos_node); - if (!node) - return NULL; - - __bfqq = rb_entry(node, struct bfq_queue, pos_node); - if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) - return __bfqq; - - return NULL; -} - -static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, - struct bfq_queue *cur_bfqq, - sector_t sector) -{ - struct bfq_queue *bfqq; - - /* - * We shall notice if some of the queues are cooperating, - * e.g., working closely on the same area of the device. In - * that case, we can group them together and: 1) don't waste - * time idling, and 2) serve the union of their requests in - * the best possible order for throughput. - */ - bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); - if (!bfqq || bfqq == cur_bfqq) - return NULL; - - return bfqq; -} - -static struct bfq_queue * -bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -{ - int process_refs, new_process_refs; - struct bfq_queue *__bfqq; - - /* - * If there are no process references on the new_bfqq, then it is - * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain - * may have dropped their last reference (not just their last process - * reference). - */ - if (!bfqq_process_refs(new_bfqq)) - return NULL; - - /* Avoid a circular list and skip interim queue merges. */ - while ((__bfqq = new_bfqq->new_bfqq)) { - if (__bfqq == bfqq) - return NULL; - new_bfqq = __bfqq; - } - - process_refs = bfqq_process_refs(bfqq); - new_process_refs = bfqq_process_refs(new_bfqq); - /* - * If the process for the bfqq has gone away, there is no - * sense in merging the queues. - */ - if (process_refs == 0 || new_process_refs == 0) - return NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", - new_bfqq->pid); - - /* - * Merging is just a redirection: the requests of the process - * owning one of the two queues are redirected to the other queue. - * The latter queue, in its turn, is set as shared if this is the - * first time that the requests of some process are redirected to - * it. - * - * We redirect bfqq to new_bfqq and not the opposite, because we - * are in the context of the process owning bfqq, hence we have - * the io_cq of this process. So we can immediately configure this - * io_cq to redirect the requests of the process to new_bfqq. - * - * NOTE, even if new_bfqq coincides with the in-service queue, the - * io_cq of new_bfqq is not available, because, if the in-service - * queue is shared, bfqd->in_service_bic may not point to the - * io_cq of the in-service queue. - * Redirecting the requests of the process owning bfqq to the - * currently in-service queue is in any case the best option, as - * we feed the in-service queue with new requests close to the - * last request served and, by doing so, hopefully increase the - * throughput. - */ - bfqq->new_bfqq = new_bfqq; - atomic_add(process_refs, &new_bfqq->ref); - return new_bfqq; -} - -static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) -{ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; - - /* - * If either of the queues has already been detected as seeky, - * then merging it with the other queue is unlikely to lead to - * sequential I/O. - */ - if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) - return false; - - /* - * Interleaved I/O is known to be done by (some) applications - * only for reads, so it does not make sense to merge async - * queues. - */ - if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) - return false; - - return true; -} - -/* - * Attempt to schedule a merge of bfqq with the currently in-service queue - * or with a close queue among the scheduled queues. - * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue - * structure otherwise. - * - * The OOM queue is not allowed to participate to cooperation: in fact, since - * the requests temporarily redirected to the OOM queue could be redirected - * again to dedicated queues at any time, the state needed to correctly - * handle merging with the OOM queue would be quite complex and expensive - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - */ -static struct bfq_queue * -bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) -{ - struct bfq_queue *in_service_bfqq, *new_bfqq; - - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - /* If device has only one backlogged bfq_queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - in_service_bfqq = bfqd->in_service_queue; - - if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); - if (new_bfqq) - return new_bfqq; - } - /* - * Check whether there is a cooperator among currently scheduled - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ -check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - - return NULL; -} - -static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -{ - /* - * If !bfqq->bic, the queue is already shared or its requests - * have already been redirected to a shared queue; both idle window - * and weight raising state have already been saved. Do nothing. - */ - if (!bfqq->bic) - return; - if (bfqq->bic->wr_time_left) - /* - * This is the queue of a just-started process, and would - * deserve weight raising: we set wr_time_left to the full - * weight-raising duration to trigger weight-raising when - * and if the queue is split and the first request of the - * queue is enqueued. - */ - bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); - else if (bfqq->wr_coeff > 1) { - unsigned long wr_duration = - jiffies - bfqq->last_wr_start_finish; - /* - * It may happen that a queue's weight raising period lasts - * longer than its wr_cur_max_time, as weight raising is - * handled only when a request is enqueued or dispatched (it - * does not use any timer). If the weight raising period is - * about to end, don't save it. - */ - if (bfqq->wr_cur_max_time <= wr_duration) - bfqq->bic->wr_time_left = 0; - else - bfqq->bic->wr_time_left = - bfqq->wr_cur_max_time - wr_duration; - /* - * The bfq_queue is becoming shared or the requests of the - * process owning the queue are being redirected to a shared - * queue. Stop the weight raising period of the queue, as in - * both cases it should not be owned by an interactive or - * soft real-time application. - */ - bfq_bfqq_end_wr(bfqq); - } else - bfqq->bic->wr_time_left = 0; - bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bfqq->bic->cooperations++; - bfqq->bic->failed_cooperations = 0; -} - -static void bfq_get_bic_reference(struct bfq_queue *bfqq) -{ - /* - * If bfqq->bic has a non-NULL value, the bic to which it belongs - * is about to begin using a shared bfq_queue. - */ - if (bfqq->bic) - atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -} - -static void -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -{ - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (unsigned long) new_bfqq->pid); - /* Save weight raising and idle window of the merged queues */ - bfq_bfqq_save_state(bfqq); - bfq_bfqq_save_state(new_bfqq); - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); - /* - * Grab a reference to the bic, to prevent it from being destroyed - * before being possibly touched by a bfq_split_bfqq(). - */ - bfq_get_bic_reference(bfqq); - bfq_get_bic_reference(new_bfqq); - /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ - bic_set_bfqq(bic, new_bfqq, 1); - bfq_mark_bfqq_coop(new_bfqq); - /* - * new_bfqq now belongs to at least two bics (it is a shared queue): - * set new_bfqq->bic to NULL. bfqq either: - * - does not belong to any bic any more, and hence bfqq->bic must - * be set to NULL, or - * - is a queue whose owning bics have already been redirected to a - * different queue, hence the queue is destined to not belong to - * any bic soon and bfqq->bic is already NULL (therefore the next - * assignment causes no harm). - */ - new_bfqq->bic = NULL; - bfqq->bic = NULL; - bfq_put_queue(bfqq); -} - -static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -{ - struct bfq_io_cq *bic = bfqq->bic; - struct bfq_data *bfqd = bfqq->bfqd; - - if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { - bic->failed_cooperations++; - if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) - bic->cooperations = 0; - } -} - static int bfq_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; - struct bfq_queue *bfqq, *new_bfqq; /* * Disallow merge of a sync bio into an async request. @@ -1621,26 +1149,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, if (!bic) return 0; - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ - if (bfqq) { - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); - if (new_bfqq) { - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - /* - * If we get here, the bio will be queued in the - * shared queue, i.e., new_bfqq, so use new_bfqq - * to decide whether bio and rq can be merged. - */ - bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); - } - - return bfqq == RQ_BFQQ(rq); + return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); } static void __bfq_set_in_service_queue(struct bfq_data *bfqd, @@ -1841,15 +1350,6 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) __bfq_bfqd_reset_in_service(bfqd); - /* - * If this bfqq is shared between multiple processes, check - * to make sure that those processes are still issuing I/Os - * within the mean seek distance. If not, it may be time to - * break the queues apart again. - */ - if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) - bfq_mark_bfqq_split_coop(bfqq); - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { /* * Overloading budget_timeout field to store the time @@ -1858,13 +1358,8 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ bfqq->budget_timeout = jiffies; bfq_del_bfqq_busy(bfqd, bfqq, 1); - } else { + } else bfq_activate_bfqq(bfqd, bfqq); - /* - * Resort priority tree of potential close cooperators. - */ - bfq_pos_tree_add_move(bfqd, bfqq); - } } /** @@ -2751,12 +2246,10 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * If the queue was activated in a burst, or * too much time has elapsed from the beginning - * of this weight-raising period, or the queue has - * exceeded the acceptable number of cooperations, - * then end weight raising. + * of this weight-raising period, then end weight + * raising. */ if (bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time)) { bfqq->last_wr_start_finish = jiffies; @@ -2985,25 +2478,6 @@ static void bfq_put_queue(struct bfq_queue *bfqq) #endif } -static void bfq_put_cooperator(struct bfq_queue *bfqq) -{ - struct bfq_queue *__bfqq, *next; - - /* - * If this queue was scheduled to merge with another queue, be - * sure to drop the reference taken on that queue (and others in - * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. - */ - __bfqq = bfqq->new_bfqq; - while (__bfqq) { - if (__bfqq == bfqq) - break; - next = __bfqq->new_bfqq; - bfq_put_queue(__bfqq); - __bfqq = next; - } -} - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq == bfqd->in_service_queue) { @@ -3014,8 +2488,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, atomic_read(&bfqq->ref)); - bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); } @@ -3024,25 +2496,6 @@ static void bfq_init_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); bic->ttime.last_end_request = jiffies; - /* - * A newly created bic indicates that the process has just - * started doing I/O, and is probably mapping into memory its - * executable and libraries: it definitely needs weight raising. - * There is however the possibility that the process performs, - * for a while, I/O close to some other process. EQM intercepts - * this behavior and may merge the queue corresponding to the - * process with some other queue, BEFORE the weight of the queue - * is raised. Merged queues are not weight-raised (they are assumed - * to belong to processes that benefit only from high throughput). - * If the merge is basically the consequence of an accident, then - * the queue will be split soon and will get back its old weight. - * It is then important to write down somewhere that this queue - * does need weight raising, even if it did not make it to get its - * weight raised before being merged. To this purpose, we overload - * the field raising_time_left and assign 1 to it, to mark the queue - * as needing weight raising. - */ - bic->wr_time_left = 1; } static void bfq_exit_icq(struct io_cq *icq) @@ -3056,13 +2509,6 @@ static void bfq_exit_icq(struct io_cq *icq) } if (bic->bfqq[BLK_RW_SYNC]) { - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ - if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) - put_io_context(icq->ioc); bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); bic->bfqq[BLK_RW_SYNC] = NULL; } @@ -3368,10 +2814,6 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) return; - /* Idle window just restored, statistics are meaningless. */ - if (bfq_bfqq_just_split(bfqq)) - return; - enable_idle = bfq_bfqq_idle_window(bfqq); if (atomic_read(&bic->icq.ioc->active_ref) == 0 || @@ -3419,7 +2861,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); - bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", @@ -3484,47 +2925,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_insert_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + struct bfq_queue *bfqq = RQ_BFQQ(rq); assert_spin_locked(bfqd->queue->queue_lock); - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to - * merge two bfq_queues. - */ - if (!in_interrupt()) { - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); - if (new_bfqq) { - if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) - new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); - /* - * Release the request's reference to the old bfqq - * and make sure one is taken to the shared queue. - */ - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; - atomic_inc(&new_bfqq->ref); - bfq_put_queue(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); - rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); - } - bfq_add_request(rq); - /* - * Here a newly-created bfq_queue has already started a weight-raising - * period: clear raising_time_left to prevent bfq_bfqq_save_state() - * from assigning it a full weight-raising period. See the detailed - * comments about this field in bfq_init_icq(). - */ - if (bfqq->bic) - bfqq->bic->wr_time_left = 0; rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -3692,32 +3098,6 @@ static void bfq_put_request(struct request *rq) } } -/* - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to said bfqq. - */ -static struct bfq_queue * -bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - - put_io_context(bic->icq.ioc); - - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_coop(bfqq); - bfq_clear_bfqq_split_coop(bfqq); - return bfqq; - } - - bic_set_bfqq(bic, NULL, 1); - - bfq_put_cooperator(bfqq); - - bfq_put_queue(bfqq); - return NULL; -} - /* * Allocate bfq data structures associated with this request. */ @@ -3730,7 +3110,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; unsigned long flags; - bool split = false; might_sleep_if(gfpflags_allow_blocking(gfp_mask)); @@ -3743,30 +3122,15 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, bfq_bic_update_cgroup(bic, bio); -new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + if (is_sync) { + if (bfqd->large_burst) bfq_mark_bfqq_in_large_burst(bfqq); - else { + else bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - } - } else { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - if (!bfqq) - goto new_queue; } } @@ -3778,26 +3142,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; - /* - * If a bfq_queue has only one process reference, it is owned - * by only one bfq_io_cq: we can set the bic field of the - * bfq_queue to the address of that structure. Also, if the - * queue has just been split, mark a flag so that the - * information is available to the other scheduler hooks. - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; - if (split) { - bfq_mark_bfqq_just_split(bfqq); - /* - * If the queue has just been split from a shared - * queue, restore the idle window and the possible - * weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bic); - } - } - spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -3951,7 +3295,6 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->my_entity = NULL; root_group->bfqd = bfqd; #endif - root_group->rq_pos_tree = RB_ROOT; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; } @@ -4032,8 +3375,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; - bfqd->bfq_coop_thresh = 2; - bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; bfqd->bfq_large_burst_thresh = 11; diff --git a/block/bfq.h b/block/bfq.h index fcce85528377..2bf54ae89ff0 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -183,8 +183,6 @@ struct bfq_group; * ioprio_class value. * @new_bfqq: shared bfq_queue if queue is cooperating with * one or more other queues. - * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). * @sort_list: sorted list of pending requests. * @next_rq: if fifo isn't expired, next request to serve. * @queued: nr of requests queued in @sort_list. @@ -306,26 +304,6 @@ struct bfq_ttime { * @ttime: associated @bfq_ttime struct * @ioprio: per (request_queue, blkcg) ioprio. * @blkcg_id: id of the blkcg the related io_cq belongs to. - * @wr_time_left: snapshot of the time left before weight raising ends - * for the sync queue associated to this process; this - * snapshot is taken to remember this value while the weight - * raising is suspended because the queue is merged with a - * shared queue, and is used to set @raising_cur_max_time - * when the queue is split from the shared queue and its - * weight is raised again - * @saved_idle_window: same purpose as the previous field for the idle - * window - * @saved_IO_bound: same purpose as the previous two fields for the I/O - * bound classification of a queue - * @saved_in_large_burst: same purpose as the previous fields for the - * value of the field keeping the queue's belonging - * to a large burst - * @was_in_burst_list: true if the queue belonged to a burst list - * before its merge with another cooperating queue - * @cooperations: counter of consecutive successful queue merges underwent - * by any of the process' @bfq_queues - * @failed_cooperations: counter of consecutive failed queue merges of any - * of the process' @bfq_queues */ struct bfq_io_cq { struct io_cq icq; /* must be the first member */ @@ -336,16 +314,6 @@ struct bfq_io_cq { #ifdef CONFIG_BFQ_GROUP_IOSCHED uint64_t blkcg_id; /* the current blkcg ID */ #endif - - unsigned int wr_time_left; - bool saved_idle_window; - bool saved_IO_bound; - - bool saved_in_large_burst; - bool was_in_burst_list; - - unsigned int cooperations; - unsigned int failed_cooperations; }; enum bfq_device_speed { @@ -589,9 +557,6 @@ enum bfqq_state_flags { * may need softrt-next-start * update */ - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ - BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -618,9 +583,6 @@ BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(constantly_seeky); -BFQ_BFQQ_FNS(coop); -BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS @@ -713,9 +675,6 @@ struct bfq_group_data { * are groups with more than one active @bfq_entity * (see the comments to the function * bfq_bfqq_must_not_expire()). - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_find_close_cooperator()). * * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup * there is a set of bfq_groups, each one collecting the lower-level @@ -742,8 +701,6 @@ struct bfq_group { int active_entities; - struct rb_root rq_pos_tree; - struct bfqg_stats stats; struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; @@ -754,8 +711,6 @@ struct bfq_group { struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; struct bfq_queue *async_idle_bfqq; - - struct rb_root rq_pos_tree; }; #endif @@ -832,27 +787,6 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - -#else - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - -#endif - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); From e6264a7992a901296a69531b9f6a99c076ba79ad Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:40 +0800 Subject: [PATCH 41/59] Revert "block: introduce the BFQ-v7r11 I/O sched for 4.5.0" This reverts commit 109bc60a495f604e66cfc7f70a79d422ec4370df. --- block/Kconfig.iosched | 6 +- block/bfq-cgroup.c | 1186 ------------- block/bfq-ioc.c | 36 - block/bfq-iosched.c | 3763 ----------------------------------------- block/bfq-sched.c | 1199 ------------- block/bfq.h | 801 --------- 6 files changed, 4 insertions(+), 6987 deletions(-) delete mode 100644 block/bfq-cgroup.c delete mode 100644 block/bfq-ioc.c delete mode 100644 block/bfq-iosched.c delete mode 100644 block/bfq-sched.c delete mode 100644 block/bfq.h diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index a29d749bbfe6..9e25d45a4c33 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -51,12 +51,14 @@ config IOSCHED_BFQ applications. If compiled built-in (saying Y here), BFQ can be configured to support hierarchical scheduling. -config BFQ_GROUP_IOSCHED +config CGROUP_BFQIO bool "BFQ hierarchical scheduling support" depends on CGROUPS && IOSCHED_BFQ=y default n ---help--- - Enable hierarchical scheduling in BFQ, using the blkio controller. + Enable hierarchical scheduling in BFQ, using the cgroups + filesystem interface. The name of the subsystem will be + bfqio. choice prompt "Default I/O scheduler" diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c deleted file mode 100644 index 8b08a5758565..000000000000 --- a/block/bfq-cgroup.c +++ /dev/null @@ -1,1186 +0,0 @@ -/* - * BFQ: CGROUPS support. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - */ - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -/* bfqg stats flags */ -enum bfqg_stats_flags { - BFQG_stats_waiting = 0, - BFQG_stats_idling, - BFQG_stats_empty, -}; - -#define BFQG_FLAG_FNS(name) \ -static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags |= (1 << BFQG_stats_##name); \ -} \ -static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << BFQG_stats_##name); \ -} \ -static int bfqg_stats_##name(struct bfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -} \ - -BFQG_FLAG_FNS(waiting) -BFQG_FLAG_FNS(idling) -BFQG_FLAG_FNS(empty) -#undef BFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_waiting(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - bfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_waiting(stats)) - return; - if (bfqg == curr_bfqg) - return; - stats->start_group_wait_time = sched_clock(); - bfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_empty(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - bfqg_stats_clear_empty(stats); -} - -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -{ - blkg_stat_add(&bfqg->stats.dequeue, 1); -} - -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (blkg_rwstat_total(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if bfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (bfqg_stats_empty(stats)) - return; - - stats->start_empty_time = sched_clock(); - bfqg_stats_mark_empty(stats); -} - -static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); - - if (time_after64(now, stats->start_idle_time)) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - bfqg_stats_clear_idling(stats); - } -} - -static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - stats->start_idle_time = sched_clock(); - bfqg_stats_mark_idling(stats); -} - -static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - bfqg_stats_update_group_wait_time(stats); -} - -static struct blkcg_policy blkcg_policy_bfq; - -/* - * blk-cgroup policy-related handlers - * The following functions help in converting between blk-cgroup - * internal structures and BFQ-specific structures. - */ - -static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct bfq_group, pd) : NULL; -} - -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -{ - return pd_to_blkg(&bfqg->pd); -} - -static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -{ - struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - - BUG_ON(!pd); - - return pd_to_bfqg(pd); -} - -/* - * bfq_group handlers - * The following functions help in navigating the bfq_group hierarchy - * by allowing to find the parent of a bfq_group or the bfq_group - * associated to a bfq_queue. - */ - -static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -{ - struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; - - return pblkg ? blkg_to_bfqg(pblkg) : NULL; -} - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - return group_entity ? container_of(group_entity, struct bfq_group, - entity) : - bfqq->bfqd->root_group; -} - -/* - * The following two functions handle get and put of a bfq_group by - * wrapping the related blk-cgroup hooks. - */ - -static void bfqg_get(struct bfq_group *bfqg) -{ - return blkg_get(bfqg_to_blkg(bfqg)); -} - -static void bfqg_put(struct bfq_group *bfqg) -{ - return blkg_put(bfqg_to_blkg(bfqg)); -} - -static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} - -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, -1); -} - -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) -{ - blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -} - -static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -} - -static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int rw) -{ - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, rw, - io_start_time - start_time); -} - -/* @stats = 0 */ -static void bfqg_stats_reset(struct bfqg_stats *stats) -{ - if (!stats) - return; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -} - -/* @to += @from */ -static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -{ - if (!to || !from) - return; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, - &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -} - -/* - * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' - * recursive stats can still account for the amount used by this bfqg after - * it's gone. - */ -static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -{ - struct bfq_group *parent; - - if (!bfqg) /* root_group */ - return; - - parent = bfqg_parent(bfqg); - - lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); - - if (unlikely(!parent)) - return; - - bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); - bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); -} - -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); - } - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -} - -static void bfqg_stats_exit(struct bfqg_stats *stats) -{ - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->sectors); - blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->unaccounted_time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -} - -static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -{ - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->sectors, gfp) || - blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->unaccounted_time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } - - return 0; -} - -static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -{ - return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -} - -static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -{ - return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -} - -static void bfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - - d->weight = BFQ_DEFAULT_GRP_WEIGHT; -} - -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -{ - struct bfq_group *bfqg; - - bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); - if (!bfqg) - return NULL; - - if (bfqg_stats_init(&bfqg->stats, gfp) || - bfqg_stats_init(&bfqg->dead_stats, gfp)) { - kfree(bfqg); - return NULL; - } - - return &bfqg->pd; -} - -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) -{ - struct bfq_entity *entity; - - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); - - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -static void bfq_pd_init(struct blkg_policy_data *pd) -{ - struct blkcg_gq *blkg = pd_to_blkg(pd); - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; - struct bfq_entity *entity = &bfqg->entity; - struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); - - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; - bfqg->my_entity = entity; /* - * the root_group's will be set to NULL - * in bfq_init_queue() - */ - bfqg->bfqd = bfqd; - bfqg->active_entities = 0; -} - -static void bfq_pd_free(struct blkg_policy_data *pd) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); - bfqg_stats_exit(&bfqg->dead_stats); - - return kfree(bfqg); -} - -/* offset delta from bfqg->stats to bfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - - offsetof(struct bfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - return sum; -} - -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat -bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat a, b; - - a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - blkg_rwstat_add_aux(&a, &b); - return a; -} - -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); -} - -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct request_queue *q = bfqd->queue; - struct bfq_group *bfqg = NULL, *parent; - struct bfq_entity *entity = NULL; - - assert_spin_locked(bfqd->queue->queue_lock); - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - bfqg = bfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - bfqg = blkg_to_bfqg(blkg); - else /* fallback to root_group */ - bfqg = bfqd->root_group; - } - - BUG_ON(!bfqg); - - /* - * Update chain of bfq_groups as we might be handling a leaf group - * which, along with some of its relatives, has not been hooked yet - * to the private hierarchy of BFQ. - */ - entity = &bfqg->entity; - for_each_entity(entity) { - bfqg = container_of(entity, struct bfq_group, entity); - BUG_ON(!bfqg); - if (bfqg != bfqd->root_group) { - parent = bfqg_parent(bfqg); - if (!parent) - parent = bfqd->root_group; - BUG_ON(!parent); - bfq_group_set_parent(bfqg, parent); - } - } - - return bfqg; -} - -/** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. - * @bfqq: the queue to move. - * @entity: @bfqq's entity. - * @bfqg: the group to move to. - * - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). - */ -static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_entity *entity, struct bfq_group *bfqg) -{ - int busy, resume; - - busy = bfq_bfqq_busy(bfqq); - resume = !RB_EMPTY_ROOT(&bfqq->sort_list); - - BUG_ON(resume && !entity->on_st); - BUG_ON(busy && !resume && entity->on_st && - bfqq != bfqd->in_service_queue); - - if (busy) { - BUG_ON(atomic_read(&bfqq->ref) < 2); - - if (!resume) - bfq_del_bfqq_busy(bfqd, bfqq, 0); - else - bfq_deactivate_bfqq(bfqd, bfqq, 0); - } else if (entity->on_st) - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_put(bfqq_group(bfqq)); - - /* - * Here we use a reference to bfqg. We don't need a refcounter - * as the cgroup reference will not be dropped, so that its - * destroy() callback will not be invoked. - */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); - - if (busy) { - if (resume) - bfq_activate_bfqq(bfqd, bfqq); - } - - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); -} - -/** - * __bfq_bic_change_cgroup - move @bic to @cgroup. - * @bfqd: the queue descriptor. - * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. - * - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. - */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) -{ - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; - struct bfq_entity *entity; - - lockdep_assert_held(bfqd->queue->queue_lock); - - bfqg = bfq_find_alloc_group(bfqd, blkcg); - if (async_bfqq) { - entity = &async_bfqq->entity; - - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", - async_bfqq, atomic_read(&async_bfqq->ref)); - bfq_put_queue(async_bfqq); - } - } - - if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); - } - - return bfqg; -} - -static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd = bic_to_bfqd(bic); - struct blkcg *blkcg; - struct bfq_group *bfqg = NULL; - uint64_t id; - - rcu_read_lock(); - blkcg = bio_blkcg(bio); - id = blkcg->css.serial_nr; - rcu_read_unlock(); - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) - return; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); - BUG_ON(!bfqg); - bic->blkcg_id = id; -} - -/** - * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. - * @st: the service tree being flushed. - */ -static void bfq_flush_idle_tree(struct bfq_service_tree *st) -{ - struct bfq_entity *entity = st->first_idle; - - for (; entity ; entity = st->first_idle) - __bfq_deactivate_entity(entity, 0); -} - -/** - * bfq_reparent_leaf_entity - move leaf entity to the root_group. - * @bfqd: the device data structure with the root group. - * @entity: the entity to move. - */ -static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); -} - -/** - * bfq_reparent_active_entities - move to the root group all active - * entities. - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. - * - * Needs queue_lock to be taken and reference to be valid over the call. - */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) -{ - struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; - - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); - - for (; entity ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); - - if (bfqg->sched_data.in_service_entity) - bfq_reparent_leaf_entity(bfqd, - bfqg->sched_data.in_service_entity); -} - -/** - * bfq_destroy_group - destroy @bfqg. - * @bfqg: the group being destroyed. - * - * Destroy @bfqg, making sure that it is not referenced from its parent. - * blkio already grabs the queue_lock for us, so no need to use RCU-based magic - */ -static void bfq_pd_offline(struct blkg_policy_data *pd) -{ - struct bfq_service_tree *st; - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; - int i; - - BUG_ON(!pd); - bfqg = pd_to_bfqg(pd); - BUG_ON(!bfqg); - bfqd = bfqg->bfqd; - BUG_ON(bfqd && !bfqd->root_group); - - entity = bfqg->my_entity; - - if (!entity) /* root group */ - return; - - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. - */ - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - BUG_ON(!bfqg->sched_data.service_tree); - st = bfqg->sched_data.service_tree + i; - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. No one else - * can access them so it's safe to act without any lock. - */ - bfq_flush_idle_tree(st); - - /* - * It may happen that some queues are still active - * (busy) upon group destruction (if the corresponding - * processes have been forced to terminate). We move - * all the leaf entities corresponding to these queues - * to the root_group. - * Also, it may happen that the group has an entity - * in service, which is disconnected from the active - * tree: it must be moved, too. - * There is no need to put the sync queues, as the - * scheduler has taken no reference. - */ - bfq_reparent_active_entities(bfqd, bfqg, st); - BUG_ON(!RB_EMPTY_ROOT(&st->active)); - BUG_ON(!RB_EMPTY_ROOT(&st->idle)); - } - BUG_ON(bfqg->sched_data.next_in_service); - BUG_ON(bfqg->sched_data.in_service_entity); - - __bfq_deactivate_entity(entity, 0); - bfq_put_async_queues(bfqd, bfqg); - BUG_ON(entity->tree); - - bfqg_stats_xfer_dead(bfqg); -} - -static void bfq_end_wr_async(struct bfq_data *bfqd) -{ - struct blkcg_gq *blkg; - - list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - - bfq_end_wr_async_queues(bfqd, bfqg); - } - bfq_end_wr_async_queues(bfqd, bfqd->root_group); -} - -static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, - struct cftype *cftype) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - int ret = -EINVAL; - - spin_lock_irq(&blkcg->lock); - ret = bfqgd->weight; - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - - spin_lock_irq(&blkcg->lock); - seq_printf(sf, "%u\n", bfqgd->weight); - spin_unlock_irq(&blkcg->lock); - - return 0; -} - -static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - struct blkcg_gq *blkg; - int ret = -EINVAL; - - if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) - return ret; - - ret = 0; - spin_lock_irq(&blkcg->lock); - bfqgd->weight = (unsigned short)val; - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - - if (!bfqg) - continue; - /* - * Setting the prio_changed flag of the entity - * to 1 with new_weight == weight would re-set - * the value of the weight to its ioprio mapping. - * Set the flag only if necessary. - */ - if ((unsigned short)val != bfqg->entity.new_weight) { - bfqg->entity.new_weight = (unsigned short)val; - /* - * Make sure that the above new value has been - * stored in bfqg->entity.new_weight before - * setting the prio_changed flag. In fact, - * this flag may be read asynchronously (in - * critical sections protected by a different - * lock than that held here), and finding this - * flag set may cause the execution of the code - * for updating parameters whose value may - * depend also on bfqg->entity.new_weight (in - * __bfq_entity_update_weight_prio). - * This barrier makes sure that the new value - * of bfqg->entity.new_weight is correctly - * seen in that code. - */ - smp_wmb(); - bfqg->entity.prio_changed = 1; - } - } - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - /* First unsigned long found in the file is used */ - return bfqio_cgroup_weight_write(of_css(of), NULL, - simple_strtoull(strim(buf), NULL, 0)); -} - -static int bfqg_print_stat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); - return 0; -} - -static int bfqg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); - return 0; -} - -static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - u64 sum = bfqg_stat_pd_recursive_sum(pd, off); - - return __blkg_prfill_u64(sf, pd, sum); -} - -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); - - return __blkg_prfill_rwstat(sf, pd, &sum); -} - -static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_stat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, false); - return 0; -} - -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - -static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); - u64 v = 0; - - if (samples) { - v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); - v = div64_u64(v, samples); - } - __blkg_prfill_u64(sf, pd, v); - return 0; -} - -/* print avg_queue_size */ -static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, - 0, false); - return 0; -} - -static struct bfq_group * -bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -{ - int ret; - - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); - if (ret) - return NULL; - - return blkg_to_bfqg(bfqd->queue->root_blkg); -} - -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); -} - -static struct cftype bfqio_files_dfl[] = { - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfqio_cgroup_weight_read_dfl, - .write = bfqio_cgroup_weight_write_dfl, - }, - {} /* terminate */ -}; - -static struct cftype bfqio_files[] = { - { - .name = "bfq.weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - /* statistics, cover only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.sectors", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.io_service_bytes", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_serviced", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_service_time", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_wait_time", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_merged", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_queued", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, - - /* the same statictics which cover the bfqg and its descendants */ - { - .name = "bfq.time_recursive", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.sectors_recursive", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.io_service_bytes_recursive", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_serviced_recursive", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_service_time_recursive", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_wait_time_recursive", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_merged_recursive", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_queued_recursive", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.avg_queue_size", - .seq_show = bfqg_print_avg_queue_size, - }, - { - .name = "bfq.group_wait_time", - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.idle_time", - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.empty_time", - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.dequeue", - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.unaccounted_time", - .private = offsetof(struct bfq_group, stats.unaccounted_time), - .seq_show = bfqg_print_stat, - }, - { } /* terminate */ -}; - -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfqio_files_dfl, - .legacy_cftypes = bfqio_files, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, -}; - -#else - -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - } - entity->sched_data = &bfqg->sched_data; -} - -static struct bfq_group * -bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd = bic_to_bfqd(bic); - - return bfqd->root_group; -} - -static void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) -{ -} - -static void bfq_end_wr_async(struct bfq_data *bfqd) -{ - bfq_end_wr_async_queues(bfqd, bfqd->root_group); -} - -static void bfq_disconnect_groups(struct bfq_data *bfqd) -{ - bfq_put_async_queues(bfqd, bfqd->root_group); -} - -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - return bfqd->root_group; -} - -static struct bfq_group * -bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -{ - struct bfq_group *bfqg; - int i; - - bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (!bfqg) - return NULL; - - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - - return bfqg; -} -#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c deleted file mode 100644 index fb7bb8f08b75..000000000000 --- a/block/bfq-ioc.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * BFQ: I/O context handling. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - */ - -/** - * icq_to_bic - convert iocontext queue structure to bfq_io_cq. - * @icq: the iocontext queue. - */ -static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -{ - /* bic->icq is the first member, %NULL will convert to %NULL */ - return container_of(icq, struct bfq_io_cq, icq); -} - -/** - * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. - * - * Queue lock must be held. - */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc) -{ - if (ioc) - return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); - return NULL; -} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c deleted file mode 100644 index 85e216905a5d..000000000000 --- a/block/bfq-iosched.c +++ /dev/null @@ -1,3763 +0,0 @@ -/* - * Budget Fair Queueing (BFQ) disk scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - * - * BFQ is a proportional-share storage-I/O scheduling algorithm based on - * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, - * measured in number of sectors, to processes instead of time slices. The - * device is not granted to the in-service process for a given time slice, - * but until it has exhausted its assigned budget. This change from the time - * to the service domain allows BFQ to distribute the device throughput - * among processes as desired, without any distortion due to ZBR, workload - * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, - * called B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated to processes. Thanks to the - * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to - * I/O-bound processes issuing sequential requests (to boost the - * throughput), and yet guarantee a low latency to interactive and soft - * real-time applications. - * - * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find - * in the latter paper full details on the main algorithm, as well as - * formulas of the guarantees and formal proofs of all the properties. - * With respect to the version of BFQ presented in these papers, this - * implementation adds a few more heuristics, such as the one that - * guarantees a low latency to soft real-time applications, and a - * hierarchical extension based on H-WF2Q+. - * - * B-WF2Q+ is based on WF2Q+, that is described in [2], together with - * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) - * complexity derives from the one introduced with EEVDF in [3]. - * - * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness - * with the BFQ Disk I/O Scheduler'', - * Proceedings of the 5th Annual International Systems and Storage - * Conference (SYSTOR '12), June 2012. - * - * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf - * - * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing - * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, - * Oct 1997. - * - * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz - * - * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline - * First: A Flexible and Accurate Mechanism for Proportional Share - * Resource Allocation,'' technical report. - * - * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "bfq.h" -#include "blk.h" - -/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; - -/* Maximum backwards seek, in KiB. */ -static const int bfq_back_max = 16 * 1024; - -/* Penalty of a backwards seek, in number of sectors. */ -static const int bfq_back_penalty = 2; - -/* Idling period duration, in jiffies. */ -static int bfq_slice_idle = HZ / 125; - -/* Minimum number of assigned budgets for which stats are safe to compute. */ -static const int bfq_stats_min_budgets = 194; - -/* Default maximum budget values, in sectors and number of requests. */ -static const int bfq_default_max_budget = 16 * 1024; -static const int bfq_max_budget_async_rq = 4; - -/* - * Async to sync throughput distribution is controlled as follows: - * when an async request is served, the entity is charged the number - * of sectors of the request, multiplied by the factor below - */ -static const int bfq_async_charge_factor = 10; - -/* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout_sync = HZ / 8; -static int bfq_timeout_async = HZ / 25; - -struct kmem_cache *bfq_pool; - -/* Below this threshold (in ms), we consider thinktime immediate. */ -#define BFQ_MIN_TT 2 - -/* hw_tag detection: parallel requests threshold and min samples needed. */ -#define BFQ_HW_QUEUE_THRESHOLD 4 -#define BFQ_HW_QUEUE_SAMPLES 32 - -#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) - -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 - -/* Shift used for peak rate fixed precision calculations. */ -#define BFQ_RATE_SHIFT 16 - -/* - * By default, BFQ computes the duration of the weight raising for - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; - * . whether the device is slow, such as old or portable HDDs, as well as - * SD cards, or fast, such as newer HDDs and SSDs. - * - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. - * - * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] - * are the reference values for a slow/fast rotational device, whereas - * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for - * a slow/fast non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. - * Both the reference peak rates and the thresholds are measured in - * sectors/usec, left-shifted by BFQ_RATE_SHIFT. - */ -static int R_slow[2] = {1536, 10752}; -static int R_fast[2] = {17415, 34791}; -/* - * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that they can be initialized only in a - * function. - */ -static int T_slow[2]; -static int T_fast[2]; -static int device_speed_thresh[2]; - -#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - -static void bfq_schedule_dispatch(struct bfq_data *bfqd); - -#include "bfq-ioc.c" -#include "bfq-sched.c" -#include "bfq-cgroup.c" - -#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) - -#define bfq_sample_valid(samples) ((samples) > 80) - -/* - * We regard a request as SYNC, if either it's a read or has the SYNC bit - * set (in which case it could also be a direct WRITE). - */ -static int bfq_bio_sync(struct bio *bio) -{ - if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) - return 1; - - return 0; -} - -/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -static void bfq_schedule_dispatch(struct bfq_data *bfqd) -{ - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); - kblockd_schedule_work(&bfqd->unplug_work); - } -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request *bfq_choose_req(struct bfq_data *bfqd, - struct request *rq1, - struct request *rq2, - sector_t last) -{ - sector_t s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ - - if (!rq1 || rq1 == rq2) - return rq2; - if (!rq2) - return rq1; - - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) - return rq2; - - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - /* - * By definition, 1KiB is 2 sectors. - */ - back_max = bfqd->bfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - - if (s1 >= s2) - return rq1; - else - return rq2; - - case BFQ_RQ2_WRAP: - return rq1; - case BFQ_RQ1_WRAP: - return rq2; - case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -/* - * Tell whether there are active queues or groups with differentiated weights. - */ -static bool bfq_differentiated_weights(struct bfq_data *bfqd) -{ - /* - * For weights to differ, at least one of the trees must contain - * at least two nodes. - */ - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || - bfqd->group_weights_tree.rb_node->rb_right) -#endif - ); -} - -/* - * The following function returns true if every queue must receive the - * same share of the throughput (this condition is used when deciding - * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). - * - * Such a scenario occurs when: - * 1) all active queues have the same weight, - * 2) all active groups at the same level in the groups tree have the same - * weight, - * 3) all active groups at the same level in the groups tree have the same - * number of children. - * - * Unfortunately, keeping the necessary state for evaluating exactly the - * above symmetry conditions would be quite complex and time-consuming. - * Therefore this function evaluates, instead, the following stronger - * sub-conditions, for which it is much easier to maintain the needed - * state: - * 1) all active queues have the same weight, - * 2) all active groups have the same weight, - * 3) all active groups have at most one active child each. - * In particular, the last two conditions are always true if hierarchical - * support and the cgroups interface are not enabled, thus no state needs - * to be maintained in this case. - */ -static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -{ - return -#ifdef CONFIG_BFQ_GROUP_IOSCHED - !bfqd->active_numerous_groups && -#endif - !bfq_differentiated_weights(bfqd); -} - -/* - * If the weight-counter tree passed as input contains no counter for - * the weight of the input entity, then add that counter; otherwise just - * increment the existing counter. - * - * Note that weight-counter trees contain few nodes in mostly symmetric - * scenarios. For example, if all queues have the same weight, then the - * weight-counter tree for the queues may contain at most one node. - * This holds even if low_latency is on, because weight-raised queues - * are not inserted in the tree. - * In most scenarios, the rate at which nodes are created/destroyed - * should be low too. - */ -static void bfq_weights_tree_add(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* - * Do not insert if the entity is already associated with a - * counter, which happens if: - * 1) the entity is associated with a queue, - * 2) a request arrival has caused the queue to become both - * non-weight-raised, and hence change its weight, and - * backlogged; in this respect, each of the two events - * causes an invocation of this function, - * 3) this is the invocation of this function caused by the - * second event. This second invocation is actually useless, - * and we handle this fact by exiting immediately. More - * efficient or clearer solutions might possibly be adopted. - */ - if (entity->weight_counter) - return; - - while (*new) { - struct bfq_weight_counter *__counter = container_of(*new, - struct bfq_weight_counter, - weights_node); - parent = *new; - - if (entity->weight == __counter->weight) { - entity->weight_counter = __counter; - goto inc_counter; - } - if (entity->weight < __counter->weight) - new = &((*new)->rb_left); - else - new = &((*new)->rb_right); - } - - entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), - GFP_ATOMIC); - entity->weight_counter->weight = entity->weight; - rb_link_node(&entity->weight_counter->weights_node, parent, new); - rb_insert_color(&entity->weight_counter->weights_node, root); - -inc_counter: - entity->weight_counter->num_active++; -} - -/* - * Decrement the weight counter associated with the entity, and, if the - * counter reaches 0, remove the counter from the tree. - * See the comments to the function bfq_weights_tree_add() for considerations - * about overhead. - */ -static void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root) -{ - if (!entity->weight_counter) - return; - - BUG_ON(RB_EMPTY_ROOT(root)); - BUG_ON(entity->weight_counter->weight != entity->weight); - - BUG_ON(!entity->weight_counter->num_active); - entity->weight_counter->num_active--; - if (entity->weight_counter->num_active > 0) - goto reset_entity_pointer; - - rb_erase(&entity->weight_counter->weights_node, root); - kfree(entity->weight_counter); - -reset_entity_pointer: - entity->weight_counter = NULL; -} - -static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); - - if (rbnext) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&bfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -} - -/* see the definition of bfq_async_charge_factor for details */ -static unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) -{ - return blk_rq_sectors(rq) * - (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * - bfq_async_charge_factor)); -} - -/** - * bfq_updated_next_req - update the queue after a new next_rq selection. - * @bfqd: the device data the queue belongs to. - * @bfqq: the queue to update. - * - * If the first request of a queue changes we make sure that the queue - * has enough budget to serve at least its first request (if the - * request has grown). We do this because if the queue has not enough - * budget for its first request, it has to go through two dispatch - * rounds to actually get it dispatched. - */ -static void bfq_updated_next_req(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - struct request *next_rq = bfqq->next_rq; - unsigned long new_budget; - - if (!next_rq) - return; - - if (bfqq == bfqd->in_service_queue) - /* - * In order not to break guarantees, budgets cannot be - * changed after an entity has been selected. - */ - return; - - BUG_ON(entity->tree != &st->active); - BUG_ON(entity == entity->sched_data->in_service_entity); - - new_budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); - bfq_activate_bfqq(bfqd, bfqq); - } -} - -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -{ - u64 dur; - - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - return dur; -} - -/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_queue *item; - struct hlist_node *n; - - hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) - hlist_del_init(&item->burst_list_node); - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - bfqd->burst_size = 1; -} - -/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - - if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { - struct bfq_queue *pos, *bfqq_item; - struct hlist_node *n; - - /* - * Enough queues have been activated shortly after each - * other to consider this burst as large. - */ - bfqd->large_burst = true; - - /* - * We can now mark all queues in the burst list as - * belonging to a large burst. - */ - hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) - bfq_mark_bfqq_in_large_burst(bfqq_item); - bfq_mark_bfqq_in_large_burst(bfqq); - - /* - * From now on, and until the current burst finishes, any - * new queue being activated shortly after the last queue - * was inserted in the burst can be immediately marked as - * belonging to a large burst. So the burst list is not - * needed any more. Remove it. - */ - hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, - burst_list_node) - hlist_del_init(&pos->burst_list_node); - } else /* burst not yet large: add bfqq to the burst list */ - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -} - -/* - * If many queues happen to become active shortly after each other, then, - * to help the processes associated to these queues get their job done as - * soon as possible, it is usually better to not grant either weight-raising - * or device idling to these queues. In this comment we describe, firstly, - * the reasons why this fact holds, and, secondly, the next function, which - * implements the main steps needed to properly mark these queues so that - * they can then be treated in a different way. - * - * As for the terminology, we say that a queue becomes active, i.e., - * switches from idle to backlogged, either when it is created (as a - * consequence of the arrival of an I/O request), or, if already existing, - * when a new request for the queue arrives while the queue is idle. - * Bursts of activations, i.e., activations of different queues occurring - * shortly after each other, are typically caused by services or applications - * that spawn or reactivate many parallel threads/processes. Examples are - * systemd during boot or git grep. - * - * These services or applications benefit mostly from a high throughput: - * the quicker the requests of the activated queues are cumulatively served, - * the sooner the target job of these queues gets completed. As a consequence, - * weight-raising any of these queues, which also implies idling the device - * for it, is almost always counterproductive: in most cases it just lowers - * throughput. - * - * On the other hand, a burst of activations may be also caused by the start - * of an application that does not consist in a lot of parallel I/O-bound - * threads. In fact, with a complex application, the burst may be just a - * consequence of the fact that several processes need to be executed to - * start-up the application. To start an application as quickly as possible, - * the best thing to do is to privilege the I/O related to the application - * with respect to all other I/O. Therefore, the best strategy to start as - * quickly as possible an application that causes a burst of activations is - * to weight-raise all the queues activated during the burst. This is the - * exact opposite of the best strategy for the other type of bursts. - * - * In the end, to take the best action for each of the two cases, the two - * types of bursts need to be distinguished. Fortunately, this seems - * relatively easy to do, by looking at the sizes of the bursts. In - * particular, we found a threshold such that bursts with a larger size - * than that threshold are apparently caused only by services or commands - * such as systemd or git grep. For brevity, hereafter we call just 'large' - * these bursts. BFQ *does not* weight-raise queues whose activations occur - * in a large burst. In addition, for each of these queues BFQ performs or - * does not perform idling depending on which choice boosts the throughput - * most. The exact choice depends on the device and request pattern at - * hand. - * - * Turning back to the next function, it implements all the steps needed - * to detect the occurrence of a large burst and to properly mark all the - * queues belonging to it (so that they can then be treated in a different - * way). This goal is achieved by maintaining a special "burst list" that - * holds, temporarily, the queues that belong to the burst in progress. The - * list is then used to mark these queues as belonging to a large burst if - * the burst does become large. The main steps are the following. - * - * . when the very first queue is activated, the queue is inserted into the - * list (as it could be the first queue in a possible burst) - * - * . if the current burst has not yet become large, and a queue Q that does - * not yet belong to the burst is activated shortly after the last time - * at which a new queue entered the burst list, then the function appends - * Q to the burst list - * - * . if, as a consequence of the previous step, the burst size reaches - * the large-burst threshold, then - * - * . all the queues in the burst list are marked as belonging to a - * large burst - * - * . the burst list is deleted; in fact, the burst list already served - * its purpose (keeping temporarily track of the queues in a burst, - * so as to be able to mark them as belonging to a large burst in the - * previous sub-step), and now is not needed any more - * - * . the device enters a large-burst mode - * - * . if a queue Q that does not belong to the burst is activated while - * the device is in large-burst mode and shortly after the last time - * at which a queue either entered the burst list or was marked as - * belonging to the current large burst, then Q is immediately marked - * as belonging to a large burst. - * - * . if a queue Q that does not belong to the burst is activated a while - * later, i.e., not shortly after, than the last time at which a queue - * either entered the burst list or was marked as belonging to the - * current large burst, then the current burst is deemed as finished and: - * - * . the large-burst mode is reset if set - * - * . the burst list is emptied - * - * . Q is inserted in the burst list, as Q may be the first queue - * in a possible new burst (then the burst list contains just Q - * after this step). - */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool idle_for_long_time) -{ - /* - * If bfqq happened to be activated in a burst, but has been idle - * for at least as long as an interactive queue, then we assume - * that, in the overall I/O initiated in the burst, the I/O - * associated to bfqq is finished. So bfqq does not need to be - * treated as a queue belonging to a burst anymore. Accordingly, - * we reset bfqq's in_large_burst flag if set, and remove bfqq - * from the burst list if it's there. We do not decrement instead - * burst_size, because the fact that bfqq does not need to belong - * to the burst list any more does not invalidate the fact that - * bfqq may have been activated during the current burst. - */ - if (idle_for_long_time) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - /* - * If bfqq is already in the burst list or is part of a large - * burst, then there is nothing else to do. - */ - if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq)) - return; - - /* - * If bfqq's activation happens late enough, then the current - * burst is finished, and related data structures must be reset. - * - * In this respect, consider the special case where bfqq is the very - * first queue being activated. In this case, last_ins_in_burst is - * not yet significant when we get here. But it is easy to verify - * that, whether or not the following condition is true, bfqq will - * end up being inserted into the burst list. In particular the - * list will happen to contain only bfqq. And this is exactly what - * has to happen, as bfqq may be the first queue in a possible - * burst. - */ - if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval)) { - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - return; - } - - /* - * If we get here, then bfqq is being activated shortly after the - * last queue. So, if the current burst is also large, we can mark - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { - bfq_mark_bfqq_in_large_burst(bfqq); - return; - } - - /* - * If we get here, then a large-burst state has not yet been - * reached, but bfqq is being activated shortly after the last - * queue. Then we add bfqq to the burst. - */ - bfq_add_to_burst(bfqd, bfqq); -} - -static void bfq_add_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_entity *entity = &bfqq->entity; - struct bfq_data *bfqd = bfqq->bfqd; - struct request *next_rq, *prev; - unsigned long old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - - bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - elv_rb_add(&bfqq->sort_list, rq); - - /* - * Check if this request is a better next-serve candidate. - */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - - if (!bfq_bfqq_busy(bfqq)) { - bool soft_rt, in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - rq->cmd_flags); -#endif - if (bfq_bfqq_sync(bfqq)) { - bool already_in_burst = - !hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq); - bfq_handle_burst(bfqd, bfqq, idle_for_long_time); - /* - * If bfqq was not already in the current burst, - * then, at this point, bfqq either has been - * added to the current burst or has caused the - * current burst to terminate. In particular, in - * the second case, bfqq has become the first - * queue in a possible new burst. - * In both cases last_ins_in_burst needs to be - * moved forward. - */ - if (!already_in_burst) - bfqd->last_ins_in_burst = jiffies; - } - - in_burst = bfq_bfqq_in_large_burst(bfqq); - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - interactive = !in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (time_before(jiffies, - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle)) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - - if (!bfqd->low_latency) - goto add_bfqq_busy; - - /* - * If the queue: - * - is not being boosted, - * - has been idle for enough time, - * - is not a sync queue or is linked to a bfq_io_cq (it is - * shared "for its nature" or it is not shared and its - * requests have not been redirected to a shared queue) - * start a weight-raising period. - */ - if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { - /* - * - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - } - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; -add_bfqq_busy: - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); - } else { - if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && - time_is_before_jiffies( - bfqq->last_wr_start_finish + - bfqd->bfq_wr_min_inter_arr_async)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - - bfqd->wr_busy_queues++; - entity->prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - if (prev != bfqq->next_rq) - bfq_updated_next_req(bfqd, bfqq); - } - - if (bfqd->low_latency && - (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) - bfqq->last_wr_start_finish = jiffies; -} - -static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio) -{ - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - - bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) - return NULL; - - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - - return NULL; -} - -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (unsigned long long) bfqd->last_position); -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - BUG_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; -} - -static void bfq_remove_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); - } - - if (rq->queuelist.prev != &rq->queuelist) - list_del_init(&rq->queuelist); - BUG_ON(bfqq->queued[sync] == 0); - bfqq->queued[sync]--; - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, 1); - /* - * Remove queue from request-position tree as it is empty. - */ - if (bfqq->pos_root) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - } - - if (rq->cmd_flags & REQ_META) { - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -#endif -} - -static int bfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_rq_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void bfq_merged_request(struct request_queue *q, struct request *req, - int type) -{ - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && - blk_rq_pos(req) < - blk_rq_pos(container_of(rb_prev(&req->rb_node), - struct request, rb_node))) { - struct bfq_queue *bfqq = RQ_BFQQ(req); - struct bfq_data *bfqd = bfqq->bfqd; - struct request *prev, *next_rq; - - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); - elv_rb_add(&bfqq->sort_list, req); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, - bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - } -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -} -#endif - -static void bfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - - /* - * If next and rq belong to the same bfq_queue and next is older - * than rq, then reposition rq in the fifo (by substituting next - * with rq). Otherwise, if next and rq belong to different - * bfq_queues, never reposition rq: in fact, we would have to - * reposition it with respect to next's position in its own fifo, - * which would most certainly be too expensive with respect to - * the benefits. - */ - if (bfqq == next_bfqq && - !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(next->fifo_time, rq->fifo_time)) { - list_del_init(&rq->queuelist); - list_replace_init(&next->queuelist, &rq->queuelist); - rq->fifo_time = next->fifo_time; - } - - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - - bfq_remove_request(next); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -#endif -} - -/* Must be called with bfqq != NULL */ -static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -{ - BUG_ON(!bfqq); - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues--; - bfqq->wr_coeff = 1; - bfqq->wr_cur_max_time = 0; - /* Trigger a weight change on the next activation of the queue */ - bfqq->entity.prio_changed = 1; -} - -static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg) -{ - int i, j; - - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -} - -static void bfq_end_wr(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - - spin_lock_irq(bfqd->queue->queue_lock); - - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); - - spin_unlock_irq(bfqd->queue->queue_lock); -} - -static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; - - /* - * Disallow merge of a sync bio into an async request. - */ - if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return 0; - - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - * Queue lock is held here. - */ - bic = bfq_bic_lookup(bfqd, current->io_context); - if (!bic) - return 0; - - return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); -} - -static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - if (bfqq) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -#endif - bfq_mark_bfqq_must_alloc(bfqq); - bfq_mark_bfqq_budget_new(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - - bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %d", - bfqq->entity.budget); - } - - bfqd->in_service_queue = bfqq; -} - -/* - * Get and set a new queue for service. - */ -static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); - - __bfq_set_in_service_queue(bfqd, bfqq); - return bfqq; -} - -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - -static void bfq_arm_slice_timer(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->in_service_queue; - struct bfq_io_cq *bic; - unsigned long sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* Processes have exited, don't wait. */ - bic = bfqd->in_service_bic; - if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) - return; - - bfq_mark_bfqq_wait_request(bfqq); - - /* - * We don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. So allow a little bit of time for him to submit a new rq. - * - * To prevent processes with (partly) seeky workloads from - * being too ill-treated, grant them a small fraction of the - * assigned budget before reducing the waiting time to - * BFQ_MIN_TT. This happened to help reduce latency. - */ - sl = bfqd->bfq_slice_idle; - /* - * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue either - * has been seeky for long enough or has already proved to be - * constantly seeky. - */ - if (bfq_sample_valid(bfqq->seek_samples) && - ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > - bfq_max_budget(bfqq->bfqd) / 8) || - bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && - bfq_symmetric_scenario(bfqd)) - sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - else if (bfqq->wr_coeff > 1) - sl = sl * 3; - bfqd->last_idling_start = ktime_get(); - mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -#endif - bfq_log(bfqd, "arm idle: %u/%u ms", - jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -} - -/* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the disk - * throughput (always guaranteed with a time slice scheme as in CFQ). - */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->in_service_queue; - unsigned int timeout_coeff; - - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - - bfqd->last_budget_start = ktime_get(); - - bfq_clear_bfqq_budget_new(bfqq); - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; - - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * - timeout_coeff)); -} - -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* - * For consistency, the next instruction should have been executed - * after removing the request from the queue and dispatching it. - * We execute instead this instruction before bfq_remove_request() - * (and hence introduce a temporary inconsistency), for efficiency. - * In fact, in a forced_dispatch, this prevents two counters related - * to bfqq->dispatched to risk to be uselessly decremented if bfqq - * is not in service, and then to be incremented again after - * incrementing bfqq->dispatched. - */ - bfqq->dispatched++; - bfq_remove_request(rq); - elv_dispatch_sort(q, rq); - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), - rq->cmd_flags); -#endif -} - -/* - * Return expired entry, or NULL to just start from scratch in rbtree. - */ -static struct request *bfq_check_fifo(struct bfq_queue *bfqq) -{ - struct request *rq = NULL; - - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; - - bfq_mark_bfqq_fifo_expire(bfqq); - - if (list_empty(&bfqq->fifo)) - return NULL; - - rq = rq_entry_fifo(bfqq->fifo.next); - - if (time_before(jiffies, rq->fifo_time)) - return NULL; - - return rq; -} - -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - return entity->budget - entity->service; -} - -static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfqq != bfqd->in_service_queue); - - __bfq_bfqd_reset_in_service(bfqd); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time - * at which the queue remains with no backlog; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, 1); - } else - bfq_activate_bfqq(bfqd, bfqq); -} - -/** - * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. - * @bfqd: device data. - * @bfqq: queue to update. - * @reason: reason for expiration. - * - * Handle the feedback on @bfqq budget at queue expiration. - * See the body for detailed comments. - */ -static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - enum bfqq_expiration reason) -{ - struct request *next_rq; - int budget, min_budget; - - budget = bfqq->max_budget; - min_budget = bfq_min_budget(bfqd); - - BUG_ON(bfqq != bfqd->in_service_queue); - - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq)) { - switch (reason) { - /* - * Caveat: in all the following cases we trade latency - * for throughput. - */ - case BFQ_BFQQ_TOO_IDLE: - /* - * This is the only case where we may reduce - * the budget: if there is no request of the - * process still waiting for completion, then - * we assume (tentatively) that the timer has - * expired because the batch of requests of - * the process could have been served with a - * smaller budget. Hence, betting that - * process will behave in the same way when it - * becomes backlogged again, we reduce its - * next budget. As long as we guess right, - * this budget cut reduces the latency - * experienced by the process. - * - * However, if there are still outstanding - * requests, then the process may have not yet - * issued its next request just because it is - * still waiting for the completion of some of - * the still outstanding ones. So in this - * subcase we do not reduce its budget, on the - * contrary we increase it to possibly boost - * the throughput, as discussed in the - * comments to the BUDGET_TIMEOUT case. - */ - if (bfqq->dispatched > 0) /* still outstanding reqs */ - budget = min(budget * 2, bfqd->bfq_max_budget); - else { - if (budget > 5 * min_budget) - budget -= 4 * min_budget; - else - budget = min_budget; - } - break; - case BFQ_BFQQ_BUDGET_TIMEOUT: - /* - * We double the budget here because: 1) it - * gives the chance to boost the throughput if - * this is not a seeky process (which may have - * bumped into this timeout because of, e.g., - * ZBR), 2) together with charge_full_budget - * it helps give seeky processes higher - * timestamps, and hence be served less - * frequently. - */ - budget = min(budget * 2, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_BUDGET_EXHAUSTED: - /* - * The process still has backlog, and did not - * let either the budget timeout or the disk - * idling timeout expire. Hence it is not - * seeky, has a short thinktime and may be - * happy with a higher budget too. So - * definitely increase the budget of this good - * candidate to boost the disk throughput. - */ - budget = min(budget * 4, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * Leave the budget unchanged. - */ - default: - return; - } - } else - /* - * Async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). - */ - budget = bfqd->bfq_max_budget; - - bfqq->max_budget = budget; - - if (bfqd->budgets_assigned >= bfq_stats_min_budgets && - !bfqd->bfq_user_max_budget) - bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); - - /* - * Make sure that we have enough budget for the next request. - * Since the finish time of the bfqq must be kept in sync with - * the budget, be sure to call __bfq_bfqq_expire() after the - * update. - */ - next_rq = bfqq->next_rq; - if (next_rq) - bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - else - bfqq->entity.budget = bfqq->max_budget; - - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", - next_rq ? blk_rq_sectors(next_rq) : 0, - bfqq->entity.budget); -} - -static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -{ - unsigned long max_budget; - - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transfered in timeout_sync at the - * estimated peak rate. - */ - max_budget = (unsigned long)(peak_rate * 1000 * - timeout >> BFQ_RATE_SHIFT); - - return max_budget; -} - -/* - * In addition to updating the peak rate, checks whether the process - * is "slow", and returns 1 if so. This slow flag is used, in addition - * to the budget timeout, to reduce the amount of service provided to - * seeky processes, and hence reduce their chances to lower the - * throughput. See the code for more details. - */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason) -{ - u64 bw, usecs, expected, timeout; - ktime_t delta; - int update = 0; - - if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) - return false; - - if (compensate) - delta = bfqd->last_idling_start; - else - delta = ktime_get(); - delta = ktime_sub(delta, bfqd->last_budget_start); - usecs = ktime_to_us(delta); - - /* Don't trust short/unrealistic values. */ - if (usecs < 100 || usecs >= LONG_MAX) - return false; - - /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)usecs); - - timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. - */ - if (usecs > 20000) { - if (bw > bfqd->peak_rate || - (!BFQQ_SEEKY(bfqq) && - reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { - bfq_log(bfqd, "measured bw =%llu", bw); - /* - * To smooth oscillations use a low-pass filter with - * alpha=7/8, i.e., - * new_rate = (7/8) * old_rate + (1/8) * bw - */ - do_div(bw, 8); - if (bw == 0) - return 0; - bfqd->peak_rate *= 7; - do_div(bfqd->peak_rate, 8); - bfqd->peak_rate += bw; - update = 1; - bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd->peak_rate, - timeout); - bfq_log(bfqd, "new max_budget=%d", - bfqd->bfq_max_budget); - } - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - } - } - - /* - * If the process has been served for a too short time - * interval to let its possible sequential accesses prevail on - * the initial seek time needed to move the disk head on the - * first sector it requested, then give the process a chance - * and for the moment return false. - */ - if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return false; - - /* - * A process is considered ``slow'' (i.e., seeky, so that we - * cannot treat it fairly in the service domain, as it would - * slow down too much the other processes) if, when a slice - * ends for whatever reason, it has received service at a - * rate that would not be high enough to complete the budget - * before the budget timeout expiration. - */ - expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; - - /* - * Caveat: processes doing IO in the slower disk zones will - * tend to be slow(er) even if not seeky. And the estimated - * peak rate will actually be an average over the disk - * surface. Hence, to not be too harsh with unlucky processes, - * we keep a budget/3 margin of safety before declaring a - * process slow. - */ - return expected > (4 * bfqq->entity.budget) / 3; -} - -/* - * To be deemed as soft real-time, an application must meet two - * requirements. First, the application must not require an average - * bandwidth higher than the approximate bandwidth required to playback or - * record a compressed high-definition video. - * The next function is invoked on the completion of the last request of a - * batch, to compute the next-start time instant, soft_rt_next_start, such - * that, if the next request of the application does not arrive before - * soft_rt_next_start, then the above requirement on the bandwidth is met. - * - * The second requirement is that the request pattern of the application is - * isochronous, i.e., that, after issuing a request or a batch of requests, - * the application stops issuing new requests until all its pending requests - * have been completed. After that, the application may issue a new batch, - * and so on. - * For this reason the next function is invoked to compute - * soft_rt_next_start only for applications that meet this requirement, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). - * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. - */ -static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + bfqq->bfqd->bfq_slice_idle + 4); -} - -/* - * Return the largest-possible time instant such that, for as long as possible, - * the current time will be lower than this time instant according to the macro - * time_is_before_jiffies(). - */ -static unsigned long bfq_infinity_from_now(unsigned long now) -{ - return now + ULONG_MAX / 2; -} - -/** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - * @bfqq: the queue to expire. - * @compensate: if true, compensate for the time spent idling. - * @reason: the reason causing the expiration. - * - * - * If the process associated to the queue is slow (i.e., seeky), or in - * case of budget timeout, or, finally, if it is async, we - * artificially charge it an entire budget (independently of the - * actual service it received). As a consequence, the queue will get - * higher timestamps than the correct ones upon reactivation, and - * hence it will be rescheduled as if it had received more service - * than what it actually received. In the end, this class of processes - * will receive less service in proportion to how slowly they consume - * their budgets (and hence how seriously they tend to lower the - * throughput). - * - * In contrast, when a queue expires because it has been idling for - * too much or because it exhausted its budget, we do not touch the - * amount of service it has received. Hence when the queue will be - * reactivated and its timestamps updated, the latter will be in sync - * with the actual service received by the queue until expiration. - * - * Charging a full budget to the first type of queues and the exact - * service to the others has the effect of using the WF2Q+ policy to - * schedule the former on a timeslice basis, without violating the - * service domain guarantees of the latter. - */ -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason) -{ - bool slow; - - BUG_ON(bfqq != bfqd->in_service_queue); - - /* - * Update disk peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). - */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); - - /* - * As above explained, 'punish' slow (i.e., seeky), timed-out - * and async queues, to favor sequential sync workloads. - * - * Processes doing I/O in the slower disk zones will tend to be - * slow(er) even if not seeky. Hence, since the estimated peak - * rate is actually an average over the disk surface, these - * processes may timeout just for bad luck. To avoid punishing - * them we do not charge a full budget to a process that - * succeeded in consuming at least 2/3 of its budget. - */ - if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) - bfq_bfqq_charge_full_budget(bfqq); - - bfqq->service_from_backlogged += bfqq->entity.service; - - if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && - !bfq_bfqq_constantly_seeky(bfqq)) { - bfq_mark_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) - bfqd->const_seeky_busy_in_flight_queues++; - } - - if (reason == BFQ_BFQQ_TOO_IDLE && - bfqq->entity.service <= 2 * bfqq->entity.budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - - if (bfqd->low_latency && bfqq->wr_coeff == 1) - bfqq->last_wr_start_finish = jiffies; - - if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * If we get here, and there are no outstanding requests, - * then the request pattern is isochronous (see the comments - * to the function bfq_bfqq_softrt_next_start()). Hence we - * can compute soft_rt_next_start. If, instead, the queue - * still has outstanding requests, then we have to wait - * for the completion of all the outstanding requests to - * discover whether the request pattern is actually - * isochronous. - */ - if (bfqq->dispatched == 0) - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); - else { - /* - * The application is still waiting for the - * completion of one or more requests: - * prevent it from possibly being incorrectly - * deemed as soft real-time by setting its - * soft_rt_next_start to infinity. In fact, - * without this assignment, the application - * would be incorrectly deemed as soft - * real-time if: - * 1) it issued a new request before the - * completion of all its in-flight - * requests, and - * 2) at that time, its soft_rt_next_start - * happened to be in the past. - */ - bfqq->soft_rt_next_start = - bfq_infinity_from_now(jiffies); - /* - * Schedule an update of soft_rt_next_start to when - * the task may be discovered to be isochronous. - */ - bfq_mark_bfqq_softrt_update(bfqq); - } - } - - bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); - - /* - * Increase, decrease or leave budget unchanged according to - * reason. - */ - __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - __bfq_bfqq_expire(bfqd, bfqq); -} - -/* - * Budget timeout is not implemented through a dedicated timer, but - * just checked on request arrivals and completions, as well as on - * idle timer expirations. - */ -static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_budget_new(bfqq) || - time_before(jiffies, bfqq->budget_timeout)) - return false; - return true; -} - -/* - * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp back-shifting that - * allows the guarantees of the queue to be preserved (see [1] for - * this tricky aspect). Hence we return true only if this condition - * does not hold, or if the queue is slow enough to deserve only to be - * kicked off for preserving a high throughput. -*/ -static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, - "may_budget_timeout: wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); - - return (!bfq_bfqq_wait_request(bfqq) || - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) - && - bfq_bfqq_budget_timeout(bfqq); -} - -/* - * For a queue that becomes empty, device idling is allowed only if - * this function returns true for that queue. As a consequence, since - * device idling plays a critical role for both throughput boosting - * and service guarantees, the return value of this function plays a - * critical role as well. - * - * In a nutshell, this function returns true only if idling is - * beneficial for throughput or, even if detrimental for throughput, - * idling is however necessary to preserve service guarantees (low - * latency, desired throughput distribution, ...). In particular, on - * NCQ-capable devices, this function tries to return false, so as to - * help keep the drives' internal queues full, whenever this helps the - * device boost the throughput without causing any service-guarantee - * issue. - * - * In more detail, the return value of this function is obtained by, - * first, computing a number of boolean variables that take into - * account throughput and service-guarantee issues, and, then, - * combining these variables in a logical expression. Most of the - * issues taken into account are not trivial. We discuss these issues - * while introducing the variables. - */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, - all_queues_seeky, on_hdd_and_not_all_queues_seeky, - idling_needed_for_service_guarantees, - asymmetric_scenario; - - /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable, or - * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the - * above conditions (a) and (b) is true, and, in particular, - * happens to be false if bfqd is an NCQ-capable flash-based - * device. - */ - idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)); - - /* - * The value of the next variable, - * idling_boosts_thr_without_issues, is equal to that of - * idling_boosts_thr, unless a special case holds. In this - * special case, described below, idling may cause problems to - * weight-raised queues. - * - * When the request pool is saturated (e.g., in the presence - * of write hogs), if the processes associated with - * non-weight-raised queues ask for requests at a lower rate, - * then processes associated with weight-raised queues have a - * higher probability to get a request from the pool - * immediately (or at least soon) when they need one. Thus - * they have a higher probability to actually get a fraction - * of the device throughput proportional to their high - * weight. This is especially true with NCQ-capable drives, - * which enqueue several requests in advance, and further - * reorder internally-queued requests. - * - * For this reason, we force to false the value of - * idling_boosts_thr_without_issues if there are weight-raised - * busy queues. In this case, and if bfqq is not weight-raised, - * this guarantees that the device is not idled for bfqq (if, - * instead, bfqq is weight-raised, then idling will be - * guaranteed by another variable, see below). Combined with - * the timestamping rules of BFQ (see [1] for details), this - * behavior causes bfqq, and hence any sync non-weight-raised - * queue, to get a lower number of requests served, and thus - * to ask for a lower number of requests from the request - * pool, before the busy weight-raised queues get served - * again. This often mitigates starvation problems in the - * presence of heavy write workloads and NCQ, thereby - * guaranteeing a higher application and system responsiveness - * in these hostile scenarios. - */ - idling_boosts_thr_without_issues = idling_boosts_thr && - bfqd->wr_busy_queues == 0; - - /* - * There are then two cases where idling must be performed not - * for throughput concerns, but to preserve service - * guarantees. In the description of these cases, we say, for - * short, that a queue is sequential/random if the process - * associated to the queue issues sequential/random requests - * (in the second case the queue may be tagged as seeky or - * even constantly_seeky). - * - * To introduce the first case, we note that, since - * bfq_bfqq_idle_window(bfqq) is false if the device is - * NCQ-capable and bfqq is random (see - * bfq_update_idle_window()), then, from the above two - * assignments it follows that - * idling_boosts_thr_without_issues is false if the device is - * NCQ-capable and bfqq is random. Therefore, for this case, - * device idling would never be allowed if we used just - * idling_boosts_thr_without_issues to decide whether to allow - * it. And, beneficially, this would imply that throughput - * would always be boosted also with random I/O on NCQ-capable - * HDDs. - * - * But we must be careful on this point, to avoid an unfair - * treatment for bfqq. In fact, because of the same above - * assignments, idling_boosts_thr_without_issues is, on the - * other hand, true if 1) the device is an HDD and bfqq is - * sequential, and 2) there are no busy weight-raised - * queues. As a consequence, if we used just - * idling_boosts_thr_without_issues to decide whether to idle - * the device, then with an HDD we might easily bump into a - * scenario where queues that are sequential and I/O-bound - * would enjoy idling, whereas random queues would not. The - * latter might then get a low share of the device throughput, - * simply because the former would get many requests served - * after being set as in service, while the latter would not. - * - * To address this issue, we start by setting to true a - * sentinel variable, on_hdd_and_not_all_queues_seeky, if the - * device is rotational and not all queues with pending or - * in-flight requests are constantly seeky (i.e., there are - * active sequential queues, and bfqq might then be mistreated - * if it does not enjoy idling because it is random). - */ - all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && - bfqd->busy_in_flight_queues == - bfqd->const_seeky_busy_in_flight_queues; - - on_hdd_and_not_all_queues_seeky = - !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; - - /* - * To introduce the second case where idling needs to be - * performed to preserve service guarantees, we can note that - * allowing the drive to enqueue more than one request at a - * time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of - * control on the actual request service order. In particular, - * the critical situation is when requests from different - * processes happens to be present, at the same time, in the - * internal queue(s) of the drive. In such a situation, the - * drive, by deciding the service order of the - * internally-queued requests, does determine also the actual - * throughput distribution among these processes. But the - * drive typically has no notion or concern about per-process - * throughput distribution, and makes its decisions only on a - * per-request basis. Therefore, the service distribution - * enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution - * only in a completely symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) all these processes have the same I/O pattern - * (either sequential or random). - * In fact, in such a scenario, the drive will tend to treat - * the requests of each of these processes in about the same - * way as the requests of the others, and thus to provide - * each of these processes with about the same throughput - * (which is exactly the desired throughput distribution). In - * contrast, in any asymmetric scenario, device idling is - * certainly needed to guarantee that bfqq receives its - * assigned fraction of the device throughput (see [1] for - * details). - * - * We address this issue by controlling, actually, only the - * symmetry sub-condition (i), i.e., provided that - * sub-condition (i) holds, idling is not performed, - * regardless of whether sub-condition (ii) holds. In other - * words, only if sub-condition (i) holds, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that, first, - * in the case of an HDD, the asymmetry in terms of types of - * I/O patterns is already taken in to account in the above - * sentinel variable - * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a - * flash-based device, we prefer however to privilege - * throughput (and idling lowers throughput for this type of - * devices), for the following reasons: - * 1) differently from HDDs, the service time of random - * requests is not orders of magnitudes lower than the service - * time of sequential requests; thus, even if processes doing - * sequential I/O get a preferential treatment with respect to - * others doing random I/O, the consequences are not as - * dramatic as with HDDs; - * 2) if a process doing random I/O does need strong - * throughput guarantees, it is hopefully already being - * weight-raised, or the user is likely to have assigned it a - * higher weight than the other processes (and thus - * sub-condition (i) is likely to be false, which triggers - * idling). - * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the - * value of this variable, we not only use the return value of - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments to - * bfq_weights_tree_add()). - * - * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. - */ - asymmetric_scenario = bfqq->wr_coeff > 1 || - !bfq_symmetric_scenario(bfqd); - - /* - * Finally, there is a case where maximizing throughput is the - * best choice even if it may cause unfairness toward - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the - * comments to bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the two cases above, we - * can now establish when idling is actually needed to - * preserve service guarantees. - */ - idling_needed_for_service_guarantees = - (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && - !bfq_bfqq_in_large_burst(bfqq); - - /* - * We have now all the components we need to compute the return - * value of the function, which is true only if both the following - * conditions hold: - * 1) bfqq is sync, because idling make sense only for sync queues; - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. - */ - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); -} - -/* - * If the in-service queue is empty but the function bfq_bfqq_may_idle - * returns true, then: - * 1) the queue must remain in service and cannot be expired, and - * 2) the device must be idled to wait for the possible arrival of a new - * request for the queue. - * See the comments to the function bfq_bfqq_may_idle for the reasons - * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself - * returns true. - */ -static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_may_idle(bfqq); -} - -/* - * Select a queue for service. If we have a current queue in service, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - struct request *next_rq; - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; - - bfqq = bfqd->in_service_queue; - if (!bfqq) - goto new_queue; - - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !timer_pending(&bfqd->idle_slice_timer) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve them, keep the queue, otherwise expire it. - */ - if (next_rq) { - if (bfq_serv_to_charge(next_rq, bfqq) > - bfq_bfqq_budget_left(bfqq)) { - reason = BFQ_BFQQ_BUDGET_EXHAUSTED; - goto expire; - } else { - /* - * The idle timer may be pending because we may - * not disable disk idling even when a new request - * arrives. - */ - if (timer_pending(&bfqd->idle_slice_timer)) { - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the - * timer because the request was too small, - * 2) then the block layer has unplugged - * the device, causing the dispatch to be - * invoked. - * - * Since the device is unplugged, now the - * requests are probably large enough to - * provide a reasonable throughput. - * So we disable idling. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif - } - goto keep_queue; - } - } - - /* - * No requests pending. However, if the in-service queue is idling - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ - if (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; - } - - reason = BFQ_BFQQ_NO_MORE_REQUESTS; -expire: - bfq_bfqq_expire(bfqd, bfqq, false, reason); -new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - bfq_log(bfqd, "select_queue: new queue %d returned", - bfqq ? bfqq->pid : 0); -keep_queue: - return bfqq; -} - -static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqq->wr_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); - - BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != - entity->orig_weight * bfqq->wr_coeff); - if (entity->prio_changed) - bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); - - /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning - * of this weight-raising period, then end weight - * raising. - */ - if (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - bfq_bfqq_end_wr(bfqq); - } - } - /* Update weight both if it must be raised and if it must be lowered */ - if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) - __bfq_entity_update_weight_prio( - bfq_entity_service_tree(entity), - entity); -} - -/* - * Dispatch one request from bfqq, moving it to the request queue - * dispatch list. - */ -static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - int dispatched = 0; - struct request *rq; - unsigned long service_to_charge; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* Follow expired path, else get first next available. */ - rq = bfq_check_fifo(bfqq); - if (!rq) - rq = bfqq->next_rq; - service_to_charge = bfq_serv_to_charge(rq, bfqq); - - if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { - /* - * This may happen if the next rq is chosen in fifo order - * instead of sector order. The budget is properly - * dimensioned to be always sufficient to serve the next - * request only if it is chosen in sector order. The reason - * is that it would be quite inefficient and little useful - * to always make sure that the budget is large enough to - * serve even the possible next rq in fifo order. - * In fact, requests are seldom served in fifo order. - * - * Expire the queue for budget exhaustion, and make sure - * that the next act_budget is enough to serve the next - * request, even if it comes from the fifo expired path. - */ - bfqq->next_rq = rq; - /* - * Since this dispatch is failed, make sure that - * a new one will be performed - */ - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - goto expire; - } - - /* Finally, insert request into driver dispatch list. */ - bfq_bfqq_served(bfqq, service_to_charge); - bfq_dispatch_insert(bfqd->queue, rq); - - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, - "dispatched %u sec req (%llu), budg left %d", - blk_rq_sectors(rq), - (unsigned long long) blk_rq_pos(rq), - bfq_bfqq_budget_left(bfqq)); - - dispatched++; - - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfqd->bfq_max_budget_async_rq) || - bfq_class_idle(bfqq))) - goto expire; - - return dispatched; - -expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); - return dispatched; -} - -static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -{ - int dispatched = 0; - - while (bfqq->next_rq) { - bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&bfqq->fifo)); - return dispatched; -} - -/* - * Drain our current requests. - * Used for barriers and when switching io schedulers on-the-fly. - */ -static int bfq_forced_dispatch(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq, *n; - struct bfq_service_tree *st; - int dispatched = 0; - - bfqq = bfqd->in_service_queue; - if (bfqq) - __bfq_bfqq_expire(bfqd, bfqq); - - /* - * Loop through classes, and be careful to leave the scheduler - * in a consistent state, as feedback mechanisms and vtime - * updates cannot be disabled during the process. - */ - list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); - - bfq_forget_idle(st); - } - - BUG_ON(bfqd->busy_queues != 0); - - return dispatched; -} - -static int bfq_dispatch_requests(struct request_queue *q, int force) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - int max_dispatch; - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - - bfqq = bfq_select_queue(bfqd); - if (!bfqq) - return 0; - - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (!bfq_bfqq_sync(bfqq)) - max_dispatch = bfqd->bfq_max_budget_async_rq; - - if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - return 0; - if (bfqq->dispatched >= 4 * max_dispatch) - return 0; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - return 0; - - bfq_clear_bfqq_wait_request(bfqq); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - - bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - - return 1; -} - -/* - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * Queue lock must be held here. - */ -static void bfq_put_queue(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_group *bfqg = bfqq_group(bfqq); -#endif - - BUG_ON(atomic_read(&bfqq->ref) <= 0); - - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, - atomic_read(&bfqq->ref)); - if (!atomic_dec_and_test(&bfqq->ref)) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->in_service_queue == bfqq); - - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ - hlist_del_init(&bfqq->burst_list_node); - - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_put(bfqg); -#endif -} - -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - if (bfqq == bfqd->in_service_queue) { - __bfq_bfqq_expire(bfqd, bfqq); - bfq_schedule_dispatch(bfqd); - } - - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - - bfq_put_queue(bfqq); -} - -static void bfq_init_icq(struct io_cq *icq) -{ - struct bfq_io_cq *bic = icq_to_bic(icq); - - bic->ttime.last_end_request = jiffies; -} - -static void bfq_exit_icq(struct io_cq *icq) -{ - struct bfq_io_cq *bic = icq_to_bic(icq); - struct bfq_data *bfqd = bic_to_bfqd(bic); - - if (bic->bfqq[BLK_RW_ASYNC]) { - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); - bic->bfqq[BLK_RW_ASYNC] = NULL; - } - - if (bic->bfqq[BLK_RW_SYNC]) { - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; - } -} - -/* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. - */ -static void -bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -{ - struct task_struct *tsk = current; - int ioprio_class; - - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { - default: - dev_err(bfqq->bfqd->queue->backing_dev_info.dev, - "bfq: bad prio class %d\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* - * No prio set, inherit CPU scheduling settings. - */ - bfqq->new_ioprio = task_nice_ioprio(tsk); - bfqq->new_ioprio_class = task_nice_ioclass(tsk); - break; - case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); - break; - } - - if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { - pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->new_ioprio); - BUG(); - } - - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; -} - -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd; - struct bfq_queue *bfqq, *new_bfqq; - unsigned long uninitialized_var(flags); - int ioprio = bic->icq.ioc->ioprio; - - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), - &flags); - /* - * This condition may trigger on a newly created bic, be sure to - * drop the lock before returning. - */ - if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - goto out; - - bic->ioprio = ioprio; - - bfqq = bic->bfqq[BLK_RW_ASYNC]; - if (bfqq) { - new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, - GFP_ATOMIC); - if (new_bfqq) { - bic->bfqq[BLK_RW_ASYNC] = new_bfqq; - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } - } - - bfqq = bic->bfqq[BLK_RW_SYNC]; - if (bfqq) - bfq_set_next_ioprio_data(bfqq, bic); - -out: - bfq_put_bfqd_unlock(bfqd, &flags); -} - -static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) -{ - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - INIT_HLIST_NODE(&bfqq->burst_list_node); - - atomic_set(&bfqq->ref, 0); - bfqq->bfqd = bfqd; - - if (bic) - bfq_set_next_ioprio_data(bfqq, bic); - - if (is_sync) { - if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); - bfq_mark_bfqq_sync(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); - bfq_mark_bfqq_IO_bound(bfqq); - - /* Tentative initial value to trade off between thr and lat */ - bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; - bfqq->pid = pid; - - bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = 0; - /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. - */ - bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -} - -static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, - gfp_t gfp_mask) -{ - struct bfq_group *bfqg; - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct blkcg *blkcg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - /* bic always exists here */ - bfqq = bic_to_bfqq(bic, is_sync); - - /* - * Always try a new alloc if we fall back to the OOM bfqq - * originally, since it should just be a temporary situation. - */ - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = NULL; - if (new_bfqq) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfpflags_allow_blocking(gfp_mask)) { - rcu_read_unlock(); - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq) - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - } - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - } - } - - if (new_bfqq) - kmem_cache_free(bfq_pool, new_bfqq); - - rcu_read_unlock(); - - return bfqq; -} - -static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - struct bfq_group *bfqg, - int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ - case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; - default: - BUG(); - } -} - -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask) -{ - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; - - if (!is_sync) { - struct blkcg *blkcg; - struct bfq_group *bfqg; - - rcu_read_lock(); - blkcg = bio_blkcg(bio); - rcu_read_unlock(); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); - bfqq = *async_bfqq; - } - - if (!bfqq) - bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); - - /* - * Pin the queue now that it's allocated, scheduler exit will - * prune it. - */ - if (!is_sync && !(*async_bfqq)) { - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, atomic_read(&bfqq->ref)); - *async_bfqq = bfqq; - } - - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - return bfqq; -} - -static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct bfq_io_cq *bic) -{ - unsigned long elapsed = jiffies - bic->ttime.last_end_request; - unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); - - bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; - bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; - bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / - bic->ttime.ttime_samples; -} - -static void bfq_update_io_seektime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *rq) -{ - sector_t sdist; - u64 total; - - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc. - */ - if (bfqq->seek_samples == 0) /* first request, not really a seek */ - sdist = 0; - else if (bfqq->seek_samples <= 60) /* second & third seek */ - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); - - bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; - bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; - total = bfqq->seek_total + (bfqq->seek_samples/2); - do_div(total, bfqq->seek_samples); - bfqq->seek_mean = (sector_t)total; - - bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, - (u64)bfqq->seek_mean); -} - -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter. - */ -static void bfq_update_idle_window(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) -{ - int enable_idle; - - /* Don't idle for async or idle io prio class. */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) - return; - - enable_idle = bfq_bfqq_idle_window(bfqq); - - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - bfqd->bfq_slice_idle == 0 || - (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && - bfqq->wr_coeff == 1)) - enable_idle = 0; - else if (bfq_sample_valid(bic->ttime.ttime_samples)) { - if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && - bfqq->wr_coeff == 1) - enable_idle = 0; - else - enable_idle = 1; - } - bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", - enable_idle); - - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); -} - -/* - * Called when a new fs request (rq) is added to bfqq. Check if there's - * something we should do about it. - */ -static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) -{ - struct bfq_io_cq *bic = RQ_BIC(rq); - - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { - bfq_clear_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), - (unsigned long long) bfqq->seek_mean); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { - bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && - blk_rq_sectors(rq) < 32; - bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); - - /* - * There is just this request queued: if the request - * is small and the queue is not to be expired, then - * just exit. - * - * In this way, if the disk is being idled to wait for - * a new request from the in-service queue, we avoid - * unplugging the device and committing the disk to serve - * just a small request. On the contrary, we wait for - * the block layer to decide when to unplug the device: - * hopefully, new requests will be merged to this one - * quickly, then the device will be unplugged and - * larger requests will be dispatched. - */ - if (small_req && !budget_timeout) - return; - - /* - * A large enough request arrived, or the queue is to - * be expired: in both cases disk idling is to be - * stopped, so clear wait_request flag and reset - * timer. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif - - /* - * The queue is not empty, because a new request just - * arrived. Hence we can safely expire the queue, in - * case of budget timeout, without risking that the - * timestamps of the queue are not updated correctly. - * See [1] for more details. - */ - if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); - - /* - * Let the request rip immediately, or let a new queue be - * selected if bfqq has just been expired. - */ - __blk_run_queue(bfqd->queue); - } -} - -static void bfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - assert_spin_locked(bfqd->queue->queue_lock); - - bfq_add_request(rq); - - rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -} - -static void bfq_update_hw_tag(struct bfq_data *bfqd) -{ - bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, - bfqd->rq_in_driver); - - if (bfqd->hw_tag == 1) - return; - - /* - * This sample is valid if the number of outstanding requests - * is large enough to allow a queueing behavior. Note that the - * sum is not exact, as it's not taking into account deactivated - * requests. - */ - if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) - return; - - if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) - return; - - bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; - bfqd->max_rq_in_driver = 0; - bfqd->hw_tag_samples = 0; -} - -static void bfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - bool sync = bfq_bfqq_sync(bfqq); - - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", - blk_rq_sectors(rq), sync); - - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); -#endif - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - } - - if (sync) { - bfqd->sync_flight--; - RQ_BIC(rq)->ttime.last_end_request = jiffies; - } - - /* - * If we are waiting to discover whether the request pattern of the - * task associated with the queue is actually isochronous, and - * both requisites for this condition to hold are satisfied, then - * compute soft_rt_next_start (see the comments to the function - * bfq_bfqq_softrt_next_start()). - */ - if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); - - /* - * If this is the in-service queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->in_service_queue == bfqq) { - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - - if (bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); - goto out; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); - else if (RB_EMPTY_ROOT(&bfqq->sort_list) && - (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_NO_MORE_REQUESTS); - } - - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - -out: - return; -} - -static int __bfq_may_queue(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { - bfq_clear_bfqq_must_alloc(bfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int bfq_may_queue(struct request_queue *q, int rw) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - - /* - * Don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be - * queued. So just lookup a possibly existing queue, or return - * 'may queue' if that fails. - */ - bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) - return ELV_MQUEUE_MAY; - - bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); - if (bfqq) - return __bfq_may_queue(bfqq); - - return ELV_MQUEUE_MAY; -} - -/* - * Queue lock held here. - */ -static void bfq_put_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - if (bfqq) { - const int rw = rq_data_dir(rq); - - BUG_ON(!bfqq->allocated[rw]); - bfqq->allocated[rw]--; - - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } -} - -/* - * Allocate bfq data structures associated with this request. - */ -static int bfq_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); - const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - unsigned long flags; - - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - bfq_check_ioprio_change(bic, bio); - - spin_lock_irqsave(q->queue_lock, flags); - - if (!bic) - goto queue_fail; - - bfq_bic_update_cgroup(bic, bio); - - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); - bic_set_bfqq(bic, bfqq, is_sync); - if (is_sync) { - if (bfqd->large_burst) - bfq_mark_bfqq_in_large_burst(bfqq); - else - bfq_clear_bfqq_in_large_burst(bfqq); - } - } - - bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, - atomic_read(&bfqq->ref)); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; - -queue_fail: - bfq_schedule_dispatch(bfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 1; -} - -static void bfq_kick_queue(struct work_struct *work) -{ - struct bfq_data *bfqd = - container_of(work, struct bfq_data, unplug_work); - struct request_queue *q = bfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - -/* - * Handler of the expiration of the timer running if the in-service queue - * is idling inside its time slice. - */ -static void bfq_idle_slice_timer(unsigned long data) -{ - struct bfq_data *bfqd = (struct bfq_data *)data; - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - - bfqq = bfqd->in_service_queue; - /* - * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if the timer handler - * spins on the queue_lock and a new request arrives for the - * current queue and there is a full dispatch cycle that changes - * the in-service queue. This can hardly happen, but in the worst - * case we just expire a queue too early. - */ - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired - * for budget timeout without wasting - * guarantees - */ - reason = BFQ_BFQQ_BUDGET_TIMEOUT; - else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) - /* - * The queue may not be empty upon timer expiration, - * because we may not disable the timer when the - * first request of the in-service queue arrives - * during disk idling. - */ - reason = BFQ_BFQQ_TOO_IDLE; - else - goto schedule_dispatch; - - bfq_bfqq_expire(bfqd, bfqq, true, reason); - } - -schedule_dispatch: - bfq_schedule_dispatch(bfqd); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -} - -static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -{ - del_timer_sync(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); -} - -static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) -{ - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - - bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); - bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; - } -} - -/* - * Release all the bfqg references to its async queues. If we are - * deallocating the group these queues may still contain requests, so - * we reparent them to the root cgroup (i.e., the only one that will - * exist for sure until all the requests on a device are gone). - */ -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -{ - int i, j; - - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); - - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -} - -static void bfq_exit_queue(struct elevator_queue *e) -{ - struct bfq_data *bfqd = e->elevator_data; - struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - - bfq_shutdown_timer_wq(bfqd); - - spin_lock_irq(q->queue_lock); - - BUG_ON(bfqd->in_service_queue); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, 0); - - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - - synchronize_rcu(); - - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); -#else - kfree(bfqd->root_group); -#endif - - kfree(bfqd); -} - -static void bfq_init_root_group(struct bfq_group *root_group, - struct bfq_data *bfqd) -{ - int i; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -#endif - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -} - -static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct bfq_data *bfqd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = bfqd; - - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. - */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - atomic_inc(&bfqd->oom_bfqq.ref); - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; - bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); - /* - * Trigger weight initialization, according to ioprio, at the - * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio - * class won't be changed any more. - */ - bfqd->oom_bfqq.entity.prio_changed = 1; - - bfqd->queue = q; - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqd->active_numerous_groups = 0; -#endif - - init_timer(&bfqd->idle_slice_timer); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - bfqd->idle_slice_timer.data = (unsigned long)bfqd; - - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; - - INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); - - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); - - bfqd->hw_tag = -1; - - bfqd->bfq_max_budget = bfq_default_max_budget; - - bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; - bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_class_idle_last_service = 0; - bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; - - bfqd->bfq_requests_within_timer = 120; - - bfqd->bfq_large_burst_thresh = 11; - bfqd->bfq_burst_interval = msecs_to_jiffies(500); - - bfqd->low_latency = true; - - bfqd->bfq_wr_coeff = 20; - bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; - bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); - bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); - bfqd->bfq_wr_max_softrt_rate = 7000; /* - * Approximate rate required - * to playback or record a - * high-definition compressed - * video. - */ - bfqd->wr_busy_queues = 0; - bfqd->busy_in_flight_queues = 0; - bfqd->const_seeky_busy_in_flight_queues = 0; - - /* - * Begin by assuming, optimistically, that the device peak rate is - * equal to the highest reference rate. - */ - bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * - T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->device_speed = BFQ_BFQD_FAST; - - return 0; - -out_free: - kfree(bfqd); - kobject_put(&eq->kobj); - return -ENOMEM; -} - -static void bfq_slab_kill(void) -{ - kmem_cache_destroy(bfq_pool); -} - -static int __init bfq_slab_setup(void) -{ - bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (!bfq_pool) - return -ENOMEM; - return 0; -} - -static ssize_t bfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static ssize_t bfq_var_store(unsigned long *var, const char *page, - size_t count) -{ - unsigned long new_val; - int ret = kstrtoul(page, 10, &new_val); - - if (ret == 0) - *var = new_val; - - return count; -} - -static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -{ - struct bfq_data *bfqd = e->elevator_data; - - return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? - jiffies_to_msecs(bfqd->bfq_wr_max_time) : - jiffies_to_msecs(bfq_wr_duration(bfqd))); -} - -static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -{ - struct bfq_queue *bfqq; - struct bfq_data *bfqd = e->elevator_data; - ssize_t num_char = 0; - - num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", - bfqd->queued); - - spin_lock_irq(bfqd->queue->queue_lock); - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, nr_queued %d %d, ", - bfqq->pid, - bfqq->entity.weight, - bfqq->queued[0], - bfqq->queued[1]); - num_char += sprintf(page + num_char, - "dur %d/%u\n", - jiffies_to_msecs( - jiffies - - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - num_char += sprintf(page + num_char, "Idle:\n"); - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, dur %d/%u\n", - bfqq->pid, - bfqq->entity.weight, - jiffies_to_msecs(jiffies - - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - spin_unlock_irq(bfqd->queue->queue_lock); - - return num_char; -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return bfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, - bfqd->bfq_max_budget_async_rq, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, - 1); -SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t \ -__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ - return ret; \ -} -STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 1); -STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 1); -STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, - 1, INT_MAX, 0); -STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, - 1); -STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, - &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, - INT_MAX, 0); -#undef STORE_FUNCTION - -/* do nothing for the moment */ -static ssize_t bfq_weights_store(struct elevator_queue *e, - const char *page, size_t count) -{ - return count; -} - -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd->peak_rate, timeout); - else - return bfq_default_max_budget; -} - -static ssize_t bfq_max_budget_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); - else { - if (__data > INT_MAX) - __data = INT_MAX; - bfqd->bfq_max_budget = __data; - } - - bfqd->bfq_user_max_budget = __data; - - return ret; -} - -static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data < 1) - __data = 1; - else if (__data > INT_MAX) - __data = INT_MAX; - - bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); - if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); - - return ret; -} - -static ssize_t bfq_low_latency_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data > 1) - __data = 1; - if (__data == 0 && bfqd->low_latency != 0) - bfq_end_wr(bfqd); - bfqd->low_latency = __data; - - return ret; -} - -#define BFQ_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) - -static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(fifo_expire_sync), - BFQ_ATTR(fifo_expire_async), - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_idle), - BFQ_ATTR(max_budget), - BFQ_ATTR(max_budget_async_rq), - BFQ_ATTR(timeout_sync), - BFQ_ATTR(timeout_async), - BFQ_ATTR(low_latency), - BFQ_ATTR(wr_coeff), - BFQ_ATTR(wr_max_time), - BFQ_ATTR(wr_rt_max_time), - BFQ_ATTR(wr_min_idle_time), - BFQ_ATTR(wr_min_inter_arr_async), - BFQ_ATTR(wr_max_softrt_rate), - BFQ_ATTR(weights), - __ATTR_NULL -}; - -static struct elevator_type iosched_bfq = { - .ops = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, -#ifdef CONFIG_BFQ_GROUP_IOSCHED - .elevator_bio_merged_fn = bfq_bio_merged, -#endif - .elevator_allow_merge_fn = bfq_allow_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, - .elevator_deactivate_req_fn = bfq_deactivate_request, - .elevator_completed_req_fn = bfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_icq_fn = bfq_init_icq, - .elevator_exit_icq_fn = bfq_exit_icq, - .elevator_set_req_fn = bfq_set_request, - .elevator_put_req_fn = bfq_put_request, - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, - }, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - .elevator_attrs = bfq_attrs, - .elevator_name = "bfq", - .elevator_owner = THIS_MODULE, -}; - -static int __init bfq_init(void) -{ - int ret; - - /* - * Can be 0 on HZ < 1000 setups. - */ - if (bfq_slice_idle == 0) - bfq_slice_idle = 1; - - if (bfq_timeout_async == 0) - bfq_timeout_async = 1; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -#endif - - ret = -ENOMEM; - if (bfq_slab_setup()) - goto err_pol_unreg; - - /* - * Times to load large popular applications for the typical systems - * installed on the reference devices (see the comments before the - * definitions of the two arrays). - */ - T_slow[0] = msecs_to_jiffies(2600); - T_slow[1] = msecs_to_jiffies(1000); - T_fast[0] = msecs_to_jiffies(5500); - T_fast[1] = msecs_to_jiffies(2000); - - /* - * Thresholds that determine the switch between speed classes (see - * the comments before the definition of the array). - */ - device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; - device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; - - ret = elv_register(&iosched_bfq); - if (ret) - goto err_pol_unreg; - - pr_info("BFQ I/O-scheduler: v7r11"); - - return 0; - -err_pol_unreg: -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif - return ret; -} - -static void __exit bfq_exit(void) -{ - elv_unregister(&iosched_bfq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif - bfq_slab_kill(); -} - -module_init(bfq_init); -module_exit(bfq_exit); - -MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -MODULE_LICENSE("GPL"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c deleted file mode 100644 index a5ed6948471a..000000000000 --- a/block/bfq-sched.c +++ /dev/null @@ -1,1199 +0,0 @@ -/* - * BFQ: Hierarchical B-WF2Q+ scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - */ - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -#define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) - -#define for_each_entity_safe(entity, parent) \ - for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - - -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd); - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - -static void bfq_update_budget(struct bfq_entity *next_in_service) -{ - struct bfq_entity *bfqg_entity; - struct bfq_group *bfqg; - struct bfq_sched_data *group_sd; - - BUG_ON(!next_in_service); - - group_sd = next_in_service->sched_data; - - bfqg = container_of(group_sd, struct bfq_group, sched_data); - /* - * bfq_group's my_entity field is not NULL only if the group - * is not the root group. We must not touch the root entity - * as it must never become an in-service entity. - */ - bfqg_entity = bfqg->my_entity; - if (bfqg_entity) - bfqg_entity->budget = next_in_service->budget; -} - -static int bfq_update_next_in_service(struct bfq_sched_data *sd) -{ - struct bfq_entity *next_in_service; - - if (sd->in_service_entity) - /* will update/requeue at the end of service */ - return 0; - - /* - * NOTE: this can be improved in many ways, such as returning - * 1 (and thus propagating upwards the update) only when the - * budget changes, or caching the bfqq that will be scheduled - * next from this subtree. By now we worry more about - * correctness than about performance... - */ - next_in_service = bfq_lookup_next_entity(sd, 0, NULL); - sd->next_in_service = next_in_service; - - if (next_in_service) - bfq_update_budget(next_in_service); - - return 1; -} - -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) -{ - BUG_ON(sd->next_in_service != entity); -} -#else -#define for_each_entity(entity) \ - for (; entity ; entity = NULL) - -#define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity ; entity = parent) - -static int bfq_update_next_in_service(struct bfq_sched_data *sd) -{ - return 0; -} - -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) -{ -} - -static void bfq_update_budget(struct bfq_entity *next_in_service) -{ -} -#endif - -/* - * Shift for timestamp calculations. This actually limits the maximum - * service allowed in one timestamp delta (small shift values increase it), - * the maximum total weight that can be used for the queues in the system - * (big shift values increase it), and the period of virtual time - * wraparounds. - */ -#define WFQ_SERVICE_SHIFT 22 - -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static int bfq_gt(u64 a, u64 b) -{ - return (s64)(a - b) > 0; -} - -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = NULL; - - BUG_ON(!entity); - - if (!entity->my_sched_data) - bfqq = container_of(entity, struct bfq_queue, entity); - - return bfqq; -} - - -/** - * bfq_delta - map service into the virtual time domain. - * @service: amount of service. - * @weight: scale factor (weight of an entity or weight sum). - */ -static u64 bfq_delta(unsigned long service, unsigned long weight) -{ - u64 d = (u64)service << WFQ_SERVICE_SHIFT; - - do_div(d, weight); - return d; -} - -/** - * bfq_calc_finish - assign the finish time to an entity. - * @entity: the entity to act upon. - * @service: the service to be charged to the entity. - */ -static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(entity->weight == 0); - - entity->finish = entity->start + - bfq_delta(service, entity->weight); - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - entity->start, entity->finish, - bfq_delta(service, entity->weight)); - } -} - -/** - * bfq_entity_of - get an entity from a node. - * @node: the node field of the entity. - * - * Convert a node pointer to the relative entity. This is used only - * to simplify the logic of some functions and not as the generic - * conversion mechanism because, e.g., in the tree walking functions, - * the check for a %NULL value would be redundant. - */ -static struct bfq_entity *bfq_entity_of(struct rb_node *node) -{ - struct bfq_entity *entity = NULL; - - if (node) - entity = rb_entry(node, struct bfq_entity, rb_node); - - return entity; -} - -/** - * bfq_extract - remove an entity from a tree. - * @root: the tree root. - * @entity: the entity to remove. - */ -static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -{ - BUG_ON(entity->tree != root); - - entity->tree = NULL; - rb_erase(&entity->rb_node, root); -} - -/** - * bfq_idle_extract - extract an entity from the idle tree. - * @st: the service tree of the owning @entity. - * @entity: the entity being removed. - */ -static void bfq_idle_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *next; - - BUG_ON(entity->tree != &st->idle); - - if (entity == st->first_idle) { - next = rb_next(&entity->rb_node); - st->first_idle = bfq_entity_of(next); - } - - if (entity == st->last_idle) { - next = rb_prev(&entity->rb_node); - st->last_idle = bfq_entity_of(next); - } - - bfq_extract(&st->idle, entity); - - if (bfqq) - list_del(&bfqq->bfqq_list); -} - -/** - * bfq_insert - generic tree insertion. - * @root: tree root. - * @entity: entity to insert. - * - * This is used for the idle and the active tree, since they are both - * ordered by finish time. - */ -static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -{ - struct bfq_entity *entry; - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - - BUG_ON(entity->tree); - - while (*node) { - parent = *node; - entry = rb_entry(parent, struct bfq_entity, rb_node); - - if (bfq_gt(entry->finish, entity->finish)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&entity->rb_node, parent, node); - rb_insert_color(&entity->rb_node, root); - - entity->tree = root; -} - -/** - * bfq_update_min - update the min_start field of a entity. - * @entity: the entity to update. - * @node: one of its children. - * - * This function is called when @entity may store an invalid value for - * min_start due to updates to the active tree. The function assumes - * that the subtree rooted at @node (which may be its left or its right - * child) has a valid min_start value. - */ -static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -{ - struct bfq_entity *child; - - if (node) { - child = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entity->min_start, child->min_start)) - entity->min_start = child->min_start; - } -} - -/** - * bfq_update_active_node - recalculate min_start. - * @node: the node to update. - * - * @node may have changed position or one of its children may have moved, - * this function updates its min_start value. The left and right subtrees - * are assumed to hold a correct min_start value. - */ -static void bfq_update_active_node(struct rb_node *node) -{ - struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); - - entity->min_start = entity->start; - bfq_update_min(entity, node->rb_right); - bfq_update_min(entity, node->rb_left); -} - -/** - * bfq_update_active_tree - update min_start for the whole active tree. - * @node: the starting node. - * - * @node must be the deepest modified node after an update. This function - * updates its min_start using the values held by its children, assuming - * that they did not change, and then updates all the nodes that may have - * changed in the path to the root. The only nodes that may have changed - * are the ones in the path or their siblings. - */ -static void bfq_update_active_tree(struct rb_node *node) -{ - struct rb_node *parent; - -up: - bfq_update_active_node(node); - - parent = rb_parent(node); - if (!parent) - return; - - if (node == parent->rb_left && parent->rb_right) - bfq_update_active_node(parent->rb_right); - else if (parent->rb_left) - bfq_update_active_node(parent->rb_left); - - node = parent; - goto up; -} - -static void bfq_weights_tree_add(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root); - -static void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root); - - -/** - * bfq_active_insert - insert an entity in the active tree of its - * group/device. - * @st: the service tree of the entity. - * @entity: the entity being inserted. - * - * The active tree is ordered by finish time, but an extra key is kept - * per each node, containing the minimum value for the start times of - * its children (and the node itself), so it's possible to search for - * the eligible node with the lowest finish time in logarithmic time. - */ -static void bfq_active_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif - - bfq_insert(&st->active, entity); - - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); - } - if (bfqg != bfqd->root_group) { - BUG_ON(!bfqg); - BUG_ON(!bfqd); - bfqg->active_entities++; - if (bfqg->active_entities == 2) - bfqd->active_numerous_groups++; - } -#endif -} - -/** - * bfq_ioprio_to_weight - calc a weight from an ioprio. - * @ioprio: the ioprio value to convert. - */ -static unsigned short bfq_ioprio_to_weight(int ioprio) -{ - BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; -} - -/** - * bfq_weight_to_ioprio - calc an ioprio from a weight. - * @weight: the weight value to convert. - * - * To preserve as much as possible the old only-ioprio user interface, - * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. - */ -static unsigned short bfq_weight_to_ioprio(int weight) -{ - BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? - 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -} - -static void bfq_get_entity(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - if (bfqq) { - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); - } -} - -/** - * bfq_find_deepest - find the deepest node that an extraction can modify. - * @node: the node being removed. - * - * Do the first step of an extraction in an rb tree, looking for the - * node that will replace @node, and returning the deepest node that - * the following modifications to the tree can touch. If @node is the - * last node in the tree return %NULL. - */ -static struct rb_node *bfq_find_deepest(struct rb_node *node) -{ - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} - -/** - * bfq_active_extract - remove an entity from the active tree. - * @st: the service_tree containing the tree. - * @entity: the entity being removed. - */ -static void bfq_active_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif - - node = bfq_find_deepest(&entity->rb_node); - bfq_extract(&st->active, entity); - - if (node) - bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif - if (bfqq) - list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); - } - if (bfqg != bfqd->root_group) { - BUG_ON(!bfqg); - BUG_ON(!bfqd); - BUG_ON(!bfqg->active_entities); - bfqg->active_entities--; - if (bfqg->active_entities == 1) { - BUG_ON(!bfqd->active_numerous_groups); - bfqd->active_numerous_groups--; - } - } -#endif -} - -/** - * bfq_idle_insert - insert an entity into the idle tree. - * @st: the service tree containing the tree. - * @entity: the entity to insert. - */ -static void bfq_idle_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) - st->first_idle = entity; - if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) - st->last_idle = entity; - - bfq_insert(&st->idle, entity); - - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -} - -/** - * bfq_forget_entity - remove an entity from the wfq trees. - * @st: the service tree. - * @entity: the entity being removed. - * - * Update the device status and forget everything about @entity, putting - * the device reference to it, if it is a queue. Entities belonging to - * groups are not refcounted. - */ -static void bfq_forget_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_sched_data *sd; - - BUG_ON(!entity->on_st); - - entity->on_st = 0; - st->wsum -= entity->weight; - if (bfqq) { - sd = entity->sched_data; - bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } -} - -/** - * bfq_put_idle_entity - release the idle tree ref of an entity. - * @st: service tree for the entity. - * @entity: the entity being released. - */ -static void bfq_put_idle_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - bfq_idle_extract(st, entity); - bfq_forget_entity(st, entity); -} - -/** - * bfq_forget_idle - update the idle tree if necessary. - * @st: the service tree to act upon. - * - * To preserve the global O(log N) complexity we only remove one entry here; - * as the idle tree will not grow indefinitely this can be done safely. - */ -static void bfq_forget_idle(struct bfq_service_tree *st) -{ - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (RB_EMPTY_ROOT(&st->active) && last_idle && - !bfq_gt(last_idle->finish, st->vtime)) { - /* - * Forget the whole idle tree, increasing the vtime past - * the last finish time of idle entities. - */ - st->vtime = last_idle->finish; - } - - if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) - bfq_put_idle_entity(st, first_idle); -} - -static struct bfq_service_tree * -__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - struct bfq_entity *entity) -{ - struct bfq_service_tree *new_st = old_st; - - if (entity->prio_changed) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned short prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; - BUG_ON(!bfqd); - } -#endif - - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - - if (entity->new_weight != entity->orig_weight) { - if (entity->new_weight < BFQ_MIN_WEIGHT || - entity->new_weight > BFQ_MAX_WEIGHT) { - pr_crit("update_weight_prio: new_weight %d\n", - entity->new_weight); - BUG(); - } - entity->orig_weight = entity->new_weight; - if (bfqq) - bfqq->ioprio = - bfq_weight_to_ioprio(entity->orig_weight); - } - - if (bfqq) - bfqq->ioprio_class = bfqq->new_ioprio_class; - entity->prio_changed = 0; - - /* - * NOTE: here we may be changing the weight too early, - * this will cause unfairness. The correct approach - * would have required additional complexity to defer - * weight changes to the proper time instants (i.e., - * when entity->finish <= old_st->vtime). - */ - new_st = bfq_entity_service_tree(entity); - - prev_weight = entity->weight; - new_weight = entity->orig_weight * - (bfqq ? bfqq->wr_coeff : 1); - /* - * If the weight of the entity changes, remove the entity - * from its old weight counter (if there is a counter - * associated with the entity), and add it to the counter - * associated with its new weight. - */ - if (prev_weight != new_weight) { - root = bfqq ? &bfqd->queue_weights_tree : - &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); - } - entity->weight = new_weight; - /* - * Add the entity to its weights tree only if it is - * not associated with a weight-raised queue. - */ - if (prev_weight != new_weight && - (bfqq ? bfqq->wr_coeff == 1 : 1)) - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, entity, root); - - new_st->wsum += entity->weight; - - if (new_st != old_st) - entity->start = new_st->vtime; - } - - return new_st; -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -#endif - -/** - * bfq_bfqq_served - update the scheduler status after selection for - * service. - * @bfqq: the queue being served. - * @served: bytes to transfer. - * - * NOTE: this can be optimized, as the timestamps of upper level entities - * are synchronized every time a new bfqq is selected for service. By now, - * we keep it to better check consistency. - */ -static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - - entity->service += served; - BUG_ON(entity->service > entity->budget); - BUG_ON(st->wsum == 0); - - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -#endif - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); -} - -/** - * bfq_bfqq_charge_full_budget - set the service to the entity budget. - * @bfqq: the queue that needs a service update. - * - * When it's not possible to be fair in the service domain, because - * a queue is not consuming its budget fast enough (the meaning of - * fast depends on the timeout parameter), we charge it a full - * budget. In this way we should obtain a sort of time-domain - * fairness among all the seeky/slow queues. - */ -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); - - bfq_bfqq_served(bfqq, entity->budget - entity->service); -} - -/** - * __bfq_activate_entity - activate an entity. - * @entity: the entity being activated. - * - * Called whenever an entity is activated, i.e., it is not active and one - * of its children receives a new request, or has to be reactivated due to - * budget exhaustion. It uses the current budget of the entity (and the - * service received if @entity is active) of the queue to calculate its - * timestamps. - */ -static void __bfq_activate_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - if (entity == sd->in_service_entity) { - BUG_ON(entity->tree); - /* - * If we are requeueing the current entity we have - * to take care of not charging to it service it has - * not received. - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; - sd->in_service_entity = NULL; - } else if (entity->tree == &st->active) { - /* - * Requeueing an entity due to a change of some - * next_in_service entity below it. We reuse the - * old start time. - */ - bfq_active_extract(st, entity); - } else if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); - entity->start = bfq_gt(st->vtime, entity->finish) ? - st->vtime : entity->finish; - } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - entity->start = st->vtime; - st->wsum += entity->weight; - bfq_get_entity(entity); - - BUG_ON(entity->on_st); - entity->on_st = 1; - } - - st = __bfq_entity_update_weight_prio(st, entity); - bfq_calc_finish(entity, entity->budget); - bfq_active_insert(st, entity); -} - -/** - * bfq_activate_entity - activate an entity and its ancestors if necessary. - * @entity: the entity to activate. - * - * Activate @entity and all the entities on the path from it to the root. - */ -static void bfq_activate_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd; - - for_each_entity(entity) { - __bfq_activate_entity(entity); - - sd = entity->sched_data; - if (!bfq_update_next_in_service(sd)) - /* - * No need to propagate the activation to the - * upper entities, as they will be updated when - * the in-service entity is rescheduled. - */ - break; - } -} - -/** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. - * @requeue: if false, the entity will not be put into the idle tree. - * - * Deactivate an entity, independently from its previous state. If the - * entity was not on a service tree just return, otherwise if it is on - * any scheduler tree, extract it from that tree, and if necessary - * and if the caller did not specify @requeue, put it on the idle tree. - * - * Return %1 if the caller should update the entity hierarchy, i.e., - * if the entity was in service or if it was the next_in_service for - * its sched_data; return %0 otherwise. - */ -static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; - int was_in_service; - int ret = 0; - - if (sd == NULL || !entity->on_st) /* never activated, or inactive */ - return 0; - - st = bfq_entity_service_tree(entity); - was_in_service = entity == sd->in_service_entity; - - BUG_ON(was_in_service && entity->tree); - - if (was_in_service) { - bfq_calc_finish(entity, entity->service); - sd->in_service_entity = NULL; - } else if (entity->tree == &st->active) - bfq_active_extract(st, entity); - else if (entity->tree == &st->idle) - bfq_idle_extract(st, entity); - else if (entity->tree) - BUG(); - - if (was_in_service || sd->next_in_service == entity) - ret = bfq_update_next_in_service(sd); - - if (!requeue || !bfq_gt(entity->finish, st->vtime)) - bfq_forget_entity(st, entity); - else - bfq_idle_insert(st, entity); - - BUG_ON(sd->in_service_entity == entity); - BUG_ON(sd->next_in_service == entity); - - return ret; -} - -/** - * bfq_deactivate_entity - deactivate an entity. - * @entity: the entity to deactivate. - * @requeue: true if the entity can be put on the idle tree - */ -static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -{ - struct bfq_sched_data *sd; - struct bfq_entity *parent; - - for_each_entity_safe(entity, parent) { - sd = entity->sched_data; - - if (!__bfq_deactivate_entity(entity, requeue)) - /* - * The parent entity is still backlogged, and - * we don't need to update it as it is still - * in service. - */ - break; - - if (sd->next_in_service) - /* - * The parent entity is still backlogged and - * the budgets on the path towards the root - * need to be updated. - */ - goto update; - - /* - * If we reach there the parent is no more backlogged and - * we want to propagate the dequeue upwards. - */ - requeue = 1; - } - - return; - -update: - entity = parent; - for_each_entity(entity) { - __bfq_activate_entity(entity); - - sd = entity->sched_data; - if (!bfq_update_next_in_service(sd)) - break; - } -} - -/** - * bfq_update_vtime - update vtime if necessary. - * @st: the service tree to act upon. - * - * If necessary update the service tree vtime to have at least one - * eligible entity, skipping to its start time. Assumes that the - * active tree of the device is not empty. - * - * NOTE: this hierarchical implementation updates vtimes quite often, - * we may end up with reactivated processes getting timestamps after a - * vtime skip done because we needed a ->first_active entity on some - * intermediate node. - */ -static void bfq_update_vtime(struct bfq_service_tree *st) -{ - struct bfq_entity *entry; - struct rb_node *node = st->active.rb_node; - - entry = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entry->min_start, st->vtime)) { - st->vtime = entry->min_start; - bfq_forget_idle(st); - } -} - -/** - * bfq_first_active_entity - find the eligible entity with - * the smallest finish time - * @st: the service tree to select from. - * - * This function searches the first schedulable entity, starting from the - * root of the tree and going on the left every time on this side there is - * a subtree with at least one eligible (start >= vtime) entity. The path on - * the right is followed only if a) the left subtree contains no eligible - * entities and b) no eligible entity has been found yet. - */ -static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -{ - struct bfq_entity *entry, *first = NULL; - struct rb_node *node = st->active.rb_node; - - while (node) { - entry = rb_entry(node, struct bfq_entity, rb_node); -left: - if (!bfq_gt(entry->start, st->vtime)) - first = entry; - - BUG_ON(bfq_gt(entry->min_start, st->vtime)); - - if (node->rb_left) { - entry = rb_entry(node->rb_left, - struct bfq_entity, rb_node); - if (!bfq_gt(entry->min_start, st->vtime)) { - node = node->rb_left; - goto left; - } - } - if (first) - break; - node = node->rb_right; - } - - BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); - return first; -} - -/** - * __bfq_lookup_next_entity - return the first eligible entity in @st. - * @st: the service tree. - * - * Update the virtual time in @st and return the first eligible entity - * it contains. - */ -static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - bool force) -{ - struct bfq_entity *entity, *new_next_in_service = NULL; - - if (RB_EMPTY_ROOT(&st->active)) - return NULL; - - bfq_update_vtime(st); - entity = bfq_first_active_entity(st); - BUG_ON(bfq_gt(entity->start, st->vtime)); - - /* - * If the chosen entity does not match with the sched_data's - * next_in_service and we are forcedly serving the IDLE priority - * class tree, bubble up budget update. - */ - if (unlikely(force && entity != entity->sched_data->next_in_service)) { - new_next_in_service = entity; - for_each_entity(new_next_in_service) - bfq_update_budget(new_next_in_service); - } - - return entity; -} - -/** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. - * @extract: if true the returned entity will be also extracted from @sd. - * - * NOTE: since we cache the next_in_service entity at each level of the - * hierarchy, the complexity of the lookup can be decreased with - * absolutely no effort just returning the cached next_in_service value; - * we prefer to do full lookups to test the consistency of * the data - * structures. - */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd) -{ - struct bfq_service_tree *st = sd->service_tree; - struct bfq_entity *entity; - int i = 0; - - BUG_ON(sd->in_service_entity); - - if (bfqd && - jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { - entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, - true); - if (entity) { - i = BFQ_IOPRIO_CLASSES - 1; - bfqd->bfq_class_idle_last_service = jiffies; - sd->next_in_service = entity; - } - } - for (; i < BFQ_IOPRIO_CLASSES; i++) { - entity = __bfq_lookup_next_entity(st + i, false); - if (entity) { - if (extract) { - bfq_check_next_in_service(sd, entity); - bfq_active_extract(st + i, entity); - sd->in_service_entity = entity; - sd->next_in_service = NULL; - } - break; - } - } - - return entity; -} - -/* - * Get next queue for service. - */ -static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -{ - struct bfq_entity *entity = NULL; - struct bfq_sched_data *sd; - struct bfq_queue *bfqq; - - BUG_ON(bfqd->in_service_queue); - - if (bfqd->busy_queues == 0) - return NULL; - - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { - entity = bfq_lookup_next_entity(sd, 1, bfqd); - BUG_ON(!entity); - entity->service = 0; - } - - bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); - - return bfqq; -} - -static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -{ - if (bfqd->in_service_bic) { - put_io_context(bfqd->in_service_bic->icq.ioc); - bfqd->in_service_bic = NULL; - } - - bfqd->in_service_queue = NULL; - del_timer(&bfqd->idle_slice_timer); -} - -static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq == bfqd->in_service_queue) - __bfq_bfqd_reset_in_service(bfqd); - - bfq_deactivate_entity(entity, requeue); -} - -static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_entity(entity); -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -#endif - -/* - * Called when the bfqq no longer has requests pending, remove it from - * the service tree. - */ -static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) -{ - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - bfq_log_bfqq(bfqd, bfqq, "del from busy"); - - bfq_clear_bfqq_busy(bfqq); - - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; - - if (!bfqq->dispatched) { - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - } - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues--; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dequeue(bfqq_group(bfqq)); -#endif - - bfq_deactivate_bfqq(bfqd, bfqq, requeue); -} - -/* - * Called when an inactive queue receives a new request. - */ -static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqq == bfqd->in_service_queue); - - bfq_log_bfqq(bfqd, bfqq, "add to busy"); - - bfq_activate_bfqq(bfqd, bfqq); - - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; - - if (!bfqq->dispatched) { - if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - bfqd->busy_in_flight_queues++; - if (bfq_bfqq_constantly_seeky(bfqq)) - bfqd->const_seeky_busy_in_flight_queues++; - } - } - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues++; -} diff --git a/block/bfq.h b/block/bfq.h deleted file mode 100644 index 2bf54ae89ff0..000000000000 --- a/block/bfq.h +++ /dev/null @@ -1,801 +0,0 @@ -/* - * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - */ - -#ifndef _BFQ_H -#define _BFQ_H - -#include -#include -#include -#include -#include - -#define BFQ_IOPRIO_CLASSES 3 -#define BFQ_CL_IDLE_TIMEOUT (HZ/5) - -#define BFQ_MIN_WEIGHT 1 -#define BFQ_MAX_WEIGHT 1000 -#define BFQ_WEIGHT_CONVERSION_COEFF 10 - -#define BFQ_DEFAULT_QUEUE_IOPRIO 4 - -#define BFQ_DEFAULT_GRP_WEIGHT 10 -#define BFQ_DEFAULT_GRP_IOPRIO 0 -#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - -struct bfq_entity; - -/** - * struct bfq_service_tree - per ioprio_class service tree. - * @active: tree for active entities (i.e., those backlogged). - * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). - * @first_idle: idle entity with minimum F_i. - * @last_idle: idle entity with maximum F_i. - * @vtime: scheduler virtual time. - * @wsum: scheduler weight sum; active and idle entities contribute to it. - * - * Each service tree represents a B-WF2Q+ scheduler on its own. Each - * ioprio_class has its own independent scheduler, and so its own - * bfq_service_tree. All the fields are protected by the queue lock - * of the containing bfqd. - */ -struct bfq_service_tree { - struct rb_root active; - struct rb_root idle; - - struct bfq_entity *first_idle; - struct bfq_entity *last_idle; - - u64 vtime; - unsigned long wsum; -}; - -/** - * struct bfq_sched_data - multi-class scheduler. - * @in_service_entity: entity in service. - * @next_in_service: head-of-the-line entity in the scheduler. - * @service_tree: array of service trees, one per ioprio_class. - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as - * an intermediate queue on a hierarchical setup. - * @next_in_service points to the active entity of the sched_data - * service trees that will be scheduled next. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. - * All the fields are protected by the queue lock of the containing bfqd. - */ -struct bfq_sched_data { - struct bfq_entity *in_service_entity; - struct bfq_entity *next_in_service; - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -}; - -/** - * struct bfq_weight_counter - counter of the number of all active entities - * with a given weight. - * @weight: weight of the entities that this counter refers to. - * @num_active: number of active entities with this weight. - * @weights_node: weights tree member (see bfq_data's @queue_weights_tree - * and @group_weights_tree). - */ -struct bfq_weight_counter { - short int weight; - unsigned int num_active; - struct rb_node weights_node; -}; - -/** - * struct bfq_entity - schedulable entity. - * @rb_node: service_tree member. - * @weight_counter: pointer to the weight counter associated with this entity. - * @on_st: flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree). - * @finish: B-WF2Q+ finish timestamp (aka F_i). - * @start: B-WF2Q+ start timestamp (aka S_i). - * @tree: tree the entity is enqueued into; %NULL if not on a tree. - * @min_start: minimum start time of the (active) subtree rooted at - * this entity; used for O(log N) lookups into active trees. - * @service: service received during the last round of service. - * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. - * @weight: weight of the queue - * @parent: parent entity, for hierarchical scheduling. - * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the - * associated scheduler queue, %NULL on leaf nodes. - * @sched_data: the scheduler queue this entity belongs to. - * @ioprio: the ioprio in use. - * @new_weight: when a weight change is requested, the new weight value. - * @orig_weight: original weight, used to implement weight boosting - * @prio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. - * - * A bfq_entity is used to represent either a bfq_queue (leaf node in the - * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each - * entity belongs to the sched_data of the parent group in the cgroup - * hierarchy. Non-leaf entities have also their own sched_data, stored - * in @my_sched_data. - * - * Each entity stores independently its priority values; this would - * allow different weights on different devices, but this - * functionality is not exported to userspace by now. Priorities and - * weights are updated lazily, first storing the new values into the - * new_* fields, then setting the @prio_changed flag. As soon as - * there is a transition in the entity state that allows the priority - * update to take place the effective and the requested priority - * values are synchronized. - * - * Unless cgroups are used, the weight value is calculated from the - * ioprio to export the same interface as CFQ. When dealing with - * ``well-behaved'' queues (i.e., queues that do not spend too much - * time to consume their budget and have true sequential behavior, and - * when there are no external factors breaking anticipation) the - * relative weights at each level of the cgroups hierarchy should be - * guaranteed. All the fields are protected by the queue lock of the - * containing bfqd. - */ -struct bfq_entity { - struct rb_node rb_node; - struct bfq_weight_counter *weight_counter; - - int on_st; - - u64 finish; - u64 start; - - struct rb_root *tree; - - u64 min_start; - - int service, budget; - unsigned short weight, new_weight; - unsigned short orig_weight; - - struct bfq_entity *parent; - - struct bfq_sched_data *my_sched_data; - struct bfq_sched_data *sched_data; - - int prio_changed; -}; - -struct bfq_group; - -/** - * struct bfq_queue - leaf schedulable entity. - * @ref: reference counter. - * @bfqd: parent bfq_data. - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. - * @allocated: currently allocated requests. - * @meta_pending: pending metadata requests. - * @fifo: fifo list of requests in sort_list. - * @entity: entity representing this queue in the scheduler. - * @max_budget: maximum budget allowed from the feedback mechanism. - * @budget_timeout: budget expiration (in jiffies). - * @dispatched: number of requests on the dispatch list or inside driver. - * @flags: status flags. - * @bfqq_list: node for active/idle bfqq list inside our bfqd. - * @burst_list_node: node for the device's burst list. - * @seek_samples: number of seeks sampled - * @seek_total: sum of the distances of the seeks sampled - * @seek_mean: mean seek distance - * @last_request_pos: position of the last request enqueued - * @requests_within_timer: number of consecutive pairs of request completion - * and arrival, such that the queue becomes idle - * after the completion, but the next request arrives - * within an idle time slice; used only if the queue's - * IO_bound has been cleared. - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_wr_start_finish: start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period - * @wr_cur_max_time: current max raising time for this queue - * @soft_rt_next_start: minimum time instant such that, only if a new - * request is enqueued after this time instant in an - * idle @bfq_queue with no outstanding requests, then - * the task associated with the queue it is deemed as - * soft real-time (see the comments to the function - * bfq_bfqq_softrt_next_start()) - * @last_idle_bklogged: time of the last transition of the @bfq_queue from - * idle to backlogged - * @service_from_backlogged: cumulative service received from the @bfq_queue - * since the last transition from idle to - * backlogged - * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the - * queue is shared - * - * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. - */ -struct bfq_queue { - atomic_t ref; - struct bfq_data *bfqd; - - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; - - /* fields for cooperating queues handling */ - struct bfq_queue *new_bfqq; - struct rb_node pos_node; - struct rb_root *pos_root; - - struct rb_root sort_list; - struct request *next_rq; - int queued[2]; - int allocated[2]; - int meta_pending; - struct list_head fifo; - - struct bfq_entity entity; - - int max_budget; - unsigned long budget_timeout; - - int dispatched; - - unsigned int flags; - - struct list_head bfqq_list; - - struct hlist_node burst_list_node; - - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - sector_t last_request_pos; - - unsigned int requests_within_timer; - - pid_t pid; - struct bfq_io_cq *bic; - - /* weight-raising fields */ - unsigned long wr_cur_max_time; - unsigned long soft_rt_next_start; - unsigned long last_wr_start_finish; - unsigned int wr_coeff; - unsigned long last_idle_bklogged; - unsigned long service_from_backlogged; -}; - -/** - * struct bfq_ttime - per process thinktime stats. - * @ttime_total: total process thinktime - * @ttime_samples: number of thinktime samples - * @ttime_mean: average process thinktime - */ -struct bfq_ttime { - unsigned long last_end_request; - - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; -}; - -/** - * struct bfq_io_cq - per (request_queue, io_context) structure. - * @icq: associated io_cq structure - * @bfqq: array of two process queues, the sync and the async - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. - */ -struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ - struct bfq_queue *bfqq[2]; - struct bfq_ttime ttime; - int ioprio; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ -#endif -}; - -enum bfq_device_speed { - BFQ_BFQD_FAST, - BFQ_BFQD_SLOW, -}; - -/** - * struct bfq_data - per device data structure. - * @queue: request queue for the managed device. - * @root_group: root bfq_group for the device. - * @active_numerous_groups: number of bfq_groups containing more than one - * active @bfq_entity. - * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues - * have the same weight. The tree contains one counter - * for each distinct weight associated to some active - * and not weight-raised @bfq_queue (see the comments to - * the functions bfq_weights_tree_[add|remove] for - * further details). - * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted - * by weight. Used to keep track of whether all - * @bfq_groups have the same weight. The tree contains - * one counter for each distinct weight associated to - * some active @bfq_group (see the comments to the - * functions bfq_weights_tree_[add|remove] for further - * details). - * @busy_queues: number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - * @busy_in_flight_queues: number of @bfq_queues containing pending or - * in-flight requests, plus the @bfq_queue in - * service, even if idle but waiting for the - * possible arrival of its next sync request. This - * field is updated only if the device is rotational, - * but used only if the device is also NCQ-capable. - * The reason why the field is updated also for non- - * NCQ-capable rotational devices is related to the - * fact that the value of @hw_tag may be set also - * later than when busy_in_flight_queues may need to - * be incremented for the first time(s). Taking also - * this possibility into account, to avoid unbalanced - * increments/decrements, would imply more overhead - * than just updating busy_in_flight_queues - * regardless of the value of @hw_tag. - * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues - * (that is, seeky queues that expired - * for budget timeout at least once) - * containing pending or in-flight - * requests, including the in-service - * @bfq_queue if constantly seeky. This - * field is updated only if the device - * is rotational, but used only if the - * device is also NCQ-capable (see the - * comments to @busy_in_flight_queues). - * @wr_busy_queues: number of weight-raised busy @bfq_queues. - * @queued: number of queued requests. - * @rq_in_driver: number of requests dispatched and waiting for completion. - * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last - * @hw_tag_samples completed requests. - * @hw_tag_samples: nr of samples used to calculate hw_tag. - * @hw_tag: flag set to one if the driver is showing a queueing behavior. - * @budgets_assigned: number of budgets assigned. - * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue in service. - * @unplug_work: delayed work to restart dispatching on the request queue. - * @in_service_queue: bfq_queue in service. - * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. - * @last_position: on-disk position of the last served request. - * @last_budget_start: beginning of the last budget. - * @last_idling_start: beginning of the last idle slice. - * @peak_rate: peak transfer rate observed for a budget. - * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before - * rescheduling. - * @active_list: list of all the bfq_queues active on the device. - * @idle_list: list of all the bfq_queues idle on the device. - * @bfq_fifo_expire: timeout for async/sync requests; when it expires - * requests are served in fifo order. - * @bfq_back_penalty: weight of backward seeks wrt forward ones. - * @bfq_back_max: maximum allowed backward seek. - * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value - * (0 for auto-tuning). - * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to - * async queues. - * @bfq_timeout: timeout for bfq_queues to consume their budget; used to - * to prevent seeky queues to impose long latencies to well - * behaved ones (this also implies that seeky queues cannot - * receive guarantees in the service domain; after a timeout - * they are charged for the whole allocated budget, to try - * to preserve a behavior reasonably fair among them, but - * without service-domain guarantees). - * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is - * no more granted any weight-raising. - * @bfq_failed_cooperations: number of consecutive failed cooperation - * chances after which weight-raising is restored - * to a queue subject to more than bfq_coop_thresh - * queue merges. - * @bfq_requests_within_timer: number of consecutive requests that must be - * issued within the idle time slice to set - * again idling to a queue which was marked as - * non-I/O-bound (see the definition of the - * IO_bound flag for further details). - * @last_ins_in_burst: last time at which a queue entered the current - * burst of queues being activated shortly after - * each other; for more details about this and the - * following parameters related to a burst of - * activations, see the comments to the function - * @bfq_handle_burst. - * @bfq_burst_interval: reference time interval used to decide whether a - * queue has been activated shortly after - * @last_ins_in_burst. - * @burst_size: number of queues in the current burst of queue activations. - * @bfq_large_burst_thresh: maximum burst size above which the current - * queue-activation burst is deemed as 'large'. - * @large_burst: true if a large queue-activation burst is in progress. - * @burst_list: head of the burst list (as for the above fields, more details - * in the comments to the function bfq_handle_burst). - * @low_latency: if set to true, low-latency heuristics are enabled. - * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised - * queue is multiplied. - * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). - * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. - * @bfq_wr_min_idle_time: minimum idle period after which weight-raising - * may be reactivated for a queue (in jiffies). - * @bfq_wr_min_inter_arr_async: minimum period between request arrivals - * after which weight-raising may be - * reactivated for an already busy queue - * (in jiffies). - * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, - * sectors per seconds. - * @RT_prod: cached value of the product R*T used for computing the maximum - * duration of the weight raising automatically. - * @device_speed: device-speed class for the low-latency heuristic. - * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. - * - * All the fields are protected by the @queue lock. - */ -struct bfq_data { - struct request_queue *queue; - - struct bfq_group *root_group; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - int active_numerous_groups; -#endif - - struct rb_root queue_weights_tree; - struct rb_root group_weights_tree; - - int busy_queues; - int busy_in_flight_queues; - int const_seeky_busy_in_flight_queues; - int wr_busy_queues; - int queued; - int rq_in_driver; - int sync_flight; - - int max_rq_in_driver; - int hw_tag_samples; - int hw_tag; - - int budgets_assigned; - - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct bfq_queue *in_service_queue; - struct bfq_io_cq *in_service_bic; - - sector_t last_position; - - ktime_t last_budget_start; - ktime_t last_idling_start; - int peak_rate_samples; - u64 peak_rate; - int bfq_max_budget; - - struct list_head active_list; - struct list_head idle_list; - - unsigned int bfq_fifo_expire[2]; - unsigned int bfq_back_penalty; - unsigned int bfq_back_max; - unsigned int bfq_slice_idle; - u64 bfq_class_idle_last_service; - - int bfq_user_max_budget; - int bfq_max_budget_async_rq; - unsigned int bfq_timeout[2]; - - unsigned int bfq_coop_thresh; - unsigned int bfq_failed_cooperations; - unsigned int bfq_requests_within_timer; - - unsigned long last_ins_in_burst; - unsigned long bfq_burst_interval; - int burst_size; - unsigned long bfq_large_burst_thresh; - bool large_burst; - struct hlist_head burst_list; - - bool low_latency; - - /* parameters of the low_latency heuristics */ - unsigned int bfq_wr_coeff; - unsigned int bfq_wr_max_time; - unsigned int bfq_wr_rt_max_time; - unsigned int bfq_wr_min_idle_time; - unsigned long bfq_wr_min_inter_arr_async; - unsigned int bfq_wr_max_softrt_rate; - u64 RT_prod; - enum bfq_device_speed device_speed; - - struct bfq_queue oom_bfqq; -}; - -enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once - * having consumed at most 2/10 of - * its budget - */ - BFQ_BFQQ_FLAG_in_large_burst, /* - * bfqq activated in a large burst, - * see comments to bfq_handle_burst. - */ - BFQ_BFQQ_FLAG_constantly_seeky, /* - * bfqq has proved to be slow and - * seeky until budget timeout - */ - BFQ_BFQQ_FLAG_softrt_update, /* - * may need softrt-next-start - * update - */ -}; - -#define BFQ_BFQQ_FNS(name) \ -static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -{ \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -} - -BFQ_BFQQ_FNS(busy); -BFQ_BFQQ_FNS(wait_request); -BFQ_BFQQ_FNS(must_alloc); -BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); -BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(budget_new); -BFQ_BFQQ_FNS(IO_bound); -BFQ_BFQQ_FNS(in_large_burst); -BFQ_BFQQ_FNS(constantly_seeky); -BFQ_BFQQ_FNS(softrt_update); -#undef BFQ_BFQQ_FNS - -/* Logging facilities. */ -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) - -#define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) - -/* Expiration reasons. */ -enum bfqq_expiration { - BFQ_BFQQ_TOO_IDLE = 0, /* - * queue has been idling for - * too long - */ - BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ - BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ - BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -}; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -struct bfqg_stats { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -}; - -/* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * - * @ps: @blkcg_policy_storage that this structure inherits - * @weight: weight of the bfq_group - */ -struct bfq_group_data { - /* must be the first member */ - struct blkcg_policy_data pd; - - unsigned short weight; -}; - -/** - * struct bfq_group - per (device, cgroup) data structure. - * @entity: schedulable entity to insert into the parent group sched_data. - * @sched_data: own sched_data, to contain child entities (they may be - * both bfq_queues and bfq_groups). - * @bfqd: the bfq_data for the device this group acts upon. - * @async_bfqq: array of async queues for all the tasks belonging to - * the group, one queue per ioprio value per ioprio_class, - * except for the idle class that has only one queue. - * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). - * @my_entity: pointer to @entity, %NULL for the toplevel group; used - * to avoid too many special cases during group creation/ - * migration. - * @active_entities: number of active entities belonging to the group; - * unused for the root group. Used to know whether there - * are groups with more than one active @bfq_entity - * (see the comments to the function - * bfq_bfqq_must_not_expire()). - * - * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup - * there is a set of bfq_groups, each one collecting the lower-level - * entities belonging to the group that are acting on the same device. - * - * Locking works as follows: - * o @bfqd is protected by the queue lock, RCU is used to access it - * from the readers. - * o All the other fields are protected by the @bfqd queue lock. - */ -struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - void *bfqd; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; - - struct bfq_entity *my_entity; - - int active_entities; - - struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ -}; - -#else -struct bfq_group { - struct bfq_sched_data sched_data; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; -}; -#endif - -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - -static struct bfq_service_tree * -bfq_entity_service_tree(struct bfq_entity *entity) -{ - struct bfq_sched_data *sched_data = entity->sched_data; - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS; - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - BUG_ON(sched_data == NULL); - - return sched_data->service_tree + idx; -} - -static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -{ - return bic->bfqq[is_sync]; -} - -static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, - bool is_sync) -{ - bic->bfqq[is_sync] = bfqq; -} - -static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -{ - return bic->icq.q->elevator->elevator_data; -} - -/** - * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. - * @ptr: a pointer to a bfqd. - * @flags: storage for the flags to be saved. - * - * This function allows bfqg->bfqd to be protected by the - * queue lock of the bfqd they reference; the pointer is dereferenced - * under RCU, so the storage for bfqd is assured to be safe as long - * as the RCU read side critical section does not end. After the - * bfqd->queue->queue_lock is taken the pointer is rechecked, to be - * sure that no other writer accessed it. If we raced with a writer, - * the function returns NULL, with the queue unlocked, otherwise it - * returns the dereferenced pointer, with the queue locked. - */ -static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -{ - struct bfq_data *bfqd; - - rcu_read_lock(); - bfqd = rcu_dereference(*(struct bfq_data **)ptr); - - if (bfqd != NULL) { - spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (ptr == NULL) - printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); - else if (*ptr == bfqd) - goto out; - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - - bfqd = NULL; -out: - rcu_read_unlock(); - return bfqd; -} - -static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -{ - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -} - -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -static void bfq_put_queue(struct bfq_queue *bfqq); -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask); -static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - -#endif /* _BFQ_H */ From 9e3c43ae00756d99f52b89c756adb1837f9bafec Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:00:43 +0800 Subject: [PATCH 42/59] Revert "block: cgroups, kconfig, build bits for BFQ-v7r11-4.5.0" This reverts commit 4a0767b99d94be5222b747996acf2c6436dcbe35. --- block/Kconfig.iosched | 32 -------------------------------- block/Makefile | 1 - include/linux/blkdev.h | 2 +- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 9e25d45a4c33..421bef9c4c48 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -39,27 +39,6 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. -config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - default n - ---help--- - The BFQ I/O scheduler tries to distribute bandwidth among - all processes according to their weights. - It aims at distributing the bandwidth as desired, independently of - the disk parameters and with any workload. It also tries to - guarantee low latency to interactive and soft real-time - applications. If compiled built-in (saying Y here), BFQ can - be configured to support hierarchical scheduling. - -config CGROUP_BFQIO - bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y - default n - ---help--- - Enable hierarchical scheduling in BFQ, using the cgroups - filesystem interface. The name of the subsystem will be - bfqio. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -76,16 +55,6 @@ choice config DEFAULT_NOOP bool "No-op" - config DEFAULT_BFQ - bool "BFQ" if IOSCHED_BFQ=y - help - Selects BFQ as the default I/O scheduler which will be - used by default for all block devices. - The BFQ I/O scheduler aims at distributing the bandwidth - as desired, independently of the disk parameters and with - any workload. It also tries to guarantee low latency to - interactive and soft real-time applications. - endchoice config DEFAULT_IOSCHED @@ -93,7 +62,6 @@ config DEFAULT_IOSCHED default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP - default "bfq" if DEFAULT_BFQ endmenu diff --git a/block/Makefile b/block/Makefile index 736e91a2ca1c..36acdd7545be 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1778f36ac1ce..9c41956dc9ca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -45,7 +45,7 @@ struct pr_ops; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 3 +#define BLKCG_MAX_POLS 2 typedef void (rq_end_io_fn)(struct request *, int); From 6cf83fc4d2f7375c1540da7223c6638a295bdf05 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:39:23 +0800 Subject: [PATCH 43/59] =?UTF-8?q?KernelSU:=20=E5=BC=95=E5=85=A5KernelSU?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- drivers/Kconfig | 2 + drivers/Makefile | 1 + drivers/kernelsu/Kconfig | 70 ++ drivers/kernelsu/LICENSE | 339 ++++++ drivers/kernelsu/Makefile | 54 + drivers/kernelsu/allowlist.c | 564 ++++++++++ drivers/kernelsu/allowlist.h | 52 + drivers/kernelsu/apk_sign.c | 387 +++++++ drivers/kernelsu/apk_sign.h | 9 + drivers/kernelsu/app_profile.c | 191 ++++ drivers/kernelsu/app_profile.h | 70 ++ drivers/kernelsu/arch.h | 121 +++ drivers/kernelsu/core_hook.c | 452 ++++++++ drivers/kernelsu/core_hook.h | 21 + drivers/kernelsu/extras.c | 219 ++++ drivers/kernelsu/feature.c | 174 ++++ drivers/kernelsu/feature.h | 39 + drivers/kernelsu/file_wrapper.c | 628 ++++++++++++ drivers/kernelsu/file_wrapper.h | 10 + drivers/kernelsu/kernel_compat.c | 173 ++++ drivers/kernelsu/kernel_compat.h | 112 ++ drivers/kernelsu/klog.h | 11 + drivers/kernelsu/kp_ksud.c | 221 ++++ drivers/kernelsu/ksu.c | 141 +++ drivers/kernelsu/ksu.h | 29 + drivers/kernelsu/ksud.c | 719 +++++++++++++ drivers/kernelsu/ksud.h | 26 + drivers/kernelsu/manager.h | 44 + drivers/kernelsu/rp_sucompat.c | 102 ++ drivers/kernelsu/selinux/rules.c | 454 +++++++++ drivers/kernelsu/selinux/selinux.c | 210 ++++ drivers/kernelsu/selinux/selinux.h | 43 + drivers/kernelsu/selinux/sepolicy.c | 1134 +++++++++++++++++++++ drivers/kernelsu/selinux/sepolicy.h | 46 + drivers/kernelsu/su_mount_ns.c | 239 +++++ drivers/kernelsu/su_mount_ns.h | 10 + drivers/kernelsu/sucompat.c | 410 ++++++++ drivers/kernelsu/sucompat.h | 10 + drivers/kernelsu/supercalls.c | 955 +++++++++++++++++ drivers/kernelsu/supercalls.h | 166 +++ drivers/kernelsu/syscall_table_hook.c | 523 ++++++++++ drivers/kernelsu/syscall_table_hook_arm.c | 320 ++++++ drivers/kernelsu/throne_tracker.c | 415 ++++++++ drivers/kernelsu/throne_tracker.h | 43 + drivers/kernelsu/tiny_sulog.c | 124 +++ 45 files changed, 10083 insertions(+) create mode 100644 drivers/kernelsu/Kconfig create mode 100644 drivers/kernelsu/LICENSE create mode 100644 drivers/kernelsu/Makefile create mode 100644 drivers/kernelsu/allowlist.c create mode 100644 drivers/kernelsu/allowlist.h create mode 100644 drivers/kernelsu/apk_sign.c create mode 100644 drivers/kernelsu/apk_sign.h create mode 100644 drivers/kernelsu/app_profile.c create mode 100644 drivers/kernelsu/app_profile.h create mode 100644 drivers/kernelsu/arch.h create mode 100644 drivers/kernelsu/core_hook.c create mode 100644 drivers/kernelsu/core_hook.h create mode 100644 drivers/kernelsu/extras.c create mode 100644 drivers/kernelsu/feature.c create mode 100644 drivers/kernelsu/feature.h create mode 100644 drivers/kernelsu/file_wrapper.c create mode 100644 drivers/kernelsu/file_wrapper.h create mode 100644 drivers/kernelsu/kernel_compat.c create mode 100644 drivers/kernelsu/kernel_compat.h create mode 100644 drivers/kernelsu/klog.h create mode 100644 drivers/kernelsu/kp_ksud.c create mode 100644 drivers/kernelsu/ksu.c create mode 100644 drivers/kernelsu/ksu.h create mode 100644 drivers/kernelsu/ksud.c create mode 100644 drivers/kernelsu/ksud.h create mode 100644 drivers/kernelsu/manager.h create mode 100644 drivers/kernelsu/rp_sucompat.c create mode 100644 drivers/kernelsu/selinux/rules.c create mode 100644 drivers/kernelsu/selinux/selinux.c create mode 100644 drivers/kernelsu/selinux/selinux.h create mode 100644 drivers/kernelsu/selinux/sepolicy.c create mode 100644 drivers/kernelsu/selinux/sepolicy.h create mode 100644 drivers/kernelsu/su_mount_ns.c create mode 100644 drivers/kernelsu/su_mount_ns.h create mode 100644 drivers/kernelsu/sucompat.c create mode 100644 drivers/kernelsu/sucompat.h create mode 100644 drivers/kernelsu/supercalls.c create mode 100644 drivers/kernelsu/supercalls.h create mode 100644 drivers/kernelsu/syscall_table_hook.c create mode 100644 drivers/kernelsu/syscall_table_hook_arm.c create mode 100644 drivers/kernelsu/throne_tracker.c create mode 100644 drivers/kernelsu/throne_tracker.h create mode 100644 drivers/kernelsu/tiny_sulog.c diff --git a/drivers/Kconfig b/drivers/Kconfig index 4ee7416ed53d..981778f02e56 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -216,4 +216,6 @@ source "drivers/gps/Kconfig" source "drivers/halls/Kconfig" +source "drivers/kernelsu/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 8daadc6db681..8d445b4401be 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -183,3 +183,4 @@ obj-$(CONFIG_SENSORS_SSC) += sensors/ obj-$(CONFIG_TEE) += tee/ obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/ obj-$(CONFIG_HALLS) += halls/ +obj-$(CONFIG_KSU) += kernelsu/ diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig new file mode 100644 index 000000000000..24a27043b3fb --- /dev/null +++ b/drivers/kernelsu/Kconfig @@ -0,0 +1,70 @@ +menu "KernelSU" + +config KSU + select SECCOMP + bool "KernelSU function support" + default n + help + Enable kernel-level root privileges on Android System. + +config KSU_EXTRAS + bool "Enable custom stuff" + depends on KSU + default n + help + Custom extensions. Experimental. + Currently, only avc log spoofing is implemented. + +config KSU_KPROBES_KSUD + bool "Enable dynamic kprobes for early boot hooks" + depends on KPROBES && KRETPROBES + default n + help + Use dynamic hooks via kprobes for functions only + on early boot. Hooks are unregistered at boot complete + to reduce overhead. + +config KSU_TAMPER_SYSCALL_TABLE + bool "EXPERIMENTAL: tamper sys_call_table for sucompat + sys_reboot" + depends on (ARM || ARM64) && !KSU_KRETPROBES_SUCOMPAT + default n + help + EXPERIMENTAL: use syscall table hijacking method demonstrated on zx2c4's + kernel-assisted-superuser. Replaces sys_reboot, sys_execve, sys_newfstatat, + sys_faccessat, sys_newfstat_ret manual hooks. + Tested on Linux 3.10 ~ 4.14, aarch64. + +config KSU_KRETPROBES_SUCOMPAT + bool "EXPERIMENTAL: kretprobes for sucompat" + depends on KRETPROBES + default n + help + EXPERIMENTAL: Use kretprobes to hook getname_flags, mainly for + sucompat. This method will hijack all fs-related syscalls, but + thwarts timing based detections. + +config KSU_DEBUG + bool "KernelSU debug mode" + depends on KSU + default n + help + Enable KernelSU debug mode. + +config KSU_THRONE_TRACKER_ALWAYS_THREADED + bool "Always run throne tracker in a kthread" + default n + help + Enable this option to run throne tracker in a kthread for the first + run, which happens at boot time / decryption stage. This can decrease + boot time, but can cause crowning failure on some FDE/FBEv1 setups. + If unsure, say n. + +config KSU_LSM_SECURITY_HOOKS + bool "Use LSM security hooks" + depends on KSU + default y + help + Disabling this is mostly useful for kernel > 6.8. + Make sure to implement manual hooks on security/security.c. + +endmenu diff --git a/drivers/kernelsu/LICENSE b/drivers/kernelsu/LICENSE new file mode 100644 index 000000000000..d159169d1050 --- /dev/null +++ b/drivers/kernelsu/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile new file mode 100644 index 000000000000..3890aca6522d --- /dev/null +++ b/drivers/kernelsu/Makefile @@ -0,0 +1,54 @@ +ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include +ccflags-y += -I$(objtree)/security/selinux + +obj-$(CONFIG_KSU) := ksu.o + +ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) +ccflags-y += -DKSU_COMPAT_HAS_CURRENT_SID +endif + +ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/include/security.h; echo $$?),0) +ccflags-y += -DKSU_COMPAT_HAS_SELINUX_STATE +endif + +# UL, look for iterate_dir on ‎fs/readdir.c +ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0) +ccflags-y += -DKSU_HAS_ITERATE_DIR +endif + +# UL, look for read_iter on f_op struct +ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0) +ccflags-y += -DKSU_HAS_FOP_READ_ITER +endif + +# UL, look for "ext4_unregister_sysfs" on fs/ext4 +ifeq ($(shell grep -q "^extern void ext4_unregister_sysfs" $(srctree)/fs/ext4/ext4.h 2>/dev/null; echo $$?),0) +ccflags-y += -DKSU_HAS_MODERN_EXT4 +endif + +ifeq ($(shell grep -q "selinux_inode" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) +ccflags-y += -DKSU_HAS_SELINUX_INODE +endif + +ifeq ($(shell grep -q "selinux_cred" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) +ccflags-y += -DKSU_HAS_SELINUX_CRED +endif + +ifeq ($(shell grep -q "static inline struct inode \*file_inode" $(srctree)/include/linux/fs.h; echo $$?),0) +ccflags-y += -DKSU_UL_HAS_FILE_INODE +endif + +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +ccflags-y += -DKSU_TYPE_VAL_TO_STRUCT +endif + +# half-assed-backport from 5.1 +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +ccflags-y += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY +endif + +ccflags-y += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat -Wno-missing-prototypes +ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-format -Wno-incompatible-pointer-types +ccflags-y += -Wno-unused-variable -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast + +# Keep a new line here!! Because someone may append config diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/allowlist.c new file mode 100644 index 000000000000..5e49d30b6942 --- /dev/null +++ b/drivers/kernelsu/allowlist.c @@ -0,0 +1,564 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +#include +#endif +#include + +#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32 +#define FILE_FORMAT_VERSION 3 // u32 + +#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID +#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0" + +static DEFINE_MUTEX(allowlist_mutex); + +// default profiles, these may be used frequently, so we cache it +static struct root_profile default_root_profile; +static struct non_root_profile default_non_root_profile; + +static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly __aligned(PAGE_SIZE); +static int allow_list_pointer __read_mostly = 0; + +static void remove_uid_from_arr(uid_t uid) +{ + int i; + for (i = 0; i < allow_list_pointer; i++) { + if (allow_list_arr[i] == uid) { + int remaining = allow_list_pointer - 1 - i; + if (remaining > 0) { + memmove(&allow_list_arr[i], &allow_list_arr[i + 1], + remaining * sizeof(allow_list_arr[0])); + } + allow_list_pointer--; + allow_list_arr[allow_list_pointer] = -1; + return; + } + } +} + +static void init_default_profiles() +{ + kernel_cap_t full_cap = CAP_FULL_SET; + + default_root_profile.uid = 0; + default_root_profile.gid = 0; + default_root_profile.groups_count = 1; + default_root_profile.groups[0] = 0; + memcpy(&default_root_profile.capabilities.effective, &full_cap, + sizeof(default_root_profile.capabilities.effective)); + default_root_profile.namespaces = KSU_NS_INHERITED; + strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN); + + // This means that we will umount modules by default! + default_non_root_profile.umount_modules = true; +} + +struct perm_data { + struct list_head list; + struct rcu_head rcu; + struct app_profile profile; +}; + +static struct list_head allow_list; + +static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE); +#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1) + +#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist" + +void ksu_persistent_allow_list(void); + +void ksu_show_allow_list(void) +{ + struct perm_data *p = NULL; + pr_info("ksu_show_allow_list\n"); + rcu_read_lock(); + list_for_each_entry_rcu (p, &allow_list, list) { + pr_info("uid :%d, allow: %d\n", p->profile.current_uid, + p->profile.allow_su); + } + rcu_read_unlock(); +} + +#ifdef CONFIG_KSU_DEBUG +static void ksu_grant_root_to_shell() +{ struct app_profile profile = { + .version = KSU_APP_PROFILE_VER, + .allow_su = true, + .current_uid = 2000, + }; + strcpy(profile.key, "com.android.shell"); + strcpy(profile.rp_config.profile.selinux_domain, + KSU_DEFAULT_SELINUX_DOMAIN); + ksu_set_app_profile(&profile); +} +#endif + +bool ksu_get_app_profile(struct app_profile *profile) +{ + struct perm_data *p = NULL; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu (p, &allow_list, list) { + bool uid_match = profile->current_uid == p->profile.current_uid; + if (uid_match) { + // found it, override it with ours + memcpy(profile, &p->profile, sizeof(*profile)); + found = true; + goto exit; + } + } + +exit: + rcu_read_unlock(); + return found; +} + +static inline bool forbid_system_uid(uid_t uid) +{ +#define SHELL_UID 2000 +#define SYSTEM_UID 1000 + return uid < SHELL_UID && uid != SYSTEM_UID; +} + +static bool profile_valid(struct app_profile *profile) +{ + if (!profile) { + return false; + } + + if (profile->version < KSU_APP_PROFILE_VER) { + pr_info("Unsupported profile version: %d\n", profile->version); + return false; + } + + if (profile->allow_su) { + if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) { + return false; + } + + if (strlen(profile->rp_config.profile.selinux_domain) == 0) { + return false; + } + } + + return true; +} + +int ksu_set_app_profile(struct app_profile *profile) +{ + struct perm_data *p = NULL, *np; + int result = 0; + u16 count = 0; + + if (!profile_valid(profile)) { + pr_err("Failed to set app profile: invalid profile!\n"); + return -EINVAL; + } + + mutex_lock(&allowlist_mutex); + + list_for_each_entry (p, &allow_list, list) { + ++count; + // both uid and package must match, otherwise it will break multiple package with different user id + if (profile->current_uid == p->profile.current_uid && + !strcmp(profile->key, p->profile.key)) { + // found it, just override it all! + np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); + if (!np) { + result = -ENOMEM; + goto out_unlock; + } + memcpy(&np->profile, profile, sizeof(*profile)); + list_replace_rcu(&p->list, &np->list); + kfree_rcu(p, rcu); + goto out; + } + } + + if (unlikely(count == U16_MAX)) { + pr_err("too many app profile\n"); + result = -E2BIG; + goto out_unlock; + } + + // not found, alloc a new node! + p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); + if (!p) { + pr_err("ksu_set_app_profile alloc failed\n"); + result = -ENOMEM; + goto out_unlock; + } + + memcpy(&p->profile, profile, sizeof(*profile)); + if (profile->allow_su) { + pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", + profile->key, profile->current_uid, + profile->rp_config.profile.gid, + profile->rp_config.profile.selinux_domain); + } else { + pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", + profile->key, profile->current_uid, + profile->nrp_config.profile.umount_modules); + } + + list_add_tail_rcu(&p->list, &allow_list); + +out: + result = 0; + + // check if the default profiles is changed, cache it to a single struct to accelerate access. + if (unlikely(!strcmp(profile->key, "$"))) { + // set default non root profile + memcpy(&default_non_root_profile, &profile->nrp_config.profile, + sizeof(default_non_root_profile)); + } else if (unlikely(!strcmp(profile->key, "#"))) { + // set default root profile + // TODO: Do we really need this? + memcpy(&default_root_profile, &profile->rp_config.profile, + sizeof(default_root_profile)); + } else if (profile->current_uid <= BITMAP_UID_MAX) { + if (profile->allow_su) + allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |= + 1 << (profile->current_uid % BITS_PER_BYTE); + else + allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &= + ~(1 << (profile->current_uid % BITS_PER_BYTE)); + } else { + if (profile->allow_su) { + /* + * 1024 apps with uid higher than BITMAP_UID_MAX + * registered to request superuser? + */ + if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) { + pr_err("too many apps registered\n"); + WARN_ON(1); + } else { + allow_list_arr[allow_list_pointer++] = profile->current_uid; + } + } else { + remove_uid_from_arr(profile->current_uid); + } + } + +out_unlock: + mutex_unlock(&allowlist_mutex); + return result; +} + +bool __ksu_is_allow_uid(uid_t uid) +{ + int i; + + if (forbid_system_uid(uid)) { + // do not bother going through the list if it's system + return false; + } + + if (likely(ksu_is_manager_appid_valid()) && + unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { + // manager is always allowed! + return true; + } + + if (likely(uid <= BITMAP_UID_MAX)) { + return !!(allow_list_bitmap[uid / BITS_PER_BYTE] & + (1 << (uid % BITS_PER_BYTE))); + } else { + for (i = 0; i < allow_list_pointer; i++) { + if (allow_list_arr[i] == uid) + return true; + } + } + + return false; +} + +bool __ksu_is_allow_uid_for_current(uid_t uid) +{ + if (unlikely(uid == 0)) { + // already root, but only allow our domain. + return is_ksu_domain(); + } + return __ksu_is_allow_uid(uid); +} + +bool ksu_uid_should_umount(uid_t uid) +{ + struct app_profile profile = { .current_uid = uid }; + if (likely(ksu_is_manager_appid_valid()) && + unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { + // we should not umount on manager! + return false; + } + bool found = ksu_get_app_profile(&profile); + if (!found) { + // no app profile found, it must be non root app + return default_non_root_profile.umount_modules; + } + if (profile.allow_su) { + // if found and it is granted to su, we shouldn't umount for it + return false; + } else { + // found an app profile + if (profile.nrp_config.use_default) { + return default_non_root_profile.umount_modules; + } else { + return profile.nrp_config.profile.umount_modules; + } + } +} + +void ksu_get_root_profile(uid_t uid, struct root_profile *profile) +{ + struct perm_data *p = NULL; + + if (is_uid_manager(uid)) { + goto use_default; + } + + rcu_read_lock(); + list_for_each_entry_rcu (p, &allow_list, list) { + if (uid == p->profile.current_uid && p->profile.allow_su) { + if (!p->profile.rp_config.use_default) { + memcpy(profile, &p->profile.rp_config.profile, + sizeof(*profile)); + rcu_read_unlock(); + return; + } + } + } + rcu_read_unlock(); + +use_default: + // use default profile + memcpy(profile, &default_root_profile, sizeof(*profile)); +} + +bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow) +{ + struct perm_data *p = NULL; + u16 i = 0, j = 0; + rcu_read_lock(); + list_for_each_entry_rcu (p, &allow_list, list) { + // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow); + if (p->profile.allow_su == allow && + !is_uid_manager(p->profile.current_uid)) { + if (j < length) { + array[j++] = p->profile.current_uid; + } + ++i; + } + } + rcu_read_unlock(); + if (out_length) { + *out_length = j; + } + if (out_total) { + *out_total = i; + } + + return true; +} + + +void ksu_persistent_allow_list_fn() +{ + u32 magic = FILE_MAGIC; + u32 version = FILE_FORMAT_VERSION; + struct perm_data *p = NULL; + loff_t off = 0; + + struct file *fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (IS_ERR(fp)) { + pr_err("save_allow_list create file failed: %ld\n", PTR_ERR(fp)); + goto out; + } + + // store magic and version + if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) != sizeof(magic)) { + pr_err("save_allow_list write magic failed.\n"); + goto close_file; + } + + if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) != sizeof(version)) { + pr_err("save_allow_list write version failed.\n"); + goto close_file; + } + + list_for_each_entry (p, &allow_list, list) { + pr_info("save allow list, name: %s uid :%d, allow: %d\n", + p->profile.key, p->profile.current_uid, p->profile.allow_su); + + ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile), &off); + } + +close_file: + filp_close(fp, 0); +out: + return; +} + +// this is a bit heavier than task work / workqueue but this allows +// us to have our own context. we give it a full escaped-to-root one. +static int persistent_allow_list_pre(void *data) +{ + pr_info("ksu_persistent_allow_list_fn: pid: %d started\n", current->pid); + + // repurpose the mutex they were holding on ksu_persistent_allow_list_fn + // since all this does eventually is to call kernel_write + // we hit two birds in one stone. exclusive io + exclusive kthread + // there wont be a single instance lock, but for what we need, its finee + // we just let other threads stall. + // 'mutex-trylock-fail-then-return' is detrimental here + mutex_lock(&allowlist_mutex); + + escape_to_root_forced(); // give permissions for everything + ksu_persistent_allow_list_fn(); + + mutex_unlock(&allowlist_mutex); + + pr_info("ksu_persistent_allow_list_fn: pid: %d exit\n", current->pid); + return 0; +} + +void ksu_persistent_allow_list() +{ + kthread_run(persistent_allow_list_pre, NULL, "allowlist"); +} + +// we can leave this synchronous it seems +// this can be revisited if escaping/deferring is needed. +void ksu_load_allow_list() +{ + loff_t off = 0; + ssize_t ret = 0; + struct file *fp = NULL; + u32 magic; + u32 version; + +#ifdef CONFIG_KSU_DEBUG + // always allow adb shell by default + ksu_grant_root_to_shell(); +#endif + + // load allowlist now! + fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); + if (IS_ERR(fp)) { + pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp)); + return; + } + + // verify magic + if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) != sizeof(magic) || + magic != FILE_MAGIC) { + pr_err("allowlist file invalid: %d!\n", magic); + goto exit; + } + + if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) != sizeof(version)) { + pr_err("allowlist read version: %d failed\n", version); + goto exit; + } + + pr_info("allowlist version: %d\n", version); + + while (true) { + struct app_profile profile; + + ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile), &off); + + if (ret <= 0) { + pr_info("load_allow_list read err: %zd\n", ret); + break; + } + + pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", profile.key, + profile.current_uid, profile.allow_su); + ksu_set_app_profile(&profile); + } + +exit: + ksu_show_allow_list(); + filp_close(fp, 0); +} + +void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data) +{ + struct perm_data *np = NULL; + struct perm_data *n = NULL; + + if (!ksu_boot_completed) { + pr_info("boot not completed, skip prune\n"); + return; + } + + bool modified = false; + mutex_lock(&allowlist_mutex); + list_for_each_entry_safe (np, n, &allow_list, list) { + uid_t uid = np->profile.current_uid; + char *package = np->profile.key; + // we use this uid for special cases, don't prune it! + bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID; + if (!is_preserved_uid && !is_uid_valid(uid, package, data)) { + modified = true; + pr_info("prune uid: %d, package: %s\n", uid, package); + list_del_rcu(&np->list); + kfree_rcu(np, rcu); + if (likely(uid <= BITMAP_UID_MAX)) { + allow_list_bitmap[uid / BITS_PER_BYTE] &= + ~(1 << (uid % BITS_PER_BYTE)); + } + remove_uid_from_arr(uid); + } + } + mutex_unlock(&allowlist_mutex); + + if (modified) { + smp_mb(); + ksu_persistent_allow_list(); + } +} + +void ksu_allowlist_init(void) +{ + int i; + + BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE); + BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE); + + for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++) + allow_list_arr[i] = -1; + + INIT_LIST_HEAD(&allow_list); + + init_default_profiles(); +} + +void ksu_allowlist_exit(void) +{ + struct perm_data *np = NULL; + struct perm_data *n = NULL; + + // free allowlist + mutex_lock(&allowlist_mutex); + list_for_each_entry_safe (np, n, &allow_list, list) { + list_del(&np->list); + kfree(np); + } + mutex_unlock(&allowlist_mutex); +} diff --git a/drivers/kernelsu/allowlist.h b/drivers/kernelsu/allowlist.h new file mode 100644 index 000000000000..7c65ab7c744e --- /dev/null +++ b/drivers/kernelsu/allowlist.h @@ -0,0 +1,52 @@ +#ifndef __KSU_H_ALLOWLIST +#define __KSU_H_ALLOWLIST + +#include +#include +#include "app_profile.h" + +#define PER_USER_RANGE 100000 +#define FIRST_APPLICATION_UID 10000 +#define LAST_APPLICATION_UID 19999 +#define FIRST_ISOLATED_UID 99000 +#define LAST_ISOLATED_UID 99999 + +void ksu_allowlist_init(void); + +void ksu_allowlist_exit(void); + +void ksu_load_allow_list(void); + +void ksu_show_allow_list(void); + +// Check if the uid is in allow list +bool __ksu_is_allow_uid(uid_t uid); +#define ksu_is_allow_uid(uid) unlikely(__ksu_is_allow_uid(uid)) + +// Check if the uid is in allow list, or current is ksu domain root +bool __ksu_is_allow_uid_for_current(uid_t uid); +#define ksu_is_allow_uid_for_current(uid) unlikely(__ksu_is_allow_uid_for_current(uid)) + +bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow); + +void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), void *data); +void ksu_persistent_allow_list(); + +bool ksu_get_app_profile(struct app_profile *); +int ksu_set_app_profile(struct app_profile *); + +bool ksu_uid_should_umount(uid_t uid); +void ksu_get_root_profile(uid_t uid, struct root_profile *); + +static inline bool is_appuid(uid_t uid) +{ + uid_t appid = uid % PER_USER_RANGE; + return appid >= FIRST_APPLICATION_UID && appid <= LAST_APPLICATION_UID; +} + +static inline bool is_isolated_process(uid_t uid) +{ + uid_t appid = uid % PER_USER_RANGE; + return appid >= FIRST_ISOLATED_UID && appid <= LAST_ISOLATED_UID; +} +#endif diff --git a/drivers/kernelsu/apk_sign.c b/drivers/kernelsu/apk_sign.c new file mode 100644 index 000000000000..697ecd81d9d9 --- /dev/null +++ b/drivers/kernelsu/apk_sign.c @@ -0,0 +1,387 @@ +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KSU_DEBUG +#include +#endif +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) +#include +#else +#include +#endif +#include + +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +static struct sdesc *init_sdesc(struct crypto_shash *alg) +{ + struct sdesc *sdesc; + int size; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(alg); + sdesc = kzalloc(size, GFP_KERNEL); + if (!sdesc) + return ERR_PTR(-ENOMEM); + sdesc->shash.tfm = alg; + return sdesc; +} + +static int calc_hash(struct crypto_shash *alg, const unsigned char *data, + unsigned int datalen, unsigned char *digest) +{ + struct sdesc *sdesc; + int ret; + + sdesc = init_sdesc(alg); + if (IS_ERR(sdesc)) { + pr_info("can't alloc sdesc\n"); + return PTR_ERR(sdesc); + } + + ret = crypto_shash_digest(&sdesc->shash, data, datalen, digest); + kfree(sdesc); + return ret; +} + +static int ksu_sha256(const unsigned char *data, unsigned int datalen, + unsigned char *digest) +{ + struct crypto_shash *alg; + char *hash_alg_name = "sha256"; + int ret; + + alg = crypto_alloc_shash(hash_alg_name, 0, 0); + if (IS_ERR(alg)) { + pr_info("can't alloc alg %s\n", hash_alg_name); + return PTR_ERR(alg); + } + ret = calc_hash(alg, data, datalen, digest); + crypto_free_shash(alg); + return ret; +} + +static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset, + unsigned expected_size, const char *expected_sha256) +{ + ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer-sequence length + ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer length + ksu_kernel_read_compat(fp, size4, 0x4, pos); // signed data length + + *offset += 0x4 * 3; + + ksu_kernel_read_compat(fp, size4, 0x4, pos); // digests-sequence length + + *pos += *size4; + *offset += 0x4 + *size4; + + ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificates length + ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificate length + *offset += 0x4 * 2; + + if (*size4 == expected_size) { + *offset += *size4; + +#define CERT_MAX_LENGTH 1024 + char cert[CERT_MAX_LENGTH]; + if (*size4 > CERT_MAX_LENGTH) { + pr_info("cert length overlimit\n"); + return false; + } + ksu_kernel_read_compat(fp, cert, *size4, pos); + unsigned char digest[SHA256_DIGEST_SIZE]; + if (ksu_sha256(cert, *size4, digest) < 0 ) { + pr_info("sha256 error\n"); + return false; + } + + char hash_str[SHA256_DIGEST_SIZE * 2 + 1]; + hash_str[SHA256_DIGEST_SIZE * 2] = '\0'; + + bin2hex(hash_str, digest, SHA256_DIGEST_SIZE); + pr_info("sha256: %s, expected: %s\n", hash_str, + expected_sha256); + if (strcmp(expected_sha256, hash_str) == 0) { + return true; + } + } + return false; +} + +struct zip_entry_header { + uint32_t signature; + uint16_t version; + uint16_t flags; + uint16_t compression; + uint16_t mod_time; + uint16_t mod_date; + uint32_t crc32; + uint32_t compressed_size; + uint32_t uncompressed_size; + uint16_t file_name_length; + uint16_t extra_field_length; +} __attribute__((packed)); + +// This is a necessary but not sufficient condition, but it is enough for us +static bool has_v1_signature_file(struct file *fp) +{ + struct zip_entry_header header; + const char MANIFEST[] = "META-INF/MANIFEST.MF"; + + loff_t pos = 0; + + while (ksu_kernel_read_compat(fp, &header, + sizeof(struct zip_entry_header), &pos) == + sizeof(struct zip_entry_header)) { + if (header.signature != 0x04034b50) { + // ZIP magic: 'PK' + return false; + } + // Read the entry file name + if (header.file_name_length == sizeof(MANIFEST) - 1) { + char fileName[sizeof(MANIFEST)]; + ksu_kernel_read_compat(fp, fileName, + header.file_name_length, &pos); + fileName[header.file_name_length] = '\0'; + + // Check if the entry matches META-INF/MANIFEST.MF + if (strncmp(MANIFEST, fileName, sizeof(MANIFEST) - 1) == + 0) { + return true; + } + } else { + // Skip the entry file name + pos += header.file_name_length; + } + + // Skip to the next entry + pos += header.extra_field_length + header.compressed_size; + } + + return false; +} + +static __always_inline bool check_v2_signature(char *path, + unsigned expected_size, + const char *expected_sha256) +{ + unsigned char buffer[0x11] = { 0 }; + u32 size4; + u64 size8, size_of_block; + + loff_t pos; + + bool v2_signing_valid = false; + int v2_signing_blocks = 0; + bool v3_signing_exist = false; + bool v3_1_signing_exist = false; + + int i; + struct path kpath; + if (kern_path(path, 0, &kpath)) + return false; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) + if (inode_is_locked(kpath.dentry->d_inode)) +#else + if (mutex_is_locked(&kpath.dentry->d_inode->i_mutex)) +#endif + { + pr_info("%s: inode is locked for %s\n", __func__, path); + path_put(&kpath); + return false; + } + + path_put(&kpath); + + struct file *fp = ksu_filp_open_compat(path, O_RDONLY, 0); + if (IS_ERR(fp)) { + // pr_err("open %s error.\n", path); + return false; + } + + // disable inotify for this file + fp->f_mode |= FMODE_NONOTIFY; + + // https://en.wikipedia.org/wiki/Zip_(file_format)#End_of_central_directory_record_(EOCD) + for (i = 0;; ++i) { + unsigned short n; + pos = vfs_llseek(fp, -i - 2, SEEK_END); + ksu_kernel_read_compat(fp, &n, 2, &pos); + if (n == i) { + pos -= 22; + ksu_kernel_read_compat(fp, &size4, 4, &pos); + if ((size4 ^ 0xcafebabeu) == 0xccfbf1eeu) { + break; + } + } + if (i == 0xffff) { + pr_info("error: cannot find eocd\n"); + goto clean; + } + } + + pos += 12; + // offset + ksu_kernel_read_compat(fp, &size4, 0x4, &pos); + pos = size4 - 0x18; + + ksu_kernel_read_compat(fp, &size8, 0x8, &pos); + ksu_kernel_read_compat(fp, buffer, 0x10, &pos); + // !! remove this casting to char just to strcmp + if (memcmp(buffer, "APK Sig Block 42", 16)) { + goto clean; + } + + pos = size4 - (size8 + 0x8); + ksu_kernel_read_compat(fp, &size_of_block, 0x8, &pos); + if (size_of_block != size8) { + goto clean; + } + + int loop_count = 0; + while (loop_count++ < 10) { + uint32_t id; + uint32_t offset; + ksu_kernel_read_compat(fp, &size8, 0x8, + &pos); // sequence length + if (size8 == size_of_block) { + break; + } + ksu_kernel_read_compat(fp, &id, 0x4, &pos); // id + offset = 4; + if (id == 0x7109871au) { + v2_signing_blocks++; + v2_signing_valid = + check_block(fp, &size4, &pos, &offset, + expected_size, expected_sha256); + } else if (id == 0xf05368c0u) { + // http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#73 + v3_signing_exist = true; + } else if (id == 0x1b93ad61u) { + // http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#74 + v3_1_signing_exist = true; + } else { +#ifdef CONFIG_KSU_DEBUG + pr_info("Unknown id: 0x%08x\n", id); +#endif + } + pos += (size8 - offset); + } + + if (v2_signing_blocks != 1) { +#ifdef CONFIG_KSU_DEBUG + pr_err("Unexpected v2 signature count: %d\n", + v2_signing_blocks); +#endif + v2_signing_valid = false; + } + + if (v2_signing_valid) { + int has_v1_signing = has_v1_signature_file(fp); + if (has_v1_signing) { + pr_err("Unexpected v1 signature scheme found!\n"); + filp_close(fp, 0); + return false; + } + } +clean: + filp_close(fp, 0); + + if (v3_signing_exist || v3_1_signing_exist) { +#ifdef CONFIG_KSU_DEBUG + pr_err("Unexpected v3 signature scheme found!\n"); +#endif + return false; + } + + return v2_signing_valid; +} + +#ifdef CONFIG_KSU_DEBUG + +int ksu_debug_manager_appid = -1; + +static int set_expected_size(const char *val, const struct kernel_param *kp) +{ + int rv = param_set_uint(val, kp); + ksu_set_manager_appid(ksu_debug_manager_appid); + pr_info("ksu_manager_appid set to %d\n", ksu_debug_manager_appid); + return rv; +} + +static struct kernel_param_ops expected_size_ops = { + .set = set_expected_size, + .get = param_get_uint, +}; + +module_param_cb(ksu_debug_manager_appid, &expected_size_ops, + &ksu_debug_manager_appid, S_IRUSR | S_IWUSR); + +#endif + +int get_pkg_from_apk_path(char *pkg, const char *path) +{ + int len = strlen(path); + if (len >= KSU_MAX_PACKAGE_NAME || len < 1) + return -1; + + const char *last_slash = NULL; + const char *second_last_slash = NULL; + + int i; + for (i = len - 1; i >= 0; i--) { + if (path[i] == '/') { + if (!last_slash) { + last_slash = &path[i]; + } else { + second_last_slash = &path[i]; + break; + } + } + } + + if (!last_slash || !second_last_slash) + return -1; + + const char *last_hyphen = strchr(second_last_slash, '-'); + if (!last_hyphen || last_hyphen > last_slash) + return -1; + + int pkg_len = last_hyphen - second_last_slash - 1; + if (pkg_len >= KSU_MAX_PACKAGE_NAME || pkg_len <= 0) + return -1; + + // Copying the package name + strncpy(pkg, second_last_slash + 1, pkg_len); + pkg[pkg_len] = '\0'; + + return 0; +} + +bool is_manager_apk(char *path) +{ + int tries = 0; + + while (tries++ < 10) { + if (!is_lock_held(path)) + break; + + pr_info("%s: waiting for %s\n", __func__, path); + msleep(100); + } + + // let it go, if retry fails, check_v2_signature will fail to open it anyway + if (tries == 10) { + pr_info("%s: timeout for %s\n", __func__, path); + return false; + } + + return check_v2_signature(path, 0x363, "4359c171f32543394cbc23ef908c4bb94cad7c8087002ba164c8230948c21549"); // dummy.keystore +} diff --git a/drivers/kernelsu/apk_sign.h b/drivers/kernelsu/apk_sign.h new file mode 100644 index 000000000000..d3a44bd207c2 --- /dev/null +++ b/drivers/kernelsu/apk_sign.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_APK_V2_SIGN +#define __KSU_H_APK_V2_SIGN + +#include + +bool is_manager_apk(char *path); +int get_pkg_from_apk_path(char *pkg, const char *path); + +#endif diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/app_profile.c new file mode 100644 index 000000000000..729b6a30820f --- /dev/null +++ b/drivers/kernelsu/app_profile.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include // signal_struct +#include +#else +#include +#endif +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION (6, 7, 0) +static struct group_info root_groups = { .usage = REFCOUNT_INIT(2) }; +#else +static struct group_info root_groups = { .usage = ATOMIC_INIT(2) }; +#endif + +static void setup_groups(struct root_profile *profile, struct cred *cred) +{ + if (profile->groups_count > KSU_MAX_GROUPS) { + pr_warn("Failed to setgroups, too large group: %d!\n", + profile->uid); + return; + } + + if (profile->groups_count == 1 && profile->groups[0] == 0) { + // setgroup to root and return early. + if (cred->group_info) + put_group_info(cred->group_info); + cred->group_info = get_group_info(&root_groups); + return; + } + + u32 ngroups = profile->groups_count; + struct group_info *group_info = groups_alloc(ngroups); + if (!group_info) { + pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid); + return; + } + + int i; + for (i = 0; i < ngroups; i++) { + gid_t gid = profile->groups[i]; + kgid_t kgid = make_kgid(current_user_ns(), gid); + if (!gid_valid(kgid)) { + pr_warn("Failed to setgroups, invalid gid: %d\n", gid); + put_group_info(group_info); + return; + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) + group_info->gid[i] = kgid; +#else + GROUP_AT(group_info, i) = kgid; +#endif + } + + groups_sort(group_info); + set_groups(cred, group_info); + put_group_info(group_info); +} + +void disable_seccomp() +{ + +// for < 5.9 lets have free_task do it for us (put_seccomp_filter) +// we risk a double free / double decrement which isn't safe on old kernels +// I'm not even sure if this thing is needed on newer kernels +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + struct task_struct *fake; + + fake = kmalloc(sizeof(*fake), GFP_ATOMIC); + if (!fake) { + pr_warn("failed to alloc fake task_struct\n"); + return; + } +#endif + + // Refer to kernel/seccomp.c: seccomp_set_mode_strict + // When disabling Seccomp, ensure that current->sighand->siglock is held during the operation. + spin_lock_irq(¤t->sighand->siglock); + + // disable seccomp +#if defined(CONFIG_GENERIC_ENTRY) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) + clear_syscall_work(SECCOMP); +#else + clear_thread_flag(TIF_SECCOMP); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + memcpy(fake, current, sizeof(*fake)); + atomic_set(¤t->seccomp.filter_count, 0); +#endif + + current->seccomp.mode = 0; + current->seccomp.filter = NULL; + + spin_unlock_irq(¤t->sighand->siglock); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) + // https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577 + fake->flags |= PF_EXITING; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) + // https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558 + fake->sighand = NULL; +#endif + + seccomp_filter_release(fake); + kfree(fake); +#endif // 5.9 +} + +static void escape_to_root(bool is_forced) +{ + struct cred *cred; + struct root_profile profile; + + cred = prepare_creds(); + if (!cred) { + pr_warn("prepare_creds failed!\n"); + return; + } + + if (!is_forced && ksu_get_uid_t(cred->euid) == 0) { + pr_warn("Already root, don't escape!\n"); + abort_creds(cred); + return; + } + + ksu_get_root_profile(ksu_get_uid_t(cred->uid), &profile); + + ksu_get_uid_t(cred->uid) = profile.uid; + ksu_get_uid_t(cred->suid) = profile.uid; + ksu_get_uid_t(cred->euid) = profile.uid; + ksu_get_uid_t(cred->fsuid) = profile.uid; + + ksu_get_uid_t(cred->gid) = profile.gid; + ksu_get_uid_t(cred->fsgid) = profile.gid; + ksu_get_uid_t(cred->sgid) = profile.gid; + ksu_get_uid_t(cred->egid) = profile.gid; + cred->securebits = 0; + + BUILD_BUG_ON(sizeof(profile.capabilities.effective) != sizeof(kernel_cap_t)); + + // setup capabilities + // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process + // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! + u64 cap_for_ksud = profile.capabilities.effective | CAP_DAC_READ_SEARCH; + memcpy(&cred->cap_effective, &cap_for_ksud, sizeof(cred->cap_effective)); + memcpy(&cred->cap_permitted, &profile.capabilities.effective, sizeof(cred->cap_permitted)); + memcpy(&cred->cap_bset, &profile.capabilities.effective, sizeof(cred->cap_bset)); + + setup_groups(&profile, cred); + setup_selinux(profile.selinux_domain, cred); + + commit_creds(cred); + + if (!!current->seccomp.mode) + disable_seccomp(); + + setup_mount_ns(profile.namespaces); +} + +void escape_to_root_for_init(void) { + struct cred *cred = prepare_creds(); + if (!cred) { + pr_err("Failed to prepare init's creds!\n"); + return; + } + + setup_selinux(KERNEL_SU_CONTEXT, cred); + commit_creds(cred); +} + +void escape_with_root_profile(void) +{ + escape_to_root(false); +} + +void escape_to_root_forced(void) +{ + // I'm not really sure which permissions are needed + // its just escape to root but bypasses cred check + // which we likely already have on contexts where this will be used. + escape_to_root(true); +} diff --git a/drivers/kernelsu/app_profile.h b/drivers/kernelsu/app_profile.h new file mode 100644 index 000000000000..fcc9daed5f53 --- /dev/null +++ b/drivers/kernelsu/app_profile.h @@ -0,0 +1,70 @@ +#ifndef __KSU_H_APP_PROFILE +#define __KSU_H_APP_PROFILE + +#include + +// Forward declarations +struct cred; + +#define KSU_APP_PROFILE_VER 2 +#define KSU_MAX_PACKAGE_NAME 256 +// NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. +#define KSU_MAX_GROUPS 32 +#define KSU_SELINUX_DOMAIN 64 + +struct root_profile { + int32_t uid; + int32_t gid; + + int32_t groups_count; + int32_t groups[KSU_MAX_GROUPS]; + + // kernel_cap_t is u32[2] for capabilities v3 + struct { + u64 effective; + u64 permitted; + u64 inheritable; + } capabilities; + + char selinux_domain[KSU_SELINUX_DOMAIN]; + + int32_t namespaces; +}; + +struct non_root_profile { + bool umount_modules; +}; + +struct app_profile { + // It may be utilized for backward compatibility, although we have never explicitly made any promises regarding this. + u32 version; + + // this is usually the package of the app, but can be other value for special apps + char key[KSU_MAX_PACKAGE_NAME]; + int32_t current_uid; + bool allow_su; + + union { + struct { + bool use_default; + char template_name[KSU_MAX_PACKAGE_NAME]; + + struct root_profile profile; + } rp_config; + + struct { + bool use_default; + + struct non_root_profile profile; + } nrp_config; + }; +}; + +// Escalate current process to root with the appropriate profile +void escape_with_root_profile(void); + +void escape_to_root_for_init(void); + +void escape_to_root_forced(void); + +#endif diff --git a/drivers/kernelsu/arch.h b/drivers/kernelsu/arch.h new file mode 100644 index 000000000000..569ea3d14863 --- /dev/null +++ b/drivers/kernelsu/arch.h @@ -0,0 +1,121 @@ +#ifndef __KSU_H_ARCH +#define __KSU_H_ARCH + +#include + +#if defined(__aarch64__) + +#define __PT_PARM1_REG regs[0] +#define __PT_PARM2_REG regs[1] +#define __PT_PARM3_REG regs[2] +#define __PT_SYSCALL_PARM4_REG regs[3] +#define __PT_CCALL_PARM4_REG regs[3] +#define __PT_PARM5_REG regs[4] +#define __PT_PARM6_REG regs[5] +#define __PT_RET_REG regs[30] +#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[0] +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define SYS_EXECVE_SYMBOL "__arm64_sys_execve" +#define SYS_REBOOT_SYMBOL "__arm64_sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "__arm64_sys_newfstat" +#define SYS_FSTAT64_SYMBOL "__arm64_sys_fstat64" +#else +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" +#endif + +#elif defined(__arm__) + +// https://elixir.bootlin.com/linux/v6.17-rc6/source/tools/lib/bpf/bpf_tracing.h +#define __PT_PARM1_REG uregs[0] +#define __PT_PARM2_REG uregs[1] +#define __PT_PARM3_REG uregs[2] +#define __PT_PARM4_REG uregs[3] + +// seems to work atleast on 3.0 on samsung galaxy s3 +// nfi what im doing +#define __PT_SYSCALL_PARM4_REG uregs[3] +#define __PT_CCALL_PARM4_REG uregs[3] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + +#define __PT_RET_REG uregs[14] +#define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG uregs[0] +#define __PT_SP_REG uregs[13] +#define __PT_IP_REG uregs[12] + +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" + +#elif defined(__x86_64__) + +#define __PT_PARM1_REG di +#define __PT_PARM2_REG si +#define __PT_PARM3_REG dx +/* syscall uses r10 for PARM4 */ +#define __PT_SYSCALL_PARM4_REG r10 +#define __PT_CCALL_PARM4_REG cx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +#define __PT_RET_REG sp +#define __PT_FP_REG bp +#define __PT_RC_REG ax +#define __PT_SP_REG sp +#define __PT_IP_REG ip + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define SYS_EXECVE_SYMBOL "__x64_sys_execve" +#define SYS_REBOOT_SYMBOL "__x64_sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" +#define SYS_FSTAT64_SYMBOL "__ia32_compat_sys_x86_fstat64" +#else +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" +#endif + +#else +#error "Unsupported arch" +#endif + +/* allow some architecutres to override `struct pt_regs` */ +#ifndef __PT_REGS_CAST +#define __PT_REGS_CAST(x) (x) +#endif + +#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) +#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) +#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) +#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG) +#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG) +#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) +#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) +#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) +#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) +#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs)) +#else +#define PT_REAL_REGS(regs) ((regs)) +#endif + +#endif diff --git a/drivers/kernelsu/core_hook.c b/drivers/kernelsu/core_hook.c new file mode 100644 index 000000000000..3d76300f7950 --- /dev/null +++ b/drivers/kernelsu/core_hook.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // sys_umount + +#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS +#define LSM_HANDLER_TYPE static int +#else +#define LSM_HANDLER_TYPE int +#endif + +static bool ksu_kernel_umount_enabled = true; + +static int kernel_umount_feature_get(u64 *value) +{ + *value = ksu_kernel_umount_enabled ? 1 : 0; + return 0; +} + +static int kernel_umount_feature_set(u64 value) +{ + bool enable = value != 0; + ksu_kernel_umount_enabled = enable; + pr_info("kernel_umount: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler kernel_umount_handler = { + .feature_id = KSU_FEATURE_KERNEL_UMOUNT, + .name = "kernel_umount", + .get_handler = kernel_umount_feature_get, + .set_handler = kernel_umount_feature_set, +}; + +LSM_HANDLER_TYPE ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry) +{ + if (!current->mm) { + // skip kernel threads + return 0; + } + + kuid_t current_uid = current_uid(); + if (ksu_get_uid_t(current_uid) != 1000) { + // skip non system uid + return 0; + } + + if (!old_dentry || !new_dentry) { + return 0; + } + + // /data/system/packages.list.tmp -> /data/system/packages.list + if (strcmp(new_dentry->d_iname, "packages.list")) { + return 0; + } + + char path[128]; + char *buf = dentry_path_raw(new_dentry, path, sizeof(path)); + if (IS_ERR(buf)) { + pr_err("dentry_path_raw failed.\n"); + return 0; + } + + if (!strstr(buf, "/system/packages.list")) { + return 0; + } + pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, + new_dentry->d_iname, buf); + + track_throne(false); + + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +__weak int path_umount(struct path *path, int flags) +{ + char buf[256] = {0}; + int ret; + + // -1 on the size as implicit null termination + // as we zero init the thing + char *usermnt = d_path(path, buf, sizeof(buf) - 1); + if (!(usermnt && usermnt != buf)) { + ret = -ENOENT; + goto out; + } + + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) + ret = ksys_umount((char __user *)usermnt, flags); +#else + ret = (int)sys_umount((char __user *)usermnt, flags); +#endif + + set_fs(old_fs); + + // release ref here! user_path_at increases it + // then only cleans for itself +out: + path_put(path); + return ret; +} +#endif + +static void ksu_umount_mnt(const char *mnt, struct path *path, int flags) +{ + int err = path_umount(path, flags); + + // upstream actually has a UAF here: path->dentry after dput + // but its fine as umount always succeeds + // that code path is very cold + if (err) + pr_info("umount %s failed: %d\n", mnt, err); +} + +static void try_umount(const char *mnt, int flags) +{ + struct path path; + int err = kern_path(mnt, 0, &path); + if (err) { + return; + } + + if (path.dentry != path.mnt->mnt_root) { + // it is not root mountpoint, maybe umounted by others already. + path_put(&path); + return; + } + + ksu_umount_mnt(mnt, &path, flags); +} + +LSM_HANDLER_TYPE ksu_handle_setuid(struct cred *new, const struct cred *old) +{ + if (!new || !old) { + return 0; + } + + uid_t new_uid = ksu_get_uid_t(new->uid); + uid_t old_uid = ksu_get_uid_t(old->uid); + + // old process is not root, ignore it. + if (0 != old_uid) + return 0; + + // we dont have those new fancy things upstream has + // lets just do original thing where we disable seccomp + if (likely(ksu_is_manager_appid_valid()) && unlikely(ksu_get_manager_appid() == new_uid % PER_USER_RANGE)) { + disable_seccomp(); + pr_info("install fd for: %d\n", new_uid); + ksu_install_fd(); // install fd for ksu manager + } + + if (unlikely(ksu_is_allow_uid_for_current(new_uid))) { + disable_seccomp(); + return 0; + } + + // if there isn't any module mounted, just ignore it! + if (!ksu_module_mounted) { + return 0; + } + + if (!ksu_kernel_umount_enabled) { + return 0; + } + + if (!ksu_cred) { + return 0; + } + + // There are 5 scenarios: + // 1. Normal app: zygote -> appuid + // 2. Isolated process forked from zygote: zygote -> isolated_process + // 3. App zygote forked from zygote: zygote -> appuid + // 4. Isolated process froked from app zygote: appuid -> isolated_process (already handled by 3) + // 5. Isolated process froked from webview zygote (no need to handle, app cannot run custom code) + if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { + return 0; + } + + if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { + return 0; + } + + // check old process's selinux context, if it is not zygote, ignore it! + // because some su apps may setuid to untrusted_app but they are in global mount namespace + // when we umount for such process, that is a disaster! + // also handle case 4 and 5 + bool is_zygote_child = is_zygote(old); + if (!is_zygote_child) { + pr_info("handle umount ignore non zygote child: %d\n", current->pid); + return 0; + } + + // umount the target mnt + pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); + + const struct cred *saved = override_creds(ksu_cred); + + struct mount_entry *entry; + down_read(&mount_list_lock); + list_for_each_entry(entry, &mount_list, list) { + pr_info("%s: unmounting: %s flags 0x%x\n", __func__, entry->umountable, entry->flags); + try_umount(entry->umountable, entry->flags); + } + up_read(&mount_list_lock); + + revert_creds(saved); + + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) +static void ksu_grab_init_session_keyring(const char *filename); +#endif + +LSM_HANDLER_TYPE ksu_bprm_check(struct linux_binprm *bprm) +{ + if (likely(!ksu_execveat_hook)) + return 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) + ksu_grab_init_session_keyring((const char *)bprm->filename); +#endif + + ksu_handle_pre_ksud((char *)bprm->filename); + + return 0; +} + +bool ksu_vfs_read_hook __read_mostly; +static void ksu_handle_initrc(struct file *file); + +LSM_HANDLER_TYPE ksu_file_permission(struct file *file, int mask) +{ + if (likely(!ksu_vfs_read_hook)) + return 0; + + ksu_handle_initrc(file); + + return 0; +} + +#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS +static int ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry) +{ + return ksu_handle_rename(old_dentry, new_dentry); +} + +static int ksu_task_fix_setuid(struct cred *new, const struct cred *old, + int flags) +{ + return ksu_handle_setuid(new, old); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#include +static struct security_hook_list ksu_hooks[] = { + LSM_HOOK_INIT(inode_rename, ksu_inode_rename), + LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid), + LSM_HOOK_INIT(bprm_check_security, ksu_bprm_check), + LSM_HOOK_INIT(file_permission, ksu_file_permission), +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +static void ksu_lsm_hook_init(void) +{ + security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); +} + +#else +static void ksu_lsm_hook_init(void) +{ + security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks)); +} +#endif // < 4.11 + +#else // 4.2 + +// selinux_ops (LSM), security_operations struct tampering for ultra legacy + +extern struct security_operations selinux_ops; + +static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); +static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry) +{ + ksu_inode_rename(old_inode, old_dentry, new_inode, new_dentry); + return orig_inode_rename(old_inode, old_dentry, new_inode, new_dentry); +} + +static int (*orig_task_fix_setuid) (struct cred *new, const struct cred *old, int flags); +static int hook_task_fix_setuid(struct cred *new, const struct cred *old, int flags) +{ + ksu_task_fix_setuid(new, old, flags); + return orig_task_fix_setuid(new, old, flags); +} + +static int (*orig_bprm_check_security)(struct linux_binprm *bprm); +static int hook_bprm_check_security(struct linux_binprm *bprm) +{ + ksu_bprm_check(bprm); + return orig_bprm_check_security(bprm); +} + +static int (*orig_file_permission) (struct file *file, int mask); +static int hook_file_permission(struct file *file, int mask) +{ + + ksu_file_permission(file, mask); + return orig_file_permission(file, mask); +} + +static void ksu_lsm_hook_restore(void) +{ + struct security_operations *ops = (struct security_operations *)&selinux_ops; + + if (!ops) + return; + + if (!!strcmp((char *)ops, "selinux")) + return; + + // TODO: maybe hunt for this in memory instead of exporting + // this is the first member of the struct so it points to the struct + pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); + + preempt_disable(); + + if (orig_bprm_check_security) { + pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->bprm_check_security, (long)orig_bprm_check_security); + ops->bprm_check_security = orig_bprm_check_security; + } + + if (orig_file_permission) { + pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission); + ops->file_permission = orig_file_permission; + } + + preempt_enable(); + + smp_mb(); + return; +} + +static struct task_struct *unhook_thread; + +static int execveat_hook_wait_fn(void *data) +{ +loop_start: + + msleep(1000); + + if ((volatile bool)ksu_execveat_hook) + goto loop_start; + + ksu_lsm_hook_restore(); + + return 0; +} + +static void execveat_hook_wait_thread() +{ + unhook_thread = kthread_run(execveat_hook_wait_fn, NULL, "unhook"); + if (IS_ERR(unhook_thread)) { + unhook_thread = NULL; + return; + } +} + +static void ksu_lsm_hook_init(void) +{ + struct security_operations *ops = (struct security_operations *)&selinux_ops; + + if (!ops) + return; + + if (!!strcmp((char *)ops, "selinux")) + return; + + // TODO: maybe hunt for this in memory instead of exporting + // this is the first member of the struct so it points to the struct + pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); + + preempt_disable(); + + orig_inode_rename = ops->inode_rename; + ops->inode_rename = hook_inode_rename; + + orig_task_fix_setuid = ops->task_fix_setuid; + ops->task_fix_setuid = hook_task_fix_setuid; + + orig_bprm_check_security = ops->bprm_check_security; + ops->bprm_check_security = hook_bprm_check_security; + + orig_file_permission = ops->file_permission; + ops->file_permission = hook_file_permission; + + preempt_enable(); + + smp_mb(); + + execveat_hook_wait_thread(); + return; +} + +#endif // < 4.2 + +#else +void __init ksu_lsm_hook_init(void) +{ + // nothing, no-op +} +#endif // CONFIG_KSU_LSM_SECURITY_HOOKS + +void __init ksu_core_init(void) +{ + ksu_lsm_hook_init(); + if (ksu_register_feature_handler(&kernel_umount_handler)) { + pr_err("Failed to register kernel_umount feature handler\n"); + } +} diff --git a/drivers/kernelsu/core_hook.h b/drivers/kernelsu/core_hook.h new file mode 100644 index 000000000000..af967f0a1be2 --- /dev/null +++ b/drivers/kernelsu/core_hook.h @@ -0,0 +1,21 @@ +#ifndef __KSU_H_KSU_CORE +#define __KSU_H_KSU_CORE + +#include +#include +#include + +void __init ksu_core_init(void); + +void escape_with_root_profile(void); + +// for the umount list +struct mount_entry { + char *umountable; + unsigned int flags; + struct list_head list; +}; +extern struct list_head mount_list; +extern struct rw_semaphore mount_list_lock; + +#endif diff --git a/drivers/kernelsu/extras.c b/drivers/kernelsu/extras.c new file mode 100644 index 000000000000..4181a62ba312 --- /dev/null +++ b/drivers/kernelsu/extras.c @@ -0,0 +1,219 @@ +#include +#include +#include + +// sorry for the ifdef hell +// but im too lazy to fragment this out. +// theres only one feature so far anyway +// - xx, 20251019 + +static u32 su_sid = 0; +static u32 priv_app_sid = 0; + +// init as disabled by default +static atomic_t disable_spoof = ATOMIC_INIT(1); + +void ksu_avc_spoof_enable(); +void ksu_avc_spoof_disable(); + +static bool ksu_avc_spoof_enabled = true; +static bool boot_completed = false; + +static int avc_spoof_feature_get(u64 *value) +{ + *value = ksu_avc_spoof_enabled ? 1 : 0; + return 0; +} + +static int avc_spoof_feature_set(u64 value) +{ + bool enable = value != 0; + + if (enable == ksu_avc_spoof_enabled) { + pr_info("avc_spoof: no need to change\n"); + return 0; + } + + ksu_avc_spoof_enabled = enable; + + if (boot_completed) { + if (enable) { + ksu_avc_spoof_enable(); + } else { + ksu_avc_spoof_disable(); + } + } + + pr_info("avc_spoof: set to %d\n", enable); + + return 0; +} + +static const struct ksu_feature_handler avc_spoof_handler = { + .feature_id = KSU_FEATURE_AVC_SPOOF, + .name = "avc_spoof", + .get_handler = avc_spoof_feature_get, + .set_handler = avc_spoof_feature_set, +}; + +static int get_sid() +{ + // dont load at all if we cant get sids + int err = security_secctx_to_secid("u:r:su:s0", strlen("u:r:su:s0"), &su_sid); + if (err) { + pr_info("avc_spoof/get_sid: su_sid not found!\n"); + return -1; + } + pr_info("avc_spoof/get_sid: su_sid: %u\n", su_sid); + + err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid); + if (err) { + pr_info("avc_spoof/get_sid: priv_app_sid not found!\n"); + return -1; + } + pr_info("avc_spoof/get_sid: priv_app_sid: %u\n", priv_app_sid); + return 0; +} + +#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) +#include +#include +static struct kprobe *slow_avc_audit_kp; + +static int ksu_handle_slow_avc_audit(u32 *tsid) +{ + if (atomic_read(&disable_spoof)) + return 0; + + // if tsid is su, we just replace it + // unsure if its enough, but this is how it is aye? + if (*tsid == su_sid) { + pr_info("avc_spoof/slow_avc_audit: replacing su_sid: %u with priv_app_sid: %u\n", su_sid, priv_app_sid); + *tsid = priv_app_sid; + } + + return 0; +} + +static int slow_avc_audit_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + if (atomic_read(&disable_spoof)) + return 0; + + /* + * for < 4.17 int slow_avc_audit(u32 ssid, u32 tsid + * for >= 4.17 int slow_avc_audit(struct selinux_state *state, u32 ssid, u32 tsid + * for >= 6.4 int slow_avc_audit(u32 ssid, u32 tsid + * not to mention theres also DKSU_HAS_SELINUX_STATE + * since its hard to make sure this selinux state thing + * cross crossing with 4.17 ~ 6.4's where slow_avc_audit + * changes abi (tsid in arg2 vs arg3) + */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) + u32 *tsid = (u32 *)&PT_REGS_PARM2(regs); + ksu_handle_slow_avc_audit(tsid); +#else + u32 *tsid = (u32 *)&PT_REGS_PARM3(regs); + ksu_handle_slow_avc_audit(tsid); +#endif + + return 0; +} + +// copied from upstream +static struct kprobe *init_kprobe(const char *name, + kprobe_pre_handler_t handler) +{ + struct kprobe *kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kp) + return NULL; + kp->symbol_name = name; + kp->pre_handler = handler; + + int ret = register_kprobe(kp); + pr_info("sucompat: register_%s kprobe: %d\n", name, ret); + if (ret) { + kfree(kp); + return NULL; + } + + return kp; +} +static void destroy_kprobe(struct kprobe **kp_ptr) +{ + struct kprobe *kp = *kp_ptr; + if (!kp) + return; + unregister_kprobe(kp); + synchronize_rcu(); + kfree(kp); + *kp_ptr = NULL; +} +#else // CONFIG_KPROBES +int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass) +{ + if (atomic_read(&disable_spoof)) + return 0; + + if (tsid != su_sid) + return 0; + + pr_info("avc_spoof/slow_avc_audit: prevent log for sid: %u\n", su_sid); + *tclass = 0; + + return 0; +} +#endif + +void ksu_avc_spoof_disable(void) +{ +#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) + pr_info("avc_spoof/exit: unregister slow_avc_audit kprobe!\n"); + destroy_kprobe(&slow_avc_audit_kp); +#endif + atomic_set(&disable_spoof, 1); + pr_info("avc_spoof/exit: slow_avc_audit spoofing disabled!\n"); +} + +void ksu_avc_spoof_enable(void) +{ + int ret = get_sid(); + if (ret) { + pr_info("avc_spoof/init: sid grab fail!\n"); + return; + } + +#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) + pr_info("avc_spoof/init: register slow_avc_audit kprobe!\n"); + slow_avc_audit_kp = init_kprobe("slow_avc_audit", slow_avc_audit_pre_handler); +#endif + // once we get the sids, we can now enable the hook handler + atomic_set(&disable_spoof, 0); + + pr_info("avc_spoof/init: slow_avc_audit spoofing enabled!\n"); +} + +void ksu_avc_spoof_late_init() +{ + boot_completed = true; + + if (ksu_avc_spoof_enabled) { + ksu_avc_spoof_enable(); + } +} + +void ksu_avc_spoof_init() +{ + if (ksu_register_feature_handler(&avc_spoof_handler)) { + pr_err("Failed to register avc spoof feature handler\n"); + } +} + +void ksu_avc_spoof_exit() +{ + if (ksu_avc_spoof_enabled) { + ksu_avc_spoof_disable(); + } + ksu_unregister_feature_handler(KSU_FEATURE_AVC_SPOOF); +} diff --git a/drivers/kernelsu/feature.c b/drivers/kernelsu/feature.c new file mode 100644 index 000000000000..57600b1f234c --- /dev/null +++ b/drivers/kernelsu/feature.c @@ -0,0 +1,174 @@ +#include +#include + +static const struct ksu_feature_handler *feature_handlers[KSU_FEATURE_MAX]; + +static DEFINE_MUTEX(feature_mutex); + +int ksu_register_feature_handler(const struct ksu_feature_handler *handler) +{ + if (!handler) { + pr_err("feature: register handler is NULL\n"); + return -EINVAL; + } + + if (handler->feature_id >= KSU_FEATURE_MAX) { + pr_err("feature: invalid feature_id %u\n", handler->feature_id); + return -EINVAL; + } + + if (!handler->get_handler && !handler->set_handler) { + pr_err("feature: no handler provided for feature %u\n", + handler->feature_id); + return -EINVAL; + } + + mutex_lock(&feature_mutex); + + if (feature_handlers[handler->feature_id]) { + pr_warn("feature: handler for %u already registered, overwriting\n", + handler->feature_id); + } + + feature_handlers[handler->feature_id] = handler; + + pr_info("feature: registered handler for %s (id=%u)\n", + handler->name ? handler->name : "unknown", handler->feature_id); + + mutex_unlock(&feature_mutex); + return 0; +} + +int ksu_unregister_feature_handler(u32 feature_id) +{ + int ret = 0; + + if (feature_id >= KSU_FEATURE_MAX) { + pr_err("feature: invalid feature_id %u\n", feature_id); + return -EINVAL; + } + + mutex_lock(&feature_mutex); + + if (!feature_handlers[feature_id]) { + pr_warn("feature: no handler registered for %u\n", feature_id); + ret = -ENOENT; + goto out; + } + + feature_handlers[feature_id] = NULL; + + pr_info("feature: unregistered handler for id=%u\n", feature_id); + +out: + mutex_unlock(&feature_mutex); + return ret; +} + +int ksu_get_feature(u32 feature_id, u64 *value, bool *supported) +{ + int ret = 0; + const struct ksu_feature_handler *handler; + + if (feature_id >= KSU_FEATURE_MAX) { + pr_err("feature: invalid feature_id %u\n", feature_id); + return -EINVAL; + } + + if (!value || !supported) { + pr_err("feature: invalid parameters\n"); + return -EINVAL; + } + + mutex_lock(&feature_mutex); + + handler = feature_handlers[feature_id]; + + if (!handler) { + *supported = false; + *value = 0; + pr_debug("feature: feature %u not supported\n", feature_id); + goto out; + } + + *supported = true; + + if (!handler->get_handler) { + pr_warn("feature: no get_handler for feature %u\n", feature_id); + ret = -EOPNOTSUPP; + goto out; + } + + ret = handler->get_handler(value); + if (ret) { + pr_err("feature: get_handler for %u failed: %d\n", feature_id, + ret); + } + +out: + mutex_unlock(&feature_mutex); + return ret; +} + +int ksu_set_feature(u32 feature_id, u64 value) +{ + int ret = 0; + const struct ksu_feature_handler *handler; + + if (feature_id >= KSU_FEATURE_MAX) { + pr_err("feature: invalid feature_id %u\n", feature_id); + return -EINVAL; + } + + mutex_lock(&feature_mutex); + + handler = feature_handlers[feature_id]; + + if (!handler) { + pr_err("feature: feature %u not registered\n", feature_id); + ret = -EOPNOTSUPP; + goto out; + } + + if (!handler->set_handler) { + pr_warn("feature: no set_handler for feature %u\n", feature_id); + ret = -EOPNOTSUPP; + goto out; + } + + ret = handler->set_handler(value); + if (ret) { + pr_err("feature: set_handler for %u failed: %d\n", feature_id, + ret); + } + +out: + mutex_unlock(&feature_mutex); + return ret; +} + +void ksu_feature_init(void) +{ + int i; + + for (i = 0; i < KSU_FEATURE_MAX; i++) { + feature_handlers[i] = NULL; + } + + pr_info("feature: feature management initialized\n"); +} + +void ksu_feature_exit(void) +{ + int i; + + mutex_lock(&feature_mutex); + + for (i = 0; i < KSU_FEATURE_MAX; i++) { + feature_handlers[i] = NULL; + } + + mutex_unlock(&feature_mutex); + + pr_info("feature: feature management cleaned up\n"); +} diff --git a/drivers/kernelsu/feature.h b/drivers/kernelsu/feature.h new file mode 100644 index 000000000000..bf0fda4d3761 --- /dev/null +++ b/drivers/kernelsu/feature.h @@ -0,0 +1,39 @@ +#ifndef __KSU_H_FEATURE +#define __KSU_H_FEATURE + +#include + +enum ksu_feature_id { + KSU_FEATURE_SU_COMPAT = 0, + KSU_FEATURE_KERNEL_UMOUNT = 1, + +#ifdef CONFIG_KSU_EXTRAS // custom extensions + KSU_FEATURE_AVC_SPOOF = 10003, +#endif + + KSU_FEATURE_MAX +}; + +typedef int (*ksu_feature_get_t)(u64 *value); +typedef int (*ksu_feature_set_t)(u64 value); + +struct ksu_feature_handler { + u32 feature_id; + const char *name; + ksu_feature_get_t get_handler; + ksu_feature_set_t set_handler; +}; + +int ksu_register_feature_handler(const struct ksu_feature_handler *handler); + +int ksu_unregister_feature_handler(u32 feature_id); + +int ksu_get_feature(u32 feature_id, u64 *value, bool *supported); + +int ksu_set_feature(u32 feature_id, u64 value); + +void ksu_feature_init(void); + +void ksu_feature_exit(void); + +#endif // __KSU_H_FEATURE diff --git a/drivers/kernelsu/file_wrapper.c b/drivers/kernelsu/file_wrapper.c new file mode 100644 index 000000000000..e3de8d9d4eac --- /dev/null +++ b/drivers/kernelsu/file_wrapper.c @@ -0,0 +1,628 @@ +#include +#include +#include +#include +#include // kernel 3.18 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ksu_file_wrapper { + struct file *orig; + struct file_operations ops; +}; + +static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) +#ifndef replace_fops +#define replace_fops(f, fops) \ + do { \ + struct file *__file = (f); \ + fops_put(__file->f_op); \ + BUG_ON(!(__file->f_op = (fops))); \ + } while(0) +#endif +#endif + +static int ksu_wrapper_open(struct inode *ino, struct file *fp) +{ + struct path *orig_path = fp->f_path.dentry->d_fsdata; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) + struct file *orig_file = dentry_open(orig_path, fp->f_flags, current_cred()); +#else + struct file *orig_file = dentry_open((*orig_path).dentry, (*orig_path).mnt, fp->f_flags, current_cred()); +#endif + + if (IS_ERR(orig_file)) { + return PTR_ERR(orig_file); + } + struct ksu_file_wrapper *wrapper = ksu_create_file_wrapper(orig_file); + if (IS_ERR(wrapper)) { + filp_close(orig_file, current->files); + return PTR_ERR(wrapper); + } + fp->private_data = wrapper; + const struct file_operations *new_fops = fops_get(&wrapper->ops); + replace_fops(fp, new_fops); + return 0; +} + +static const struct file_operations ksu_file_wrapper_inode_fops = { + .owner = THIS_MODULE, + .open = ksu_wrapper_open +}; + +static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->llseek(data->orig, off, flags); +} + +static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz, loff_t *off) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->read(orig, ptr, sz, off); +} + +static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr, size_t sz, loff_t *off) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->write(orig, ptr, sz, off); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi) { + struct ksu_file_wrapper* data = iocb->ki_filp->private_data; + struct file* orig = data->orig; + iocb->ki_filp = orig; + return orig->f_op->read_iter(iocb, iovi); +} + +static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi) { + struct ksu_file_wrapper* data = iocb->ki_filp->private_data; + struct file* orig = data->orig; + iocb->ki_filp = orig; + return orig->f_op->write_iter(iocb, iovi); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) +static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch* icb, unsigned int v) { + struct ksu_file_wrapper* data = kiocb->ki_filp->private_data; + struct file* orig = data->orig; + kiocb->ki_filp = orig; + return orig->f_op->iopoll(kiocb, icb, v); +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) +static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin) { + struct ksu_file_wrapper* data = kiocb->ki_filp->private_data; + struct file* orig = data->orig; + kiocb->ki_filp = orig; + return orig->f_op->iopoll(kiocb, spin); +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR)) +static int ksu_wrapper_iterate (struct file *fp, struct dir_context *dc) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->iterate(orig, dc); +} +#endif + +// int (*readdir) (struct file *, void *, filldir_t); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) +static int ksu_wrapper_readdir(struct file *fp, void *ptr, filldir_t filler) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->readdir(orig, ptr, filler); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) +static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->iterate_shared(orig, dc); +} +#endif + +// typedef unsigned __bitwise __poll_t; +static unsigned __bitwise ksu_wrapper_poll(struct file *fp, struct poll_table_struct *pts) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->poll(orig, pts); +} + +static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->unlocked_ioctl(orig, cmd, arg); +} + +static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->compat_ioctl(orig, cmd, arg); +} + +static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct * vma) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->mmap(orig, vma); +} + +static int ksu_wrapper_flush(struct file *fp, fl_owner_t id) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->flush(orig, id); +} + + +static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2, int datasync) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->fsync(orig, off1, off2, datasync); +} + +static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->fasync(arg, orig, arg2); +} + +static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->lock(orig, arg1, fl); +} + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) +static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, size_t sz, loff_t *off, int arg2) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->sendpage) { + return orig->f_op->sendpage(orig, pg, arg1, sz, off, arg2); + } + return -EINVAL; +} +#endif + +static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->get_unmapped_area) { + return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3, arg4); + } + return -EINVAL; +} + +// static int ksu_wrapper_check_flags(int arg) {} + +static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->flock) { + return orig->f_op->flock(orig, arg1, fl); + } + return -EINVAL; +} + +static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info * pii, struct file *fp, loff_t *off, size_t sz, unsigned int arg1) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->splice_write) { + return orig->f_op->splice_write(pii, orig, off, sz, arg1); + } + return -EINVAL; +} + +static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, struct pipe_inode_info *pii, size_t sz, unsigned int arg1) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->splice_read) { + return orig->f_op->splice_read(orig, off, pii, sz, arg1); + } + return -EINVAL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) +void ksu_wrapper_splice_eof(struct file *fp) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->splice_eof) { + return orig->f_op->splice_eof(orig); + } +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) +static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lease **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->setlease) { + return orig->f_op->setlease(orig, arg1, fl, p); + } + return -EINVAL; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) +static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lock **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->setlease) { + return orig->f_op->setlease(orig, arg1, fl, p); + } + return -EINVAL; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) // int (*setlease)(struct file *, long, struct file_lock **, void **); +static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->setlease) { + return orig->f_op->setlease(orig, arg1, fl, p); + } + return -EINVAL; +} +#else // int (*setlease)(struct file *, long, struct file_lock **); +static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->setlease) { + return orig->f_op->setlease(orig, arg1, fl); + } + return -EINVAL; +} +#endif + +static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->fallocate) { + return orig->f_op->fallocate(orig, mode, offset, len); + } + return -EINVAL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) { + struct ksu_file_wrapper* data = f->private_data; + struct file* orig = data->orig; + if (orig->f_op->show_fdinfo) { + orig->f_op->show_fdinfo(m, orig); + } +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) { + struct ksu_file_wrapper* data = f->private_data; + struct file* orig = data->orig; + if (orig->f_op->show_fdinfo) { + orig->f_op->show_fdinfo(m, orig); + } + return -EINVAL; +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) +// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1593-1606;drc=398da7defe218d3e51b0f3bdff75147e28125b60 +static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, + loff_t pos_out, size_t len, unsigned int flags) { + struct ksu_file_wrapper* data = file_out->private_data; + struct file* orig = data->orig; + return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len, flags); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) +// no REMAP_FILE_DEDUP: use file_in +// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1598-1599;drc=398da7defe218d3e51b0f3bdff75147e28125b60 +// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=403-404;drc=398da7defe218d3e51b0f3bdff75147e28125b60 +// REMAP_FILE_DEDUP: use file_out +// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=483-484;drc=398da7defe218d3e51b0f3bdff75147e28125b60 +static loff_t ksu_wrapper_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { + if (remap_flags & REMAP_FILE_DEDUP) { + struct ksu_file_wrapper* data = file_out->private_data; + struct file* orig = data->orig; + return orig->f_op->remap_file_range(file_in, pos_in, orig, pos_out, len, remap_flags); + } else { + struct ksu_file_wrapper* data = file_in->private_data; + struct file* orig = data->orig; + return orig->f_op->remap_file_range(orig, pos_in, file_out, pos_out, len, remap_flags); + } +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) +static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, int flags) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + if (orig->f_op->fadvise) { + return orig->f_op->fadvise(orig, off1, off2, flags); + } + return -EINVAL; +} +#endif + +static void ksu_release_file_wrapper(struct ksu_file_wrapper *data); + +static int ksu_wrapper_release(struct inode *inode, struct file *filp) { + // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/file_table.c;l=467-473;drc=3be0b283b562eabbc2b1f3bb534dc8903079bbaa + // f_op->release is called before fops_put(f_op), so we put it manually. + fops_put(filp->f_op); + // prevent it from being put again + filp->f_op = NULL; + ksu_release_file_wrapper(filp->private_data); + return 0; +} + +static struct ksu_file_wrapper* ksu_create_file_wrapper(struct file* fp) { + struct ksu_file_wrapper* p = kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL); + if (!p) { + return ERR_PTR(-ENOMEM); + } + + get_file(fp); + + p->orig = fp; + p->ops.owner = THIS_MODULE; + p->ops.llseek = fp->f_op->llseek ? ksu_wrapper_llseek : NULL; + p->ops.read = fp->f_op->read ? ksu_wrapper_read : NULL; + p->ops.write = fp->f_op->write ? ksu_wrapper_write : NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + p->ops.read_iter = fp->f_op->read_iter ? ksu_wrapper_read_iter : NULL; + p->ops.write_iter = fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) + p->ops.iopoll = fp->f_op->iopoll ? ksu_wrapper_iopoll : NULL; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR)) + p->ops.iterate = fp->f_op->iterate ? ksu_wrapper_iterate : NULL; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) + p->ops.readdir = fp->f_op->readdir ? ksu_wrapper_readdir : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) + p->ops.iterate_shared = fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL; +#endif + p->ops.poll = fp->f_op->poll ? ksu_wrapper_poll : NULL; + p->ops.unlocked_ioctl = fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL; + p->ops.compat_ioctl = fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL; + p->ops.mmap = fp->f_op->mmap ? ksu_wrapper_mmap : NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) + p->ops.fop_flags = fp->f_op->fop_flags; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) + p->ops.mmap_supported_flags = fp->f_op->mmap_supported_flags; +#endif + p->ops.flush = fp->f_op->flush ? ksu_wrapper_flush : NULL; + p->ops.release = ksu_wrapper_release; + p->ops.fsync = fp->f_op->fsync ? ksu_wrapper_fsync : NULL; + p->ops.fasync = fp->f_op->fasync ? ksu_wrapper_fasync : NULL; + p->ops.lock = fp->f_op->lock ? ksu_wrapper_lock : NULL; +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) + p->ops.sendpage = fp->f_op->sendpage ? ksu_wrapper_sendpage : NULL; +#endif + p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ? ksu_wrapper_get_unmapped_area : NULL; + p->ops.check_flags = fp->f_op->check_flags; + p->ops.flock = fp->f_op->flock ? ksu_wrapper_flock : NULL; + p->ops.splice_write = fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL; + p->ops.splice_read = fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL; + p->ops.setlease = fp->f_op->setlease ? ksu_wrapper_setlease : NULL; + p->ops.fallocate = fp->f_op->fallocate ? ksu_wrapper_fallocate : NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + p->ops.show_fdinfo = fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) + p->ops.copy_file_range = fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) + p->ops.remap_file_range = fp->f_op->remap_file_range ? ksu_wrapper_remap_file_range : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) + p->ops.fadvise = fp->f_op->fadvise ? ksu_wrapper_fadvise : NULL; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) + p->ops.splice_eof = fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL; +#endif + + return p; +} + +static void ksu_release_file_wrapper(struct ksu_file_wrapper *data) +{ + fput((struct file*) data->orig); + kfree(data); +} + +static char *ksu_wrapper_d_dname(struct dentry *dentry, char *buffer, + int buflen) +{ + struct path *orig_path = dentry->d_fsdata; + return d_path(orig_path, buffer, buflen); +} + +static void ksu_wrapper_d_release(struct dentry *dentry) +{ + struct path *orig_path = dentry->d_fsdata; + path_put(orig_path); + kfree(orig_path); +} + +static const struct dentry_operations ksu_file_wrapper_d_ops = { + .d_dname = ksu_wrapper_d_dname, + .d_release = ksu_wrapper_d_release +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) +#define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) +#define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure + +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) +// There is no anon_inode_create_getfile before 5.16, but it's not difficult to implement it. +// https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e + +// Borrow kernel's anon_inode_mnt, so that we don't need to mount one by ourselves. +static struct vfsmount *anon_inode_mnt __read_mostly; + +static struct inode * +ksu_anon_inode_make_secure_inode(const char *name, const struct inode *context_inode) +{ + struct inode *inode; + + if (unlikely(!anon_inode_mnt)) { + return ERR_PTR(-ENODEV); + } + + inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) + return inode; + inode->i_flags &= ~S_PRIVATE; + + return inode; +} + +static struct file *ksu_anon_inode_create_getfile_compat( + const char *name, const struct file_operations *fops, void *priv, int flags, + const struct inode *context_inode) +{ + struct inode *inode; + struct file *file; + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + inode = ksu_anon_inode_make_secure_inode(name, context_inode); + if (IS_ERR(inode)) { + file = ERR_CAST(inode); + goto err; + } + + file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); + if (IS_ERR(file)) + goto err_iput; + + file->f_mapping = inode->i_mapping; + + file->private_data = priv; + + return file; + +err_iput: + iput(inode); +err: + module_put(fops->owner); + return file; +} +#else +struct file * +ksu_anon_inode_create_getfile_compat(const char *name, const struct file_operations *fops, + void *priv, int flags, const struct inode *context_inode) +{ + return anon_inode_getfile(name, fops, priv, flags); +} +#endif + +int ksu_install_file_wrapper(int fd) +{ + int out_fd, ret; + struct file *orig_file = fget(fd); + if (!orig_file) { + return -EBADF; + } + + out_fd = get_unused_fd_flags(O_CLOEXEC); + if (out_fd < 0) { + ret = out_fd; + goto done; + } + + struct ksu_file_wrapper *file_wrapper_data = + ksu_create_file_wrapper(orig_file); + if (IS_ERR(file_wrapper_data)) { + ret = PTR_ERR(file_wrapper_data); + goto out_put_fd; + } + + struct file *wrapper_file = ksu_anon_inode_create_getfile_compat( + "[ksu_fdwrapper]", &file_wrapper_data->ops, file_wrapper_data, + orig_file->f_flags, NULL); + if (IS_ERR(wrapper_file)) { + pr_err("ksu_fdwrapper: getfile failed: %ld\n", PTR_ERR(wrapper_file)); + ret = PTR_ERR(wrapper_file); + goto out_release_wrapper; + } + + // Now do magic on inode and dentry. + // It should be safe to modify them since the file hasn't been published. + + struct inode *wrapper_inode = file_inode(wrapper_file); + // libc's stdio relies on the fstat() result of the fd to determine its buffer type. + wrapper_inode->i_mode = file_inode(orig_file)->i_mode; + struct inode_security_struct *wrapper_sec = selinux_inode(wrapper_inode); + // Use ksu_file_sid to bypass SELinux check. + // When we call `su` from terminal app, this is useful. + if (wrapper_sec) { + wrapper_sec->sid = ksu_file_sid; + } + // Install open file operation for inode. + wrapper_inode->i_fop = &ksu_file_wrapper_inode_fops; + + struct path *orig_path = kmalloc(sizeof(struct path), GFP_KERNEL); + if (!orig_path) { + ret = -ENOMEM; + goto out_put_wrapper_file; + } + *orig_path = orig_file->f_path; + path_get(orig_path); + // Some applications (such as screen) won't work if the tty's path is weird, + // Therefore, we use d_dname to spoof it to return the path to the original file. + wrapper_file->f_path.dentry->d_fsdata = orig_path; + wrapper_file->f_path.dentry->d_op = &ksu_file_wrapper_d_ops; + + fd_install(out_fd, wrapper_file); + ret = out_fd; + goto done; + +out_put_wrapper_file: + fput(wrapper_file); + // file_wrapper will be released by fput + goto out_put_fd; +out_release_wrapper: + ksu_release_file_wrapper(file_wrapper_data); +out_put_fd: + put_unused_fd(out_fd); +done: + fput(orig_file); + + return ret; +} + +void ksu_file_wrapper_init(void) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) + static const struct file_operations tmp = { .owner = THIS_MODULE }; + struct file *dummy = anon_inode_getfile("dummy", &tmp, NULL, 0); + if (IS_ERR(dummy)) { + pr_err( + "file_wrapper: initialize anon_inode_mnt failed, can't get file: %ld\n", + PTR_ERR(dummy)); + return; + } + anon_inode_mnt = dummy->f_path.mnt; + if (unlikely(!anon_inode_mnt)) { + pr_err("file_wrapper: initialize anon_inode_mnt failed, got NULL\n"); + } + fput(dummy); +#endif +} diff --git a/drivers/kernelsu/file_wrapper.h b/drivers/kernelsu/file_wrapper.h new file mode 100644 index 000000000000..faae4dded301 --- /dev/null +++ b/drivers/kernelsu/file_wrapper.h @@ -0,0 +1,10 @@ +#ifndef KSU_FILE_WRAPPER_H +#define KSU_FILE_WRAPPER_H + +#include +#include + +int ksu_install_file_wrapper(int fd); +void ksu_file_wrapper_init(void); + +#endif // KSU_FILE_WRAPPER_H diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c new file mode 100644 index 000000000000..1de8941da722 --- /dev/null +++ b/drivers/kernelsu/kernel_compat.c @@ -0,0 +1,173 @@ +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include // signal_struct +#include +#else +#include +#endif +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) +#include +#include +#include +struct key *init_session_keyring = NULL; + +static inline int install_session_keyring(struct key *keyring) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = install_session_keyring_to_cred(new, keyring); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +// this is on tgcred on < 3.8 +// while we can grab that one, it seems to not actually be needed +static void ksu_grab_init_session_keyring(const char *filename) +{ + if (init_session_keyring) + return; + + if (!strstr(filename, "init")) + return; + + if (!!strcmp(current->comm, "init")) + return; + + if (!!!is_init(get_current_cred())) + return; + + // thats surely some exclamation comedy + // and now we are sure that this is the key we want + // up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ + // so we need to grab this using rcu_dereference + struct key *keyring = rcu_dereference(current->cred->session_keyring); + if (!keyring) + return; + + init_session_keyring = key_get(keyring); + + pr_info("%s: init_session_keyring: 0x%p \n", __func__, init_session_keyring); + +} +struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) +{ + // normally we only put this on ((current->flags & PF_WQ_WORKER) || (current->flags & PF_KTHREAD)) + // but in the grand scale of things, this does NOT matter. + // pr_info("installing init session keyring for older kernel\n"); + if (init_session_keyring != NULL && !current_cred()->session_keyring) { + install_session_keyring(init_session_keyring); + } + return filp_open(filename, flags, mode); +} +#else +struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) +{ + return filp_open(filename, flags, mode); +} +#endif + +ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + return kernel_read(p, buf, count, pos); +#else // https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t result = vfs_read(p, (void __user *)buf, count, pos); + set_fs(old_fs); + return result; +#endif +} + +ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + return kernel_write(p, buf, count, pos); +#else // https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); + set_fs(old_fs); + return res; +#endif +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +__weak int path_mount(const char *dev_name, struct path *path, + const char *type_page, unsigned long flags, void *data_page) +{ + // 384 is enough + char buf[384] = {0}; + + // -1 on the size as implicit null termination + // as we zero init the thing + char *realpath = d_path(path, buf, sizeof(buf) - 1); + if (!(realpath && realpath != buf)) + return -ENOENT; + + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + long ret = do_mount(dev_name, (const char __user *)realpath, type_page, flags, data_page); + set_fs(old_fs); + return ret; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +__weak long copy_from_user_nofault(void *dst, const void __user *src, size_t size) +{ + // https://elixir.bootlin.com/linux/v5.8/source/mm/maccess.c#L205 + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + + // normally theres an access_ok check here + // but for what we use it, it will always be true. + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + set_fs(old_fs); + + if (ret) + return -EFAULT; + return 0; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +__weak long copy_from_kernel_nofault(void *dst, const void *src, size_t size) +{ + // https://elixir.bootlin.com/linux/v5.2.21/source/mm/maccess.c#L27 + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; +} +#endif diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h new file mode 100644 index 000000000000..9119648121ff --- /dev/null +++ b/drivers/kernelsu/kernel_compat.h @@ -0,0 +1,112 @@ +#ifndef __KSU_H_KERNEL_COMPAT +#define __KSU_H_KERNEL_COMPAT + +#include +#include +#include +#include +#include +#include +#include + +extern struct file *ksu_filp_open_compat(const char *filename, int flags, + umode_t mode); +extern ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, + loff_t *pos); +extern ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, + size_t count, loff_t *pos); + +// for supercalls.c fd install tw +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) +#ifndef TWA_RESUME +#define TWA_RESUME 1 +#endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0) +__weak int close_fd(unsigned fd) +{ + return sys_close(fd); +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) +__weak int close_fd(unsigned fd) +{ + // this is ksys_close, but that shit is inline + // its problematic to cascade a weak symbol for it + return __close_fd(current->files, fd); +} +#endif + +extern long copy_from_user_nofault(void *dst, const void __user *src, size_t size); + +/* + * ksu_copy_from_user_retry + * try nofault copy first, if it fails, try with plain + * paramters are the same as copy_from_user + * 0 = success + * + hot since this is reused on sucompat + */ +__attribute__((hot)) +static long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned long count) +{ + long ret = copy_from_user_nofault(to, from, count); + if (likely(!ret)) + return ret; + + // we faulted! fallback to slow path + return copy_from_user(to, from, count); +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) +struct dir_context { + const filldir_t actor; + loff_t pos; +}; + +static int iterate_dir(struct file *file, struct dir_context *ctx) +{ + return vfs_readdir(file, ctx->actor, ctx); +} +#endif // KSU_HAS_ITERATE_DIR + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +__weak char *bin2hex(char *dst, const void *src, size_t count) +{ + const unsigned char *_src = src; + while (count--) + dst = pack_hex_byte(dst, *_src++); + return dst; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) && !defined(KSU_UL_HAS_FILE_INODE) +static inline struct inode *file_inode(struct file *f) +{ + return f->f_path.dentry->d_inode; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_INODE) +static inline struct inode_security_struct *selinux_inode(const struct inode *inode) +{ + return inode->i_security; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_CRED) +static inline struct task_security_struct *selinux_cred(const struct cred *cred) +{ + return cred->security; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 15, 0) +__weak void groups_sort(struct group_info *group_info) +{ + return; +} +#endif + +#endif diff --git a/drivers/kernelsu/klog.h b/drivers/kernelsu/klog.h new file mode 100644 index 000000000000..a934027fbeeb --- /dev/null +++ b/drivers/kernelsu/klog.h @@ -0,0 +1,11 @@ +#ifndef __KSU_H_KLOG +#define __KSU_H_KLOG + +#include + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) "KernelSU: " fmt +#endif + +#endif diff --git a/drivers/kernelsu/kp_ksud.c b/drivers/kernelsu/kp_ksud.c new file mode 100644 index 000000000000..ece6cf12e456 --- /dev/null +++ b/drivers/kernelsu/kp_ksud.c @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static struct task_struct *unregister_thread; + +// sys_newfstat rp +// upstream: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 + +// this is a bit different from copy_from_user_retry +// here we just disable preempt and try nofault again +// we use this inside context that can't sleep +static long ksu_copy_from_user_nofault_retry(void *to, const void __user *from, unsigned long count) +{ + long ret = copy_from_user_nofault(to, from, count); + if (likely(!ret)) + return ret; + + preempt_disable(); + ret = copy_from_user_nofault(to, from, count); + preempt_enable(); + + return ret; +} + +static int sys_newfstat_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + unsigned int fd = PT_REGS_PARM1(real_regs); + void *statbuf = PT_REGS_PARM2(real_regs); + *(void **)&p->data = NULL; + + if (!is_init(get_current_cred())) + return 0; + + struct file *file = fget(fd); + if (!file) + return 0; + + if (is_init_rc(file)) { + pr_info("kp_ksud: newfstat: stat init.rc \n"); + fput(file); + *(void **)&p->data = statbuf; + return 0; + } + fput(file); + + return 0; +} + +static int sys_newfstat_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) +{ + void __user *statbuf = *(void **)&p->data; + if (!statbuf) + return 0; + + void __user *st_size_ptr = statbuf + offsetof(struct stat, st_size); + long size, new_size; + + if (ksu_copy_from_user_nofault_retry(&size, st_size_ptr, sizeof(long))) { + pr_info("kp_ksud: newfstat: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); + return 0; + } + + new_size = size + ksu_rc_len; + pr_info("kp_ksud: newfstat: adding ksu_rc_len: %ld -> %ld \n", size, new_size); + + // I do NOT think this matters much for now, we can use copy_to_user + // if SHTF then we backport cope_to_user_nofault + if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) + pr_info("kp_ksud: newfstat: added ksu_rc_len \n"); + else + pr_info("kp_ksud: newfstat: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + + return 0; +} + +static struct kretprobe sys_newfstat_rp = { + .kp.symbol_name = SYS_NEWFSTAT_SYMBOL, + .entry_handler = sys_newfstat_handler_pre, + .handler = sys_newfstat_handler_post, + .data_size = sizeof(void *), +}; + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +static int sys_fstat64_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + unsigned long fd = PT_REGS_PARM1(real_regs); // long, but I don't think it matters. + void *statbuf = PT_REGS_PARM2(real_regs); + *(void **)&p->data = NULL; + + if (!is_init(get_current_cred())) + return 0; + + struct file *file = fget(fd); + if (!file) + return 0; + + if (is_init_rc(file)) { + pr_info("kp_ksud: fstat64: stat init.rc \n"); + fput(file); + *(void **)&p->data = statbuf; + return 0; + } + fput(file); + + return 0; +} + +static int sys_fstat64_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) +{ + void __user *statbuf = *(void **)&p->data; + if (!statbuf) + return 0; + + // compat_stat + void __user *st_size_ptr = statbuf + offsetof(struct stat64, st_size); + long size, new_size; + + if (ksu_copy_from_user_nofault_retry(&size, st_size_ptr, sizeof(long long))) { + pr_info("kp_ksud: fstat64: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); + return 0; + } + + new_size = size + ksu_rc_len; + pr_info("kp_ksud: fstat64: adding ksu_rc_len: %ld -> %ld \n", size, new_size); + + if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) + pr_info("kp_ksud: fstat64: added ksu_rc_len \n"); + else + pr_info("kp_ksud: fstat64: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + + return 0; +} + +static struct kretprobe sys_fstat64_rp = { + .kp.symbol_name = SYS_FSTAT64_SYMBOL, + .entry_handler = sys_fstat64_handler_pre, + .handler = sys_fstat64_handler_post, + .data_size = sizeof(void *), +}; +#endif + +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE +// sys_reboot +extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); + +static int sys_reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + int magic1 = (int)PT_REGS_PARM1(real_regs); + int magic2 = (int)PT_REGS_PARM2(real_regs); + int cmd = (int)PT_REGS_PARM3(real_regs); + void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs); + + return ksu_handle_sys_reboot(magic1, magic2, cmd, arg); +} + +static struct kprobe sys_reboot_kp = { + .symbol_name = SYS_REBOOT_SYMBOL, + .pre_handler = sys_reboot_handler_pre, +}; +#endif + +static int unregister_kprobe_function(void *data) +{ +loop_start: + + msleep(1000); + + if ((volatile bool)ksu_execveat_hook) + goto loop_start; + + pr_info("kp_ksud: unregistering kprobes...\n"); + + unregister_kretprobe(&sys_newfstat_rp); + pr_info("kp_ksud: unregister sys_newfstat_rp!\n"); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + unregister_kretprobe(&sys_fstat64_rp); + pr_info("kp_ksud: unregister sys_fstat64_rp!\n"); +#endif + + unregister_thread = NULL; + + return 0; +} + +static void unregister_kprobe_thread() +{ + unregister_thread = kthread_run(unregister_kprobe_function, NULL, "kprobe_unregister"); + if (IS_ERR(unregister_thread)) { + unregister_thread = NULL; + return; + } +} + +static void kp_ksud_init() +{ + +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE + int ret = register_kprobe(&sys_reboot_kp); // dont unreg this one + pr_info("kp_ksud: sys_reboot_kp: %d\n", ret); +#endif + + int ret2 = register_kretprobe(&sys_newfstat_rp); + pr_info("kp_ksud: sys_newfstat_rp: %d\n", ret2); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + int ret3 = register_kretprobe(&sys_fstat64_rp); + pr_info("kp_ksud: sys_fstat64_rp: %d\n", ret3); +#endif + + unregister_kprobe_thread(); +} diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c new file mode 100644 index 000000000000..d12add3c4415 --- /dev/null +++ b/drivers/kernelsu/ksu.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include /* LINUX_VERSION_CODE, KERNEL_VERSION macros */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#else +#include +#endif + +#define ksu_get_uid_t(x) *(unsigned int *)&(x) + +#include "allowlist.h" +#include "apk_sign.h" +#include "app_profile.h" +#include "arch.h" +#include "core_hook.h" +#include "feature.h" +#include "file_wrapper.h" +#include "kernel_compat.h" +#include "klog.h" +#include "ksud.h" +#include "ksu.h" +#include "manager.h" +#include "sucompat.h" +#include "supercalls.h" +#include "throne_tracker.h" +#include "su_mount_ns.h" +#include "selinux/selinux.h" +#include "selinux/sepolicy.h" + +// selinux includes +#include +#include "avc_ss.h" +#include "objsec.h" +#include "ss/services.h" +#include "ss/symtab.h" +#include "xfrm.h" +#ifndef KSU_COMPAT_USE_SELINUX_STATE +#include "avc.h" +#endif + +// unity build +#include "tiny_sulog.c" +#include "allowlist.c" +#include "app_profile.c" +#include "apk_sign.c" +#include "sucompat.c" +#include "throne_tracker.c" +#include "core_hook.c" +#include "supercalls.c" +#include "feature.c" +#include "su_mount_ns.c" +#include "ksud.c" +#include "kernel_compat.c" +#include "file_wrapper.c" + +#include "selinux/selinux.c" +#include "selinux/sepolicy.c" +#include "selinux/rules.c" + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +#ifdef CONFIG_ARM64 +#include "syscall_table_hook.c" +#elif CONFIG_ARM +#include "syscall_table_hook_arm.c" +#endif +#endif + +#ifdef CONFIG_KSU_KPROBES_KSUD +#include "kp_ksud.c" +#endif + +#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT +#include "rp_sucompat.c" +#endif + +#ifdef CONFIG_KSU_EXTRAS +#include "extras.c" +#endif + +struct cred* ksu_cred; + +extern void ksu_supercalls_init(); + +int __init kernelsu_init(void) +{ +#ifdef CONFIG_KSU_DEBUG + pr_alert("*************************************************************"); + pr_alert("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); + pr_alert("** **"); + pr_alert("** You are running KernelSU in DEBUG mode **"); + pr_alert("** **"); + pr_alert("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); + pr_alert("*************************************************************"); +#endif + + ksu_cred = prepare_creds(); + if (!ksu_cred) { + pr_err("prepare cred failed!\n"); + } + + ksu_feature_init(); + + ksu_supercalls_init(); + + ksu_sucompat_init(); // so the feature is registered + + ksu_core_init(); + + ksu_allowlist_init(); + + ksu_throne_tracker_init(); + + ksu_ksud_init(); + + ksu_file_wrapper_init(); + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE + ksu_syscall_table_hook_init(); +#endif + +#ifdef CONFIG_KSU_KPROBES_KSUD + kp_ksud_init(); +#endif + +#ifdef CONFIG_KSU_EXTRAS + ksu_avc_spoof_init(); // so the feature is registered +#endif + + return 0; +} + +device_initcall(kernelsu_init); + +// MODULE_LICENSE("GPL"); +// MODULE_AUTHOR("weishu"); +// MODULE_DESCRIPTION("Android KernelSU"); diff --git a/drivers/kernelsu/ksu.h b/drivers/kernelsu/ksu.h new file mode 100644 index 000000000000..7b75ada2029e --- /dev/null +++ b/drivers/kernelsu/ksu.h @@ -0,0 +1,29 @@ +#ifndef __KSU_H_KSU +#define __KSU_H_KSU + +#include +#include + +#define KERNEL_SU_VERSION 32322 + +#define EVENT_POST_FS_DATA 1 +#define EVENT_BOOT_COMPLETED 2 +#define EVENT_MODULE_MOUNTED 3 + +static inline int startswith(char *s, char *prefix) +{ + return strncmp(s, prefix, strlen(prefix)); +} + +static inline int endswith(const char *s, const char *t) +{ + size_t slen = strlen(s); + size_t tlen = strlen(t); + if (tlen > slen) + return 1; + return strcmp(s + slen - tlen, t); +} + +extern struct cred* ksu_cred; + +#endif diff --git a/drivers/kernelsu/ksud.c b/drivers/kernelsu/ksud.c new file mode 100644 index 000000000000..40ebe997b787 --- /dev/null +++ b/drivers/kernelsu/ksud.c @@ -0,0 +1,719 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) +#include +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#else +#include +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) +#include +#endif +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#include /* fatal_signal_pending */ +#else +#include /* fatal_signal_pending */ +#endif +#include + +bool ksu_module_mounted __read_mostly = false; +bool ksu_boot_completed __read_mostly = false; + +#ifdef CONFIG_KSU_EXTRAS +extern void ksu_avc_spoof_late_init(); +#else +void ksu_avc_spoof_late_init() {} +#endif + +static const char KERNEL_SU_RC[] = + "\n" + + "on post-fs-data\n" + " start logd\n" + // We should wait for the post-fs-data finish + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " post-fs-data\n" + "\n" + + "on nonencrypted\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" + "\n" + + "on property:vold.decrypt=trigger_restart_framework\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" + "\n" + + "on property:sys.boot_completed=1\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " boot-completed\n" + "\n" + + "\n"; + +static void stop_vfs_read_hook(); +static void stop_execve_hook(); +static void stop_input_hook(); + +bool ksu_vfs_read_hook __read_mostly = true; +bool ksu_execveat_hook __read_mostly = true; +bool ksu_input_hook __read_mostly = true; + +void on_post_fs_data(void) +{ + static bool done = false; + if (done) { + pr_info("on_post_fs_data already done\n"); + return; + } + done = true; + pr_info("on_post_fs_data!\n"); + + ksu_load_allow_list(); + // sanity check, this may influence the performance + stop_input_hook(); +} + +#if defined(CONFIG_EXT4_FS) && ( LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) || defined(KSU_HAS_MODERN_EXT4) ) +extern void ext4_unregister_sysfs(struct super_block *sb); +int nuke_ext4_sysfs(const char *mnt) +{ + struct path path; + int err = kern_path(mnt, 0, &path); + if (err) { + pr_err("nuke path err: %d\n", err); + return err; + } + + struct super_block *sb = path.dentry->d_inode->i_sb; + const char *name = sb->s_type->name; + if (strcmp(name, "ext4") != 0) { + pr_info("nuke but module aren't mounted\n"); + path_put(&path); + return -EINVAL; + } + + ext4_unregister_sysfs(sb); + path_put(&path); + return 0; +} +#else +int nuke_ext4_sysfs(const char* mnt) { + pr_info("%s: feature not implemented!\n", __func__); + return 0; +} +#endif + +void on_module_mounted(void) +{ + pr_info("on_module_mounted!\n"); + ksu_module_mounted = true; +} + +void on_boot_completed(void) +{ + ksu_boot_completed = true; + pr_info("on_boot_completed!\n"); + track_throne(true); + ksu_avc_spoof_late_init(); // slow_avc_init kp +} + +// since _ksud handler only uses argv and envp for comparisons +// this can probably work +// adapted from ksu_handle_execveat_ksud +static int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const char *envp, size_t envp_len) +{ + static const char app_process[] = "/system/bin/app_process"; + static bool first_app_process = true; + + /* This applies to versions Android 10+ */ + static const char system_bin_init[] = "/system/bin/init"; + /* This applies to versions between Android 6 ~ 9 */ + static const char old_system_init[] = "/init"; + static bool init_second_stage_executed = false; + + // return early when disabled + if (!ksu_execveat_hook) + return 0; + + if (!filename) + return 0; + + // debug! remove me! + pr_info("%s: filename: %s argv1: %s envp_len: %zu\n", __func__, filename, argv1, envp_len); + +#ifdef CONFIG_KSU_DEBUG + const char *envp_n = envp; + unsigned int envc = 1; + do { + pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); + envp_n += strlen(envp_n) + 1; + envc++; + } while (envp_n < envp + 256); +#endif + + if (init_second_stage_executed) + goto first_app_process; + + // /system/bin/init with argv1 + if (!strcmp(filename, system_bin_init) && argv1 && !strcmp(argv1, "second_stage")) { + pr_info("%s: /system/bin/init second_stage executed\n", __func__); + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); + init_second_stage_executed = true; + } + + // /init with argv1 + if (!strcmp(filename, old_system_init) && argv1 && !strcmp(argv1, "--second-stage")) { + pr_info("%s: /init --second-stage executed\n", __func__); + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); + init_second_stage_executed = true; + } + + if (!envp || !envp_len) + goto first_app_process; + + // /init without argv1/useless-argv1 but usable envp + // untested! TODO: test and debug me! + if (!init_second_stage_executed && !strcmp(filename, old_system_init)) { + + // we hunt for "INIT_SECOND_STAGE" + const char *envp_n = envp; + unsigned int envc = 1; + do { + if (strstarts(envp_n, "INIT_SECOND_STAGE")) + break; + envp_n += strlen(envp_n) + 1; + envc++; + } while (envp_n < envp + envp_len); + pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); + + if (!strcmp(envp_n, "INIT_SECOND_STAGE=1") + || !strcmp(envp_n, "INIT_SECOND_STAGE=true") ) { + pr_info("%s: /init +envp: INIT_SECOND_STAGE executed\n", __func__); + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); + init_second_stage_executed = true; + } + } + +first_app_process: + if (first_app_process && strstarts(filename, app_process)) { + first_app_process = false; + pr_info("%s: exec app_process, /data prepared, second_stage: %d\n", __func__, init_second_stage_executed); + on_post_fs_data(); + stop_execve_hook(); + } + + return 0; +} + +int ksu_handle_pre_ksud(const char *filename) +{ + if (likely(!ksu_execveat_hook)) + return 0; + + // not /system/bin/init, not /init, not /system/bin/app_process (64/32 thingy) + // return 0; + if (likely(strcmp(filename, "/system/bin/init") && strcmp(filename, "/init") + && !strstarts(filename, "/system/bin/app_process") )) + return 0; + + if (!current || !current->mm) + return 0; + + // https://elixir.bootlin.com/linux/v4.14.1/source/include/linux/mm_types.h#L429 + // unsigned long arg_start, arg_end, env_start, env_end; + unsigned long arg_start = current->mm->arg_start; + unsigned long arg_end = current->mm->arg_end; + unsigned long env_start = current->mm->env_start; + unsigned long env_end = current->mm->env_end; + + size_t arg_len = arg_end - arg_start; + size_t envp_len = env_end - env_start; + + if (arg_len <= 0 || envp_len <= 0) // this wont make sense, filter it + return 0; + +#define ARGV_MAX 32 +#define ENVP_MAX 256 + char args[ARGV_MAX]; + char envp[ENVP_MAX]; + size_t argv_copy_len = (arg_len > ARGV_MAX) ? ARGV_MAX : arg_len; + size_t envp_copy_len = (envp_len > ENVP_MAX) ? ENVP_MAX : envp_len; + + // we cant use strncpy on here, else it will truncate once it sees \0 + if (ksu_copy_from_user_retry(args, (void __user *)arg_start, argv_copy_len)) + return 0; + + if (ksu_copy_from_user_retry(envp, (void __user *)env_start, envp_copy_len)) + return 0; + + args[ARGV_MAX - 1] = '\0'; + envp[ENVP_MAX - 1] = '\0'; + + // we only need argv1 ! + char *argv1 = args + strlen(args) + 1; + if (argv1 >= args + argv_copy_len) // out of bounds! + argv1 = ""; + + return ksu_handle_bprm_ksud(filename, argv1, envp, envp_copy_len); +} + +static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); +static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *); +static struct file_operations fops_proxy; +static ssize_t ksu_rc_pos = 0; +const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1; + +// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da +// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da +// The system will read init.rc file until EOF, whenever read() returns 0, +// so we begin append ksu rc when we meet EOF. + +static ssize_t read_proxy(struct file *file, char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret = 0; + size_t append_count; + if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) + goto append_ksu_rc; + + ret = orig_read(file, buf, count, pos); + if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { + return ret; + } else { + pr_info("read_proxy: orig read finished, start append rc\n"); + } +append_ksu_rc: + append_count = ksu_rc_len - ksu_rc_pos; + if (append_count > count - ret) + append_count = count - ret; + // copy_to_user returns the number of not copied + if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) { + pr_info("read_proxy: append error, totally appended %ld\n", ksu_rc_pos); + } else { + pr_info("read_proxy: append %ld\n", append_count); + + ksu_rc_pos += append_count; + if (ksu_rc_pos == ksu_rc_len) { + pr_info("read_proxy: append done\n"); + } + ret += append_count; + } + + return ret; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER) +static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + size_t append_count; + if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) + goto append_ksu_rc; + + ret = orig_read_iter(iocb, to); + if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { + return ret; + } else { + pr_info("read_iter_proxy: orig read finished, start append rc\n"); + } +append_ksu_rc: + // copy_to_iter returns the number of copied bytes + append_count = copy_to_iter((void *)KERNEL_SU_RC + ksu_rc_pos, ksu_rc_len - ksu_rc_pos, to); + if (!append_count) { + pr_info("read_iter_proxy: append error, totally appended %ld\n", ksu_rc_pos); + } else { + pr_info("read_iter_proxy: append %ld\n", append_count); + + ksu_rc_pos += append_count; + if (ksu_rc_pos == ksu_rc_len) { + pr_info("read_iter_proxy: append done\n"); + } + ret += append_count; + } + return ret; +} +#endif + +static bool is_init_rc(struct file *fp) +{ + if (strcmp(current->comm, "init")) { + // we are only interest in `init` process + return false; + } + + if (!S_ISREG(fp->f_path.dentry->d_inode->i_mode)) { + return false; + } + + const char *short_name = fp->f_path.dentry->d_name.name; + if (strcmp(short_name, "init.rc")) { + // we are only interest `init.rc` file name file + return false; + } + char path[256] = {0}; + char *dpath = d_path(&fp->f_path, path, sizeof(path)); + + if (IS_ERR(dpath)) { + return false; + } + + if (!!strcmp(dpath, "/init.rc") && !!strcmp(dpath, "/system/etc/init/hw/init.rc")) { + return false; + } + + pr_info("%s: %s \n", __func__, dpath); + + return true; +} + +static void ksu_handle_initrc(struct file *file) +{ + if (!ksu_vfs_read_hook) { + return; + } + + if (!is_init(get_current_cred())) + return; + + if (!is_init_rc(file)) { + return; + } + + // we only process the first read + static bool rc_hooked = false; + if (rc_hooked) { + // we don't need this kprobe, unregister it! + stop_vfs_read_hook(); + return; + } + rc_hooked = true; + + // now we can sure that the init process is reading + // `/system/etc/init/init.rc` + + pr_info("read init.rc, comm: %s, rc_count: %zu\n", current->comm, ksu_rc_len); + + // Now we need to proxy the read and modify the result! + // But, we can not modify the file_operations directly, because it's in read-only memory. + // We just replace the whole file_operations with a proxy one. + memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations)); + orig_read = file->f_op->read; + if (orig_read) { + fops_proxy.read = read_proxy; + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER) + orig_read_iter = file->f_op->read_iter; + if (orig_read_iter) { + fops_proxy.read_iter = read_iter_proxy; + } +#endif + // replace the file_operations + file->f_op = &fops_proxy; + + return; +} + +// NOTE: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 +// - added 260110, seems needed for A17 + +#define STAT_NATIVE 0 +#define STAT_STAT64 1 + +static __always_inline void ksu_common_newfstat_ret(unsigned long fd_long, void **statbuf_ptr, const int type) +{ + + if (!ksu_vfs_read_hook) { + return; + } + + if (!is_init(get_current_cred())) + return; + + struct file *file = fget(fd_long); + if (!file) + return; + + if (!is_init_rc(file)) { + fput(file); + return; + } + fput(file); + + pr_info("%s: stat init.rc \n", __func__); + + uintptr_t statbuf_ptr_local = (uintptr_t)*(void **)statbuf_ptr; + void __user *statbuf = (void __user *)statbuf_ptr_local; + if (!statbuf) + return; + + void __user *st_size_ptr; + long size, new_size; + size_t len; + + st_size_ptr = statbuf + offsetof(struct stat, st_size); + len = sizeof(long); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + if (type) { + st_size_ptr = statbuf + offsetof(struct stat64, st_size); + len = sizeof(long long); + } +#endif + + if (copy_from_user(&size, st_size_ptr, len)) { + pr_info("%s: read statbuf 0x%lx failed \n", __func__, (unsigned long)st_size_ptr); + return; + } + + new_size = size + ksu_rc_len; + pr_info("%s: adding ksu_rc_len: %ld -> %ld \n", __func__, size, new_size); + + if (!copy_to_user(st_size_ptr, &new_size, len)) + pr_info("%s: added ksu_rc_len \n", __func__); + else + pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", __func__, (unsigned long)st_size_ptr); + + return; +} + +void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr) +{ + unsigned long fd_long = (unsigned long)*fd; + + // native + ksu_common_newfstat_ret(fd_long, (void **)statbuf_ptr, STAT_NATIVE); +} + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr) +{ + unsigned long fd_long = (unsigned long)*fd; + + // 32-bit call uses this! + ksu_common_newfstat_ret(fd_long, (void **)statbuf_ptr, STAT_STAT64); +} +#endif + +static bool safe_mode_flag = false; +#define VOLUME_PRESS_THRESHOLD_COUNT 3 + +bool ksu_is_safe_mode() +{ + // don't need to check again, userspace may call multiple times + static bool already_checked = false; + if (already_checked) + return true; + + // stop hook first! + stop_input_hook(); + + + if (!safe_mode_flag) + return false; + + pr_info("volume keys pressed max times, safe mode detected!\n"); + already_checked = true; + return true; +} + +static void vol_detector_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) +{ + static int vol_up_cnt = 0; + static int vol_down_cnt = 0; + + if (!value) + return; + + if (type != EV_KEY) + return; + + if (code == KEY_VOLUMEDOWN) { + vol_down_cnt++; + pr_info("KEY_VOLUMEDOWN press detected!\n"); + } + + if (code == KEY_VOLUMEUP) { + vol_up_cnt++; + pr_info("KEY_VOLUMEUP press detected!\n"); + } + + pr_info("volume_pressed_count: vol_up: %d vol_down: %d\n", vol_up_cnt, vol_down_cnt); + + /* + * on upstream we call stop_input_hook() here but this is causing issues + * #1. unregistering an input handler inside the input handler is a bad meme + * #2. when I tried to defer unreg to a kthread, it also causes issues on some users? nfi. + * since unregging is done anyway on ksu_is_safe_mode() or on_post_fs_data() we just dont bother. + * + */ + if (vol_up_cnt >= VOLUME_PRESS_THRESHOLD_COUNT || vol_down_cnt >= VOLUME_PRESS_THRESHOLD_COUNT) { + pr_info("volume keys pressed max times, safe mode detected!\n"); + safe_mode_flag = true; + } +} + +static int vol_detector_connect(struct input_handler *handler, struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "ksu_handle_input"; + + error = input_register_handle(handle); + if (error) + goto err_free_handle; + + error = input_open_device(handle); + if (error) + goto err_unregister_handle; + + return 0; + +err_unregister_handle: + input_unregister_handle(handle); +err_free_handle: + kfree(handle); + return error; +} + +static const struct input_device_id vol_detector_ids[] = { + // we add key volume up so that + // 1. if you have broken volume down you get shit + // 2. we can make sure to trigger only ksu safemode, not android's safemode. + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_VOLUMEUP)] = BIT_MASK(KEY_VOLUMEUP) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_VOLUMEDOWN)] = BIT_MASK(KEY_VOLUMEDOWN) }, + }, + { } +}; + +static void vol_detector_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +MODULE_DEVICE_TABLE(input, vol_detector_ids); + +static struct input_handler vol_detector_handler = { + .event = vol_detector_event, + .connect = vol_detector_connect, + .disconnect = vol_detector_disconnect, + .name = "ksu", + .id_table = vol_detector_ids, +}; + +static int vol_detector_init() +{ + pr_info("vol_detector: init\n"); + return input_register_handler(&vol_detector_handler); +} + +static int vol_detector_exit() +{ + pr_info("vol_detector: exit\n"); + input_unregister_handler(&vol_detector_handler); + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) // is_ksu_transition +u32 ksud_init_sid = 0; +u32 ksud_su_sid = 0; + +int grab_transition_sids() +{ + int error = security_secctx_to_secid("u:r:init:s0", strlen("u:r:init:s0"), &ksud_init_sid); + if (error) + return 1; + + pr_info("is_ksu_transition: got init sid: %d\n", ksud_init_sid); + + error = security_secctx_to_secid(KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &ksud_su_sid); + if (error) + return 1; + + pr_info("is_ksu_transition: got su sid: %d\n", ksud_su_sid); + + return 0; +} + +bool is_ksu_transition(const struct task_security_struct *old_tsec, + const struct task_security_struct *new_tsec) +{ + + // we don't need this hook anymore after the third ksud run, which is boot-complete. + if (likely(ksu_boot_completed)) + return false; + + if (!ksud_su_sid || !ksud_init_sid) { + int ret = grab_transition_sids(); + if (ret) + return false; + } + + // if its init transitioning to su, allow it + if (old_tsec->sid == ksud_init_sid && new_tsec->sid == ksud_su_sid) { + pr_info("%s: allowing init (%d) -> su (%d)\n", __func__, ksud_init_sid, ksud_su_sid); + return true; + } + + return false; +} +#endif // is_ksu_transition + +static void stop_vfs_read_hook() +{ + ksu_vfs_read_hook = false; + pr_info("stop vfs_read_hook\n"); +} + +static void stop_execve_hook() +{ + ksu_execveat_hook = false; + pr_info("stop execve_hook\n"); +} + +static void stop_input_hook() +{ + if (!ksu_input_hook) { return; } + ksu_input_hook = false; + pr_info("stop input_hook\n"); + + vol_detector_exit(); +} + +void ksu_ksud_init() +{ + vol_detector_init(); +} + diff --git a/drivers/kernelsu/ksud.h b/drivers/kernelsu/ksud.h new file mode 100644 index 000000000000..2a2ccf265f8c --- /dev/null +++ b/drivers/kernelsu/ksud.h @@ -0,0 +1,26 @@ +#ifndef __KSU_H_KSUD +#define __KSU_H_KSUD + +#include + +#define KSUD_PATH "/data/adb/ksud" + +void ksu_ksud_init(); +void ksu_ksud_exit(); + +void on_post_fs_data(void); +void on_module_mounted(void); +void on_boot_completed(void); + +bool ksu_is_safe_mode(void); + +int nuke_ext4_sysfs(const char* mnt); + +extern bool ksu_execveat_hook __read_mostly; +extern int ksu_handle_pre_ksud(const char *filename); + +extern u32 ksu_file_sid; +extern bool ksu_module_mounted; +extern bool ksu_boot_completed; + +#endif diff --git a/drivers/kernelsu/manager.h b/drivers/kernelsu/manager.h new file mode 100644 index 000000000000..e3159c988c0c --- /dev/null +++ b/drivers/kernelsu/manager.h @@ -0,0 +1,44 @@ +#ifndef __KSU_H_KSU_MANAGER +#define __KSU_H_KSU_MANAGER + +#include +#include +#include "allowlist.h" + +#define KSU_INVALID_APPID -1 + +extern uid_t ksu_manager_appid; // DO NOT DIRECT USE + +static inline bool ksu_is_manager_appid_valid() +{ + return ksu_manager_appid != KSU_INVALID_APPID; +} + +static inline bool is_manager() +{ + kuid_t current_uid = current_uid(); + return unlikely(ksu_manager_appid == ksu_get_uid_t(current_uid) % PER_USER_RANGE); +} + + +static inline bool is_uid_manager(uid_t uid) +{ + return unlikely(ksu_manager_appid == uid % PER_USER_RANGE); +} + +static inline uid_t ksu_get_manager_appid() +{ + return ksu_manager_appid; +} + +static inline void ksu_set_manager_appid(uid_t appid) +{ + ksu_manager_appid = appid; +} + +static inline void ksu_invalidate_manager_uid() +{ + ksu_manager_appid = KSU_INVALID_APPID; +} + +#endif diff --git a/drivers/kernelsu/rp_sucompat.c b/drivers/kernelsu/rp_sucompat.c new file mode 100644 index 000000000000..c156e29a7bbb --- /dev/null +++ b/drivers/kernelsu/rp_sucompat.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(ksu_rp_sucompat_lock); + +// struct filename *getname_flags(const char __user *filename, int flags, int *empty) +// https://elixir.bootlin.com/linux/v4.9.337/source/samples/kprobes/kretprobe_example.c + +extern int ksu_getname_flags_kernel(char **kname, int flags); + +struct kretprobe *getname_rp; + +static int getname_flags_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + int *flags = (int *)ri->data; + + struct filename *ret = (struct filename *)PT_REGS_RC(regs); + if (IS_ERR(ret) || !ret || !ret->name) + return 0; + + ksu_getname_flags_kernel((char **)&ret->name, *flags); + return 0; +} + +static int getname_flags_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + int *flags = (int *)ri->data; // as per sample, we store everything on ri->data ? + *flags = (int)PT_REGS_PARM2(regs); // keep a copy of arg2 + + return 0; +} + +#if 0 +static struct kretprobe getname_kretprobe = { + .kp.symbol_name = "getname_flags", + .entry_handler = getname_flags_entry_handler, + .handler = getname_flags_ret_handler, + .data_size = sizeof(int), + .maxactive = 20, +}; +#endif + +// kanged from upstrteam +// this method allows high volume register/unregister +static struct kretprobe *init_kretprobe(const char *symbol, + kretprobe_handler_t entry_handler, + kretprobe_handler_t ret_handler, + size_t data_size, + int maxactive) +{ + struct kretprobe *rp = kzalloc(sizeof(struct kretprobe), GFP_KERNEL); + if (!rp) + return NULL; + + rp->kp.symbol_name = symbol; + rp->entry_handler = entry_handler; + rp->handler = ret_handler; + rp->data_size = data_size; + rp->maxactive = maxactive; + + mutex_lock(&ksu_rp_sucompat_lock); + int ret = register_kretprobe(rp); + mutex_unlock(&ksu_rp_sucompat_lock); + if (ret) { + kfree(rp); + return NULL; + } + pr_info("rp_sucompat: planted kretprobe at %s: %p\n", rp->kp.symbol_name, rp->kp.addr); + + return rp; +} + +static void destroy_kretprobe(struct kretprobe **rp_ptr) +{ + if (!rp_ptr || !*rp_ptr) + return; + + mutex_lock(&ksu_rp_sucompat_lock); + unregister_kretprobe(*rp_ptr); + mutex_unlock(&ksu_rp_sucompat_lock); + kfree(*rp_ptr); + *rp_ptr = NULL; +} + +static void rp_sucompat_exit() +{ + pr_info("rp_sucompat: unregister getname_flags!\n"); + destroy_kretprobe(&getname_rp); +} + +static void rp_sucompat_init() +{ + pr_info("%s: register getname_flags!\n", __func__); + getname_rp = init_kretprobe("getname_flags", getname_flags_entry_handler, + getname_flags_ret_handler, sizeof(int), 20); +} diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c new file mode 100644 index 000000000000..d0bd6d2b788e --- /dev/null +++ b/drivers/kernelsu/selinux/rules.c @@ -0,0 +1,454 @@ +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#define SELINUX_POLICY_INSTEAD_SELINUX_SS +#endif + +#define ALL NULL + + +static struct policydb *get_policydb(void) +{ + struct policydb *db; +// selinux_state does not exists before 4.19 +#ifdef KSU_COMPAT_USE_SELINUX_STATE +#ifdef SELINUX_POLICY_INSTEAD_SELINUX_SS + struct selinux_policy *policy = selinux_state.policy; + db = &policy->policydb; +#else + struct selinux_ss *ss = selinux_state.ss; + db = &ss->policydb; +#endif +#else + db = &policydb; +#endif + return db; +} + +static DEFINE_MUTEX(ksu_rules); + +void apply_kernelsu_rules() +{ + struct policydb *db; + + if (!getenforce()) { + pr_info("SELinux permissive or disabled, apply rules!\n"); + } + + mutex_lock(&ksu_rules); + + db = get_policydb(); + + ksu_permissive(db, KERNEL_SU_DOMAIN); + ksu_typeattribute(db, KERNEL_SU_DOMAIN, "mlstrustedsubject"); + ksu_typeattribute(db, KERNEL_SU_DOMAIN, "netdomain"); + ksu_typeattribute(db, KERNEL_SU_DOMAIN, "bluetoothdomain"); + + // Create unconstrained file type + ksu_type(db, KERNEL_SU_FILE, "file_type"); + ksu_typeattribute(db, KERNEL_SU_FILE, "mlstrustedobject"); + ksu_allow(db, ALL, KERNEL_SU_FILE, ALL, ALL); + + // allow all! + ksu_allow(db, KERNEL_SU_DOMAIN, ALL, ALL, ALL); + + // allow us do any ioctl + if (db->policyvers >= POLICYDB_VERSION_XPERMS_IOCTL) { + ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "blk_file", ALL); + ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "fifo_file", ALL); + ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "chr_file", ALL); + ksu_allowxperm(db, KERNEL_SU_DOMAIN, ALL, "file", ALL); + } + + // our ksud triggered by init + ksu_allow(db, "init", KERNEL_SU_DOMAIN, ALL, ALL); + + // restored from https://github.com/tiann/KernelSU/pull/3031 + ksu_allow(db, "init", "adb_data_file", "file", ALL); + ksu_allow(db, "init", "adb_data_file", "dir", ALL); // #1289 + + // copied from Magisk rules + // suRights + ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "dir", "search"); + ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "dir", "read"); + ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "open"); + ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "read"); + ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "process", "sigchld"); + + // allowLog + ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "dir", "search"); + ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "read"); + ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "open"); + ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "getattr"); + + // dumpsys + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fd", "use"); + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "write"); + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "read"); + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "open"); + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "getattr"); + + // bootctl + ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "dir", "search"); + ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "read"); + ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "open"); + ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); + + // Allow all binder transactions + ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "binder", ALL); + + // Allow system server kill su process + ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "getpgid"); + ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "sigkill"); + + mutex_unlock(&ksu_rules); +} + +#define MAX_SEPOL_LEN 128 + +#define CMD_NORMAL_PERM 1 +#define CMD_XPERM 2 +#define CMD_TYPE_STATE 3 +#define CMD_TYPE 4 +#define CMD_TYPE_ATTR 5 +#define CMD_ATTR 6 +#define CMD_TYPE_TRANSITION 7 +#define CMD_TYPE_CHANGE 8 +#define CMD_GENFSCON 9 + +struct sepol_data { + u32 cmd; + u32 subcmd; + u64 sepol1; + u64 sepol2; + u64 sepol3; + u64 sepol4; + u64 sepol5; + u64 sepol6; + u64 sepol7; +}; + +static int get_object(char *buf, char __user *user_object, size_t buf_sz, + char **object) +{ + if (!user_object) { + *object = ALL; + return 0; + } + + if (strncpy_from_user(buf, user_object, buf_sz) < 0) { + return -EINVAL; + } + + *object = buf; + + return 0; +} + +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) +extern int avc_ss_reset(u32 seqno); +#else +extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); +#endif +// reset avc cache table, otherwise the new rules will not take effect if already denied +static void reset_avc_cache() +{ +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) + avc_ss_reset(0); + selnl_notify_policyload(0); + selinux_status_update_policyload(0); +#else + struct selinux_avc *avc = selinux_state.avc; + avc_ss_reset(avc, 0); + selnl_notify_policyload(0); + selinux_status_update_policyload(&selinux_state, 0); +#endif + selinux_xfrm_notify_policyload(); +} + +int handle_sepolicy(unsigned long arg3, void __user *arg4) +{ + struct policydb *db; + + if (!arg4) { + return -EINVAL; + } + + if (!getenforce()) { + pr_info("SELinux permissive or disabled when handle policy!\n"); + } + + struct sepol_data data; + if (copy_from_user(&data, arg4, sizeof(struct sepol_data))) { + pr_err("sepol: copy sepol_data failed.\n"); + return -EINVAL; + } + + u32 cmd = data.cmd; + u32 subcmd = data.subcmd; + + mutex_lock(&ksu_rules); + + db = get_policydb(); + + int ret = -EINVAL; + if (cmd == CMD_NORMAL_PERM) { + char src_buf[MAX_SEPOL_LEN]; + char tgt_buf[MAX_SEPOL_LEN]; + char cls_buf[MAX_SEPOL_LEN]; + char perm_buf[MAX_SEPOL_LEN]; + + char *s, *t, *c, *p; + if (get_object(src_buf, (void __user *)(uintptr_t)data.sepol1, sizeof(src_buf), &s) < 0) { + pr_err("sepol: copy src failed.\n"); + goto exit; + } + + if (get_object(tgt_buf, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt_buf), &t) < 0) { + pr_err("sepol: copy tgt failed.\n"); + goto exit; + } + + if (get_object(cls_buf, (void __user *)(uintptr_t)data.sepol3, sizeof(cls_buf), &c) < 0) { + pr_err("sepol: copy cls failed.\n"); + goto exit; + } + + if (get_object(perm_buf, (void __user *)(uintptr_t)data.sepol4, sizeof(perm_buf), &p) < + 0) { + pr_err("sepol: copy perm failed.\n"); + goto exit; + } + + bool success = false; + if (subcmd == 1) { + success = ksu_allow(db, s, t, c, p); + } else if (subcmd == 2) { + success = ksu_deny(db, s, t, c, p); + } else if (subcmd == 3) { + success = ksu_auditallow(db, s, t, c, p); + } else if (subcmd == 4) { + success = ksu_dontaudit(db, s, t, c, p); + } else { + pr_err("sepol: unknown subcmd: %d\n", subcmd); + } + ret = success ? 0 : -EINVAL; + + } else if (cmd == CMD_XPERM) { + char src_buf[MAX_SEPOL_LEN]; + char tgt_buf[MAX_SEPOL_LEN]; + char cls_buf[MAX_SEPOL_LEN]; + + char __maybe_unused + operation[MAX_SEPOL_LEN]; // it is always ioctl now! + char perm_set[MAX_SEPOL_LEN]; + + char *s, *t, *c; + if (get_object(src_buf, (void __user *)(uintptr_t)data.sepol1, sizeof(src_buf), &s) < 0) { + pr_err("sepol: copy src failed.\n"); + goto exit; + } + if (get_object(tgt_buf, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt_buf), &t) < 0) { + pr_err("sepol: copy tgt failed.\n"); + goto exit; + } + if (get_object(cls_buf, (void __user *)(uintptr_t)data.sepol3, sizeof(cls_buf), &c) < 0) { + pr_err("sepol: copy cls failed.\n"); + goto exit; + } + if (strncpy_from_user(operation, (void __user *)(uintptr_t)data.sepol4, + sizeof(operation)) < 0) { + pr_err("sepol: copy operation failed.\n"); + goto exit; + } + if (strncpy_from_user(perm_set, (void __user *)(uintptr_t)data.sepol5, sizeof(perm_set)) < + 0) { + pr_err("sepol: copy perm_set failed.\n"); + goto exit; + } + + bool success = false; + if (subcmd == 1) { + success = ksu_allowxperm(db, s, t, c, perm_set); + } else if (subcmd == 2) { + success = ksu_auditallowxperm(db, s, t, c, perm_set); + } else if (subcmd == 3) { + success = ksu_dontauditxperm(db, s, t, c, perm_set); + } else { + pr_err("sepol: unknown subcmd: %d\n", subcmd); + } + ret = success ? 0 : -EINVAL; + } else if (cmd == CMD_TYPE_STATE) { + char src[MAX_SEPOL_LEN]; + + if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { + pr_err("sepol: copy src failed.\n"); + goto exit; + } + + bool success = false; + if (subcmd == 1) { + success = ksu_permissive(db, src); + } else if (subcmd == 2) { + success = ksu_enforce(db, src); + } else { + pr_err("sepol: unknown subcmd: %d\n", subcmd); + } + if (success) + ret = 0; + + } else if (cmd == CMD_TYPE || cmd == CMD_TYPE_ATTR) { + char type[MAX_SEPOL_LEN]; + char attr[MAX_SEPOL_LEN]; + + if (strncpy_from_user(type, (void __user *)(uintptr_t)data.sepol1, sizeof(type)) < 0) { + pr_err("sepol: copy type failed.\n"); + goto exit; + } + if (strncpy_from_user(attr, (void __user *)(uintptr_t)data.sepol2, sizeof(attr)) < 0) { + pr_err("sepol: copy attr failed.\n"); + goto exit; + } + + bool success = false; + if (cmd == CMD_TYPE) { + success = ksu_type(db, type, attr); + } else { + success = ksu_typeattribute(db, type, attr); + } + if (!success) { + pr_err("sepol: %d failed.\n", cmd); + goto exit; + } + ret = 0; + + } else if (cmd == CMD_ATTR) { + char attr[MAX_SEPOL_LEN]; + + if (strncpy_from_user(attr, (void __user *)(uintptr_t)data.sepol1, sizeof(attr)) < 0) { + pr_err("sepol: copy attr failed.\n"); + goto exit; + } + if (!ksu_attribute(db, attr)) { + pr_err("sepol: %d failed.\n", cmd); + goto exit; + } + ret = 0; + + } else if (cmd == CMD_TYPE_TRANSITION) { + char src[MAX_SEPOL_LEN]; + char tgt[MAX_SEPOL_LEN]; + char cls[MAX_SEPOL_LEN]; + char default_type[MAX_SEPOL_LEN]; + char object[MAX_SEPOL_LEN]; + + if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { + pr_err("sepol: copy src failed.\n"); + goto exit; + } + if (strncpy_from_user(tgt, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt)) < 0) { + pr_err("sepol: copy tgt failed.\n"); + goto exit; + } + if (strncpy_from_user(cls, (void __user *)(uintptr_t)data.sepol3, sizeof(cls)) < 0) { + pr_err("sepol: copy cls failed.\n"); + goto exit; + } + if (strncpy_from_user(default_type, (void __user *)(uintptr_t)data.sepol4, + sizeof(default_type)) < 0) { + pr_err("sepol: copy default_type failed.\n"); + goto exit; + } + char *real_object; + if ((void __user *)(uintptr_t)data.sepol5 == NULL) { + real_object = NULL; + } else { + if (strncpy_from_user(object, (void __user *)(uintptr_t)data.sepol5, + sizeof(object)) < 0) { + pr_err("sepol: copy object failed.\n"); + goto exit; + } + real_object = object; + } + + bool success = ksu_type_transition(db, src, tgt, cls, + default_type, real_object); + if (success) + ret = 0; + + } else if (cmd == CMD_TYPE_CHANGE) { + char src[MAX_SEPOL_LEN]; + char tgt[MAX_SEPOL_LEN]; + char cls[MAX_SEPOL_LEN]; + char default_type[MAX_SEPOL_LEN]; + + if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { + pr_err("sepol: copy src failed.\n"); + goto exit; + } + if (strncpy_from_user(tgt, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt)) < 0) { + pr_err("sepol: copy tgt failed.\n"); + goto exit; + } + if (strncpy_from_user(cls, (void __user *)(uintptr_t)data.sepol3, sizeof(cls)) < 0) { + pr_err("sepol: copy cls failed.\n"); + goto exit; + } + if (strncpy_from_user(default_type, (void __user *)(uintptr_t)data.sepol4, + sizeof(default_type)) < 0) { + pr_err("sepol: copy default_type failed.\n"); + goto exit; + } + bool success = false; + if (subcmd == 1) { + success = ksu_type_change(db, src, tgt, cls, + default_type); + } else if (subcmd == 2) { + success = ksu_type_member(db, src, tgt, cls, + default_type); + } else { + pr_err("sepol: unknown subcmd: %d\n", subcmd); + } + if (success) + ret = 0; + } else if (cmd == CMD_GENFSCON) { + char name[MAX_SEPOL_LEN]; + char path[MAX_SEPOL_LEN]; + char context[MAX_SEPOL_LEN]; + if (strncpy_from_user(name, (void __user *)(uintptr_t)data.sepol1, sizeof(name)) < 0) { + pr_err("sepol: copy name failed.\n"); + goto exit; + } + if (strncpy_from_user(path, (void __user *)(uintptr_t)data.sepol2, sizeof(path)) < 0) { + pr_err("sepol: copy path failed.\n"); + goto exit; + } + if (strncpy_from_user(context, (void __user *)(uintptr_t)data.sepol3, sizeof(context)) < + 0) { + pr_err("sepol: copy context failed.\n"); + goto exit; + } + + if (!ksu_genfscon(db, name, path, context)) { + pr_err("sepol: %d failed.\n", cmd); + goto exit; + } + ret = 0; + } else { + pr_err("sepol: unknown cmd: %d\n", cmd); + } + +exit: + mutex_unlock(&ksu_rules); + + // only allow and xallow needs to reset avc cache, but we cannot do that because + // we are in atomic context. so we just reset it every time. + reset_avc_cache(); + + return ret; +} diff --git a/drivers/kernelsu/selinux/selinux.c b/drivers/kernelsu/selinux/selinux.c new file mode 100644 index 000000000000..975c0ae4e05f --- /dev/null +++ b/drivers/kernelsu/selinux/selinux.c @@ -0,0 +1,210 @@ +/* + * Cached SID values for frequently checked contexts. + * These are resolved once at init and used for fast u32 comparison + * instead of expensive string operations on every check. + * + * A value of 0 means "no cached SID is available" for that context. + * This covers both the initial "not yet cached" state and any case + * where resolving the SID (e.g. via security_secctx_to_secid) failed. + * In all such cases we intentionally fall back to the slower + * string-based comparison path; this degrades performance only and + * does not cause a functional failure. + */ +static u32 cached_su_sid __read_mostly = 0; +static u32 cached_zygote_sid __read_mostly = 0; +static u32 cached_init_sid __read_mostly = 0; +u32 ksu_file_sid __read_mostly = 0; + +static int transive_to_domain(const char *domain, struct cred *cred) +{ + u32 sid; + int error; +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) + struct task_security_struct *tsec; +#else + struct cred_security_struct *tsec; +#endif + tsec = selinux_cred(cred); + if (!tsec) { + pr_err("tsec == NULL!\n"); + return -1; + } + error = security_secctx_to_secid(domain, strlen(domain), &sid); + if (error) { + pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n", domain, + sid, error); + } + if (!error) { + tsec->sid = sid; + tsec->create_sid = 0; + tsec->keycreate_sid = 0; + tsec->sockcreate_sid = 0; + } + return error; +} + +void setup_selinux(const char *domain, struct cred *cred) +{ + if (transive_to_domain(domain, cred)) { + pr_err("transive domain failed.\n"); + return; + } +} + +void setup_ksu_cred(void) +{ + if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred)) { + pr_err("setup ksu cred failed.\n"); + } +} + +void setenforce(bool enforce) +{ +#ifdef CONFIG_SECURITY_SELINUX_DEVELOP +#ifdef KSU_COMPAT_USE_SELINUX_STATE + selinux_state.enforcing = enforce; +#else + selinux_enforcing = enforce; +#endif +#endif +} + +bool getenforce(void) +{ +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +#ifdef KSU_COMPAT_USE_SELINUX_STATE + if (selinux_state.disabled) { + return false; + } +#else + if (selinux_disabled) { + return false; + } +#endif // KSU_COMPAT_USE_SELINUX_STATE +#endif // CONFIG_SECURITY_SELINUX_DISABLE + +#ifdef CONFIG_SECURITY_SELINUX_DEVELOP +#ifdef KSU_COMPAT_USE_SELINUX_STATE + return selinux_state.enforcing; +#else + return selinux_enforcing; +#endif +#else + return true; +#endif +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0) +struct lsm_context { + char *context; + u32 len; +}; + +static int __security_secid_to_secctx(u32 secid, struct lsm_context *cp) +{ + return security_secid_to_secctx(secid, &cp->context, &cp->len); +} +static void __security_release_secctx(struct lsm_context *cp) +{ + security_release_secctx(cp->context, cp->len); +} +#else +#define __security_secid_to_secctx security_secid_to_secctx +#define __security_release_secctx security_release_secctx +#endif + +/* + * Initialize cached SID values for frequently checked SELinux contexts. + * Called once after SELinux policy is loaded (post-fs-data). + * This eliminates expensive string comparisons in hot paths. + */ +void cache_sid(void) +{ + int err; + + err = security_secctx_to_secid(KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid); + if (err) { + pr_warn("Failed to cache kernel su domain SID: %d\n", err); + cached_su_sid = 0; + } else { + pr_info("Cached su SID: %u\n", cached_su_sid); + } + + err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT), &cached_zygote_sid); + if (err) { + pr_warn("Failed to cache zygote SID: %d\n", err); + cached_zygote_sid = 0; + } else { + pr_info("Cached zygote SID: %u\n", cached_zygote_sid); + } + + err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT), &cached_init_sid); + if (err) { + pr_warn("Failed to cache init SID: %d\n", err); + cached_init_sid = 0; + } else { + pr_info("Cached init SID: %u\n", cached_init_sid); + } + + err = security_secctx_to_secid(KSU_FILE_CONTEXT, strlen(KSU_FILE_CONTEXT), &ksu_file_sid); + if (err) { + pr_warn("Failed to cache ksu_file SID: %d\n", err); + ksu_file_sid = 0; + } else { + pr_info("Cached ksu_file SID: %u\n", ksu_file_sid); + } +} + +/* + * Fast path: compare task's SID directly against cached value. + * Falls back to string comparison if cache is not initialized. + */ +static bool is_sid_match(const struct cred *cred, u32 cached_sid, const char *fallback_context) +{ + if (!cred) { + return false; + } +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) + const struct task_security_struct *tsec = selinux_cred(cred); +#else + const struct cred_security_struct *tsec = selinux_cred(cred); +#endif + if (!tsec) { + return false; + } + + // Fast path: use cached SID if available + if (likely(cached_sid != 0)) { + return tsec->sid == cached_sid; + } + + // Slow path fallback: string comparison (only before cache is initialized) + struct lsm_context ctx; + bool result; + if (__security_secid_to_secctx(tsec->sid, &ctx)) { + return false; + } + result = strncmp(fallback_context, ctx.context, ctx.len) == 0; + __security_release_secctx(&ctx); + return result; +} + +bool is_task_ksu_domain(const struct cred *cred) +{ + return is_sid_match(cred, cached_su_sid, KERNEL_SU_CONTEXT); +} + +bool is_ksu_domain(void) +{ + return is_task_ksu_domain(current_cred()); +} + +bool is_zygote(const struct cred *cred) +{ + return is_sid_match(cred, cached_zygote_sid, ZYGOTE_CONTEXT); +} + +bool is_init(const struct cred *cred) +{ + return is_sid_match(cred, cached_init_sid, INIT_CONTEXT); +} diff --git a/drivers/kernelsu/selinux/selinux.h b/drivers/kernelsu/selinux/selinux.h new file mode 100644 index 000000000000..c80e6cf764f1 --- /dev/null +++ b/drivers/kernelsu/selinux/selinux.h @@ -0,0 +1,43 @@ +#ifndef __KSU_H_SELINUX +#define __KSU_H_SELINUX + +#include "linux/types.h" +#include "linux/version.h" +#include "linux/cred.h" + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)) || defined(KSU_COMPAT_HAS_SELINUX_STATE) +#define KSU_COMPAT_USE_SELINUX_STATE +#endif + +// TODO: rename to "ksu" +#define KERNEL_SU_DOMAIN "su" +#define KERNEL_SU_FILE "ksu_file" + +#define KERNEL_SU_CONTEXT "u:r:" KERNEL_SU_DOMAIN ":s0" +#define KSU_FILE_CONTEXT "u:object_r:" KERNEL_SU_FILE ":s0" +#define ZYGOTE_CONTEXT "u:r:zygote:s0" +#define INIT_CONTEXT "u:r:init:s0" + +void setup_selinux(const char *, struct cred *); + +void setenforce(bool); + +bool getenforce(); + +void cache_sid(void); + +bool is_task_ksu_domain(const struct cred* cred); + +bool is_ksu_domain(); + +bool is_zygote(const struct cred* cred); + +bool is_init(const struct cred* cred); + +void apply_kernelsu_rules(); + +int handle_sepolicy(unsigned long arg3, void __user *arg4); + +void setup_ksu_cred(); + +#endif diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c new file mode 100644 index 000000000000..32b1ac209565 --- /dev/null +++ b/drivers/kernelsu/selinux/sepolicy.c @@ -0,0 +1,1134 @@ +#include +#include +#include +#include + +#define KSU_SUPPORT_ADD_TYPE + +/* + * Adapt to Huawei HISI kernel without affecting other kernels , + * Huawei Hisi Kernel EBITMAP Enable or Disable Flag , + * From ss/ebitmap.h + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) +#ifdef HISI_SELINUX_EBITMAP_RO +#define CONFIG_IS_HW_HISI +#endif +#endif + +////////////////////////////////////////////////////// +// Declaration +////////////////////////////////////////////////////// + +static struct avtab_node *get_avtab_node(struct policydb *db, + struct avtab_key *key, + struct avtab_extended_perms *xperms); + +static bool add_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *p, int effect, bool invert); + +static void add_rule_raw(struct policydb *db, struct type_datum *src, + struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert); + +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, + struct type_datum *tgt, struct class_datum *cls, + uint16_t low, uint16_t high, int effect, + bool invert); +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *range, int effect, + bool invert); + +static bool add_type_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *d, int effect); + +static bool add_filename_trans(struct policydb *db, const char *s, + const char *t, const char *c, const char *d, + const char *o); + +static bool add_genfscon(struct policydb *db, const char *fs_name, + const char *path, const char *context); + +static bool add_type(struct policydb *db, const char *type_name, bool attr); + +static bool set_type_state(struct policydb *db, const char *type_name, + bool permissive); + +static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, + struct type_datum *attr); + +static bool add_typeattribute(struct policydb *db, const char *type, + const char *attr); + +////////////////////////////////////////////////////// +// Implementation +////////////////////////////////////////////////////// + +// Invert is adding rules for auditdeny; in other cases, invert is removing +// rules +#define strip_av(effect, invert) ((effect == AVTAB_AUDITDENY) == !invert) + +#define ksu_hash_for_each(node_ptr, n_slot, cur) \ + int i; \ + for (i = 0; i < n_slot; ++i) \ + for (cur = node_ptr[i]; cur; cur = cur->next) + +// htable is a struct instead of pointer above 5.8.0: +// https://elixir.bootlin.com/linux/v5.8-rc1/source/security/selinux/ss/symtab.h +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) +#define ksu_hashtab_for_each(htab, cur) \ + ksu_hash_for_each(htab.htable, htab.size, cur) +#else +#define ksu_hashtab_for_each(htab, cur) \ + ksu_hash_for_each(htab->htable, htab->size, cur) +#endif + +// symtab_search is introduced on 5.9.0: +// https://elixir.bootlin.com/linux/v5.9-rc1/source/security/selinux/ss/symtab.h +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +#define symtab_search(s, name) hashtab_search((s)->table, name) +#define symtab_insert(s, name, datum) hashtab_insert((s)->table, name, datum) +#endif + +#define avtab_for_each(avtab, cur) \ + ksu_hash_for_each(avtab.htable, avtab.nslot, cur); + +static struct avtab_node *get_avtab_node(struct policydb *db, + struct avtab_key *key, + struct avtab_extended_perms *xperms) +{ + struct avtab_node *node; + + /* AVTAB_XPERMS entries are not necessarily unique */ + if (key->specified & AVTAB_XPERMS) { + bool match = false; + node = avtab_search_node(&db->te_avtab, key); + while (node) { + if ((node->datum.u.xperms->specified == + xperms->specified) && + (node->datum.u.xperms->driver == xperms->driver)) { + match = true; + break; + } + node = avtab_search_node_next(node, key->specified); + } + if (!match) + node = NULL; + } else { + node = avtab_search_node(&db->te_avtab, key); + } + + if (!node) { + struct avtab_datum avdatum = {}; + /* + * AUDITDENY, aka DONTAUDIT, are &= assigned, versus |= for + * others. Initialize the data accordingly. + */ + if (key->specified & AVTAB_XPERMS) { + avdatum.u.xperms = xperms; + } else { + avdatum.u.data = + key->specified == AVTAB_AUDITDENY ? ~0U : 0U; + } + /* this is used to get the node - insertion is actually unique */ + node = avtab_insert_nonunique(&db->te_avtab, key, &avdatum); + + int grow_size = sizeof(struct avtab_key); + grow_size += sizeof(struct avtab_datum); + if (key->specified & AVTAB_XPERMS) { + grow_size += sizeof(u8); + grow_size += sizeof(u8); + grow_size += sizeof(u32) * + ARRAY_SIZE(avdatum.u.xperms->perms.p); + } + db->len += grow_size; + } + + return node; +} + +static bool add_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *p, int effect, bool invert) +{ + struct type_datum *src = NULL, *tgt = NULL; + struct class_datum *cls = NULL; + struct perm_datum *perm = NULL; + + if (s) { + src = symtab_search(&db->p_types, s); + if (src == NULL) { + pr_info("source type %s does not exist\n", s); + return false; + } + } + + if (t) { + tgt = symtab_search(&db->p_types, t); + if (tgt == NULL) { + pr_info("target type %s does not exist\n", t); + return false; + } + } + + if (c) { + cls = symtab_search(&db->p_classes, c); + if (cls == NULL) { + pr_info("class %s does not exist\n", c); + return false; + } + } + + if (p) { + if (c == NULL) { + pr_info("No class is specified, cannot add perm [%s] \n", + p); + return false; + } + + perm = symtab_search(&cls->permissions, p); + if (perm == NULL && cls->comdatum != NULL) { + perm = symtab_search(&cls->comdatum->permissions, p); + } + if (perm == NULL) { + pr_info("perm %s does not exist in class %s\n", p, c); + return false; + } + } + add_rule_raw(db, src, tgt, cls, perm, effect, invert); + return true; +} + +static void add_rule_raw(struct policydb *db, struct type_datum *src, + struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert) +{ + if (src == NULL) { + struct hashtab_node *node; + if (strip_av(effect, invert)) { + ksu_hashtab_for_each(db->p_types.table, node) + { + add_rule_raw(db, + (struct type_datum *)node->datum, + tgt, cls, perm, effect, invert); + }; + } else { + ksu_hashtab_for_each(db->p_types.table, node) + { + struct type_datum *type = + (struct type_datum *)(node->datum); + if (type->attribute) { + add_rule_raw(db, type, tgt, cls, perm, + effect, invert); + } + }; + } + } else if (tgt == NULL) { + struct hashtab_node *node; + if (strip_av(effect, invert)) { + ksu_hashtab_for_each(db->p_types.table, node) + { + add_rule_raw(db, src, + (struct type_datum *)node->datum, + cls, perm, effect, invert); + }; + } else { + ksu_hashtab_for_each(db->p_types.table, node) + { + struct type_datum *type = + (struct type_datum *)(node->datum); + if (type->attribute) { + add_rule_raw(db, src, type, cls, perm, + effect, invert); + } + }; + } + } else if (cls == NULL) { + struct hashtab_node *node; + ksu_hashtab_for_each(db->p_classes.table, node) + { + add_rule_raw(db, src, tgt, + (struct class_datum *)node->datum, perm, + effect, invert); + } + } else { + struct avtab_key key; + key.source_type = src->value; + key.target_type = tgt->value; + key.target_class = cls->value; + key.specified = effect; + + struct avtab_node *node = get_avtab_node(db, &key, NULL); + if (invert) { + if (perm) + node->datum.u.data &= + ~(1U << (perm->value - 1)); + else + node->datum.u.data = 0U; + } else { + if (perm) + node->datum.u.data |= 1U << (perm->value - 1); + else + node->datum.u.data = ~0U; + } + } +} + +#define ioctl_driver(x) (x >> 8 & 0xFF) +#define ioctl_func(x) (x & 0xFF) + +#define xperm_test(x, p) (1 & (p[x >> 5] >> (x & 0x1f))) +#define xperm_set(x, p) (p[x >> 5] |= (1 << (x & 0x1f))) +#define xperm_clear(x, p) (p[x >> 5] &= ~(1 << (x & 0x1f))) + +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, + struct type_datum *tgt, struct class_datum *cls, + uint16_t low, uint16_t high, int effect, + bool invert) +{ + if (src == NULL) { + struct hashtab_node *node; + ksu_hashtab_for_each(db->p_types.table, node) + { + struct type_datum *type = + (struct type_datum *)(node->datum); + if (type->attribute) { + add_xperm_rule_raw(db, type, tgt, cls, low, + high, effect, invert); + } + }; + } else if (tgt == NULL) { + struct hashtab_node *node; + ksu_hashtab_for_each(db->p_types.table, node) + { + struct type_datum *type = + (struct type_datum *)(node->datum); + if (type->attribute) { + add_xperm_rule_raw(db, src, type, cls, low, + high, effect, invert); + } + }; + } else if (cls == NULL) { + struct hashtab_node *node; + ksu_hashtab_for_each(db->p_classes.table, node) + { + add_xperm_rule_raw(db, src, tgt, + (struct class_datum *)(node->datum), + low, high, effect, invert); + }; + } else { + struct avtab_key key; + key.source_type = src->value; + key.target_type = tgt->value; + key.target_class = cls->value; + key.specified = effect; + + struct avtab_datum *datum; + struct avtab_node *node; + struct avtab_extended_perms xperms; + + memset(&xperms, 0, sizeof(xperms)); + if (ioctl_driver(low) != ioctl_driver(high)) { + xperms.specified = AVTAB_XPERMS_IOCTLDRIVER; + xperms.driver = 0; + } else { + xperms.specified = AVTAB_XPERMS_IOCTLFUNCTION; + xperms.driver = ioctl_driver(low); + } + int i; + if (xperms.specified == AVTAB_XPERMS_IOCTLDRIVER) { + for (i = ioctl_driver(low); i <= ioctl_driver(high); + ++i) { + if (invert) + xperm_clear(i, xperms.perms.p); + else + xperm_set(i, xperms.perms.p); + } + } else { + for (i = ioctl_func(low); i <= ioctl_func(high); ++i) { + if (invert) + xperm_clear(i, xperms.perms.p); + else + xperm_set(i, xperms.perms.p); + } + } + + node = get_avtab_node(db, &key, &xperms); + if (!node) { + pr_warn("add_xperm_rule_raw cannot found node!\n"); + return; + } + datum = &node->datum; + + if (datum->u.xperms == NULL) { + datum->u.xperms = + (struct avtab_extended_perms *)(kzalloc( + sizeof(xperms), GFP_ATOMIC)); + if (!datum->u.xperms) { + pr_err("alloc xperms failed\n"); + return; + } + memcpy(datum->u.xperms, &xperms, sizeof(xperms)); + } + } +} + +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *range, int effect, + bool invert) +{ + struct type_datum *src = NULL, *tgt = NULL; + struct class_datum *cls = NULL; + + if (s) { + src = symtab_search(&db->p_types, s); + if (src == NULL) { + pr_info("source type %s does not exist\n", s); + return false; + } + } + + if (t) { + tgt = symtab_search(&db->p_types, t); + if (tgt == NULL) { + pr_info("target type %s does not exist\n", t); + return false; + } + } + + if (c) { + cls = symtab_search(&db->p_classes, c); + if (cls == NULL) { + pr_info("class %s does not exist\n", c); + return false; + } + } + + u16 low, high; + + if (range) { + if (strchr(range, '-')) { + sscanf(range, "%hx-%hx", &low, &high); + } else { + sscanf(range, "%hx", &low); + high = low; + } + } else { + low = 0; + high = 0xFFFF; + } + + add_xperm_rule_raw(db, src, tgt, cls, low, high, effect, invert); + return true; +} + +static bool add_type_rule(struct policydb *db, const char *s, const char *t, + const char *c, const char *d, int effect) +{ + struct type_datum *src, *tgt, *def; + struct class_datum *cls; + + src = symtab_search(&db->p_types, s); + if (src == NULL) { + pr_info("source type %s does not exist\n", s); + return false; + } + tgt = symtab_search(&db->p_types, t); + if (tgt == NULL) { + pr_info("target type %s does not exist\n", t); + return false; + } + cls = symtab_search(&db->p_classes, c); + if (cls == NULL) { + pr_info("class %s does not exist\n", c); + return false; + } + def = symtab_search(&db->p_types, d); + if (def == NULL) { + pr_info("default type %s does not exist\n", d); + return false; + } + + struct avtab_key key; + key.source_type = src->value; + key.target_type = tgt->value; + key.target_class = cls->value; + key.specified = effect; + + struct avtab_node *node = get_avtab_node(db, &key, NULL); + node->datum.u.data = def->value; + + return true; +} + +// 5.9.0 : static inline int hashtab_insert(struct hashtab *h, void *key, void +// *datum, struct hashtab_key_params key_params) 5.8.0: int +// hashtab_insert(struct hashtab *h, void *k, void *d); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) +static u32 filenametr_hash(const void *k) +{ + const struct filename_trans_key *ft = k; + unsigned long hash; + unsigned int byte_num; + unsigned char focus; + + hash = ft->ttype ^ ft->tclass; + + byte_num = 0; + while ((focus = ft->name[byte_num++])) + hash = partial_name_hash(focus, hash); + return hash; +} + +static int filenametr_cmp(const void *k1, const void *k2) +{ + const struct filename_trans_key *ft1 = k1; + const struct filename_trans_key *ft2 = k2; + int v; + + v = ft1->ttype - ft2->ttype; + if (v) + return v; + + v = ft1->tclass - ft2->tclass; + if (v) + return v; + + return strcmp(ft1->name, ft2->name); +} + +static const struct hashtab_key_params filenametr_key_params = { + .hash = filenametr_hash, + .cmp = filenametr_cmp, +}; +#endif + +static bool add_filename_trans(struct policydb *db, const char *s, + const char *t, const char *c, const char *d, + const char *o) +{ + struct type_datum *src, *tgt, *def; + struct class_datum *cls; + + src = symtab_search(&db->p_types, s); + if (src == NULL) { + pr_warn("source type %s does not exist\n", s); + return false; + } + tgt = symtab_search(&db->p_types, t); + if (tgt == NULL) { + pr_warn("target type %s does not exist\n", t); + return false; + } + cls = symtab_search(&db->p_classes, c); + if (cls == NULL) { + pr_warn("class %s does not exist\n", c); + return false; + } + def = symtab_search(&db->p_types, d); + if (def == NULL) { + pr_warn("default type %s does not exist\n", d); + return false; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + struct filename_trans_key key; + key.ttype = tgt->value; + key.tclass = cls->value; + key.name = (char *)o; + + struct filename_trans_datum *last = NULL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + struct filename_trans_datum *trans = + policydb_filenametr_search(db, &key); +#else + struct filename_trans_datum *trans = + hashtab_search(&db->filename_trans, &key); +#endif + while (trans) { + if (ebitmap_get_bit(&trans->stypes, src->value - 1)) { + // Duplicate, overwrite existing data and return + trans->otype = def->value; + return true; + } + if (trans->otype == def->value) + break; + last = trans; + trans = trans->next; + } + + if (trans == NULL) { + trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), + GFP_ATOMIC); + struct filename_trans_key *new_key = + (struct filename_trans_key *)kzalloc(sizeof(*new_key), + GFP_ATOMIC); + *new_key = key; + new_key->name = kstrdup(key.name, GFP_ATOMIC); + trans->next = last; + trans->otype = def->value; + hashtab_insert(&db->filename_trans, new_key, trans, + filenametr_key_params); + } + + db->compat_filename_trans_count++; + return ebitmap_set_bit(&trans->stypes, src->value - 1, 1) == 0; +#else // < 5.7.0, has no filename_trans_key, but struct filename_trans + + struct filename_trans key; + key.ttype = tgt->value; + key.tclass = cls->value; + key.name = (char *)o; + + struct filename_trans_datum *trans = + hashtab_search(db->filename_trans, &key); + + if (trans == NULL) { + trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), + 1, GFP_ATOMIC); + if (!trans) { + pr_err("add_filename_trans: Failed to alloc datum\n"); + return false; + } + struct filename_trans *new_key = + (struct filename_trans *)kmalloc(sizeof(*new_key), + GFP_ATOMIC); + if (!new_key) { + pr_err("add_filename_trans: Failed to alloc new_key\n"); + return false; + } + *new_key = key; + new_key->name = kstrdup(key.name, GFP_ATOMIC); + trans->otype = def->value; + hashtab_insert(db->filename_trans, new_key, trans); + } + + return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == + 0; +#endif +} + +static bool add_genfscon(struct policydb *db, const char *fs_name, + const char *path, const char *context) +{ + return false; +} + +// https://github.com/torvalds/linux/commit/590b9d576caec6b4c46bba49ed36223a399c3fc5#diff-cc9aa90e094e6e0f47bd7300db4f33cf4366b98b55d8753744f31eb69c691016R844-R845 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) +#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_ATOMIC) +#else +// https://cs.android.com/android/_/android/kernel/common/+/f5f3e54f811679761c33526e695bd296190faade +// Some 5.10 kernel don't have this backport, so copy one. +static void *ksu_kvrealloc_compat(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + __builtin_memcpy(newp, p, oldsize); // bypass fortify_source, kasan + kvfree(p); + return newp; +} +#define ksu_kvrealloc(p, new_size, old_size) ksu_kvrealloc_compat(p, old_size, new_size, GFP_ATOMIC) +#endif + +static bool add_type(struct policydb *db, const char *type_name, bool attr) +{ +#ifdef KSU_SUPPORT_ADD_TYPE + struct type_datum *type = symtab_search(&db->p_types, type_name); + if (type) { + pr_warn("Type %s already exists\n", type_name); + return true; + } + + u32 value = ++db->p_types.nprim; + type = (struct type_datum *)kzalloc(sizeof(struct type_datum), + GFP_ATOMIC); + if (!type) { + pr_err("add_type: alloc type_datum failed.\n"); + return false; + } + + type->primary = 1; + type->value = value; + type->attribute = attr; + + char *key = kstrdup(type_name, GFP_ATOMIC); + if (!key) { + pr_err("add_type: alloc key failed.\n"); + return false; + } + + if (symtab_insert(&db->p_types, key, type)) { + pr_err("add_type: insert symtab failed.\n"); + return false; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) + struct ebitmap *new_type_attr_map_array = + ksu_kvrealloc(db->type_attr_map_array, + value * sizeof(struct ebitmap), + (value - 1) * sizeof(struct ebitmap)); + + if (!new_type_attr_map_array) { + pr_err("add_type: alloc type_attr_map_array failed\n"); + return false; + } + + struct type_datum **new_type_val_to_struct = + ksu_kvrealloc(db->type_val_to_struct, + sizeof(*db->type_val_to_struct) * value, + sizeof(*db->type_val_to_struct) * (value - 1)); + + if (!new_type_val_to_struct) { + pr_err("add_type: alloc type_val_to_struct failed\n"); + return false; + } + + char **new_val_to_name_types = + ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES], + sizeof(char *) * value, + sizeof(char *) * (value - 1)); + if (!new_val_to_name_types) { + pr_err("add_type: alloc val_to_name failed\n"); + return false; + } + + db->type_attr_map_array = new_type_attr_map_array; + ebitmap_init(&db->type_attr_map_array[value - 1]); + ebitmap_set_bit(&db->type_attr_map_array[value - 1], value - 1, 1); + + db->type_val_to_struct = new_type_val_to_struct; + db->type_val_to_struct[value - 1] = type; + + db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; + db->sym_val_to_name[SYM_TYPES][value - 1] = key; + + int i; + for (i = 0; i < db->p_roles.nprim; ++i) { + ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, + 1); + } + + return true; + +#elif defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) + struct ebitmap *new_type_attr_map_array = + ksu_kvrealloc(db->type_attr_map_array, + value * sizeof(struct ebitmap), + (value - 1) * sizeof(struct ebitmap)); + + if (!new_type_attr_map_array) { + pr_err("add_type: alloc type_attr_map_array failed\n"); + return false; + } + + struct type_datum **new_type_val_to_struct = + ksu_kvrealloc(db->type_val_to_struct_array, + sizeof(*db->type_val_to_struct_array) * value, + sizeof(*db->type_val_to_struct_array) * (value - 1)); + + if (!new_type_val_to_struct) { + pr_err("add_type: alloc type_val_to_struct failed\n"); + return false; + } + + char **new_val_to_name_types = + ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES], + sizeof(char *) * value, + sizeof(char *) * (value - 1)); + if (!new_val_to_name_types) { + pr_err("add_type: alloc val_to_name failed\n"); + return false; + } + + db->type_attr_map_array = new_type_attr_map_array; + ebitmap_init(&db->type_attr_map_array[value - 1]); + ebitmap_set_bit(&db->type_attr_map_array[value - 1], value - 1, 1); + + db->type_val_to_struct_array = new_type_val_to_struct; + db->type_val_to_struct_array[value - 1] = type; + + db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; + db->sym_val_to_name[SYM_TYPES][value - 1] = key; + + int i; + for (i = 0; i < db->p_roles.nprim; ++i) { + ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, + 1); + } + + return true; + +#elif defined(CONFIG_IS_HW_HISI) + /* + * Huawei use type_attr_map and type_val_to_struct. + * And use ebitmap not flex_array. + */ + size_t new_size = sizeof(struct ebitmap) * db->p_types.nprim; + struct ebitmap *new_type_attr_map = + (krealloc(db->type_attr_map, new_size, GFP_ATOMIC)); + + struct type_datum **new_type_val_to_struct = + krealloc(db->type_val_to_struct, + sizeof(*db->type_val_to_struct) * db->p_types.nprim, + GFP_ATOMIC); + + if (!new_type_attr_map) { + pr_err("add_type: alloc type_attr_map failed\n"); + return false; + } + + if (!new_type_val_to_struct) { + pr_err("add_type: alloc type_val_to_struct failed\n"); + return false; + } + + char **new_val_to_name_types = + krealloc(db->sym_val_to_name[SYM_TYPES], + sizeof(char *) * db->symtab[SYM_TYPES].nprim, + GFP_KERNEL); + if (!new_val_to_name_types) { + pr_err("add_type: alloc val_to_name failed\n"); + return false; + } + + db->type_attr_map = new_type_attr_map; + ebitmap_init(&db->type_attr_map[value - 1], HISI_SELINUX_EBITMAP_RO); + ebitmap_set_bit(&db->type_attr_map[value - 1], value - 1, 1); + + db->type_val_to_struct = new_type_val_to_struct; + db->type_val_to_struct[value - 1] = type; + + db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; + db->sym_val_to_name[SYM_TYPES][value - 1] = key; + + int i; + for (i = 0; i < db->p_roles.nprim; ++i) { + ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, + 1); + } + + return true; +#else + // flex_array is not extensible, we need to create a new bigger one instead + struct flex_array *new_type_attr_map_array = + flex_array_alloc(sizeof(struct ebitmap), db->p_types.nprim, + GFP_ATOMIC | __GFP_ZERO); + + struct flex_array *new_type_val_to_struct = + flex_array_alloc(sizeof(struct type_datum *), db->p_types.nprim, + GFP_ATOMIC | __GFP_ZERO); + + struct flex_array *new_val_to_name_types = + flex_array_alloc(sizeof(char *), db->symtab[SYM_TYPES].nprim, + GFP_ATOMIC | __GFP_ZERO); + + if (!new_type_attr_map_array) { + pr_err("add_type: alloc type_attr_map_array failed\n"); + return false; + } + + if (!new_type_val_to_struct) { + pr_err("add_type: alloc type_val_to_struct failed\n"); + return false; + } + + if (!new_val_to_name_types) { + pr_err("add_type: alloc val_to_name failed\n"); + return false; + } + + // preallocate so we don't have to worry about the put ever failing + if (flex_array_prealloc(new_type_attr_map_array, 0, db->p_types.nprim, + GFP_ATOMIC | __GFP_ZERO)) { + pr_err("add_type: prealloc type_attr_map_array failed\n"); + return false; + } + + if (flex_array_prealloc(new_type_val_to_struct, 0, db->p_types.nprim, + GFP_ATOMIC | __GFP_ZERO)) { + pr_err("add_type: prealloc type_val_to_struct_array failed\n"); + return false; + } + + if (flex_array_prealloc(new_val_to_name_types, 0, + db->symtab[SYM_TYPES].nprim, + GFP_ATOMIC | __GFP_ZERO)) { + pr_err("add_type: prealloc val_to_name_types failed\n"); + return false; + } + + int j; + void *old_elem; + // copy the old data or pointers to new flex arrays + for (j = 0; j < db->type_attr_map_array->total_nr_elements; j++) { + old_elem = flex_array_get(db->type_attr_map_array, j); + if (old_elem) + flex_array_put(new_type_attr_map_array, j, old_elem, + GFP_ATOMIC | __GFP_ZERO); + } + + for (j = 0; j < db->type_val_to_struct_array->total_nr_elements; j++) { + old_elem = flex_array_get_ptr(db->type_val_to_struct_array, j); + if (old_elem) + flex_array_put_ptr(new_type_val_to_struct, j, old_elem, + GFP_ATOMIC | __GFP_ZERO); + } + + for (j = 0; j < db->symtab[SYM_TYPES].nprim; j++) { + old_elem = + flex_array_get_ptr(db->sym_val_to_name[SYM_TYPES], j); + if (old_elem) + flex_array_put_ptr(new_val_to_name_types, j, old_elem, + GFP_ATOMIC | __GFP_ZERO); + } + + // store the pointer of old flex arrays first, when assigning new ones we + // should free it + struct flex_array *old_fa; + + old_fa = db->type_attr_map_array; + db->type_attr_map_array = new_type_attr_map_array; + if (old_fa) { + flex_array_free(old_fa); + } + + ebitmap_init(flex_array_get(db->type_attr_map_array, value - 1)); + ebitmap_set_bit(flex_array_get(db->type_attr_map_array, value - 1), + value - 1, 1); + + old_fa = db->type_val_to_struct_array; + db->type_val_to_struct_array = new_type_val_to_struct; + if (old_fa) { + flex_array_free(old_fa); + } + flex_array_put_ptr(db->type_val_to_struct_array, value - 1, type, + GFP_ATOMIC | __GFP_ZERO); + + old_fa = db->sym_val_to_name[SYM_TYPES]; + db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; + if (old_fa) { + flex_array_free(old_fa); + } + flex_array_put_ptr(db->sym_val_to_name[SYM_TYPES], value - 1, key, + GFP_ATOMIC | __GFP_ZERO); + + int i; + for (i = 0; i < db->p_roles.nprim; ++i) { + ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, + 1); + } + return true; +#endif + +#else + return false; +#endif +} + +static bool set_type_state(struct policydb *db, const char *type_name, + bool permissive) +{ + struct type_datum *type; + if (type_name == NULL) { + struct hashtab_node *node; + ksu_hashtab_for_each(db->p_types.table, node) + { + type = (struct type_datum *)(node->datum); + if (ebitmap_set_bit(&db->permissive_map, type->value, + permissive)) + pr_info("Could not set bit in permissive map\n"); + }; + } else { + type = (struct type_datum *)symtab_search(&db->p_types, + type_name); + if (type == NULL) { + pr_info("type %s does not exist\n", type_name); + return false; + } + if (ebitmap_set_bit(&db->permissive_map, type->value, + permissive)) { + pr_info("Could not set bit in permissive map\n"); + return false; + } + } + return true; +} + +static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, + struct type_datum *attr) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) + struct ebitmap *sattr = &db->type_attr_map_array[type->value - 1]; +#elif defined(CONFIG_IS_HW_HISI) + /* + * HISI_SELINUX_EBITMAP_RO is Huawei's unique features. + */ + struct ebitmap *sattr = &db->type_attr_map[type->value - 1], + HISI_SELINUX_EBITMAP_RO; +#else + struct ebitmap *sattr = + flex_array_get(db->type_attr_map_array, type->value - 1); +#endif + ebitmap_set_bit(sattr, attr->value - 1, 1); + + struct hashtab_node *node; + struct constraint_node *n; + struct constraint_expr *e; + ksu_hashtab_for_each(db->p_classes.table, node) + { + struct class_datum *cls = (struct class_datum *)(node->datum); + for (n = cls->constraints; n; n = n->next) { + for (e = n->expr; e; e = e->next) { + if (e->expr_type == CEXPR_NAMES && + ebitmap_get_bit(&e->type_names->types, + attr->value - 1)) { + ebitmap_set_bit(&e->names, + type->value - 1, 1); + } + } + } + }; +} + +static bool add_typeattribute(struct policydb *db, const char *type, + const char *attr) +{ + struct type_datum *type_d = symtab_search(&db->p_types, type); + if (type_d == NULL) { + pr_info("type %s does not exist\n", type); + return false; + } else if (type_d->attribute) { + pr_info("type %s is an attribute\n", attr); + return false; + } + + struct type_datum *attr_d = symtab_search(&db->p_types, attr); + if (attr_d == NULL) { + pr_info("attribute %s does not exist\n", type); + return false; + } else if (!attr_d->attribute) { + pr_info("type %s is not an attribute \n", attr); + return false; + } + + add_typeattribute_raw(db, type_d, attr_d); + return true; +} + +////////////////////////////////////////////////////////////////////////// + +// Operation on types +bool ksu_type(struct policydb *db, const char *name, const char *attr) +{ + return add_type(db, name, false) && add_typeattribute(db, name, attr); +} + +bool ksu_attribute(struct policydb *db, const char *name) +{ + return add_type(db, name, true); +} + +bool ksu_permissive(struct policydb *db, const char *type) +{ + return set_type_state(db, type, true); +} + +bool ksu_enforce(struct policydb *db, const char *type) +{ + return set_type_state(db, type, false); +} + +bool ksu_typeattribute(struct policydb *db, const char *type, const char *attr) +{ + return add_typeattribute(db, type, attr); +} + +bool ksu_exists(struct policydb *db, const char *type) +{ + return symtab_search(&db->p_types, type) != NULL; +} + +// Access vector rules +bool ksu_allow(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm) +{ + return add_rule(db, src, tgt, cls, perm, AVTAB_ALLOWED, false); +} + +bool ksu_deny(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm) +{ + return add_rule(db, src, tgt, cls, perm, AVTAB_ALLOWED, true); +} + +bool ksu_auditallow(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm) +{ + return add_rule(db, src, tgt, cls, perm, AVTAB_AUDITALLOW, false); +} +bool ksu_dontaudit(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm) +{ + return add_rule(db, src, tgt, cls, perm, AVTAB_AUDITDENY, true); +} + +// Extended permissions access vector rules +bool ksu_allowxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range) +{ + return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_ALLOWED, + false); +} + +bool ksu_auditallowxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range) +{ + return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_AUDITALLOW, + false); +} + +bool ksu_dontauditxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range) +{ + return add_xperm_rule(db, src, tgt, cls, range, AVTAB_XPERMS_DONTAUDIT, + false); +} + +// Type rules +bool ksu_type_transition(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def, const char *obj) +{ + if (obj) { + return add_filename_trans(db, src, tgt, cls, def, obj); + } else { + return add_type_rule(db, src, tgt, cls, def, AVTAB_TRANSITION); + } +} + +bool ksu_type_change(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def) +{ + return add_type_rule(db, src, tgt, cls, def, AVTAB_CHANGE); +} + +bool ksu_type_member(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def) +{ + return add_type_rule(db, src, tgt, cls, def, AVTAB_MEMBER); +} + +// File system labeling +bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, + const char *ctx) +{ + return add_genfscon(db, fs_name, path, ctx); +} diff --git a/drivers/kernelsu/selinux/sepolicy.h b/drivers/kernelsu/selinux/sepolicy.h new file mode 100644 index 000000000000..675d1499e46d --- /dev/null +++ b/drivers/kernelsu/selinux/sepolicy.h @@ -0,0 +1,46 @@ +#ifndef __KSU_H_SEPOLICY +#define __KSU_H_SEPOLICY + +#include + +#include "ss/policydb.h" + +// Operation on types +bool ksu_type(struct policydb *db, const char *name, const char *attr); +bool ksu_attribute(struct policydb *db, const char *name); +bool ksu_permissive(struct policydb *db, const char *type); +bool ksu_enforce(struct policydb *db, const char *type); +bool ksu_typeattribute(struct policydb *db, const char *type, const char *attr); +bool ksu_exists(struct policydb *db, const char *type); + +// Access vector rules +bool ksu_allow(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm); +bool ksu_deny(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm); +bool ksu_auditallow(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm); +bool ksu_dontaudit(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *perm); + +// Extended permissions access vector rules +bool ksu_allowxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range); +bool ksu_auditallowxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range); +bool ksu_dontauditxperm(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *range); + +// Type rules +bool ksu_type_transition(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def, const char *obj); +bool ksu_type_change(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def); +bool ksu_type_member(struct policydb *db, const char *src, const char *tgt, + const char *cls, const char *def); + +// File system labeling +bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, + const char *ctx); + +#endif diff --git a/drivers/kernelsu/su_mount_ns.c b/drivers/kernelsu/su_mount_ns.c new file mode 100644 index 000000000000..185599b3890e --- /dev/null +++ b/drivers/kernelsu/su_mount_ns.c @@ -0,0 +1,239 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) +#include +#else +#include +#endif +#endif + +extern int path_mount(const char *dev_name, struct path *path, + const char *type_page, unsigned long flags, + void *data_page); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) +#if defined(__aarch64__) +extern long __arm64_sys_setns(const struct pt_regs *regs); +#elif defined(__x86_64__) +extern long __x64_sys_setns(const struct pt_regs *regs); +#elif defined(__arm__) // https://syscalls.mebeim.net/?table=arm/32/eabi/latest +extern long sys_setns(const struct pt_regs *regs); +#endif + +static long ksu_sys_setns(int fd, int flags) +{ + struct pt_regs regs; + memset(®s, 0, sizeof(regs)); + + PT_REGS_PARM1(®s) = fd; + PT_REGS_PARM2(®s) = flags; + +#if defined(__aarch64__) + return __arm64_sys_setns(®s); +#elif defined(__x86_64__) + return __x64_sys_setns(®s); +#elif defined(__arm__) + return sys_setns(®s); +#else + return -ENOSYS; +#endif +} +#else +static long ksu_sys_setns(int fd, int flags) +{ + return sys_setns(fd, flags); +} +__weak int ksys_unshare(unsigned long unshare_flags) +{ + return sys_unshare(unshare_flags); +} +#endif + +// global mode , need CAP_SYS_ADMIN and CAP_SYS_CHROOT to perform setns +static void ksu_mnt_ns_global(void) +{ + // save current working directory as absolute path before setns + char *pwd_path = NULL; + char *pwd_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pwd_buf) { + pr_warn("no mem for pwd buffer, skip restore pwd!!\n"); + goto try_setns; + } + + struct path saved_pwd; + get_fs_pwd(current->fs, &saved_pwd); + pwd_path = d_path(&saved_pwd, pwd_buf, PATH_MAX); + path_put(&saved_pwd); + + if (IS_ERR(pwd_path)) { + if (PTR_ERR(pwd_path) == -ENAMETOOLONG) { + pr_warn("absolute pwd longer than: %d, skip restore pwd!!\n", + PATH_MAX); + } else { + pr_warn("get absolute pwd failed: %ld\n", PTR_ERR(pwd_path)); + } + pwd_path = NULL; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +try_setns: + + rcu_read_lock(); + // &init_task is not init, but swapper/idle, which forks the init process + // so we need find init process + struct pid *pid_struct = find_pid_ns(1, &init_pid_ns); + if (unlikely(!pid_struct)) { + rcu_read_unlock(); + pr_warn("failed to find pid_struct for PID 1\n"); + goto out; + } + + struct task_struct *pid1_task = get_pid_task(pid_struct, PIDTYPE_PID); + rcu_read_unlock(); + if (unlikely(!pid1_task)) { + pr_warn("failed to get task_struct for PID 1\n"); + goto out; + } + struct path ns_path; + long ret = (long)ns_get_path(&ns_path, pid1_task, &mntns_operations); + put_task_struct(pid1_task); + if (ret) { + pr_warn("failed get path for init mount namespace: %ld\n", ret); + goto out; + } +#else +try_setns: + barrier(); // to shutup declaration after label + + // on UL kernels we can try to just feed it with struct path of /proc/1/ns/mnt + // we do NOT have ns_get_path. if it works, GOOD. if it doesn't I don't care. + + struct path ns_path; + const struct cred *saved = override_creds(ksu_cred); + + // make sure to LOOKUP_FOLLOW + // /proc/1/ns/mnt -> 'mnt:[4026531840]' + long ret = kern_path("/proc/1/ns/mnt", LOOKUP_FOLLOW, &ns_path); + if (ret) { + revert_creds(saved); + pr_warn("kern_path /proc/1/ns/mnt fail! ret: %d\n", ret); + goto out; + } + revert_creds(saved); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) + struct file *ns_file = dentry_open(&ns_path, O_RDONLY, ksu_cred); +#else + struct file *ns_file = dentry_open(ns_path.dentry, ns_path.mnt, O_RDONLY, ksu_cred); +#endif + + path_put(&ns_path); + if (IS_ERR(ns_file)) { + pr_warn("failed open file for init mount namespace: %ld\n", + PTR_ERR(ns_file)); + goto out; + } + + int fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + pr_warn("failed to get an unused fd: %d\n", fd); + fput(ns_file); + goto out; + } + + fd_install(fd, ns_file); + ret = ksu_sys_setns(fd, CLONE_NEWNS); + + close_fd(fd); + + if (ret) { + pr_warn("call setns failed: %ld\n", ret); + goto out; + } + // try to restore working directory using absolute path after setns + if (pwd_path) { + struct path new_pwd; + int err = kern_path(pwd_path, 0, &new_pwd); + if (!err) { + set_fs_pwd(current->fs, &new_pwd); + path_put(&new_pwd); + } else { + pr_warn("restore pwd failed: %d, path: %s\n", err, pwd_path); + } + } +out: + kfree(pwd_buf); +} + +// individual mode , need CAP_SYS_ADMIN to perform unshare and remount +static void ksu_mnt_ns_individual(void) +{ + long ret = ksys_unshare(CLONE_NEWNS); + if (ret) { + pr_warn("call ksys_unshare failed: %ld\n", ret); + return; + } + + // make root mount private + struct path root_path; + get_fs_root(current->fs, &root_path); + int pm_ret = path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL); + path_put(&root_path); + + if (pm_ret < 0) { + pr_err("failed to make root private, err: %d\n", pm_ret); + } +} + +void setup_mount_ns(int32_t ns_mode) +{ + // inherit mode + if (ns_mode == KSU_NS_INHERITED) { + // do nothing + return; + } + + if (ns_mode != KSU_NS_GLOBAL && ns_mode != KSU_NS_INDIVIDUAL) { + pr_warn("pid: %d ,unknown mount namespace mode: %d\n", current->pid, + ns_mode); + return; + } + + if (!ksu_cred) { + pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n", current->pid); + return; + } + + const struct cred *old_cred = override_creds(ksu_cred); + if (ns_mode == KSU_NS_GLOBAL) { + ksu_mnt_ns_global(); + } else { + ksu_mnt_ns_individual(); + } + revert_creds(old_cred); +} diff --git a/drivers/kernelsu/su_mount_ns.h b/drivers/kernelsu/su_mount_ns.h new file mode 100644 index 000000000000..f118d8135c12 --- /dev/null +++ b/drivers/kernelsu/su_mount_ns.h @@ -0,0 +1,10 @@ +#ifndef __KSU_SU_MOUNT_NS_H +#define __KSU_SU_MOUNT_NS_H + +#define KSU_NS_INHERITED 0 +#define KSU_NS_GLOBAL 1 +#define KSU_NS_INDIVIDUAL 2 + +void setup_mount_ns(int32_t ns_mode); + +#endif diff --git a/drivers/kernelsu/sucompat.c b/drivers/kernelsu/sucompat.c new file mode 100644 index 000000000000..091df1398c5b --- /dev/null +++ b/drivers/kernelsu/sucompat.c @@ -0,0 +1,410 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#include +#else +#include +#endif + +#define SU_PATH "/system/bin/su" +#define SH_PATH "/system/bin/sh" + +static bool ksu_su_compat_enabled __read_mostly = true; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static void __user *userspace_stack_buffer(const void *d, size_t len) +{ + /* To avoid having to mmap a page in userspace, just write below the stack + * pointer. */ + char __user *p = (void __user *)current_user_stack_pointer() - len; + + return copy_to_user(p, d, len) ? NULL : p; +} +#else +static void __user *userspace_stack_buffer(const void *d, size_t len) +{ + if (!current->mm) + return NULL; + + volatile unsigned long start_stack = current->mm->start_stack; + unsigned int step = 32; + char __user *p = NULL; + + do { + p = (void __user *)(start_stack - step - len); + if (!copy_to_user(p, d, len)) { + /* pr_info("%s: start_stack: %lx p: %lx len: %zu\n", + __func__, start_stack, (unsigned long)p, len ); */ + return p; + } + step = step + step; + } while (step <= 2048); + return NULL; +} +#endif + +static char __user *sh_user_path(void) +{ + static const char sh_path[] = "/system/bin/sh"; + + return userspace_stack_buffer(sh_path, sizeof(sh_path)); +} + +static char __user *ksud_user_path(void) +{ + static const char ksud_path[] = KSUD_PATH; + + return userspace_stack_buffer(ksud_path, sizeof(ksud_path)); +} + +__attribute__((hot)) +static __always_inline bool is_su_allowed(const void **ptr_to_check) +{ + barrier(); + if (!ksu_su_compat_enabled) + return false; + + if (likely(!!current->seccomp.mode)) + return false; + + // with seccomp check above, we can make this neutral + kuid_t current_uid = current_uid(); + if (!ksu_is_allow_uid_for_current( ksu_get_uid_t(current_uid) )) + return false; + + // first check the pointer-to-pointer + if (unlikely(!ptr_to_check)) + return false; + + // now dereference pointer-to-pointer to check actual pointer + if (unlikely(!*ptr_to_check)) + return false; + + return true; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +static inline void sys_execve_escape_ksud(const char __user **filename_user) +{ + if (likely(ksu_boot_completed)) + return; + + // see if its init + if (!is_init(get_current_cred())) + return; + + const char ksud_path[] = KSUD_PATH; + char path[sizeof(ksud_path)]; + + // see if its trying to execute ksud + if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) + return; + + if (memcmp(ksud_path, path, sizeof(path))) + return; + + pr_info("sys_execve: escape init executing ksud with pid: %d\n", current->pid); + + escape_to_root_forced(); // give this context all permissions + + return; +} + +static inline void kernel_execve_escape_ksud(void *filename_ptr) +{ + if (likely(ksu_boot_completed)) + return; + + // see if its init + if (!is_init(get_current_cred())) + return; + + if (likely(memcmp(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)))) + return; + + pr_info("kernel_execve: escape init executing ksud with pid: %d\n", current->pid); + + escape_to_root_forced(); // give this context all permissions + + return; +} +#else +static inline void sys_execve_escape_ksud(const char __user **filename_user) { } // no-op +static inline void kernel_execve_escape_ksud(void *filename_ptr) {} // no-op +#endif + +static int ksu_sucompat_user_common(const char __user **filename_user, + const char *syscall_name, + const bool escalate, + const uint8_t sym) +{ + const char su[] = SU_PATH; + + char path[sizeof(su)]; // sizeof includes nullterm already! + if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) + return 0; + + // what we shouldve copied should've been preterminated! + // path[sizeof(path) - 1] = '\0'; + + if (memcmp(path, su, sizeof(su))) + return 0; + + write_sulog(sym); + + if (escalate) { + pr_info("%s su found\n", syscall_name); + *filename_user = ksud_user_path(); + escape_with_root_profile(); // escalate !! + } else { + pr_info("%s su->sh!\n", syscall_name); + *filename_user = sh_user_path(); + } + + return 0; +} + +// sys_faccessat +int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, + int *__unused_flags) +{ + if (!is_su_allowed((const void **)filename_user)) + return 0; + + return ksu_sucompat_user_common(filename_user, "faccessat", false, 'a'); +} + +// sys_newfstatat, sys_fstat64 +int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) +{ + if (!is_su_allowed((const void **)filename_user)) + return 0; + + return ksu_sucompat_user_common(filename_user, "newfstatat", false, 's'); +} + +// sys_execve, compat_sys_execve +int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, + void *__never_use_argv, void *__never_use_envp, + int *__never_use_flags) +{ + sys_execve_escape_ksud(filename_user); + + if (!is_su_allowed((const void **)filename_user)) + return 0; + + return ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x'); +} + +// getname_flags on fs/namei.c, this hooks ALL fs-related syscalls. +// NOT RECOMMENDED for daily use. mostly for debugging purposes. +int ksu_getname_flags_user(const char __user **filename_user, int flags) +{ + if (!is_su_allowed((const void **)filename_user)) + return 0; + + // sys_execve always calls getname, which sets flags = 0 on getname_flags + // we can use it to deduce if caller is likely execve + + uint8_t sym = '$'; + bool escalate = false; + + if (!flags) { + escalate = true; + sym = 'x'; + } + + return ksu_sucompat_user_common(filename_user, "getname_flags", escalate, sym); +} + +static int ksu_sucompat_kernel_common(void *filename_ptr, const char *function_name, bool escalate, const uint8_t sym) +{ + + if (likely(memcmp(filename_ptr, SU_PATH, sizeof(SU_PATH)))) + return 0; + + write_sulog(sym); + + if (escalate) { + pr_info("%s su found\n", function_name); + memcpy(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); + escape_with_root_profile(); + } else { + pr_info("%s su->sh\n", function_name); + memcpy(filename_ptr, SH_PATH, sizeof(SH_PATH)); + } + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0) +// for do_execveat_common / do_execve_common on >= 3.14 +// take note: struct filename **filename +int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, + void *__never_use_argv, void *__never_use_envp, + int *__never_use_flags) +{ + kernel_execve_escape_ksud((void *)(*filename_ptr)->name); + + if (!is_su_allowed((const void **)filename_ptr)) + return 0; + + // struct filename *filename = *filename_ptr; + // return ksu_do_execveat_common((void *)filename->name, "do_execveat_common"); + // nvm this, just inline + + return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true, 'x'); +} + +// for compatibility to old hooks +int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, + void *envp, int *flags) +{ + kernel_execve_escape_ksud((void *)(*filename_ptr)->name); + + if (!is_su_allowed((const void **)filename_ptr)) + return 0; + + return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true, 'x'); +} +#else +// for do_execve_common on < 3.14 +// take note: char **filename +int ksu_legacy_execve_sucompat(const char **filename_ptr, + void *__never_use_argv, + void *__never_use_envp) +{ + kernel_execve_escape_ksud((void *)*filename_ptr); + + if (!is_su_allowed((const void **)filename_ptr)) + return 0; + + return ksu_sucompat_kernel_common((void *)*filename_ptr, "do_execve_common", true, 'x'); +} +#endif + +// vfs_statx for 5.18+ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0) +int ksu_handle_vfs_statx(void *__never_use_dfd, struct filename **filename_ptr, + void *__never_use_flags, void **__never_use_stat, + void *__never_use_request_mask) +{ + if (!is_su_allowed((const void **)filename_ptr)) + return 0; + + return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "vfs_statx", false, 's'); +} +#endif + +// getname_flags on fs/namei.c, this hooks ALL fs-related syscalls. +// put the hook right after usercopy +// NOT RECOMMENDED for daily use. mostly for debugging purposes. +int ksu_getname_flags_kernel(char **kname, int flags) +{ + if (!is_su_allowed((const void **)kname)) + return 0; + + uint8_t sym = '$'; + bool escalate = false; + + if (!flags) { + escalate = true; + sym = 'x'; + } + + return ksu_sucompat_kernel_common((void *)*kname, "getname_flags", escalate, sym); +} + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +static void syscall_table_sucompat_enable(); +static void syscall_table_sucompat_disable(); +#endif + +#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT +static void rp_sucompat_exit(); +static void rp_sucompat_init(); +#endif + +static void ksu_sucompat_enable() +{ + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE + syscall_table_sucompat_enable(); +#endif + +#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT + rp_sucompat_init(); +#endif + + ksu_su_compat_enabled = true; + pr_info("%s: hooks enabled: exec, faccessat, stat\n", __func__); +} + +static void ksu_sucompat_disable() +{ + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE + syscall_table_sucompat_disable(); +#endif + +#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT + rp_sucompat_exit(); +#endif + + ksu_su_compat_enabled = false; + pr_info("%s: hooks disabled: exec, faccessat, stat\n", __func__); +} + +static int su_compat_feature_get(u64 *value) +{ + *value = ksu_su_compat_enabled ? 1 : 0; + return 0; +} + +static int su_compat_feature_set(u64 value) +{ + bool enable = value != 0; + + if (enable == ksu_su_compat_enabled) { + pr_info("su_compat: no need to change\n"); + return 0; + } + + if (enable) { + ksu_sucompat_enable(); + } else { + ksu_sucompat_disable(); + } + + ksu_su_compat_enabled = enable; + pr_info("su_compat: set to %d\n", enable); + + return 0; +} + +static const struct ksu_feature_handler su_compat_handler = { + .feature_id = KSU_FEATURE_SU_COMPAT, + .name = "su_compat", + .get_handler = su_compat_feature_get, + .set_handler = su_compat_feature_set, +}; + +// sucompat: permited process can execute 'su' to gain root access. +void ksu_sucompat_init() +{ + if (ksu_register_feature_handler(&su_compat_handler)) { + pr_err("Failed to register su_compat feature handler\n"); + } +} + +void ksu_sucompat_exit() +{ + ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT); +} diff --git a/drivers/kernelsu/sucompat.h b/drivers/kernelsu/sucompat.h new file mode 100644 index 000000000000..52c30780a7b2 --- /dev/null +++ b/drivers/kernelsu/sucompat.h @@ -0,0 +1,10 @@ +#ifndef __KSU_H_SUCOMPAT +#define __KSU_H_SUCOMPAT +#include +#include +#include + +void ksu_sucompat_init(void); +void ksu_sucompat_exit(void); + +#endif diff --git a/drivers/kernelsu/supercalls.c b/drivers/kernelsu/supercalls.c new file mode 100644 index 000000000000..3ca054dc2dde --- /dev/null +++ b/drivers/kernelsu/supercalls.c @@ -0,0 +1,955 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // utsname() and uts_sem + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include // put_task_struct +#else +#include +#endif + +// Permission check functions +bool only_manager(void) +{ + return is_manager(); +} + +bool only_root(void) +{ + kuid_t current_uid = current_uid(); + return ksu_get_uid_t(current_uid) == 0; +} + +bool manager_or_root(void) +{ + kuid_t current_uid = current_uid(); + return ksu_get_uid_t(current_uid) == 0 || is_manager(); +} + +bool always_allow(void) +{ + return true; // No permission check +} + +bool allowed_for_su(void) +{ + kuid_t current_uid = current_uid(); + bool is_allowed = is_manager() || ksu_is_allow_uid_for_current(ksu_get_uid_t(current_uid)); + return is_allowed; +} + +static int do_grant_root(void __user *arg) +{ + // we already check uid above on allowed_for_su() + + write_sulog('i'); // log ioctl escalation + + kuid_t current_uid = current_uid(); + pr_info("allow root for: %d\n", ksu_get_uid_t(current_uid)); + escape_with_root_profile(); + + return 0; +} + +static uint32_t ksuver_override = 0; + +static int do_get_info(void __user *arg) +{ + struct ksu_get_info_cmd cmd = {.version = KERNEL_SU_VERSION, .flags = 0}; + + if (ksuver_override) + cmd.version = ksuver_override; + + if (is_manager()) { + cmd.flags |= 0x2; + } + cmd.features = KSU_FEATURE_MAX; + + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_version: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_report_event(void __user *arg) +{ + struct ksu_report_event_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + switch (cmd.event) { + case EVENT_POST_FS_DATA: { + static bool post_fs_data_lock = false; + if (!post_fs_data_lock) { + post_fs_data_lock = true; + pr_info("post-fs-data triggered\n"); + on_post_fs_data(); + } + break; + } + case EVENT_BOOT_COMPLETED: { + static bool boot_complete_lock = false; + if (!boot_complete_lock) { + boot_complete_lock = true; + pr_info("boot_complete triggered\n"); + on_boot_completed(); + } + break; + } + case EVENT_MODULE_MOUNTED: { + ksu_module_mounted = true; + pr_info("module mounted!\n"); + on_module_mounted(); + break; + } + default: + break; + } + + return 0; +} + +static int do_set_sepolicy(void __user *arg) +{ + struct ksu_set_sepolicy_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + return handle_sepolicy(cmd.cmd, (void __user *)cmd.arg); +} + +static int do_check_safemode(void __user *arg) +{ + struct ksu_check_safemode_cmd cmd; + + cmd.in_safe_mode = ksu_is_safe_mode(); + + if (cmd.in_safe_mode) { + pr_warn("safemode enabled!\n"); + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("check_safemode: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_new_get_allow_list_common(void __user *arg, bool allow) +{ + struct ksu_new_get_allow_list_cmd cmd; + int *arr = NULL; + int err = 0; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + if (cmd.count) { + arr = kmalloc(sizeof(int) * cmd.count, GFP_KERNEL); + if (!arr) { + return -ENOMEM; + } + } + + bool success = ksu_get_allow_list(arr, cmd.count, &cmd.count, &cmd.total_count, allow); + + if (!success) { + err = -EFAULT; + goto out; + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("new_get_allow_list: copy_to_user count failed\n"); + err = -EFAULT; + goto out; + } + + if (cmd.count && copy_to_user(&((struct ksu_new_get_allow_list_cmd *)arg)->uids, arr, sizeof(int) * cmd.count)) { + pr_err("new_get_allow_list: copy_to_user uids failed\n"); + err = -EFAULT; + } + +out: + if (arr) { + kfree(arr); + } + return err; +} + +static int do_new_get_deny_list(void __user *arg) +{ + return do_new_get_allow_list_common(arg, false); +} + +static int do_new_get_allow_list(void __user *arg) +{ + return do_new_get_allow_list_common(arg, true); +} + +static int do_get_allow_list_common(void __user *arg, bool allow) +{ + int *arr = NULL; + int err = 0; + u16 count; + u32 out_count; + static const u16 kSize = 128; + + arr = kmalloc(sizeof(int) * kSize, GFP_KERNEL); + if (!arr) { + return -ENOMEM; + } + + bool success = ksu_get_allow_list(arr, kSize, &count, NULL, allow); + + if (!success) { + err = -EFAULT; + goto out; + } + + out_count = count; + + if (copy_to_user(arg + offsetof(struct ksu_get_allow_list_cmd, count), + &out_count, sizeof(u32))) { + pr_err("get_allow_list: copy_to_user count failed\n"); + err = -EFAULT; + goto out; + } + + if (copy_to_user(arg, arr, sizeof(u32) * count)) { + pr_err("get_allow_list: copy_to_user uids failed\n"); + err = -EFAULT; + } + +out: + if (arr) { + kfree(arr); + } + return err; +} + +static int do_get_deny_list(void __user *arg) +{ + return do_get_allow_list_common(arg, false); +} + +static int do_get_allow_list(void __user *arg) +{ + return do_get_allow_list_common(arg, true); +} + +static int do_uid_granted_root(void __user *arg) +{ + struct ksu_uid_granted_root_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + cmd.granted = ksu_is_allow_uid_for_current(cmd.uid); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("uid_granted_root: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_uid_should_umount(void __user *arg) +{ + struct ksu_uid_should_umount_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + cmd.should_umount = ksu_uid_should_umount(cmd.uid); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("uid_should_umount: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_get_manager_appid(void __user *arg) +{ + struct ksu_get_manager_appid_cmd cmd; + + cmd.appid = ksu_get_manager_appid(); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_manager_appid: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_get_app_profile(void __user *arg) +{ + struct ksu_get_app_profile_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_app_profile: copy_from_user failed\n"); + return -EFAULT; + } + + if (!ksu_get_app_profile(&cmd.profile)) { + return -ENOENT; + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_app_profile: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_set_app_profile(void __user *arg) +{ + struct ksu_set_app_profile_cmd cmd; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("set_app_profile: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_set_app_profile(&cmd.profile); + if (!ret) { + ksu_persistent_allow_list(); + } + + return ret; +} + +static int do_get_feature(void __user *arg) +{ + struct ksu_get_feature_cmd cmd; + bool supported; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_feature: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported); + cmd.supported = supported ? 1 : 0; + + if (ret && supported) { + pr_err("get_feature: failed for feature %u: %d\n", cmd.feature_id, ret); + return ret; + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_feature: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_set_feature(void __user *arg) +{ + struct ksu_set_feature_cmd cmd; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("set_feature: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_set_feature(cmd.feature_id, cmd.value); + if (ret) { + pr_err("set_feature: failed for feature %u: %d\n", cmd.feature_id, ret); + return ret; + } + + return 0; +} + +static int do_get_wrapper_fd(void __user *arg) { + if (!ksu_file_sid) { + return -EINVAL; + } + + struct ksu_get_wrapper_fd_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_wrapper_fd: copy_from_user failed\n"); + return -EFAULT; + } + + return ksu_install_file_wrapper(cmd.fd); +} + +// Get task mark status +// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found +/* BRICKPORT: on this one we return 1 if seccomp is disabled and 0 if enabled */ +static int ksu_get_task_mark(pid_t pid) +{ + struct task_struct *task; + int ret = -ESRCH; + + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return ret; + } + + ret = !task->seccomp.mode; + rcu_read_unlock(); + + return ret; +} + +static int do_manage_mark(void __user *arg) +{ + struct ksu_manage_mark_cmd cmd; + int ret = 0; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("manage_mark: copy_from_user failed\n"); + return -EFAULT; + } + + switch (cmd.operation) { + case KSU_MARK_GET: { + // on this one, we return seccomp status of a pid instead + // at the very least we have partial featureset + ret = ksu_get_task_mark(cmd.pid); + if (ret < 0) { + pr_err("manage_mark: get failed for pid %d: %d\n", cmd.pid, ret); + return ret; + } + cmd.result = (u32)ret; + break; + } +#if 0 // TODO: revisit this sometime + case KSU_MARK_MARK: { break; } + case KSU_MARK_UNMARK: { break; } + case KSU_MARK_REFRESH: { break; } +#endif + default: { + pr_err("manage_mark: invalid operation %u\n", cmd.operation); + return -EINVAL; + } + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("manage_mark: copy_to_user failed\n"); + return -EFAULT; + } + + + return 0; +} +static int do_nuke_ext4_sysfs(void __user *arg) +{ + struct ksu_nuke_ext4_sysfs_cmd cmd; + char mnt[256]; + long ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) + return -EFAULT; + + if (!cmd.arg) + return -EINVAL; + + memset(mnt, 0, sizeof(mnt)); + + ret = strncpy_from_user(mnt, (void __user *)cmd.arg, sizeof(mnt)); + if (ret < 0) { + pr_err("nuke ext4 copy mnt failed: %ld\\n", ret); + return -EFAULT; // 或者 return ret; + } + + if (ret == sizeof(mnt)) { + pr_err("nuke ext4 mnt path too long\\n"); + return -ENAMETOOLONG; + } + + pr_info("do_nuke_ext4_sysfs: %s\n", mnt); + + return nuke_ext4_sysfs(mnt); +} + +struct list_head mount_list = LIST_HEAD_INIT(mount_list); +DECLARE_RWSEM(mount_list_lock); + +static int add_try_umount(void __user *arg) +{ + struct mount_entry *new_entry, *entry, *tmp; + struct ksu_add_try_umount_cmd cmd; + char buf[256] = {0}; + + if (copy_from_user(&cmd, arg, sizeof cmd)) + return -EFAULT; + + switch (cmd.mode) { + case KSU_UMOUNT_WIPE: { + struct mount_entry *entry, *tmp; + down_write(&mount_list_lock); + list_for_each_entry_safe(entry, tmp, &mount_list, list) { + pr_info("wipe_umount_list: removing entry: %s\n", entry->umountable); + list_del(&entry->list); + kfree(entry->umountable); + kfree(entry); + } + up_write(&mount_list_lock); + + return 0; + } + + case KSU_UMOUNT_ADD: { + long len = strncpy_from_user(buf, (const char __user *)cmd.arg, 256); + if (len <= 0) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->umountable = kstrdup(buf, GFP_KERNEL); + if (!new_entry->umountable) { + kfree(new_entry); + return -ENOMEM; + } + + down_write(&mount_list_lock); + + // disallow dupes + // if this gets too many, we can consider moving this whole task to a kthread + list_for_each_entry(entry, &mount_list, list) { + if (!strcmp(entry->umountable, buf)) { + pr_info("cmd_add_try_umount: %s is already here!\n", buf); + up_write(&mount_list_lock); + kfree(new_entry->umountable); + kfree(new_entry); + return -EEXIST; + } + } + + // now check flags and add + // this also serves as a null check + if (cmd.flags) + new_entry->flags = cmd.flags; + else + new_entry->flags = 0; + + // debug + list_add(&new_entry->list, &mount_list); + up_write(&mount_list_lock); + pr_info("cmd_add_try_umount: %s added!\n", buf); + + return 0; + } + + // this is just strcmp'd wipe anyway + case KSU_UMOUNT_DEL: { + long len = strncpy_from_user(buf, (const char __user *)cmd.arg, sizeof(buf) - 1); + if (len <= 0) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + + down_write(&mount_list_lock); + list_for_each_entry_safe(entry, tmp, &mount_list, list) { + if (!strcmp(entry->umountable, buf)) { + pr_info("cmd_add_try_umount: entry removed: %s\n", entry->umountable); + list_del(&entry->list); + kfree(entry->umountable); + kfree(entry); + } + } + up_write(&mount_list_lock); + + return 0; + } + + // this way userspace can deduce the memory it has to prepare. + case KSU_UMOUNT_GETSIZE: { + // check for pointer first + if (!cmd.arg) + return -EFAULT; + + size_t total_size = 0; // size of list in bytes + + down_read(&mount_list_lock); + list_for_each_entry(entry, &mount_list, list) { + total_size = total_size + strlen(entry->umountable) + 1; // + 1 for \0 + } + up_read(&mount_list_lock); + + pr_info("cmd_add_try_umount: total_size: %zu\n", total_size); + + if (copy_to_user((size_t __user *)cmd.arg, &total_size, sizeof(total_size))) + return -EFAULT; + + return 0; + } + + // WARNING! this is straight up pointerwalking. + // this way we dont need to redefine the ioctl defs. + // this also avoids us needing to kmalloc + // userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA. + case KSU_UMOUNT_GETLIST: { + if (!cmd.arg) + return -EFAULT; + + char *user_buf = (char *)cmd.arg; + + down_read(&mount_list_lock); + list_for_each_entry(entry, &mount_list, list) { + pr_info("cmd_add_try_umount: entry: %s\n", entry->umountable); + + if (copy_to_user((char __user *)user_buf, entry->umountable, strlen(entry->umountable) + 1 )) { + up_read(&mount_list_lock); + return -EFAULT; + } + + // walk it! +1 for null terminator + user_buf = user_buf + strlen(entry->umountable) + 1; + } + up_read(&mount_list_lock); + + return 0; + } + + default: { + pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode); + return -EINVAL; + } + + } // switch(cmd.mode) + + return 0; +} + +// IOCTL handlers mapping table +static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { + { .cmd = KSU_IOCTL_GRANT_ROOT, .name = "GRANT_ROOT", .handler = do_grant_root, .perm_check = allowed_for_su }, + { .cmd = KSU_IOCTL_GET_INFO, .name = "GET_INFO", .handler = do_get_info, .perm_check = always_allow }, + { .cmd = KSU_IOCTL_REPORT_EVENT, .name = "REPORT_EVENT", .handler = do_report_event, .perm_check = only_root }, + { .cmd = KSU_IOCTL_SET_SEPOLICY, .name = "SET_SEPOLICY", .handler = do_set_sepolicy, .perm_check = only_root }, + { .cmd = KSU_IOCTL_CHECK_SAFEMODE, .name = "CHECK_SAFEMODE", .handler = do_check_safemode, .perm_check = always_allow }, + { .cmd = KSU_IOCTL_GET_ALLOW_LIST, .name = "GET_ALLOW_LIST", .handler = do_get_allow_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_DENY_LIST, .name = "GET_DENY_LIST", .handler = do_get_deny_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NEW_GET_ALLOW_LIST, .name = "NEW_GET_ALLOW_LIST", .handler = do_new_get_allow_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NEW_GET_DENY_LIST, .name = "NEW_GET_DENY_LIST", .handler = do_new_get_deny_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_UID_GRANTED_ROOT, .name = "UID_GRANTED_ROOT", .handler = do_uid_granted_root, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_UID_SHOULD_UMOUNT, .name = "UID_SHOULD_UMOUNT", .handler = do_uid_should_umount, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_MANAGER_APPID, .name = "GET_MANAGER_APPID", .handler = do_get_manager_appid, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_APP_PROFILE, .name = "GET_APP_PROFILE", .handler = do_get_app_profile, .perm_check = only_manager }, + { .cmd = KSU_IOCTL_SET_APP_PROFILE, .name = "SET_APP_PROFILE", .handler = do_set_app_profile, .perm_check = only_manager }, + { .cmd = KSU_IOCTL_GET_FEATURE, .name = "GET_FEATURE", .handler = do_get_feature, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_SET_FEATURE, .name = "SET_FEATURE", .handler = do_set_feature, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_WRAPPER_FD, .name = "GET_WRAPPER_FD", .handler = do_get_wrapper_fd, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_MANAGE_MARK, .name = "MANAGE_MARK", .handler = do_manage_mark, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NUKE_EXT4_SYSFS, .name = "NUKE_EXT4_SYSFS", .handler = do_nuke_ext4_sysfs, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_ADD_TRY_UMOUNT, .name = "ADD_TRY_UMOUNT", .handler = add_try_umount, .perm_check = manager_or_root }, + { .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } // Sentinel +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +#include +#include + +struct ksu_install_fd_tw { + struct callback_head cb; + int __user *outp; +}; + +static void ksu_install_fd_tw_func(struct callback_head *cb) +{ + struct ksu_install_fd_tw *tw = container_of(cb, struct ksu_install_fd_tw, cb); + int fd = ksu_install_fd(); + pr_info("[%d] install ksu fd: %d\n", current->pid, fd); + + if (copy_to_user(tw->outp, &fd, sizeof(fd))) { + pr_err("install ksu fd reply err\n"); + close_fd(fd); + } + + kfree(tw); +} + +static int ksu_handle_fd_request(void __user *arg4) +{ + struct ksu_install_fd_tw *tw; + + tw = kzalloc(sizeof(*tw), GFP_ATOMIC); + if (!tw) + return 0; + + tw->outp = (int __user *)arg4; + tw->cb.func = ksu_install_fd_tw_func; + + if (task_work_add(current, &tw->cb, TWA_RESUME)) { + kfree(tw); + pr_warn("install fd add task_work failed\n"); + } + + return 0; +} +#else +static int ksu_handle_fd_request(void __user *arg4) +{ + int fd = ksu_install_fd(); + pr_info("[%d] install ksu fd: %d\n", current->pid, fd); + + if (copy_to_user(arg4, &fd, sizeof(fd))) { + pr_err("install ksu fd reply err\n"); + close_fd(fd); + } + + return 0; +} +#endif + +// downstream: make sure to pass arg as reference, this can allow us to extend things. +int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg) +{ + + if (magic1 != KSU_INSTALL_MAGIC1) + return 0; + + pr_info("sys_reboot: intercepted call! magic: 0x%x id: %d\n", magic1, magic2); + + // arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs); + // downstream: dereference arg as arg4 so we can be inline to upstream + void __user *arg4 = (void __user *)*arg; + + // Check if this is a request to install KSU fd + if (magic2 == KSU_INSTALL_MAGIC2) { + return ksu_handle_fd_request(arg4); + } + + // extensions + u64 reply = (u64)*arg; + + kuid_t current_uid = current_uid(); + if (ksu_get_uid_t(current_uid) != 0) + return 0; + + if (magic2 == CHANGE_MANAGER_UID) { + // only root is allowed for this command + + pr_info("sys_reboot: ksu_set_manager_appid to: %d\n", cmd); + ksu_set_manager_appid(cmd); + + if (cmd == ksu_get_manager_appid()) { + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + pr_info("sys_reboot: reply fail\n"); + } + + return 0; + } + + if (magic2 == GET_SULOG_DUMP_V2) { + int ret = send_sulog_dump(*arg); + if (ret) + return 0; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + if (magic2 == CHANGE_KSUVER) { + + pr_info("sys_reboot: ksu_change_ksuver to: %d\n", cmd); + ksuver_override = cmd; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + // WARNING!!! triple ptr zone! *** + // https://wiki.c2.com/?ThreeStarProgrammer + if (magic2 == CHANGE_SPOOF_UNAME) { + + char release_buf[65]; + char version_buf[65]; + static char original_release_buf[65] = {0}; + static char original_version_buf[65] = {0}; + + // basically void * void __user * void __user *arg + void ***ppptr = (void ***)(uintptr_t)arg; + + // user pointer storage + // init this as zero so this works on 32-on-64 compat (LE) + uint64_t u_pptr = 0; + uint64_t u_ptr = 0; + + pr_info("sys_reboot: ppptr: 0x%lx \n", (uintptr_t)ppptr); + + // arg here is ***, dereference to pull out ** + if (copy_from_user(&u_pptr, (void __user *)*ppptr, sizeof(u_pptr))) + return 0; + + pr_info("sys_reboot: u_pptr: 0x%lx \n", (uintptr_t)u_pptr); + + // now we got the __user ** + // we cannot dereference this as this is __user + // we just do another copy_from_user to get it + if (copy_from_user(&u_ptr, (void __user *)u_pptr, sizeof(u_ptr))) + return 0; + + pr_info("sys_reboot: u_ptr: 0x%lx \n", (uintptr_t)u_ptr); + + // for release + if (strncpy_from_user(release_buf, (char __user *)u_ptr, sizeof(release_buf)) < 0) + return 0; + release_buf[sizeof(release_buf) - 1] = '\0'; + + // for version + if (strncpy_from_user(version_buf, (char __user *)(u_ptr + strlen(release_buf) + 1), sizeof(version_buf)) < 0) + return 0; + version_buf[sizeof(version_buf) - 1] = '\0'; + + if (original_release_buf[0] == '\0') { + struct new_utsname *u_curr = utsname(); + // we save current version as the original before modifying + strncpy(original_release_buf, u_curr->release, sizeof(original_release_buf)); + strncpy(original_version_buf, u_curr->version, sizeof(original_version_buf)); + pr_info("sys_reboot: original uname saved: %s %s\n", original_release_buf, original_version_buf); + } + + // so user can reset + if (!strcmp(release_buf, "default")) { + memcpy(release_buf, original_release_buf, sizeof(release_buf)); + } + if (!strcmp(version_buf, "default")) { + memcpy(version_buf, original_version_buf, sizeof(version_buf)); + } + + pr_info("sys_reboot: spoofing kernel to: %s - %s\n", release_buf, version_buf); + + struct new_utsname *u = utsname(); + + down_write(&uts_sem); + strncpy(u->release, release_buf, sizeof(u->release)); + strncpy(u->version, version_buf, sizeof(u->version)); + up_write(&uts_sem); + + // we write our confirmation on ** + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + return 0; + } + + return 0; +} + +void ksu_supercalls_init(void) +{ + int i; + + pr_info("KernelSU IOCTL Commands:\n"); + for (i = 0; ksu_ioctl_handlers[i].handler; i++) { + pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, ksu_ioctl_handlers[i].cmd); + } + + sulog_init_heap(); // grab heap memory for sulog + +} + +void ksu_supercalls_exit(void){} + +// IOCTL dispatcher +static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int i; + + kuid_t current_uid = current_uid(); + +#ifdef CONFIG_KSU_DEBUG + pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, ksu_get_uid_t(current_uid)); +#endif + + for (i = 0; ksu_ioctl_handlers[i].handler; i++) { + if (cmd == ksu_ioctl_handlers[i].cmd) { + // Check permission first + if (ksu_ioctl_handlers[i].perm_check && + !ksu_ioctl_handlers[i].perm_check()) { + pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", + cmd, ksu_get_uid_t(current_uid)); + return -EPERM; + } + // Execute handler + return ksu_ioctl_handlers[i].handler(argp); + } + } + + pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd); + return -ENOTTY; +} + +// File release handler +static int anon_ksu_release(struct inode *inode, struct file *filp) +{ + pr_info("ksu fd released\n"); + return 0; +} + +// File operations structure +static const struct file_operations anon_ksu_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = anon_ksu_ioctl, + .compat_ioctl = anon_ksu_ioctl, + .release = anon_ksu_release, +}; + +// Install KSU fd to current process +int ksu_install_fd(void) +{ + struct file *filp; + int fd; + + // Get unused fd + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + pr_err("ksu_install_fd: failed to get unused fd\n"); + return fd; + } + + // Create anonymous inode file + filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, O_RDWR | O_CLOEXEC); + if (IS_ERR(filp)) { + pr_err("ksu_install_fd: failed to create anon inode file\n"); + put_unused_fd(fd); + return PTR_ERR(filp); + } + + // Install fd + fd_install(fd, filp); + + pr_info("ksu fd installed: %d for pid %d\n", fd, current->pid); + + return fd; +} diff --git a/drivers/kernelsu/supercalls.h b/drivers/kernelsu/supercalls.h new file mode 100644 index 000000000000..388df1eeaed5 --- /dev/null +++ b/drivers/kernelsu/supercalls.h @@ -0,0 +1,166 @@ +#ifndef __KSU_H_SUPERCALLS +#define __KSU_H_SUPERCALLS + +#include +#include +#include "app_profile.h" + +// Magic numbers for reboot hook to install fd +#define KSU_INSTALL_MAGIC1 0xDEADBEEF +#define KSU_INSTALL_MAGIC2 0xCAFEBABE + +// Command structures for ioctl + +struct ksu_become_daemon_cmd { + __u8 token[65]; // Input: daemon token (null-terminated) +}; + +struct ksu_get_info_cmd { + __u32 version; // Output: KERNEL_SU_VERSION + __u32 flags; // Output: flags (bit 0: MODULE mode) + __u32 features; // Output: max feature ID supported +}; + +struct ksu_report_event_cmd { + __u32 event; // Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. +}; + +struct ksu_set_sepolicy_cmd { + __u64 cmd; // Input: sepolicy command + __aligned_u64 arg; // Input: sepolicy argument pointer +}; + +struct ksu_check_safemode_cmd { + __u8 in_safe_mode; // Output: true if in safe mode, false otherwise +}; + +// deprecated +struct ksu_get_allow_list_cmd { + __u32 uids[128]; // Output: array of allowed/denied UIDs + __u32 count; // Output: number of UIDs in array + __u8 allow; // Input: true for allow list, false for deny list +}; + +struct ksu_new_get_allow_list_cmd { + __u16 count; // Input / Output: number of UIDs in array + __u16 total_count; // Output: total number of UIDs in requested list + __u32 uids[0]; // Output: array of allowed/denied UIDs +}; + +struct ksu_uid_granted_root_cmd { + __u32 uid; // Input: target UID to check + __u8 granted; // Output: true if granted, false otherwise +}; + +struct ksu_uid_should_umount_cmd { + __u32 uid; // Input: target UID to check + __u8 should_umount; // Output: true if should umount, false otherwise +}; + +struct ksu_get_manager_appid_cmd { + __u32 appid; // Output: manager app id +}; + +struct ksu_get_app_profile_cmd { + struct app_profile profile; // Input/Output: app profile structure +}; + +struct ksu_set_app_profile_cmd { + struct app_profile profile; // Input: app profile structure +}; + +struct ksu_get_feature_cmd { + __u32 feature_id; // Input: feature ID (enum ksu_feature_id) + __u64 value; // Output: feature value/state + __u8 supported; // Output: true if feature is supported, false otherwise +}; + +struct ksu_set_feature_cmd { + __u32 feature_id; // Input: feature ID (enum ksu_feature_id) + __u64 value; // Input: feature value/state to set +}; + +struct ksu_get_wrapper_fd_cmd { + __u32 fd; // Input: userspace fd + __u32 flags; // Input: flags of userspace fd +}; + +struct ksu_manage_mark_cmd { + __u32 operation; // Input: KSU_MARK_* + __s32 pid; // Input: target pid (0 for all processes) + __u32 result; // Output: for get operation - mark status or reg_count +}; + +#define KSU_MARK_GET 1 +#define KSU_MARK_MARK 2 +#define KSU_MARK_UNMARK 3 +#define KSU_MARK_REFRESH 4 + +struct ksu_nuke_ext4_sysfs_cmd { + __aligned_u64 arg; // Input: mnt pointer +}; + +struct ksu_add_try_umount_cmd { + __aligned_u64 arg; // char ptr, this is the mountpoint + __u32 flags; // this is the flag we use for it + __u8 mode; // denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry +}; + +#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list +#define KSU_UMOUNT_ADD 1 // add entry (path + flags) +#define KSU_UMOUNT_DEL 2 // delete entry, strcmp + + + +// IOCTL command definitions +#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) +#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) +#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) +#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ|_IOC_WRITE, 'K', 4, 0) +#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) +// deprecated +#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 6, 0) +// deprecated +#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 7, 0) +#define KSU_IOCTL_NEW_GET_ALLOW_LIST _IOWR('K', 6, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_NEW_GET_DENY_LIST _IOWR('K', 7, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ|_IOC_WRITE, 'K', 8, 0) +#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ|_IOC_WRITE, 'K', 9, 0) +#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) +#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ|_IOC_WRITE, 'K', 11, 0) +#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) +#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ|_IOC_WRITE, 'K', 13, 0) +#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) +#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) +#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ|_IOC_WRITE, 'K', 16, 0) +#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) +#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) + +// IOCTL handler types +typedef int (*ksu_ioctl_handler_t)(void __user *arg); +typedef bool (*ksu_perm_check_t)(void); + +// IOCTL command mapping +struct ksu_ioctl_cmd_map { + unsigned int cmd; + const char *name; + ksu_ioctl_handler_t handler; + ksu_perm_check_t perm_check; // Permission check function +}; + +// Install KSU fd to current process +int ksu_install_fd(void); + +void ksu_supercalls_init(void); +void ksu_supercalls_exit(void); + +// extensions +#define CHANGE_MANAGER_UID 10006 +#define KSU_UMOUNT_GETSIZE 107 // get list size // shit is u8 we cant fit 10k+ on it +#define KSU_UMOUNT_GETLIST 108 // get list +#define GET_SULOG_DUMP 10009 // get sulog dump, max, last 100 escalations +#define GET_SULOG_DUMP_V2 10010 // get sulog dump, timestamped, last 250 escalations +#define CHANGE_KSUVER 10011 // change ksu version +#define CHANGE_SPOOF_UNAME 10012 // spoof uname + +#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/syscall_table_hook.c b/drivers/kernelsu/syscall_table_hook.c new file mode 100644 index 000000000000..cebb94fdb211 --- /dev/null +++ b/drivers/kernelsu/syscall_table_hook.c @@ -0,0 +1,523 @@ +#include + +#ifndef CONFIG_ARM64 +#error "only meant for ARM64" +#endif + +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h + +#define FORCE_VOLATILE(x) *(volatile typeof(x) *)&(x) + +#define __AARCH64_reboot 142 +#define __AARCH64_execve 221 +#define __AARCH64_faccessat 48 +#define __AARCH64_newfstatat 79 +#define __AARCH64_newfstat 80 + +// NOTE: CONFIG_COMPAT implies __ARCH_WANT_COMPAT_STAT64 (fstatat64, fstat64) +#define __ARMEABI_reboot 88 +#define __ARMEABI_execve 11 +#define __ARMEABI_faccessat 334 +#define __ARMEABI_fstatat64 327 +#define __ARMEABI_fstat64 197 + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) + +// on 4.19+ its is no longer just a void *sys_call_table[] +// it becomes syscall_fn_t sys_call_table[]; + +static syscall_fn_t aarch64_reboot = NULL; +static long hook_aarch64_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return aarch64_reboot(regs); +} + +static syscall_fn_t aarch64_execve = NULL; +static long hook_aarch64_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + + ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + return aarch64_execve(regs); +} + +static syscall_fn_t aarch64_faccessat = NULL; +static long hook_aarch64_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return aarch64_faccessat(regs); +} + +static syscall_fn_t aarch64_newfstatat = NULL; +static long hook_aarch64_newfstatat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return aarch64_newfstatat(regs); +} + +static syscall_fn_t aarch64_newfstat = NULL; +static long hook_aarch64_newfstat_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned int *fd = (unsigned int *)®s->regs[0]; + struct stat __user **statbuf = (struct stat __user **)®s->regs[1]; + + long ret = aarch64_newfstat(regs); + ksu_handle_newfstat_ret(fd, statbuf); + return ret; +} + +#ifdef CONFIG_COMPAT +static syscall_fn_t armeabi_reboot = NULL; +static long hook_armeabi_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return armeabi_reboot(regs); +} + +static syscall_fn_t armeabi_execve = NULL; +static long hook_armeabi_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + + ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + return armeabi_execve(regs); +} + +static syscall_fn_t armeabi_faccessat = NULL; +static long hook_armeabi_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return armeabi_faccessat(regs); +} + +static syscall_fn_t armeabi_fstatat64 = NULL; +static long hook_armeabi_fstatat64(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return armeabi_fstatat64(regs); +} + +static syscall_fn_t armeabi_fstat64 = NULL; +static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned long *fd = (unsigned long *)®s->regs[0]; + struct stat64 __user **statbuf = (struct stat64 __user **)®s->regs[1]; + + long ret = armeabi_fstat64(regs); + ksu_handle_fstat64_ret(fd, statbuf); + return ret; +} +#endif // CONFIG_COMPAT + +#else // END OF 4.19+ SYSCALL HANDLERS + +static long (*aarch64_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return aarch64_reboot(magic1, magic2, cmd, arg); +} + +static long (*aarch64_execve)(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) = NULL; +static long hook_aarch64_execve(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) +{ + ksu_handle_execve_sucompat((int *)AT_FDCWD, &filename, NULL, NULL, NULL); + return aarch64_execve(filename, argv, envp); +} + +static long (*aarch64_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long hook_aarch64_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return aarch64_faccessat(dfd, filename, mode); +} + +static long (*aarch64_newfstatat)(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) = NULL; +static long hook_aarch64_newfstatat(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return aarch64_newfstatat(dfd, filename, statbuf, flag); +} + +static long (*aarch64_newfstat)(unsigned int fd, struct stat __user * statbuf) = NULL; +static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * statbuf) +{ + // we handle it like rp + long ret = aarch64_newfstat(fd, statbuf); + ksu_handle_newfstat_ret(&fd, &statbuf); + return ret; +} + +#ifdef CONFIG_COMPAT +extern const void *compat_sys_call_table[]; + +static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return armeabi_reboot(magic1, magic2, cmd, arg); +} + +static long (*armeabi_execve)(const char __user * filename, + const compat_uptr_t __user * argv, + const compat_uptr_t __user * envp) = NULL; +static long hook_armeabi_execve(const char __user * filename, + const compat_uptr_t __user * argv, + const compat_uptr_t __user * envp) +{ + ksu_handle_execve_sucompat(NULL, &filename, NULL, NULL, NULL); + return armeabi_execve(filename, argv, envp); +} + +static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return armeabi_faccessat(dfd, filename, mode); +} + +static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) = NULL; +static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return armeabi_fstatat64(dfd, filename, statbuf, flag); +} + +static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) = NULL; +static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) +{ + // we handle it like rp + long ret = armeabi_fstat64(fd, statbuf); + ksu_handle_fstat64_ret(&fd, &statbuf); + return ret; +} +#endif // CONFIG_COMPAT + +#endif // SYSCALL HANDLERS + +// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head +// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou" + +// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) +// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void ** +// for 4.19- old_ptr is actually void ** +// target_table is void *target_table[]; +static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + /* + * basically the trick is + * addr, say 0xffff1234, this is READ-ONLY + * align it, 0xffff0000 + * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234 + * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE + * write on 0xcccc0000 + 0x00001234 + * + */ + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + preempt_disable(); + local_irq_disable(); + + *(void **)old_ptr = *target_slot; + + *target_slot = new_ptr; + smp_mb(); // ^^ + + local_irq_enable(); + preempt_enable(); + + vunmap(writable_addr); + + smp_mb(); +} + +static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + /* + * we do this to make sure that old_ptr is filled. + * we risk a dead syscall !!! + * if read_and_replace failed or we restore again, it wont be pointing to anything + * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast + * yeah it really just dummy copies machine instructions at this point. + * + * normally we use probe_kernel_address / get_kernel_nofault here but the API is + * so inconsistent across kernel versions, and since its just a dummied wrapper + * for copy_from_kernel_nofault we can do it ourselves + * + */ + + long dummy = 0; + if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long))) + return; + + pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + // check if its ours + if (*target_slot != new_ptr) { + pr_info("%s: syscall is not ours!\n", __func__); + goto out; + } + + pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr ); + + preempt_disable(); + local_irq_disable(); + + *target_slot = *(void **)old_ptr; + smp_mb(); // ^^ + + *(void **)old_ptr = NULL; // explicit reset + + local_irq_enable(); + preempt_enable(); + +out: + vunmap(writable_addr); + + smp_mb(); +} + +static int ksu_syscall_table_restore() +{ +loop_start: + + msleep(1000); + + if (FORCE_VOLATILE(ksu_vfs_read_hook)) + goto loop_start; + +#ifndef CONFIG_KSU_KPROBES_KSUD + restore_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); +#endif +#endif + + return 0; +} + +static void vfs_read_hook_wait_thread() +{ + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); +} + +static void ksu_syscall_table_hook_init() +{ + read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); + +#ifndef CONFIG_KSU_KPROBES_KSUD + read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); +#endif + +#if defined(CONFIG_COMPAT) + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); + +#ifndef CONFIG_KSU_KPROBES_KSUD + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); +#endif + +#endif // COMPAT + + vfs_read_hook_wait_thread(); // start unreg kthread +} + +static void syscall_table_sucompat_enable() +{ + read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); +#endif + +} + +static void syscall_table_sucompat_disable() +{ + restore_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); + restore_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); + restore_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); +#endif + +} + +// EOF + +#if 0 // these are kept for posterity +static int override_security_head(void *head, const void *new_head, size_t len) +{ + unsigned long base = (unsigned long)head & PAGE_MASK; + unsigned long offset = offset_in_page(head); + + // this is impossible for our case because the page alignment + // but be careful for other cases! + BUG_ON(offset + len > PAGE_SIZE); + struct page *page = phys_to_page(__pa(base)); + if (!page) { + return -EFAULT; + } + + void *addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!addr) { + return -ENOMEM; + } + local_irq_disable(); + memcpy(addr + offset, new_head, len); + local_irq_enable(); + vunmap(addr); + return 0; +} + +// normally backported on msm 3.10, provide weak +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +__weak int set_memory_ro(unsigned long addr, int numpages) { return 0; } +__weak int set_memory_rw(unsigned long addr, int numpages) { return 0; } +#endif + +// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) +// old_ptr is actually void ** +// target_table is void *target_table[]; +static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + // *old_ptr = READ_ONCE(*((void **)sys_call_table + syscall_nr)); + // WRITE_ONCE(*((void **)sys_call_table + syscall_nr), new_ptr); + + // the one from zx2c4 looks like above, but the issue is that we dont have + // READ_ONCE and WRITE_ONCE on 3.x kernels, here we just force volatile everything + // since those are actually just forced-aligned-volatile-rw + + // void **syscall_addr = (void **)(sys_call_table + syscall_nr); + // sugar: *(a + b) == a[b]; , a + b == &a[b]; + + void **sctable = (void **)target_table; + void **syscall_addr = (void **)&sctable[syscall_nr]; + + // dont hook non-existing syscall + if (!FORCE_VOLATILE(*syscall_addr)) + return; + + pr_info("%s: syscall: #%d slot: 0x%lx new_ptr: 0x%lx \n", __func__, syscall_nr, *(long *)syscall_addr, (long)new_ptr); + + set_memory_rw(((unsigned long)syscall_addr & PAGE_MASK), 1); + + barrier(); + *(void **)old_ptr = FORCE_VOLATILE(*syscall_addr); + + barrier(); + preempt_disable(); + FORCE_VOLATILE(*syscall_addr) = new_ptr; + preempt_enable(); + + set_memory_ro(((unsigned long)syscall_addr & PAGE_MASK), 1); + smp_mb(); + + return; +} +#endif diff --git a/drivers/kernelsu/syscall_table_hook_arm.c b/drivers/kernelsu/syscall_table_hook_arm.c new file mode 100644 index 000000000000..9ae4ff6aaf7e --- /dev/null +++ b/drivers/kernelsu/syscall_table_hook_arm.c @@ -0,0 +1,320 @@ +#include +#include + +#ifndef CONFIG_ARM +#error "only meant for ARM" +#endif + +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h + +#define FORCE_VOLATILE(x) *(volatile typeof(x) *)&(x) + +#define __ARMEABI_reboot 88 +#define __ARMEABI_execve 11 +#define __ARMEABI_faccessat 334 +#define __ARMEABI_fstatat64 327 +#define __ARMEABI_fstat64 197 + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) + +// on 4.19+ its is no longer just a void *sys_call_table[] +// it becomes syscall_fn_t sys_call_table[]; + +static syscall_fn_t armeabi_reboot = NULL; +static long hook_armeabi_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return armeabi_reboot(regs); +} + +static syscall_fn_t armeabi_execve = NULL; +static long hook_armeabi_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + + ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + return armeabi_execve(regs); +} + +static syscall_fn_t armeabi_faccessat = NULL; +static long hook_armeabi_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return armeabi_faccessat(regs); +} + +static syscall_fn_t armeabi_fstatat64 = NULL; +static long hook_armeabi_fstatat64(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return armeabi_fstatat64(regs); +} + +static syscall_fn_t armeabi_fstat64 = NULL; +static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned long *fd = (unsigned long *)®s->regs[0]; + struct stat64 __user **statbuf = (struct stat64 __user **)®s->regs[1]; + + long ret = armeabi_fstat64(regs); + ksu_handle_fstat64_ret(fd, statbuf); + return ret; +} + +#else // END OF 4.19+ SYSCALL HANDLERS + +static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return armeabi_reboot(magic1, magic2, cmd, arg); +} + +static long (*armeabi_execve)(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) = NULL; +static long hook_armeabi_execve(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) +{ + ksu_handle_execve_sucompat(NULL, &filename, NULL, NULL, NULL); + return armeabi_execve(filename, argv, envp); +} + +static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return armeabi_faccessat(dfd, filename, mode); +} + +static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) = NULL; +static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return armeabi_fstatat64(dfd, filename, statbuf, flag); +} + +static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) = NULL; +static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) +{ + // we handle it like rp + long ret = armeabi_fstat64(fd, statbuf); + ksu_handle_fstat64_ret(&fd, &statbuf); + return ret; +} + + +#endif // SYSCALL HANDLERS + +// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head +// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou" + +// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) +// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void ** +// for 4.19- old_ptr is actually void ** +// target_table is void *target_table[]; +static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + /* + * basically the trick is + * addr, say 0xffff1234, this is READ-ONLY + * align it, 0xffff0000 + * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234 + * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE + * write on 0xcccc0000 + 0x00001234 + * + */ + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + preempt_disable(); + local_irq_disable(); + + *(void **)old_ptr = *target_slot; + + *target_slot = new_ptr; + smp_mb(); // ^^ + + local_irq_enable(); + preempt_enable(); + + vunmap(writable_addr); + + smp_mb(); +} + +static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + /* + * we do this to make sure that old_ptr is filled. + * we risk a dead syscall !!! + * if read_and_replace failed or we restore again, it wont be pointing to anything + * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast + * yeah it really just dummy copies machine instructions at this point. + * + * normally we use probe_kernel_address / get_kernel_nofault here but the API is + * so inconsistent across kernel versions, and since its just a dummied wrapper + * for copy_from_kernel_nofault we can do it ourselves + * + */ + + long dummy = 0; + if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long))) + return; + + pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + // check if its ours + if (*target_slot != new_ptr) { + pr_info("%s: syscall is not ours!\n", __func__); + goto out; + } + + pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr ); + + preempt_disable(); + local_irq_disable(); + + *target_slot = *(void **)old_ptr; + smp_mb(); // ^^ + + *(void **)old_ptr = NULL; // explicit reset + + local_irq_enable(); + preempt_enable(); + +out: + vunmap(writable_addr); + + smp_mb(); +} + +static int ksu_syscall_table_restore() +{ +loop_start: + + msleep(1000); + + if (FORCE_VOLATILE(ksu_vfs_read_hook)) + goto loop_start; + +#ifndef CONFIG_KSU_KPROBES_KSUD + restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); +#endif + + return 0; +} + +static void vfs_read_hook_wait_thread() +{ + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); +} + +static void ksu_syscall_table_hook_init() +{ + + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); + +#ifndef CONFIG_KSU_KPROBES_KSUD + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); +#endif + + vfs_read_hook_wait_thread(); // start unreg kthread +} + +static void syscall_table_sucompat_enable() +{ + + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); +} + +static void syscall_table_sucompat_disable() +{ + restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); + restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); + restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); +} + +// EOF diff --git a/drivers/kernelsu/throne_tracker.c b/drivers/kernelsu/throne_tracker.c new file mode 100644 index 000000000000..6a2503fd945b --- /dev/null +++ b/drivers/kernelsu/throne_tracker.c @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +uid_t ksu_manager_appid = KSU_INVALID_APPID; + +static struct task_struct *throne_thread = NULL; +#define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list" + +struct uid_data { + struct list_head list; + u32 uid; + char package[KSU_MAX_PACKAGE_NAME]; +}; + +static void crown_manager(const char *apk, struct list_head *uid_data) +{ + char pkg[KSU_MAX_PACKAGE_NAME]; + if (get_pkg_from_apk_path(pkg, apk) < 0) { + pr_err("Failed to get package name from apk path: %s\n", apk); + return; + } + + pr_info("manager pkg: %s\n", pkg); + + struct list_head *list = (struct list_head *)uid_data; + struct uid_data *np; + + list_for_each_entry (np, list, list) { + if (strncmp(np->package, pkg, KSU_MAX_PACKAGE_NAME) == 0) { + pr_info("Crowning manager: %s(uid=%d)\n", pkg, np->uid); + ksu_set_manager_appid(np->uid); + break; + } + } +} + +#define DATA_PATH_LEN 384 // 384 is enough for /data/app//base.apk + +struct data_path { + char dirpath[DATA_PATH_LEN]; + int depth; + struct list_head list; +}; + +struct apk_path_hash { + unsigned int hash; + bool exists; + struct list_head list; +}; + +struct my_dir_context { + struct dir_context ctx; + struct list_head *data_path_list; + char *parent_dir; + void *private_data; + int depth; + int *stop; +}; +// https://docs.kernel.org/filesystems/porting.html +// filldir_t (readdir callbacks) calling conventions have changed. Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted. +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) +#define FILLDIR_RETURN_TYPE bool +#define FILLDIR_ACTOR_CONTINUE true +#define FILLDIR_ACTOR_STOP false +#else +#define FILLDIR_RETURN_TYPE int +#define FILLDIR_ACTOR_CONTINUE 0 +#define FILLDIR_ACTOR_STOP -EINVAL +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) +#define MY_ACTOR_CTX_ARG struct dir_context *ctx +#else +#define MY_ACTOR_CTX_ARG void *ctx_void +#endif + +extern bool is_manager_apk(char *path); +FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, + int namelen, loff_t off, u64 ino, + unsigned int d_type) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) + // then pull it out of the void + struct dir_context *ctx = (struct dir_context *)ctx_void; +#endif + struct my_dir_context *my_ctx = + container_of(ctx, struct my_dir_context, ctx); + + // we put the apk path we collected here + char *candidate_path = (char *)my_ctx->private_data; + + char dirpath[DATA_PATH_LEN]; + + if (!my_ctx) { + pr_err("Invalid context\n"); + return FILLDIR_ACTOR_STOP; + } + if (my_ctx->stop && *my_ctx->stop) { + pr_info("Stop searching\n"); + return FILLDIR_ACTOR_STOP; + } + + if (!strncmp(name, "..", namelen) || !strncmp(name, ".", namelen)) + return FILLDIR_ACTOR_CONTINUE; // Skip "." and ".." + + if (d_type == DT_DIR && namelen >= 8 && !strncmp(name, "vmdl", 4) && + !strncmp(name + namelen - 4, ".tmp", 4)) { + pr_info("Skipping directory: %.*s\n", namelen, name); + return FILLDIR_ACTOR_CONTINUE; // Skip staging package + } + + if (snprintf(dirpath, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, + namelen, name) >= DATA_PATH_LEN) { + pr_err("Path too long: %s/%.*s\n", my_ctx->parent_dir, namelen, + name); + return FILLDIR_ACTOR_CONTINUE; + } + + if (d_type == DT_DIR && my_ctx->depth > 0 && + (my_ctx->stop && !*my_ctx->stop)) { + struct data_path *data = kzalloc(sizeof(struct data_path), GFP_ATOMIC); + + if (!data) { + pr_err("Failed to allocate memory for %s\n", dirpath); + return FILLDIR_ACTOR_CONTINUE; + } + + strncpy(data->dirpath, dirpath, DATA_PATH_LEN - 1 ); + data->depth = my_ctx->depth - 1; + list_add_tail(&data->list, my_ctx->data_path_list); + + return FILLDIR_ACTOR_CONTINUE; + } + + // now put this on candidate_path + if (d_type == DT_REG && !strncmp(name, "base.apk", 8)) { + snprintf(candidate_path, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, namelen, name); + } + + return FILLDIR_ACTOR_CONTINUE; +} + +// compat: https://elixir.bootlin.com/linux/v3.9/source/include/linux/fs.h#L771 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) +#define S_MAGIC_COMPAT(x) ((x)->f_inode->i_sb->s_magic) +#else +#define S_MAGIC_COMPAT(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic) +#endif + +void search_manager(const char *path, int depth, struct list_head *uid_data) +{ + int i, stop = 0; + struct list_head data_path_list; + INIT_LIST_HEAD(&data_path_list); + unsigned long data_app_magic = 0; + + // First depth + struct data_path data = { }; + strncpy(data.dirpath, path, DATA_PATH_LEN - 1 ); + data.depth = depth; + list_add_tail(&data.list, &data_path_list); + + // we put the apk path we collected here + char candidate_path[DATA_PATH_LEN]; + + for (i = depth; i >= 0; i--) { + struct data_path *pos, *n; + + list_for_each_entry_safe(pos, n, &data_path_list, list) { + struct my_dir_context ctx = { .ctx.actor = my_actor, + .data_path_list = &data_path_list, + .parent_dir = pos->dirpath, + .private_data = candidate_path, + .depth = pos->depth, + .stop = &stop }; + + // make sure to clean buffer on every iteration + memset(candidate_path, 0, DATA_PATH_LEN); + + if (stop) + goto skip_iterate; + + struct file *file = ksu_filp_open_compat(pos->dirpath, O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0); + if (IS_ERR(file)) { + pr_err("Failed to open directory: %s, err: %ld\n", pos->dirpath, PTR_ERR(file)); + goto skip_iterate; + } + + // grab magic on first folder, which is /data/app + if (!data_app_magic) { + if (S_MAGIC_COMPAT(file)) { + data_app_magic = S_MAGIC_COMPAT(file); + pr_info("%s: dir: %s got magic! 0x%lx\n", __func__, pos->dirpath, data_app_magic); + } else { + filp_close(file, NULL); + goto skip_iterate; + } + } + + if (S_MAGIC_COMPAT(file) != data_app_magic) { + pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", __func__, pos->dirpath, S_MAGIC_COMPAT(file), data_app_magic); + filp_close(file, NULL); + goto skip_iterate; + } + + iterate_dir(file, &ctx.ctx); + filp_close(file, NULL); + + // ^ oh so thats the issue! + // we were calling is_manager_apk inside iterate_dir + // now we defer file opens after iterate_dir + // this way we dont open apks while inside that + if (!strstarts(candidate_path, "/data/ap") ) + goto skip_iterate; + + bool is_manager = is_manager_apk(candidate_path); + pr_info("Found new base.apk at path: %s, is_manager: %d\n", candidate_path, is_manager); + + if (likely(!is_manager)) + goto skip_iterate; + + crown_manager(candidate_path, uid_data); + stop = 1; + +skip_iterate: + list_del(&pos->list); + if (pos != &data) + kfree(pos); + } + } + +} + +static bool is_uid_exist(uid_t uid, char *package, void *data) +{ + struct list_head *list = (struct list_head *)data; + struct uid_data *np; + + bool exist = false; + list_for_each_entry (np, list, list) { + if (np->uid == uid % PER_USER_RANGE && + strncmp(np->package, package, KSU_MAX_PACKAGE_NAME) == 0) { + exist = true; + break; + } + } + return exist; +} + +static void throne_tracker_fn(bool prune_only) +{ + struct file *fp; + int tries = 0; + + while (tries++ < 10) { + if (!is_lock_held(SYSTEM_PACKAGES_LIST_PATH)) { + fp = ksu_filp_open_compat(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); + if (!IS_ERR(fp)) + break; + } + + pr_info("%s: waiting for %s\n", __func__, SYSTEM_PACKAGES_LIST_PATH); + msleep(100); // migth as well add a delay + }; + + if (IS_ERR(fp)) { + pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", __func__, PTR_ERR(fp)); + return; + } else + pr_info("%s: %s found!\n", __func__, SYSTEM_PACKAGES_LIST_PATH); + + struct list_head uid_list; + INIT_LIST_HEAD(&uid_list); + + char chr = 0; + loff_t pos = 0; + loff_t line_start = 0; + char buf[KSU_MAX_PACKAGE_NAME]; + for (;;) { + ssize_t count = ksu_kernel_read_compat(fp, &chr, sizeof(chr), &pos); + if (count != sizeof(chr)) + break; + if (chr != '\n') + continue; + + count = ksu_kernel_read_compat(fp, buf, sizeof(buf), &line_start); + + struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_ATOMIC); + if (!data) { + filp_close(fp, 0); + goto out; + } + + char *tmp = buf; + const char *delim = " "; + char *package = strsep(&tmp, delim); + char *uid = strsep(&tmp, delim); + if (!uid || !package) { + kfree(data); + pr_err("update_uid: package or uid is NULL!\n"); + break; + } + + u32 res; + if (kstrtou32(uid, 10, &res)) { + kfree(data); + pr_err("update_uid: uid parse err\n"); + break; + } + data->uid = res; + strncpy(data->package, package, KSU_MAX_PACKAGE_NAME); + list_add_tail(&data->list, &uid_list); + // reset line start + line_start = pos; + } + filp_close(fp, 0); + + // now update uid list + struct uid_data *np; + struct uid_data *n; + + if (prune_only) + goto prune; + + // first, check if manager_uid exist! + bool manager_exist = false; + list_for_each_entry (np, &uid_list, list) { + if (np->uid == ksu_get_manager_appid()) { + manager_exist = true; + break; + } + } + + if (!manager_exist) { + if (ksu_is_manager_appid_valid()) { + pr_info("manager is uninstalled, invalidate it!\n"); + ksu_invalidate_manager_uid(); + goto prune; + } + pr_info("Searching manager...\n"); + search_manager("/data/app", 2, &uid_list); + pr_info("Search manager finished\n"); + } + +prune: + // then prune the allowlist + ksu_prune_allowlist(is_uid_exist, &uid_list); +out: + // free uid_list + list_for_each_entry_safe (np, n, &uid_list, list) { + list_del(&np->list); + kfree(np); + } +} + +static int throne_tracker_thread(void *data) +{ + // now de-void it here + bool prune_only = (bool)data; + + pr_info("throne_tracker: pid: %d started\n", current->pid); + + // this is normally not needed, but it wont hurt + escape_to_root_forced(); + + throne_tracker_fn(prune_only); + throne_thread = NULL; + smp_mb(); + pr_info("throne_tracker: pid: %d exit!\n", current->pid); + return 0; +} + +void track_throne(bool prune_only) +{ +#ifndef CONFIG_KSU_THRONE_TRACKER_ALWAYS_THREADED + static bool throne_tracker_first_run __read_mostly = true; + if (unlikely(throne_tracker_first_run)) { + throne_tracker_fn(prune_only); + throne_tracker_first_run = false; + return; + } +#endif + smp_mb(); + if (throne_thread != NULL) // single instance lock + return; + + // HACK: force cast prune_only to be a void * + // this way we won't need to create a struct. + // there is only one argument anyway for track_throne() + // so yes, true or false is now a void pointer. + // reality is what I want to be. + throne_thread = kthread_run(throne_tracker_thread, (void *)prune_only, "throne_tracker"); + if (IS_ERR(throne_thread)) { + throne_thread = NULL; + return; + } +} + +void ksu_throne_tracker_init() +{ + // nothing to do +} + +void ksu_throne_tracker_exit() +{ + // nothing to do +} diff --git a/drivers/kernelsu/throne_tracker.h b/drivers/kernelsu/throne_tracker.h new file mode 100644 index 000000000000..0416de2c58a0 --- /dev/null +++ b/drivers/kernelsu/throne_tracker.h @@ -0,0 +1,43 @@ +#ifndef __KSU_H_UID_OBSERVER +#define __KSU_H_UID_OBSERVER + +void ksu_throne_tracker_init(); + +void ksu_throne_tracker_exit(); + +void track_throne(bool prune_only); + +/* + * small helper to check if lock is held + * false - file is stable + * true - file is being deleted/renamed + * possibly optional + * + */ +static bool is_lock_held(const char *path) +{ + struct path kpath; + + // kern_path returns 0 on success + if (kern_path(path, 0, &kpath)) + return true; + + // just being defensive + if (!kpath.dentry) { + path_put(&kpath); + return true; + } + + if (!spin_trylock(&kpath.dentry->d_lock)) { + pr_info("%s: lock held, bail out!\n", __func__); + path_put(&kpath); + return true; + } + // we hold it ourselves here! + + spin_unlock(&kpath.dentry->d_lock); + path_put(&kpath); + return false; +} + +#endif diff --git a/drivers/kernelsu/tiny_sulog.c b/drivers/kernelsu/tiny_sulog.c new file mode 100644 index 000000000000..9379182a311d --- /dev/null +++ b/drivers/kernelsu/tiny_sulog.c @@ -0,0 +1,124 @@ +// half assed ringbuffer +// 8 bytes +struct sulog_entry { + uint32_t s_time; // uptime in seconds + uint32_t data; // uint8_t[0,1,2] = uid, basically uint24_t, uint8_t[3] = symbol +} __attribute__((packed)); + +#define SULOG_ENTRY_MAX 250 +#define SULOG_BUFSIZ SULOG_ENTRY_MAX * (sizeof (struct sulog_entry)) + +static void *sulog_buf_ptr = NULL; +static uint8_t sulog_index_next = 0; + +static DEFINE_SPINLOCK(sulog_lock); + +void sulog_init_heap() +{ + sulog_buf_ptr = kzalloc(SULOG_BUFSIZ, GFP_KERNEL); + if (!sulog_buf_ptr) + return; + + pr_info("sulog_init: allocated %lu bytes on 0x%p \n", SULOG_BUFSIZ, sulog_buf_ptr); +} + +/* + * + * boottime_s_get, get kernel uptime in seconds + * + * - handles sub 4.10 compat + * - we do this forced pointer cast to cut down on compat, pre 4.10, ktime is a union + * + * - bs handling 64-bit division on 32-bit (do_div) + * - remainder = do_div(dividend, divisor); dividend will hold the quotient + * - for 64-bit we can straight up just use divide + * + */ +static inline uint32_t boottime_s_get() +{ + ktime_t boottime_kt = ktime_get_boottime(); + +#ifdef CONFIG_64BIT + uint64_t boottime_s = *(uint64_t *)&boottime_kt / 1000000000; +#else + uint64_t boottime_s = *(uint64_t *)&boottime_kt; + do_div(boottime_s, 1000000000); +#endif + + return (uint32_t)boottime_s; +} + +void write_sulog(uint8_t sym) +{ + if (!sulog_buf_ptr) + return; + + unsigned int offset = sulog_index_next * sizeof(struct sulog_entry); + struct sulog_entry entry = {0}; + + kuid_t current_uid = current_uid(); + + // WARNING!!! this is LE only! + entry.s_time = boottime_s_get(); + entry.data = (uint32_t)ksu_get_uid_t(current_uid); + *((char *)&entry.data + 3) = sym; + + // we can perform this write atomic on 64-bit + // however this still has to be locked for exclusion as theres a reader + + spin_lock(&sulog_lock); + +#ifdef CONFIG_64BIT + *(volatile uint64_t *)(sulog_buf_ptr + offset) = *(uint64_t *)&entry; +#else + __builtin_memcpy(sulog_buf_ptr + offset, &entry, sizeof(entry)); +#endif + spin_unlock(&sulog_lock); + + // move ptr for next iteration + sulog_index_next = sulog_index_next + 1; + + if (sulog_index_next >= SULOG_ENTRY_MAX) + sulog_index_next = 0; +} + +struct sulog_entry_rcv_ptr { + uint64_t index_ptr; // send index here + uint64_t buf_ptr; // send buf here + uint64_t uptime_ptr; // uptime +}; + +int send_sulog_dump(void __user *uptr) +{ + if (!sulog_buf_ptr) + return 1; + + struct sulog_entry_rcv_ptr sbuf = {0}; + + if (copy_from_user(&sbuf, uptr, sizeof(sbuf) )) + return 1; + + if (!sbuf.index_ptr || !sbuf.buf_ptr || !sbuf.uptime_ptr ) + return 1; + + // send uptime + + uint32_t uptime = boottime_s_get(); + + if (copy_to_user((void __user *)(uintptr_t)sbuf.uptime_ptr, &uptime, sizeof(uptime) )) + return 1; + + // send index + if (copy_to_user((void __user *)(uintptr_t)sbuf.index_ptr, &sulog_index_next, sizeof(sulog_index_next) )) + return 1; + + // send buffer data + spin_lock(&sulog_lock); + if (copy_to_user((void __user *)(uintptr_t)sbuf.buf_ptr, sulog_buf_ptr, SULOG_BUFSIZ )) { + spin_unlock(&sulog_lock); + return 1; + } + spin_unlock(&sulog_lock); + + return 0; +} From 38b6716539e0fbff5814d1c0c5150444221a9a73 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:44:09 +0800 Subject: [PATCH 44/59] =?UTF-8?q?arm64/configs:=20=E5=90=AF=E7=94=A8KSU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/vendor/xiaomi/mi845_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 0669b3be30e3..856a3ebfa367 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -634,3 +634,5 @@ CONFIG_SND_SOC_WCD_MBHC_ADC=y CONFIG_SND_SOC_WCD_SPI=y CONFIG_SOUNDWIRE=y CONFIG_WCD_SPI_AC=y +CONFIG_KSU=y +CONFIG_KSU_TAMPER_SYSCALL_TABLE=y From 3a7d2420393af85efedaa6637b3827097c6b372c Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:47:48 +0800 Subject: [PATCH 45/59] =?UTF-8?q?arm64/configs:=20=E4=BD=BF=E7=94=A8HZ=5F3?= =?UTF-8?q?00?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/vendor/xiaomi/mi845_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 856a3ebfa367..75383094f2c1 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -71,7 +71,7 @@ CONFIG_PCI_MSM=y CONFIG_SCHED_MC=y CONFIG_NR_CPUS=8 CONFIG_PREEMPT=y -CONFIG_HZ_100=y +CONFIG_HZ_300=y CONFIG_ANON_MIN_KBYTES=196608 CONFIG_CLEAN_LOW_KBYTES=393216 CONFIG_CLEAN_MIN_KBYTES=196608 From 7e0bc72a52cbc7659a307deca37a58d7f07fe445 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:50:01 +0800 Subject: [PATCH 46/59] =?UTF-8?q?arm64/configs:=20=E5=90=AF=E7=94=A8IP6=5F?= =?UTF-8?q?NF=5FNAT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/vendor/xiaomi/mi845_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 75383094f2c1..92a628b225af 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -218,6 +218,7 @@ CONFIG_IP6_NF_IPTABLES_128=y CONFIG_IP6_NF_MATCH_RPFILTER=y CONFIG_IP6_NF_TARGET_HL=y CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_NAT=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IP6_NF_MANGLE=y CONFIG_IP6_NF_RAW=y From 6c4b9cb14cffdb3228b182d05d5322bc6c7114e3 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:53:55 +0800 Subject: [PATCH 47/59] =?UTF-8?q?arm64/configs:=20=E6=B7=BB=E5=8A=A0lxc.co?= =?UTF-8?q?nfig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/lxc.config | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 arch/arm64/configs/lxc.config diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config new file mode 100644 index 000000000000..08e7955f0cfb --- /dev/null +++ b/arch/arm64/configs/lxc.config @@ -0,0 +1,40 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y + +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_PID_NS=y +CONFIG_USER_NS=y +CONFIG_NET_NS=y + +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_PIDS=y +CONFIG_MEMCG=y +CONFIG_CPUSETS=y + +CONFIG_VETH=y +CONFIG_MACVLAN=y +CONFIG_VLAN_8021Q=y +CONFIG_BRIDGE=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NF_NAT_IPV4=y +CONFIG_NF_NAT_IPV6=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP6_NF_TARGET_MASQUERADE=y +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_FUSE_FS=y + +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_FHANDLE=y +CONFIG_EVENTFD=y +CONFIG_EPOLL=y +CONFIG_UNIX_DIAG=y +CONFIG_INET_DIAG=y +CONFIG_PACKET_DIAG=y +CONFIG_NETLINK_DIAG=y From 9c2b350c5f39a3c21b925c6712f7430583ccb131 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Mon, 9 Mar 2026 20:26:11 +0800 Subject: [PATCH 48/59] =?UTF-8?q?CI:=20=E4=BC=98=E5=8C=96=E7=BC=96?= =?UTF-8?q?=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- .github/workflows/build.yml | 234 ++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 948e5d193a8c..b5b4349add89 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,20 +16,11 @@ jobs: name: 编译内核 permissions: { contents: write } runs-on: ubuntu-latest - strategy: - matrix: - device: - - beryllium - - dipper - - equuleus - - perseus - - polaris - - ursa steps: - name: 安装软件包 + if: env.PACKAGES != '' env: PACKAGES: - ccache binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi run: | @@ -39,7 +30,7 @@ jobs: - name: 安装make4.4.1-2 run: | curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb - sudo apt-get install -y ./make.deb + sudo apt-get install -y -q ./make.deb rm ./make.deb - name: 同步仓库 @@ -47,158 +38,167 @@ jobs: with: path: kernel + - name: 配置Anykernel3 + run: | + git clone https://github.com/osm0sis/AnyKernel3.git --depth=1 ak3 + rm -rf .git .github README.md LICENSE + find ak3/ -name "placeholder" | xargs rm -rf + cat >ak3/anykernel.sh <anykernel.sh <> $GITHUB_OUTPUT - echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT - - - name: 下载ci管理器 - continue-on-error: true - uses: dawidd6/action-download-artifact@master - with: - repo: rsuntk/KernelSU - workflow_conclusion: success - name: manager - workflow: build-manager.yml - path: manager - check_artifacts: true - search_artifacts: true + NOW=$(date +%s) + TIME_STR=$(TZ='Asia/Shanghai' date -d "@$NOW" +'%Y%m%d%H%M') + echo "timestamp=$NOW" >> $GITHUB_OUTPUT + echo "time=$TIME_STR" >> $GITHUB_OUTPUT - name: 发布 + if: github.event_name == 'push' + id: release uses: softprops/action-gh-release@master with: tag_name: rel-${{ steps.time.outputs.timestamp }} name: Kernel build ${{ steps.time.outputs.time }} prerelease: ${{ startsWith(github.ref_name, 'dev/') }} - files: | - kernel/* - manager/* + files: dist/* - name: 发送Telegram通知 + if: github.event_name == 'push' continue-on-error: true - env: - COMMIT_MESSAGE: ${{ github.event.head_commit.message }} - COMMIT_URL: ${{ github.event.head_commit.url }} - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }} run: | - msg="*CI ${{ steps.time.outputs.time }}* - > Branch/分支: \`${{ github.ref_name }}\` - \`\`\` - $COMMIT_MESSAGE - \`\`\` - [Download/下载]($RELEASE_URL) - [Commit/提交]($COMMIT_URL) - [Run/工作流]($RUN_URL) + IDS=(${{ join(github.event.commits.*.id, ' ') }}) + MAX=6 + if [ "${#IDS[@]}" -gt "$MAX" ]; then + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")" + else + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")" + fi + MSG="\ + CI ${{ steps.time.outputs.time }} +
\
+          项目: ${{ github.repository }}
+          分支: ${{ github.ref_name }}\
+          
+ 提交ID: +
$COMMIT_IDS_TEXT
\ " - curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \ - -F 'parse_mode="markdownv2"' \ - -F "text=\"$msg\"" | tee Markdown.txt - ! ${{ startsWith(github.ref_name, 'stable/') }} || \ - curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F message_id=$(jq '.result.message_id' Markdown.txt) + PREVIEW_OPTIONS="{ \ + \"url\": \"${{ steps.release.outputs.url }}\", \ + \"prefer_small_media\": true, \ + \"show_above_text\": true \ + }" + BUTTONS="{\"inline_keyboard\": [ [ \ + { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \ + { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \ + ] ] }" + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=$MSG" \ + -d "link_preview_options=$PREVIEW_OPTIONS" \ + -d "reply_markup=$BUTTONS" \ + -o response.txt && \ + (! ${{ startsWith(github.ref_name, 'stable/') }} || \ + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_id=$(jq '.result.message_id' response.txt)") From e89846cc6322902291430b5831c58f71b1a3ca50 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 14 Mar 2026 18:20:29 +0800 Subject: [PATCH 49/59] =?UTF-8?q?input/fts521:=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E7=BC=A9=E8=BF=9B=E8=AD=A6=E5=91=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- drivers/input/touchscreen/fts_521/fts.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/input/touchscreen/fts_521/fts.c b/drivers/input/touchscreen/fts_521/fts.c index 4722476a4c2d..e0750f28a609 100644 --- a/drivers/input/touchscreen/fts_521/fts.c +++ b/drivers/input/touchscreen/fts_521/fts.c @@ -2723,15 +2723,15 @@ static void fts_enter_pointer_event_handler(struct fts_ts_info *info, input_report_key(info->input_dev, BTN_TOOL_FINGER, 1); /*input_report_abs(info->input_dev, ABS_MT_TRACKING_ID, touchId); */ - input_report_abs(info->input_dev, ABS_MT_POSITION_X, x); - input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y); - input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z); - input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z); - input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance); + input_report_abs(info->input_dev, ABS_MT_POSITION_X, x); + input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y); + input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z); + input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z); + input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance); #ifdef CONFIG_INPUT_PRESS_NDT - input_report_abs(info->input_dev, ABS_MT_PRESSURE, z); + input_report_abs(info->input_dev, ABS_MT_PRESSURE, z); #endif - input_sync(info->input_dev); + input_sync(info->input_dev); /* pr_info("%s: Event 0x%02x - ID[%d], (x, y, z) = (%3d, %3d, %3d) type = %d\n", __func__, *event, touchId, x, y, z, touchType); */ From 418c5b6fe9035beaf2dbabe7c50f1a6ede7e6877 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sun, 22 Mar 2026 17:33:36 +0800 Subject: [PATCH 50/59] =?UTF-8?q?=E5=90=AF=E7=94=A8BINFMT=5FMISC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/lxc.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config index 08e7955f0cfb..511fdb81e629 100644 --- a/arch/arm64/configs/lxc.config +++ b/arch/arm64/configs/lxc.config @@ -38,3 +38,5 @@ CONFIG_UNIX_DIAG=y CONFIG_INET_DIAG=y CONFIG_PACKET_DIAG=y CONFIG_NETLINK_DIAG=y + +CONFIG_BINFMT_MISC=y \ No newline at end of file From 9125e8f1f7fa4c17989bed9591cd93b1466f31a0 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 4 Apr 2026 12:26:24 +0800 Subject: [PATCH 51/59] =?UTF-8?q?kbuild:=20=E4=BF=AE=E5=A4=8D=E9=9D=99?= =?UTF-8?q?=E9=BB=98=E6=A8=A1=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 64d8b31c7b5d..a2946c935d83 100644 --- a/Makefile +++ b/Makefile @@ -87,10 +87,16 @@ endif # If the user is running make -s (silent mode), suppress echoing of # commands +# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. -ifneq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),) - quiet=silent_ - tools_silent=s +ifeq ($(filter 3.%,$(MAKE_VERSION)),) +silence:=$(findstring s,$(firstword -$(MAKEFLAGS))) +else +silence:=$(findstring s,$(filter-out --%,$(MAKEFLAGS))) +endif + +ifeq ($(silence),s) +quiet=silent_ endif export quiet Q KBUILD_VERBOSE From ce313493aa2e15c3df16cf96aae4e45afcc76bf6 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sat, 4 Apr 2026 08:07:04 +0800 Subject: [PATCH 52/59] =?UTF-8?q?arm64/configs:=20=E7=A6=81=E7=94=A8CONFIG?= =?UTF-8?q?=5FANDROID=5FPARANOID=5FNETWORK?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- arch/arm64/configs/lxc.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config index 511fdb81e629..7c2967ce7218 100644 --- a/arch/arm64/configs/lxc.config +++ b/arch/arm64/configs/lxc.config @@ -39,4 +39,6 @@ CONFIG_INET_DIAG=y CONFIG_PACKET_DIAG=y CONFIG_NETLINK_DIAG=y -CONFIG_BINFMT_MISC=y \ No newline at end of file +CONFIG_BINFMT_MISC=y + +CONFIG_ANDROID_PARANOID_NETWORK=n \ No newline at end of file From 2d8b64937429861f3c0337c8ba90dedfa4f84f1c Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sun, 29 Mar 2026 07:59:03 +0800 Subject: [PATCH 53/59] =?UTF-8?q?KernelSU:=20=E5=90=8C=E6=AD=A5=E8=87=B3ba?= =?UTF-8?q?ckslashxx/KernelSU@554c470?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- drivers/kernelsu/Kconfig | 27 +- drivers/kernelsu/Makefile | 63 +- drivers/kernelsu/app_profile.h | 70 -- drivers/kernelsu/extras.c | 5 - drivers/kernelsu/feature/adb_root.c | 271 ++++++ drivers/kernelsu/feature/adb_root.h | 9 + drivers/kernelsu/feature/kernel_umount.c | 120 +++ .../{core_hook.h => feature/kernel_umount.h} | 12 +- drivers/kernelsu/{ => feature}/sucompat.c | 255 ++--- drivers/kernelsu/{ => feature}/sucompat.h | 3 - drivers/kernelsu/feature/sulog.c | 57 ++ drivers/kernelsu/feature/sulog.h | 8 + drivers/kernelsu/{ => hook}/core_hook.c | 227 +---- drivers/kernelsu/{ => hook}/kp_ksud.c | 84 +- .../{ => hook}/syscall_table_hook_arm.c | 40 +- .../syscall_table_hook_arm64.c} | 142 ++- drivers/kernelsu/{ => include}/arch.h | 7 +- drivers/kernelsu/{ => include}/klog.h | 2 - drivers/kernelsu/{ => include}/ksu.h | 5 +- drivers/kernelsu/include/uapi/app_profile.h | 61 ++ drivers/kernelsu/include/uapi/feature.h | 17 + drivers/kernelsu/include/uapi/selinux.h | 29 + drivers/kernelsu/include/uapi/sulog.h | 32 + drivers/kernelsu/include/uapi/supercall.h | 162 ++++ drivers/kernelsu/infra/event_queue.c | 393 ++++++++ drivers/kernelsu/infra/event_queue.h | 54 ++ drivers/kernelsu/{ => infra}/file_wrapper.c | 43 +- drivers/kernelsu/{ => infra}/file_wrapper.h | 3 - drivers/kernelsu/{ => infra}/su_mount_ns.c | 53 +- drivers/kernelsu/{ => infra}/su_mount_ns.h | 0 drivers/kernelsu/kernel_compat.c | 153 +-- drivers/kernelsu/kernel_compat.h | 285 ++++-- drivers/kernelsu/kernel_includes.h | 160 ++++ drivers/kernelsu/ksu.c | 125 ++- drivers/kernelsu/{ => manager}/apk_sign.c | 63 +- drivers/kernelsu/{ => manager}/apk_sign.h | 2 - .../{manager.h => manager/manager_identity.h} | 8 +- .../kernelsu/{ => manager}/throne_tracker.c | 91 +- .../kernelsu/{ => manager}/throne_tracker.h | 0 drivers/kernelsu/{ => policy}/allowlist.c | 96 +- drivers/kernelsu/{ => policy}/allowlist.h | 3 +- drivers/kernelsu/{ => policy}/app_profile.c | 78 +- drivers/kernelsu/policy/app_profile.h | 9 + drivers/kernelsu/{ => policy}/feature.c | 7 +- drivers/kernelsu/{ => policy}/feature.h | 13 - drivers/kernelsu/rp_sucompat.c | 102 -- drivers/kernelsu/{ => runtime}/ksud.c | 217 ++--- drivers/kernelsu/{ => runtime}/ksud.h | 9 +- drivers/kernelsu/selinux/rules.c | 898 ++++++++++++------ drivers/kernelsu/selinux/selinux.c | 25 +- drivers/kernelsu/selinux/selinux.h | 11 +- drivers/kernelsu/selinux/sepolicy.c | 513 +++++++++- drivers/kernelsu/selinux/sepolicy.h | 8 +- drivers/kernelsu/sulog/event.c | 271 ++++++ drivers/kernelsu/sulog/event.h | 17 + drivers/kernelsu/sulog/fd.c | 83 ++ drivers/kernelsu/sulog/fd.h | 8 + .../{supercalls.c => supercall/dispatch.c} | 387 ++------ drivers/kernelsu/supercall/internal.h | 14 + drivers/kernelsu/supercall/perm.c | 27 + drivers/kernelsu/supercall/supercall.c | 249 +++++ drivers/kernelsu/supercall/supercall.h | 32 + drivers/kernelsu/supercalls.h | 166 ---- drivers/kernelsu/tiny_sulog.c | 6 +- 64 files changed, 4188 insertions(+), 2202 deletions(-) delete mode 100644 drivers/kernelsu/app_profile.h create mode 100644 drivers/kernelsu/feature/adb_root.c create mode 100644 drivers/kernelsu/feature/adb_root.h create mode 100644 drivers/kernelsu/feature/kernel_umount.c rename drivers/kernelsu/{core_hook.h => feature/kernel_umount.h} (51%) rename drivers/kernelsu/{ => feature}/sucompat.c (56%) rename drivers/kernelsu/{ => feature}/sucompat.h (58%) create mode 100644 drivers/kernelsu/feature/sulog.c create mode 100644 drivers/kernelsu/feature/sulog.h rename drivers/kernelsu/{ => hook}/core_hook.c (54%) rename drivers/kernelsu/{ => hook}/kp_ksud.c (69%) rename drivers/kernelsu/{ => hook}/syscall_table_hook_arm.c (89%) rename drivers/kernelsu/{syscall_table_hook.c => hook/syscall_table_hook_arm64.c} (85%) rename drivers/kernelsu/{ => include}/arch.h (95%) rename drivers/kernelsu/{ => include}/klog.h (82%) rename drivers/kernelsu/{ => include}/ksu.h (84%) create mode 100644 drivers/kernelsu/include/uapi/app_profile.h create mode 100644 drivers/kernelsu/include/uapi/feature.h create mode 100644 drivers/kernelsu/include/uapi/selinux.h create mode 100644 drivers/kernelsu/include/uapi/sulog.h create mode 100644 drivers/kernelsu/include/uapi/supercall.h create mode 100644 drivers/kernelsu/infra/event_queue.c create mode 100644 drivers/kernelsu/infra/event_queue.h rename drivers/kernelsu/{ => infra}/file_wrapper.c (95%) rename drivers/kernelsu/{ => infra}/file_wrapper.h (76%) rename drivers/kernelsu/{ => infra}/su_mount_ns.c (81%) rename drivers/kernelsu/{ => infra}/su_mount_ns.h (100%) create mode 100644 drivers/kernelsu/kernel_includes.h rename drivers/kernelsu/{ => manager}/apk_sign.c (83%) rename drivers/kernelsu/{ => manager}/apk_sign.h (85%) rename drivers/kernelsu/{manager.h => manager/manager_identity.h} (84%) rename drivers/kernelsu/{ => manager}/throne_tracker.c (84%) rename drivers/kernelsu/{ => manager}/throne_tracker.h (100%) rename drivers/kernelsu/{ => policy}/allowlist.c (87%) rename drivers/kernelsu/{ => policy}/allowlist.h (96%) rename drivers/kernelsu/{ => policy}/app_profile.c (76%) create mode 100644 drivers/kernelsu/policy/app_profile.h rename drivers/kernelsu/{ => policy}/feature.c (97%) rename drivers/kernelsu/{ => policy}/feature.h (73%) delete mode 100644 drivers/kernelsu/rp_sucompat.c rename drivers/kernelsu/{ => runtime}/ksud.c (77%) rename drivers/kernelsu/{ => runtime}/ksud.h (65%) create mode 100644 drivers/kernelsu/sulog/event.c create mode 100644 drivers/kernelsu/sulog/event.h create mode 100644 drivers/kernelsu/sulog/fd.c create mode 100644 drivers/kernelsu/sulog/fd.h rename drivers/kernelsu/{supercalls.c => supercall/dispatch.c} (68%) create mode 100644 drivers/kernelsu/supercall/internal.h create mode 100644 drivers/kernelsu/supercall/perm.c create mode 100644 drivers/kernelsu/supercall/supercall.c create mode 100644 drivers/kernelsu/supercall/supercall.h delete mode 100644 drivers/kernelsu/supercalls.h diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig index 24a27043b3fb..cb75564a95ae 100644 --- a/drivers/kernelsu/Kconfig +++ b/drivers/kernelsu/Kconfig @@ -1,8 +1,10 @@ menu "KernelSU" config KSU - select SECCOMP bool "KernelSU function support" + depends on !CPU_BIG_ENDIAN + depends on SECURITY_SELINUX + select SECCOMP default n help Enable kernel-level root privileges on Android System. @@ -18,7 +20,7 @@ config KSU_EXTRAS config KSU_KPROBES_KSUD bool "Enable dynamic kprobes for early boot hooks" depends on KPROBES && KRETPROBES - default n + default y help Use dynamic hooks via kprobes for functions only on early boot. Hooks are unregistered at boot complete @@ -26,22 +28,27 @@ config KSU_KPROBES_KSUD config KSU_TAMPER_SYSCALL_TABLE bool "EXPERIMENTAL: tamper sys_call_table for sucompat + sys_reboot" - depends on (ARM || ARM64) && !KSU_KRETPROBES_SUCOMPAT + depends on (ARM || ARM64) && !CFI_CLANG && !CFI default n help EXPERIMENTAL: use syscall table hijacking method demonstrated on zx2c4's kernel-assisted-superuser. Replaces sys_reboot, sys_execve, sys_newfstatat, sys_faccessat, sys_newfstat_ret manual hooks. - Tested on Linux 3.10 ~ 4.14, aarch64. + Personally tested on Linux 3.10 ~ 4.14, aarch64. -config KSU_KRETPROBES_SUCOMPAT - bool "EXPERIMENTAL: kretprobes for sucompat" - depends on KRETPROBES +config KSU_FEATURE_SULOG + bool "KernelSU SU Logging feature" + depends on KSU + default y + help + Build KernelSU's SU Log. + +config KSU_FEATURE_ADBROOT + bool "KernelSU ADB Root feature" + depends on KSU default n help - EXPERIMENTAL: Use kretprobes to hook getname_flags, mainly for - sucompat. This method will hijack all fs-related syscalls, but - thwarts timing based detections. + Build KernelSU's adb root feature. config KSU_DEBUG bool "KernelSU debug mode" diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile index 3890aca6522d..ab779e4d0a5d 100644 --- a/drivers/kernelsu/Makefile +++ b/drivers/kernelsu/Makefile @@ -1,54 +1,73 @@ -ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include -ccflags-y += -I$(objtree)/security/selinux +# NOTE: unity build. single unit. obj-$(CONFIG_KSU) := ksu.o +CFLAGS_ksu.o += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include +CFLAGS_ksu.o += -I$(objtree)/security/selinux + ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) -ccflags-y += -DKSU_COMPAT_HAS_CURRENT_SID +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_CURRENT_SID endif ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/include/security.h; echo $$?),0) -ccflags-y += -DKSU_COMPAT_HAS_SELINUX_STATE +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_SELINUX_STATE endif -# UL, look for iterate_dir on ‎fs/readdir.c -ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0) -ccflags-y += -DKSU_HAS_ITERATE_DIR +ifeq ($(shell grep -q "^DEFINE_RWLOCK(policy_rwlock);" $(srctree)/security/selinux/ss/services.c; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK +endif + +ifeq ($(shell grep -q "cpus_ptr;" $(srctree)/include/linux/sched.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_BACKPORTED_CPUS_PTR endif # UL, look for read_iter on f_op struct ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0) -ccflags-y += -DKSU_HAS_FOP_READ_ITER +CFLAGS_ksu.o += -DKSU_HAS_FOP_READ_ITER endif -# UL, look for "ext4_unregister_sysfs" on fs/ext4 -ifeq ($(shell grep -q "^extern void ext4_unregister_sysfs" $(srctree)/fs/ext4/ext4.h 2>/dev/null; echo $$?),0) -ccflags-y += -DKSU_HAS_MODERN_EXT4 +# UL, look for iterate_dir on ‎fs/readdir.c +ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0) +CFLAGS_ksu.o += -DKSU_HAS_ITERATE_DIR endif ifeq ($(shell grep -q "selinux_inode" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) -ccflags-y += -DKSU_HAS_SELINUX_INODE +CFLAGS_ksu.o += -DKSU_HAS_SELINUX_INODE endif ifeq ($(shell grep -q "selinux_cred" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) -ccflags-y += -DKSU_HAS_SELINUX_CRED -endif - -ifeq ($(shell grep -q "static inline struct inode \*file_inode" $(srctree)/include/linux/fs.h; echo $$?),0) -ccflags-y += -DKSU_UL_HAS_FILE_INODE +CFLAGS_ksu.o += -DKSU_HAS_SELINUX_CRED endif ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) -ccflags-y += -DKSU_TYPE_VAL_TO_STRUCT +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT endif # half-assed-backport from 5.1 ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) -ccflags-y += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY endif -ccflags-y += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat -Wno-missing-prototypes -ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-format -Wno-incompatible-pointer-types -ccflags-y += -Wno-unused-variable -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast +CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-int-conversion -Wno-missing-prototypes +CFLAGS_ksu.o += -Wno-declaration-after-statement -Wno-unused-function -Wno-format -Wno-incompatible-pointer-types +CFLAGS_ksu.o += -Wno-unused-variable -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast + +# so we can see stack use atleast, as we disable all stack safety here +CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024) + +# to make sure we can use builtins +CFLAGS_REMOVE_ksu.o += -fno-builtin + +ifneq ($(CONFIG_KSU_DEBUG),y) +# strip, remove tracing / profiling +# comment out if proper backtrace is needed +CFLAGS_ksu.o += -g0 -fno-unwind-tables -fno-asynchronous-unwind-tables -fomit-frame-pointer +CFLAGS_REMOVE_ksu.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ksu.o += -pg + +# if cflags can be macro'd, this will be called 'TRUST_ME' +CFLAGS_ksu.o += -fno-stack-protector -fno-stack-check +CFLAGS_REMOVE_ksu.o += -fsanitize=shadow-call-stack +endif # CONFIG_KSU_DEBUG # Keep a new line here!! Because someone may append config diff --git a/drivers/kernelsu/app_profile.h b/drivers/kernelsu/app_profile.h deleted file mode 100644 index fcc9daed5f53..000000000000 --- a/drivers/kernelsu/app_profile.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __KSU_H_APP_PROFILE -#define __KSU_H_APP_PROFILE - -#include - -// Forward declarations -struct cred; - -#define KSU_APP_PROFILE_VER 2 -#define KSU_MAX_PACKAGE_NAME 256 -// NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. -#define KSU_MAX_GROUPS 32 -#define KSU_SELINUX_DOMAIN 64 - -struct root_profile { - int32_t uid; - int32_t gid; - - int32_t groups_count; - int32_t groups[KSU_MAX_GROUPS]; - - // kernel_cap_t is u32[2] for capabilities v3 - struct { - u64 effective; - u64 permitted; - u64 inheritable; - } capabilities; - - char selinux_domain[KSU_SELINUX_DOMAIN]; - - int32_t namespaces; -}; - -struct non_root_profile { - bool umount_modules; -}; - -struct app_profile { - // It may be utilized for backward compatibility, although we have never explicitly made any promises regarding this. - u32 version; - - // this is usually the package of the app, but can be other value for special apps - char key[KSU_MAX_PACKAGE_NAME]; - int32_t current_uid; - bool allow_su; - - union { - struct { - bool use_default; - char template_name[KSU_MAX_PACKAGE_NAME]; - - struct root_profile profile; - } rp_config; - - struct { - bool use_default; - - struct non_root_profile profile; - } nrp_config; - }; -}; - -// Escalate current process to root with the appropriate profile -void escape_with_root_profile(void); - -void escape_to_root_for_init(void); - -void escape_to_root_forced(void); - -#endif diff --git a/drivers/kernelsu/extras.c b/drivers/kernelsu/extras.c index 4181a62ba312..642c83bfd39a 100644 --- a/drivers/kernelsu/extras.c +++ b/drivers/kernelsu/extras.c @@ -1,7 +1,3 @@ -#include -#include -#include - // sorry for the ifdef hell // but im too lazy to fragment this out. // theres only one feature so far anyway @@ -77,7 +73,6 @@ static int get_sid() #if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) #include -#include static struct kprobe *slow_avc_audit_kp; static int ksu_handle_slow_avc_audit(u32 *tsid) diff --git a/drivers/kernelsu/feature/adb_root.c b/drivers/kernelsu/feature/adb_root.c new file mode 100644 index 000000000000..c3935cd8139a --- /dev/null +++ b/drivers/kernelsu/feature/adb_root.c @@ -0,0 +1,271 @@ +#ifdef CONFIG_KSU_FEATURE_ADBROOT + +static bool ksu_adb_root __read_mostly = false; + +static long is_exec_adbd(const char __user **filename_user) +{ + // should be bigger than `/apex/com.android.adbd/bin/adbd` + char buf[40] = { 0 }; + size_t copysize = sizeof("/apex/com.android.adbd/bin/adbd"); + + if (!!copy_from_user(buf, *filename_user, copysize)) + return 0; + + if (!!endswith(buf, "/adbd")) + return 0; + + pr_info("%s: adbd: %s \n", __func__, buf); + + return 1; +} + +static long is_libadbroot_ok() +{ + static const char kLibAdbRoot[] = "/data/adb/ksu/lib/libadbroot.so"; + struct path path; + long ret = kern_path(kLibAdbRoot, 0, &path); + if (ret < 0) { + if (ret == -ENOENT) { + pr_err("libadbroot.so not exists, skip adb root. Please run `ksud install`\n"); + ret = 0; + } else { + pr_err("access libadbroot.so failed: %ld, skip adb root\n", ret); + } + } else { + ret = 1; + } + path_put(&path); + return ret; +} + +// NOTE: envp is (void ***), void * const char __user * const char __user * +static long setup_ld_preload(void ***envp_arg) +{ + static const char kLdPreload[] = "LD_PRELOAD=/data/adb/ksu/lib/libadbroot.so"; + static const char kLdLibraryPath[] = "LD_LIBRARY_PATH=/data/adb/ksu/lib"; + static const size_t kReadEnvBatch = 16; + static const size_t kPtrSize = sizeof(unsigned long); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + unsigned long stackp = current_user_stack_pointer(); +#else + volatile unsigned long stackp = current->mm->start_stack; // its just a stack smash in the end, it'll work. +#endif + unsigned long envp, ld_preload_p, ld_library_path_p; + unsigned long *envp_p = (uintptr_t)envp_arg; + unsigned long *tmp_env_p = NULL, *tmp_env_p2 = NULL; + size_t env_count = 0, total_size; + long ret; + + envp = (char __user **)untagged_addr((unsigned long)*envp_p); + + ld_preload_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdPreload), 8); // 2 words on 32-bit, 32-on-64 its gonna be fine dw. + ret = copy_to_user(ld_preload_p, kLdPreload, sizeof(kLdPreload)); + if (ret != 0) { + pr_warn("write ld_preload when adb_root_handle_execve failed: %ld\n", ret); + return -EFAULT; + } + + ld_library_path_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdLibraryPath), 8); + ret = copy_to_user(ld_library_path_p, kLdLibraryPath, sizeof(kLdLibraryPath)); + if (ret != 0) { + pr_warn("write ld_library_path when adb_root_handle_execve failed: %ld\n", ret); + return -EFAULT; + } + + for (;;) { + tmp_env_p2 = krealloc(tmp_env_p, (env_count + kReadEnvBatch + 2) * kPtrSize, GFP_KERNEL); + if (tmp_env_p2 == NULL) { + pr_err("alloc tmp env failed\n"); + ret = -ENOMEM; + goto out_release_env_p; + } + tmp_env_p = tmp_env_p2; + ret = copy_from_user(&tmp_env_p[env_count], envp + env_count * kPtrSize, kReadEnvBatch * kPtrSize); + if (ret < 0) { + pr_warn("Access envp when adb_root_handle_execve failed: %ld\n", ret); + ret = -EFAULT; + goto out_release_env_p; + } + size_t read_count = kReadEnvBatch * kPtrSize - ret; + size_t max_new_env_count = read_count / kPtrSize, new_env_count = 0; + bool meet_zero = false; + for (; new_env_count < max_new_env_count; new_env_count++) { + if (!tmp_env_p[new_env_count + env_count]) { + meet_zero = true; + break; + } + } + if (!meet_zero) { + if (read_count % kPtrSize != 0) { + pr_err("unaligned envp array!\n"); + ret = -EFAULT; + goto out_release_env_p; + } else if (ret != 0) { + pr_err("truncated envp array!\n"); + ret = -EFAULT; + goto out_release_env_p; + } + } + env_count += new_env_count; + if (meet_zero) + break; + } + + // We should have allocated enough memory + // TODO: handle existing LD_PRELOAD + tmp_env_p[env_count++] = ld_preload_p; + tmp_env_p[env_count++] = ld_library_path_p; + tmp_env_p[env_count++] = 0; + total_size = env_count * kPtrSize; + + stackp -= total_size; + ret = copy_to_user(stackp, tmp_env_p, total_size); + if (ret != 0) { + pr_err("copy new env failed: %ld\n", ret); + ret = -EFAULT; + goto out_release_env_p; + } + + *envp_p = stackp; + ret = 0; + +out_release_env_p: + if (tmp_env_p) { + kfree(tmp_env_p); + } + + return ret; +} + +__attribute__((cold)) +static noinline long do_ksu_adb_root_handle_execve(const char __user **filename_user, void ***envp) +{ + if (likely(!is_exec_adbd(filename_user))) + return 0; + + if (unlikely(!is_libadbroot_ok())) + return 0; + + long ret = setup_ld_preload(envp); + if (ret) + return ret; + + pr_info("escape to root for adb\n"); + escape_to_root_for_adb_root(); + escape_with_root_profile(); // why is this needed for 3.x? + return 0; +} + +// sys_execve, syscall hooks +static __always_inline long ksu_adb_root_handle_execve(const char __user **filename_user, void ***envp) +{ + if (likely(!ksu_adb_root)) + return 0; + + if (likely(!!current->seccomp.mode)) + return 0; + + do_ksu_adb_root_handle_execve(filename_user, envp); + + return 0; +} + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + const compat_uptr_t __user *compat; +#endif + } ptr; +}; + +__attribute__((cold)) +static noinline long do_ksu_adb_root_handle_execveat(char *filename, void *envp_in) +{ + if (!!endswith(filename, "/adbd")) + return 0; + + if (unlikely(!is_libadbroot_ok())) + return 0; + + struct user_arg_ptr *envp = (struct user_arg_ptr *)envp_in; + + void ***envp_addr = (void ***)&envp->ptr.native; +#ifdef CONFIG_COMPAT + if (unlikely(envp->is_compat)) + envp_addr = (void ***)&envp->ptr.compat; +#endif + + pr_info("%s: envp 0x%lx \n", __func__, (uintptr_t)*envp_addr ); + + long ret = setup_ld_preload(envp_addr); + if (ret) + return ret; + + pr_info("escape to root for adb\n"); + escape_to_root_for_adb_root(); + escape_with_root_profile(); // why is this needed? + return 0; +} + +// do_execve, do_execve_common, do_execveat_common +static __always_inline long ksu_adb_root_handle_execveat(char *filename, void *envp_in) +{ + if (likely(!ksu_adb_root)) + return 0; + + if (likely(!!current->seccomp.mode)) + return 0; + + if (!filename) + return 0; + + if (!envp_in) + return 0; + + do_ksu_adb_root_handle_execveat(filename, envp_in); + + return 0; +} + +static int kernel_adb_root_feature_get(u64 *value) +{ + *value = ksu_adb_root ? 1 : 0; + return 0; +} + +static int kernel_adb_root_feature_set(u64 value) +{ + bool enable = value != 0; + if (enable) { + ksu_adb_root = true; + } else { + ksu_adb_root = false; + } + pr_info("adb_root: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler ksu_adb_root_handler = { + .feature_id = KSU_FEATURE_ADB_ROOT, + .name = "adb_root", + .get_handler = kernel_adb_root_feature_get, + .set_handler = kernel_adb_root_feature_set, +}; + +void __init ksu_adb_root_init(void) +{ + if (ksu_register_feature_handler(&ksu_adb_root_handler)) { + pr_err("Failed to register adb_root feature handler\n"); + } +} + +void __exit ksu_adb_root_exit(void) +{ + ksu_unregister_feature_handler(KSU_FEATURE_ADB_ROOT); +} + +#endif // CONFIG_KSU_FEATURE_ADBROOT diff --git a/drivers/kernelsu/feature/adb_root.h b/drivers/kernelsu/feature/adb_root.h new file mode 100644 index 000000000000..331148751ca5 --- /dev/null +++ b/drivers/kernelsu/feature/adb_root.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_ADB_ROOT +#define __KSU_H_ADB_ROOT + +#ifdef CONFIG_KSU_FEATURE_ADBROOT +void ksu_adb_root_init(void); +void ksu_adb_root_exit(void); +#endif + +#endif diff --git a/drivers/kernelsu/feature/kernel_umount.c b/drivers/kernelsu/feature/kernel_umount.c new file mode 100644 index 000000000000..88b6ce6cc565 --- /dev/null +++ b/drivers/kernelsu/feature/kernel_umount.c @@ -0,0 +1,120 @@ +static bool ksu_kernel_umount_enabled = true; + +static int kernel_umount_feature_get(u64 *value) +{ + *value = ksu_kernel_umount_enabled ? 1 : 0; + return 0; +} + +static int kernel_umount_feature_set(u64 value) +{ + bool enable = value != 0; + ksu_kernel_umount_enabled = enable; + pr_info("kernel_umount: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler kernel_umount_handler = { + .feature_id = KSU_FEATURE_KERNEL_UMOUNT, + .name = "kernel_umount", + .get_handler = kernel_umount_feature_get, + .set_handler = kernel_umount_feature_set, +}; + +extern int path_umount(struct path *path, int flags); + +static void ksu_umount_mnt(const char *mnt, struct path *path, int flags) +{ + int err = path_umount(path, flags); + if (err) + pr_info("umount %s failed: %d\n", mnt, err); +} + +static void try_umount(const char *mnt, int flags) +{ + struct path path; + int err = kern_path(mnt, 0, &path); + if (err) { + return; + } + + if (path.dentry != path.mnt->mnt_root) { + // it is not root mountpoint, maybe umounted by others already. + path_put(&path); + return; + } + + ksu_umount_mnt(mnt, &path, flags); +} + +static inline int ksu_handle_umount(struct cred *new, const struct cred *old) +{ + uid_t new_uid = ksu_get_uid_t(new->uid); + uid_t old_uid = ksu_get_uid_t(old->uid); + + // if there isn't any module mounted, just ignore it! + if (!ksu_module_mounted) { + return 0; + } + + if (!ksu_kernel_umount_enabled) { + return 0; + } + + if (!ksu_cred) { + return 0; + } + + // There are 6 scenarios: + // 1. Normal app: zygote -> appuid + // 2. Isolated process forked from zygote: zygote -> isolated_process + // 3. App zygote forked from zygote: zygote -> appuid + // 4. Webview zygote forked from zygote: zygote -> WEBVIEW_ZYGOTE_UID (no need to handle, app cannot run custom code) + // 5. Isolated process forked from app zygote: appuid -> isolated_process (already handled by 3) + // 6. Isolated process forked from webview zygote (no need to handle, app cannot run custom code) + if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { + return 0; + } + + if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { + return 0; + } + + // check old process's selinux context, if it is not zygote, ignore it! + // because some su apps may setuid to untrusted_app but they are in global mount namespace + // when we umount for such process, that is a disaster! + // also handle case 4 and 5 + bool is_zygote_child = is_zygote(old); + if (!is_zygote_child) { + pr_info("handle umount ignore non zygote child: %d\n", current->pid); + return 0; + } + // umount the target mnt + pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); + + const struct cred *saved = override_creds(ksu_cred); + + struct mount_entry *entry; + down_read(&mount_list_lock); + list_for_each_entry (entry, &mount_list, list) { + pr_info("%s: unmounting: %s flags: 0x%x\n", __func__, entry->umountable, entry->flags); + try_umount(entry->umountable, entry->flags); + } + up_read(&mount_list_lock); + + revert_creds(saved); + + return 0; +} + +void __init ksu_kernel_umount_init(void) +{ + if (ksu_register_feature_handler(&kernel_umount_handler)) { + pr_err("Failed to register kernel_umount feature handler\n"); + } +} + +void __exit ksu_kernel_umount_exit(void) +{ + ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT); +} diff --git a/drivers/kernelsu/core_hook.h b/drivers/kernelsu/feature/kernel_umount.h similarity index 51% rename from drivers/kernelsu/core_hook.h rename to drivers/kernelsu/feature/kernel_umount.h index af967f0a1be2..51af740d619c 100644 --- a/drivers/kernelsu/core_hook.h +++ b/drivers/kernelsu/feature/kernel_umount.h @@ -1,13 +1,5 @@ -#ifndef __KSU_H_KSU_CORE -#define __KSU_H_KSU_CORE - -#include -#include -#include - -void __init ksu_core_init(void); - -void escape_with_root_profile(void); +#ifndef __KSU_H_KERNEL_UMOUNT +#define __KSU_H_KERNEL_UMOUNT // for the umount list struct mount_entry { diff --git a/drivers/kernelsu/sucompat.c b/drivers/kernelsu/feature/sucompat.c similarity index 56% rename from drivers/kernelsu/sucompat.c rename to drivers/kernelsu/feature/sucompat.c index 091df1398c5b..dabfe34f2def 100644 --- a/drivers/kernelsu/sucompat.c +++ b/drivers/kernelsu/feature/sucompat.c @@ -1,19 +1,3 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif - #define SU_PATH "/system/bin/su" #define SH_PATH "/system/bin/sh" @@ -36,17 +20,21 @@ static void __user *userspace_stack_buffer(const void *d, size_t len) volatile unsigned long start_stack = current->mm->start_stack; unsigned int step = 32; - char __user *p = NULL; - do { - p = (void __user *)(start_stack - step - len); - if (!copy_to_user(p, d, len)) { - /* pr_info("%s: start_stack: %lx p: %lx len: %zu\n", - __func__, start_stack, (unsigned long)p, len ); */ - return p; - } - step = step + step; - } while (step <= 2048); +start_loop: + ; + char __user *p = (void __user *)(start_stack - step - len); + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("%s: start_stack: %lx p: %lx len: %zu\n", __func__, start_stack, (unsigned long)p, len ); + + if (!copy_to_user(p, d, len)) + return p; + + step = step + step; + + if (step <= 2048) + goto start_loop; + return NULL; } #endif @@ -68,10 +56,13 @@ static char __user *ksud_user_path(void) __attribute__((hot)) static __always_inline bool is_su_allowed(const void **ptr_to_check) { +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE barrier(); if (!ksu_su_compat_enabled) return false; +#endif + barrier(); if (likely(!!current->seccomp.mode)) return false; @@ -92,13 +83,11 @@ static __always_inline bool is_su_allowed(const void **ptr_to_check) } #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) -static inline void sys_execve_escape_ksud(const char __user **filename_user) +__attribute__((cold)) +static noinline void sys_execve_escape_ksud(const char __user **filename_user) { - if (likely(ksu_boot_completed)) - return; - // see if its init - if (!is_init(get_current_cred())) + if (!is_init(current_cred())) return; const char ksud_path[] = KSUD_PATH; @@ -118,13 +107,11 @@ static inline void sys_execve_escape_ksud(const char __user **filename_user) return; } -static inline void kernel_execve_escape_ksud(void *filename_ptr) +__attribute__((cold)) +static noinline void kernel_execve_escape_ksud(void *filename_ptr) { - if (likely(ksu_boot_completed)) - return; - // see if its init - if (!is_init(get_current_cred())) + if (!is_init(current_cred())) return; if (likely(memcmp(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)))) @@ -138,10 +125,10 @@ static inline void kernel_execve_escape_ksud(void *filename_ptr) } #else static inline void sys_execve_escape_ksud(const char __user **filename_user) { } // no-op -static inline void kernel_execve_escape_ksud(void *filename_ptr) {} // no-op +static inline void kernel_execve_escape_ksud(void *filename_ptr) { } // no-op #endif -static int ksu_sucompat_user_common(const char __user **filename_user, +static noinline int ksu_sucompat_user_common(const char __user **filename_user, const char *syscall_name, const bool escalate, const uint8_t sym) @@ -160,16 +147,31 @@ static int ksu_sucompat_user_common(const char __user **filename_user, write_sulog(sym); - if (escalate) { - pr_info("%s su found\n", syscall_name); - *filename_user = ksud_user_path(); - escape_with_root_profile(); // escalate !! - } else { - pr_info("%s su->sh!\n", syscall_name); - *filename_user = sh_user_path(); - } + if (!escalate) + goto no_escalate; + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); +#endif + if (!!escape_with_root_profile()) + return 0; + + // NOTE: we only check file existence, not exec success! + struct path kpath; + if (!!kern_path("/data/adb/ksud", 0, &kpath)) + goto no_ksud; + path_put(&kpath); + pr_info("%s su->ksud!\n", syscall_name); + *filename_user = ksud_user_path(); return 0; + +no_ksud: +no_escalate: + pr_info("%s su->sh!\n", syscall_name); + *filename_user = sh_user_path(); + return 0; + } // sys_faccessat @@ -192,11 +194,15 @@ int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) } // sys_execve, compat_sys_execve -int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) +static int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, + void *argv, void *envp, int *flags) { - sys_execve_escape_ksud(filename_user); + if (unlikely(!ksu_boot_completed)) + sys_execve_escape_ksud(filename_user); + +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execve(filename_user, (void ***)envp); +#endif if (!is_su_allowed((const void **)filename_user)) return 0; @@ -204,144 +210,93 @@ int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, return ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x'); } -// getname_flags on fs/namei.c, this hooks ALL fs-related syscalls. -// NOT RECOMMENDED for daily use. mostly for debugging purposes. -int ksu_getname_flags_user(const char __user **filename_user, int flags) +static noinline int ksu_sucompat_kernel_common(void *filename_ptr, const char *function_name, bool escalate) { - if (!is_su_allowed((const void **)filename_user)) - return 0; - // sys_execve always calls getname, which sets flags = 0 on getname_flags - // we can use it to deduce if caller is likely execve - - uint8_t sym = '$'; - bool escalate = false; - - if (!flags) { - escalate = true; - sym = 'x'; - } + if (likely(memcmp(filename_ptr, SU_PATH, sizeof(SU_PATH)))) + return 0; - return ksu_sucompat_user_common(filename_user, "getname_flags", escalate, sym); -} + // we only handle execve here after removing vfs_statx hook for >= 6.1 + write_sulog('x'); -static int ksu_sucompat_kernel_common(void *filename_ptr, const char *function_name, bool escalate, const uint8_t sym) -{ + if (!escalate) + goto no_escalate; - if (likely(memcmp(filename_ptr, SU_PATH, sizeof(SU_PATH)))) +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); +#endif + if (!!escape_with_root_profile()) return 0; - write_sulog(sym); + // NOTE: we only check file existence, not exec success! + struct path kpath; + if (!!kern_path("/data/adb/ksud", 0, &kpath)) + goto no_ksud; - if (escalate) { - pr_info("%s su found\n", function_name); - memcpy(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); - escape_with_root_profile(); - } else { - pr_info("%s su->sh\n", function_name); - memcpy(filename_ptr, SH_PATH, sizeof(SH_PATH)); - } + path_put(&kpath); + pr_info("%s su->ksud!\n", function_name); + memcpy(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); + return 0; + +no_ksud: +no_escalate: + pr_info("%s su->sh!\n", function_name); + memcpy(filename_ptr, SH_PATH, sizeof(SH_PATH)); return 0; + } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0) // for do_execveat_common / do_execve_common on >= 3.14 // take note: struct filename **filename -int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) +int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags) { - kernel_execve_escape_ksud((void *)(*filename_ptr)->name); + if (unlikely(!ksu_boot_completed)) + kernel_execve_escape_ksud((void *)(*filename_ptr)->name); +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execveat((void *)(*filename_ptr)->name, envp); +#endif if (!is_su_allowed((const void **)filename_ptr)) return 0; - // struct filename *filename = *filename_ptr; - // return ksu_do_execveat_common((void *)filename->name, "do_execveat_common"); - // nvm this, just inline - - return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true, 'x'); + return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true); } - -// for compatibility to old hooks -int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, - void *envp, int *flags) +int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags) { - kernel_execve_escape_ksud((void *)(*filename_ptr)->name); - - if (!is_su_allowed((const void **)filename_ptr)) - return 0; - - return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true, 'x'); + // literally just an alias due to old hooks + return ksu_handle_execveat(fd, filename_ptr, argv, envp, flags); } #else // for do_execve_common on < 3.14 // take note: char **filename -int ksu_legacy_execve_sucompat(const char **filename_ptr, - void *__never_use_argv, - void *__never_use_envp) +int ksu_legacy_execve_sucompat(const char **filename_ptr, void *argv, void *envp) { - kernel_execve_escape_ksud((void *)*filename_ptr); - - if (!is_su_allowed((const void **)filename_ptr)) - return 0; + if (unlikely(!ksu_boot_completed)) + kernel_execve_escape_ksud((void *)*filename_ptr); - return ksu_sucompat_kernel_common((void *)*filename_ptr, "do_execve_common", true, 'x'); -} +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execveat((void *)*filename_ptr, envp); #endif - -// vfs_statx for 5.18+ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0) -int ksu_handle_vfs_statx(void *__never_use_dfd, struct filename **filename_ptr, - void *__never_use_flags, void **__never_use_stat, - void *__never_use_request_mask) -{ if (!is_su_allowed((const void **)filename_ptr)) return 0; - return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "vfs_statx", false, 's'); + return ksu_sucompat_kernel_common((void *)*filename_ptr, "do_execve_common", true); } #endif -// getname_flags on fs/namei.c, this hooks ALL fs-related syscalls. -// put the hook right after usercopy -// NOT RECOMMENDED for daily use. mostly for debugging purposes. -int ksu_getname_flags_kernel(char **kname, int flags) -{ - if (!is_su_allowed((const void **)kname)) - return 0; - - uint8_t sym = '$'; - bool escalate = false; - - if (!flags) { - escalate = true; - sym = 'x'; - } - - return ksu_sucompat_kernel_common((void *)*kname, "getname_flags", escalate, sym); -} - #ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE static void syscall_table_sucompat_enable(); static void syscall_table_sucompat_disable(); -#endif - -#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT -static void rp_sucompat_exit(); -static void rp_sucompat_init(); +#else +static inline void syscall_table_sucompat_enable() { } // no-op +static inline void syscall_table_sucompat_disable() { } // no-op #endif static void ksu_sucompat_enable() { -#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE syscall_table_sucompat_enable(); -#endif - -#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT - rp_sucompat_init(); -#endif ksu_su_compat_enabled = true; pr_info("%s: hooks enabled: exec, faccessat, stat\n", __func__); @@ -350,13 +305,7 @@ static void ksu_sucompat_enable() static void ksu_sucompat_disable() { -#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE syscall_table_sucompat_disable(); -#endif - -#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT - rp_sucompat_exit(); -#endif ksu_su_compat_enabled = false; pr_info("%s: hooks disabled: exec, faccessat, stat\n", __func__); @@ -397,14 +346,14 @@ static const struct ksu_feature_handler su_compat_handler = { }; // sucompat: permited process can execute 'su' to gain root access. -void ksu_sucompat_init() +void __init ksu_sucompat_init() { if (ksu_register_feature_handler(&su_compat_handler)) { pr_err("Failed to register su_compat feature handler\n"); } } -void ksu_sucompat_exit() +void __exit ksu_sucompat_exit() { ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT); } diff --git a/drivers/kernelsu/sucompat.h b/drivers/kernelsu/feature/sucompat.h similarity index 58% rename from drivers/kernelsu/sucompat.h rename to drivers/kernelsu/feature/sucompat.h index 52c30780a7b2..580384ee9c6c 100644 --- a/drivers/kernelsu/sucompat.h +++ b/drivers/kernelsu/feature/sucompat.h @@ -1,8 +1,5 @@ #ifndef __KSU_H_SUCOMPAT #define __KSU_H_SUCOMPAT -#include -#include -#include void ksu_sucompat_init(void); void ksu_sucompat_exit(void); diff --git a/drivers/kernelsu/feature/sulog.c b/drivers/kernelsu/feature/sulog.c new file mode 100644 index 000000000000..9f76805ca4f6 --- /dev/null +++ b/drivers/kernelsu/feature/sulog.c @@ -0,0 +1,57 @@ +static bool ksu_sulog_enabled __read_mostly = false; + +static int sulog_feature_get(u64 *value) +{ + *value = ksu_sulog_enabled ? 1 : 0; + return 0; +} + +static int sulog_feature_set(u64 value) +{ + bool enable = value != 0; + + ksu_sulog_enabled = enable; + pr_info("sulog: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler sulog_handler = { + .feature_id = KSU_FEATURE_SULOG, + .name = "sulog", + .get_handler = sulog_feature_get, + .set_handler = sulog_feature_set, +}; + +bool ksu_sulog_is_enabled(void) +{ + return ksu_sulog_enabled; +} + +void __init ksu_sulog_init(void) +{ + int ret; + + ksu_sulog_enabled = false; + + ret = ksu_register_feature_handler(&sulog_handler); + if (ret) { + pr_err("Failed to register sulog feature handler\n"); + return; + } + + ret = ksu_sulog_events_init(); + if (ret) { + pr_err("Failed to initialize sulog events: %d\n", ret); + ksu_unregister_feature_handler(KSU_FEATURE_SULOG); + return; + } + + ksu_sulog_fd_init(); +} + +void __exit ksu_sulog_exit(void) +{ + ksu_sulog_fd_exit(); + ksu_sulog_events_exit(); + ksu_unregister_feature_handler(KSU_FEATURE_SULOG); +} diff --git a/drivers/kernelsu/feature/sulog.h b/drivers/kernelsu/feature/sulog.h new file mode 100644 index 000000000000..565f59113cd0 --- /dev/null +++ b/drivers/kernelsu/feature/sulog.h @@ -0,0 +1,8 @@ +#ifndef __KSU_H_SULOG +#define __KSU_H_SULOG + +bool ksu_sulog_is_enabled(void); +void ksu_sulog_init(void); +void ksu_sulog_exit(void); + +#endif diff --git a/drivers/kernelsu/core_hook.c b/drivers/kernelsu/hook/core_hook.c similarity index 54% rename from drivers/kernelsu/core_hook.c rename to drivers/kernelsu/hook/core_hook.c index 3d76300f7950..1f203cdf44c0 100644 --- a/drivers/kernelsu/core_hook.c +++ b/drivers/kernelsu/hook/core_hook.c @@ -1,61 +1,9 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // sys_umount - #ifdef CONFIG_KSU_LSM_SECURITY_HOOKS #define LSM_HANDLER_TYPE static int #else #define LSM_HANDLER_TYPE int #endif -static bool ksu_kernel_umount_enabled = true; - -static int kernel_umount_feature_get(u64 *value) -{ - *value = ksu_kernel_umount_enabled ? 1 : 0; - return 0; -} - -static int kernel_umount_feature_set(u64 value) -{ - bool enable = value != 0; - ksu_kernel_umount_enabled = enable; - pr_info("kernel_umount: set to %d\n", enable); - return 0; -} - -static const struct ksu_feature_handler kernel_umount_handler = { - .feature_id = KSU_FEATURE_KERNEL_UMOUNT, - .name = "kernel_umount", - .get_handler = kernel_umount_feature_get, - .set_handler = kernel_umount_feature_set, -}; - LSM_HANDLER_TYPE ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry) { if (!current->mm) { @@ -96,67 +44,6 @@ LSM_HANDLER_TYPE ksu_handle_rename(struct dentry *old_dentry, struct dentry *new return 0; } -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) -__weak int path_umount(struct path *path, int flags) -{ - char buf[256] = {0}; - int ret; - - // -1 on the size as implicit null termination - // as we zero init the thing - char *usermnt = d_path(path, buf, sizeof(buf) - 1); - if (!(usermnt && usermnt != buf)) { - ret = -ENOENT; - goto out; - } - - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - ret = ksys_umount((char __user *)usermnt, flags); -#else - ret = (int)sys_umount((char __user *)usermnt, flags); -#endif - - set_fs(old_fs); - - // release ref here! user_path_at increases it - // then only cleans for itself -out: - path_put(path); - return ret; -} -#endif - -static void ksu_umount_mnt(const char *mnt, struct path *path, int flags) -{ - int err = path_umount(path, flags); - - // upstream actually has a UAF here: path->dentry after dput - // but its fine as umount always succeeds - // that code path is very cold - if (err) - pr_info("umount %s failed: %d\n", mnt, err); -} - -static void try_umount(const char *mnt, int flags) -{ - struct path path; - int err = kern_path(mnt, 0, &path); - if (err) { - return; - } - - if (path.dentry != path.mnt->mnt_root) { - // it is not root mountpoint, maybe umounted by others already. - path_put(&path); - return; - } - - ksu_umount_mnt(mnt, &path, flags); -} - LSM_HANDLER_TYPE ksu_handle_setuid(struct cred *new, const struct cred *old) { if (!new || !old) { @@ -183,88 +70,32 @@ LSM_HANDLER_TYPE ksu_handle_setuid(struct cred *new, const struct cred *old) return 0; } - // if there isn't any module mounted, just ignore it! - if (!ksu_module_mounted) { - return 0; - } - - if (!ksu_kernel_umount_enabled) { - return 0; - } - - if (!ksu_cred) { - return 0; - } - - // There are 5 scenarios: - // 1. Normal app: zygote -> appuid - // 2. Isolated process forked from zygote: zygote -> isolated_process - // 3. App zygote forked from zygote: zygote -> appuid - // 4. Isolated process froked from app zygote: appuid -> isolated_process (already handled by 3) - // 5. Isolated process froked from webview zygote (no need to handle, app cannot run custom code) - if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - // check old process's selinux context, if it is not zygote, ignore it! - // because some su apps may setuid to untrusted_app but they are in global mount namespace - // when we umount for such process, that is a disaster! - // also handle case 4 and 5 - bool is_zygote_child = is_zygote(old); - if (!is_zygote_child) { - pr_info("handle umount ignore non zygote child: %d\n", current->pid); - return 0; - } - - // umount the target mnt - pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); - - const struct cred *saved = override_creds(ksu_cred); - - struct mount_entry *entry; - down_read(&mount_list_lock); - list_for_each_entry(entry, &mount_list, list) { - pr_info("%s: unmounting: %s flags 0x%x\n", __func__, entry->umountable, entry->flags); - try_umount(entry->umountable, entry->flags); - } - up_read(&mount_list_lock); - - revert_creds(saved); - - return 0; + return ksu_handle_umount(new, old); } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) -static void ksu_grab_init_session_keyring(const char *filename); -#endif - LSM_HANDLER_TYPE ksu_bprm_check(struct linux_binprm *bprm) { +#ifdef CONFIG_KSU_FEATURE_SULOG + if (unlikely(!current->seccomp.mode)) + ksu_sulog_emit_bprm((const char *)bprm->filename); +#endif + if (likely(!ksu_execveat_hook)) return 0; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) ksu_grab_init_session_keyring((const char *)bprm->filename); -#endif - ksu_handle_pre_ksud((char *)bprm->filename); + ksu_handle_pre_ksud((const char *)bprm->filename); return 0; } -bool ksu_vfs_read_hook __read_mostly; -static void ksu_handle_initrc(struct file *file); - LSM_HANDLER_TYPE ksu_file_permission(struct file *file, int mask) { if (likely(!ksu_vfs_read_hook)) return 0; - ksu_handle_initrc(file); + ksu_install_rc_hook(file); return 0; } @@ -283,12 +114,13 @@ static int ksu_task_fix_setuid(struct cred *new, const struct cred *old, } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) -#include static struct security_hook_list ksu_hooks[] = { LSM_HOOK_INIT(inode_rename, ksu_inode_rename), LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid), LSM_HOOK_INIT(bprm_check_security, ksu_bprm_check), +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && !defined(CONFIG_KSU_KPROBES_KSUD) LSM_HOOK_INIT(file_permission, ksu_file_permission), +#endif }; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) @@ -311,7 +143,7 @@ static void ksu_lsm_hook_init(void) extern struct security_operations selinux_ops; static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); + struct inode *new_dir, struct dentry *new_dentry) = NULL; static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { @@ -319,21 +151,21 @@ static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry, return orig_inode_rename(old_inode, old_dentry, new_inode, new_dentry); } -static int (*orig_task_fix_setuid) (struct cred *new, const struct cred *old, int flags); +static int (*orig_task_fix_setuid) (struct cred *new, const struct cred *old, int flags) = NULL; static int hook_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { ksu_task_fix_setuid(new, old, flags); return orig_task_fix_setuid(new, old, flags); } -static int (*orig_bprm_check_security)(struct linux_binprm *bprm); +static int (*orig_bprm_check_security)(struct linux_binprm *bprm) = NULL; static int hook_bprm_check_security(struct linux_binprm *bprm) { ksu_bprm_check(bprm); return orig_bprm_check_security(bprm); } -static int (*orig_file_permission) (struct file *file, int mask); +static int (*orig_file_permission) (struct file *file, int mask) = NULL; static int hook_file_permission(struct file *file, int mask) { @@ -356,25 +188,28 @@ static void ksu_lsm_hook_restore(void) pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); preempt_disable(); + local_irq_disable(); +#ifndef CONFIG_KSU_FEATURE_SULOG if (orig_bprm_check_security) { pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->bprm_check_security, (long)orig_bprm_check_security); ops->bprm_check_security = orig_bprm_check_security; } +#endif if (orig_file_permission) { pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission); ops->file_permission = orig_file_permission; } + smp_mb(); + + local_irq_enable(); preempt_enable(); - smp_mb(); return; } -static struct task_struct *unhook_thread; - static int execveat_hook_wait_fn(void *data) { loop_start: @@ -391,11 +226,7 @@ static int execveat_hook_wait_fn(void *data) static void execveat_hook_wait_thread() { - unhook_thread = kthread_run(execveat_hook_wait_fn, NULL, "unhook"); - if (IS_ERR(unhook_thread)) { - unhook_thread = NULL; - return; - } + kthread_run(execveat_hook_wait_fn, NULL, "unhook"); } static void ksu_lsm_hook_init(void) @@ -413,6 +244,7 @@ static void ksu_lsm_hook_init(void) pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); preempt_disable(); + local_irq_disable(); orig_inode_rename = ops->inode_rename; ops->inode_rename = hook_inode_rename; @@ -423,13 +255,16 @@ static void ksu_lsm_hook_init(void) orig_bprm_check_security = ops->bprm_check_security; ops->bprm_check_security = hook_bprm_check_security; +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && !defined(CONFIG_KSU_KPROBES_KSUD) orig_file_permission = ops->file_permission; ops->file_permission = hook_file_permission; +#endif - preempt_enable(); - smp_mb(); + local_irq_enable(); + preempt_enable(); + execveat_hook_wait_thread(); return; } @@ -437,16 +272,10 @@ static void ksu_lsm_hook_init(void) #endif // < 4.2 #else -void __init ksu_lsm_hook_init(void) -{ - // nothing, no-op -} +void __init ksu_lsm_hook_init(void) { } // nothing, no-op #endif // CONFIG_KSU_LSM_SECURITY_HOOKS void __init ksu_core_init(void) { ksu_lsm_hook_init(); - if (ksu_register_feature_handler(&kernel_umount_handler)) { - pr_err("Failed to register kernel_umount feature handler\n"); - } } diff --git a/drivers/kernelsu/kp_ksud.c b/drivers/kernelsu/hook/kp_ksud.c similarity index 69% rename from drivers/kernelsu/kp_ksud.c rename to drivers/kernelsu/hook/kp_ksud.c index ece6cf12e456..39c2e654eff5 100644 --- a/drivers/kernelsu/kp_ksud.c +++ b/drivers/kernelsu/hook/kp_ksud.c @@ -1,29 +1,27 @@ -#include #include -#include -#include -#include -#include -#include -#include - -static struct task_struct *unregister_thread; // sys_newfstat rp // upstream: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 // this is a bit different from copy_from_user_retry -// here we just disable preempt and try nofault again +// here we just enable preempt and try again // we use this inside context that can't sleep -static long ksu_copy_from_user_nofault_retry(void *to, const void __user *from, unsigned long count) +static __always_inline long ksu_copy_from_user_fuck_faults(void *to, const void __user *from, unsigned long count) { long ret = copy_from_user_nofault(to, from, count); if (likely(!ret)) return ret; - preempt_disable(); - ret = copy_from_user_nofault(to, from, count); - preempt_enable(); + bool got_flipped = false; + if (!preemptible()) { + preempt_enable(); + got_flipped = true; + } + + ret = copy_from_user(to, from, count); + + if (got_flipped) + preempt_disable(); return ret; } @@ -35,7 +33,7 @@ static int sys_newfstat_handler_pre(struct kretprobe_instance *p, struct pt_regs void *statbuf = PT_REGS_PARM2(real_regs); *(void **)&p->data = NULL; - if (!is_init(get_current_cred())) + if (!is_init(current_cred())) return 0; struct file *file = fget(fd); @@ -62,20 +60,20 @@ static int sys_newfstat_handler_post(struct kretprobe_instance *p, struct pt_reg void __user *st_size_ptr = statbuf + offsetof(struct stat, st_size); long size, new_size; - if (ksu_copy_from_user_nofault_retry(&size, st_size_ptr, sizeof(long))) { - pr_info("kp_ksud: newfstat: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); + if (ksu_copy_from_user_fuck_faults(&size, st_size_ptr, sizeof(long))) { + pr_info("kp_ksud: sys_newfstat: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); return 0; } new_size = size + ksu_rc_len; - pr_info("kp_ksud: newfstat: adding ksu_rc_len: %ld -> %ld \n", size, new_size); + pr_info("kp_ksud: sys_newfstat: adding ksu_rc_len: %ld -> %ld \n", size, new_size); // I do NOT think this matters much for now, we can use copy_to_user // if SHTF then we backport cope_to_user_nofault if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) - pr_info("kp_ksud: newfstat: added ksu_rc_len \n"); + pr_info("kp_ksud: sys_newfstat: added ksu_rc_len \n"); else - pr_info("kp_ksud: newfstat: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + pr_info("kp_ksud: sys_newfstat: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); return 0; } @@ -95,10 +93,11 @@ static int sys_fstat64_handler_pre(struct kretprobe_instance *p, struct pt_regs void *statbuf = PT_REGS_PARM2(real_regs); *(void **)&p->data = NULL; - if (!is_init(get_current_cred())) + if (!is_init(current_cred())) return 0; - struct file *file = fget(fd); + // WARNING: LE-only!!! + struct file *file = fget(*(unsigned int *)&fd); if (!file) return 0; @@ -123,18 +122,18 @@ static int sys_fstat64_handler_post(struct kretprobe_instance *p, struct pt_regs void __user *st_size_ptr = statbuf + offsetof(struct stat64, st_size); long size, new_size; - if (ksu_copy_from_user_nofault_retry(&size, st_size_ptr, sizeof(long long))) { - pr_info("kp_ksud: fstat64: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); + if (ksu_copy_from_user_fuck_faults(&size, st_size_ptr, sizeof(long long))) { + pr_info("kp_ksud: sys_fstat64: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); return 0; } new_size = size + ksu_rc_len; - pr_info("kp_ksud: fstat64: adding ksu_rc_len: %ld -> %ld \n", size, new_size); + pr_info("kp_ksud: sys_fstat64: adding ksu_rc_len: %ld -> %ld \n", size, new_size); if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) - pr_info("kp_ksud: fstat64: added ksu_rc_len \n"); + pr_info("kp_ksud: sys_fstat64: added ksu_rc_len \n"); else - pr_info("kp_ksud: fstat64: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + pr_info("kp_ksud: sys_fstat64: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); return 0; } @@ -147,7 +146,21 @@ static struct kretprobe sys_fstat64_rp = { }; #endif -#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE +// sys_read +static int sys_read_handler_pre(struct kprobe *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + unsigned int fd = (int)PT_REGS_PARM1(real_regs); + + ksu_handle_sys_read_fd(fd); + return 0; +} + +static struct kprobe sys_read_kp = { + .symbol_name = SYS_READ_SYMBOL, + .pre_handler = sys_read_handler_pre, +}; + // sys_reboot extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); @@ -166,7 +179,6 @@ static struct kprobe sys_reboot_kp = { .symbol_name = SYS_REBOOT_SYMBOL, .pre_handler = sys_reboot_handler_pre, }; -#endif static int unregister_kprobe_function(void *data) { @@ -187,27 +199,22 @@ static int unregister_kprobe_function(void *data) pr_info("kp_ksud: unregister sys_fstat64_rp!\n"); #endif - unregister_thread = NULL; + unregister_kprobe(&sys_read_kp); + pr_info("kp_ksud: unregister sys_read_kp!\n"); return 0; } static void unregister_kprobe_thread() { - unregister_thread = kthread_run(unregister_kprobe_function, NULL, "kprobe_unregister"); - if (IS_ERR(unregister_thread)) { - unregister_thread = NULL; - return; - } + kthread_run(unregister_kprobe_function, NULL, "kp_unreg"); } static void kp_ksud_init() { -#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE int ret = register_kprobe(&sys_reboot_kp); // dont unreg this one pr_info("kp_ksud: sys_reboot_kp: %d\n", ret); -#endif int ret2 = register_kretprobe(&sys_newfstat_rp); pr_info("kp_ksud: sys_newfstat_rp: %d\n", ret2); @@ -217,5 +224,8 @@ static void kp_ksud_init() pr_info("kp_ksud: sys_fstat64_rp: %d\n", ret3); #endif + int ret4 = register_kprobe(&sys_read_kp); + pr_info("kp_ksud: sys_read_kp: %d\n", ret4); + unregister_kprobe_thread(); } diff --git a/drivers/kernelsu/syscall_table_hook_arm.c b/drivers/kernelsu/hook/syscall_table_hook_arm.c similarity index 89% rename from drivers/kernelsu/syscall_table_hook_arm.c rename to drivers/kernelsu/hook/syscall_table_hook_arm.c index 9ae4ff6aaf7e..6cabc75bf3f1 100644 --- a/drivers/kernelsu/syscall_table_hook_arm.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm.c @@ -1,6 +1,3 @@ -#include -#include - #ifndef CONFIG_ARM #error "only meant for ARM" #endif @@ -16,6 +13,7 @@ #define __ARMEABI_faccessat 334 #define __ARMEABI_fstatat64 327 #define __ARMEABI_fstat64 197 +#define __ARMEABI_read 3 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) @@ -38,8 +36,9 @@ static syscall_fn_t armeabi_execve = NULL; static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); return armeabi_execve(regs); } @@ -73,6 +72,15 @@ static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) return ret; } +static syscall_fn_t armeabi_read = NULL; +static long hook_armeabi_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return armeabi_read(regs); +} + #else // END OF 4.19+ SYSCALL HANDLERS static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; @@ -89,7 +97,7 @@ static long hook_armeabi_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) { - ksu_handle_execve_sucompat(NULL, &filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); return armeabi_execve(filename, argv, envp); } @@ -116,6 +124,12 @@ static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * st return ret; } +static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return armeabi_read(fd, buf, count); +} #endif // SYSCALL HANDLERS @@ -188,6 +202,8 @@ static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, vo smp_mb(); } +extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); + static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) { void **sctable = (void **)target_table; @@ -275,9 +291,8 @@ static int ksu_syscall_table_restore() if (FORCE_VOLATILE(ksu_vfs_read_hook)) goto loop_start; -#ifndef CONFIG_KSU_KPROBES_KSUD restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); -#endif + restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); return 0; } @@ -295,26 +310,31 @@ static void ksu_syscall_table_hook_init() read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); -#ifndef CONFIG_KSU_KPROBES_KSUD + // will be unregged read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); -#endif + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); vfs_read_hook_wait_thread(); // start unreg kthread } +static DEFINE_MUTEX(sucompat_toggle_mutex); + static void syscall_table_sucompat_enable() { - + mutex_lock(&sucompat_toggle_mutex); read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); + mutex_unlock(&sucompat_toggle_mutex); } static void syscall_table_sucompat_disable() { + mutex_lock(&sucompat_toggle_mutex); restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); + mutex_unlock(&sucompat_toggle_mutex); } // EOF diff --git a/drivers/kernelsu/syscall_table_hook.c b/drivers/kernelsu/hook/syscall_table_hook_arm64.c similarity index 85% rename from drivers/kernelsu/syscall_table_hook.c rename to drivers/kernelsu/hook/syscall_table_hook_arm64.c index cebb94fdb211..232cb16e18d8 100644 --- a/drivers/kernelsu/syscall_table_hook.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm64.c @@ -1,5 +1,3 @@ -#include - #ifndef CONFIG_ARM64 #error "only meant for ARM64" #endif @@ -15,6 +13,7 @@ #define __AARCH64_faccessat 48 #define __AARCH64_newfstatat 79 #define __AARCH64_newfstat 80 +#define __AARCH64_read 63 // NOTE: CONFIG_COMPAT implies __ARCH_WANT_COMPAT_STAT64 (fstatat64, fstat64) #define __ARMEABI_reboot 88 @@ -22,6 +21,7 @@ #define __ARMEABI_faccessat 334 #define __ARMEABI_fstatat64 327 #define __ARMEABI_fstat64 197 +#define __ARMEABI_read 3 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) @@ -44,8 +44,9 @@ static syscall_fn_t aarch64_execve = NULL; static long hook_aarch64_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); return aarch64_execve(regs); } @@ -79,6 +80,15 @@ static long hook_aarch64_newfstat_ret(const struct pt_regs *regs) return ret; } +static syscall_fn_t aarch64_read = NULL; +static long hook_aarch64_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return aarch64_read(regs); +} + #ifdef CONFIG_COMPAT static syscall_fn_t armeabi_reboot = NULL; static long hook_armeabi_reboot(const struct pt_regs *regs) @@ -96,8 +106,9 @@ static syscall_fn_t armeabi_execve = NULL; static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); return armeabi_execve(regs); } @@ -130,6 +141,16 @@ static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) ksu_handle_fstat64_ret(fd, statbuf); return ret; } + +static syscall_fn_t armeabi_read = NULL; +static long hook_armeabi_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return armeabi_read(regs); +} + #endif // CONFIG_COMPAT #else // END OF 4.19+ SYSCALL HANDLERS @@ -148,7 +169,7 @@ static long hook_aarch64_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) { - ksu_handle_execve_sucompat((int *)AT_FDCWD, &filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); return aarch64_execve(filename, argv, envp); } @@ -175,6 +196,13 @@ static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * stat return ret; } +static long (*aarch64_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return aarch64_read(fd, buf, count); +} + #ifdef CONFIG_COMPAT extern const void *compat_sys_call_table[]; @@ -192,7 +220,7 @@ static long hook_armeabi_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - ksu_handle_execve_sucompat(NULL, &filename, NULL, NULL, NULL); + ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); return armeabi_execve(filename, argv, envp); } @@ -218,6 +246,14 @@ static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * st ksu_handle_fstat64_ret(&fd, &statbuf); return ret; } + +static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return armeabi_read(fd, buf, count); +} + #endif // CONFIG_COMPAT #endif // SYSCALL HANDLERS @@ -291,6 +327,8 @@ static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, vo smp_mb(); } +extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); + static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) { void **sctable = (void **)target_table; @@ -378,12 +416,12 @@ static int ksu_syscall_table_restore() if (FORCE_VOLATILE(ksu_vfs_read_hook)) goto loop_start; -#ifndef CONFIG_KSU_KPROBES_KSUD restore_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); + restore_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); #if defined(CONFIG_COMPAT) restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); -#endif + restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); #endif return 0; @@ -401,9 +439,9 @@ static void ksu_syscall_table_hook_init() read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); -#ifndef CONFIG_KSU_KPROBES_KSUD + // will be unregged read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); -#endif + read_and_replace_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); #if defined(CONFIG_COMPAT) read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table); @@ -411,17 +449,21 @@ static void ksu_syscall_table_hook_init() read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); -#ifndef CONFIG_KSU_KPROBES_KSUD + // will be unregged read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); -#endif + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); #endif // COMPAT vfs_read_hook_wait_thread(); // start unreg kthread } +static DEFINE_MUTEX(sucompat_toggle_mutex); + static void syscall_table_sucompat_enable() { + mutex_lock(&sucompat_toggle_mutex); + read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); @@ -432,10 +474,13 @@ static void syscall_table_sucompat_enable() read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); #endif + mutex_unlock(&sucompat_toggle_mutex); } static void syscall_table_sucompat_disable() { + mutex_lock(&sucompat_toggle_mutex); + restore_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); restore_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); restore_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); @@ -446,78 +491,7 @@ static void syscall_table_sucompat_disable() restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); #endif + mutex_unlock(&sucompat_toggle_mutex); } // EOF - -#if 0 // these are kept for posterity -static int override_security_head(void *head, const void *new_head, size_t len) -{ - unsigned long base = (unsigned long)head & PAGE_MASK; - unsigned long offset = offset_in_page(head); - - // this is impossible for our case because the page alignment - // but be careful for other cases! - BUG_ON(offset + len > PAGE_SIZE); - struct page *page = phys_to_page(__pa(base)); - if (!page) { - return -EFAULT; - } - - void *addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); - if (!addr) { - return -ENOMEM; - } - local_irq_disable(); - memcpy(addr + offset, new_head, len); - local_irq_enable(); - vunmap(addr); - return 0; -} - -// normally backported on msm 3.10, provide weak -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) -__weak int set_memory_ro(unsigned long addr, int numpages) { return 0; } -__weak int set_memory_rw(unsigned long addr, int numpages) { return 0; } -#endif - -// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) -// old_ptr is actually void ** -// target_table is void *target_table[]; -static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) -{ - // *old_ptr = READ_ONCE(*((void **)sys_call_table + syscall_nr)); - // WRITE_ONCE(*((void **)sys_call_table + syscall_nr), new_ptr); - - // the one from zx2c4 looks like above, but the issue is that we dont have - // READ_ONCE and WRITE_ONCE on 3.x kernels, here we just force volatile everything - // since those are actually just forced-aligned-volatile-rw - - // void **syscall_addr = (void **)(sys_call_table + syscall_nr); - // sugar: *(a + b) == a[b]; , a + b == &a[b]; - - void **sctable = (void **)target_table; - void **syscall_addr = (void **)&sctable[syscall_nr]; - - // dont hook non-existing syscall - if (!FORCE_VOLATILE(*syscall_addr)) - return; - - pr_info("%s: syscall: #%d slot: 0x%lx new_ptr: 0x%lx \n", __func__, syscall_nr, *(long *)syscall_addr, (long)new_ptr); - - set_memory_rw(((unsigned long)syscall_addr & PAGE_MASK), 1); - - barrier(); - *(void **)old_ptr = FORCE_VOLATILE(*syscall_addr); - - barrier(); - preempt_disable(); - FORCE_VOLATILE(*syscall_addr) = new_ptr; - preempt_enable(); - - set_memory_ro(((unsigned long)syscall_addr & PAGE_MASK), 1); - smp_mb(); - - return; -} -#endif diff --git a/drivers/kernelsu/arch.h b/drivers/kernelsu/include/arch.h similarity index 95% rename from drivers/kernelsu/arch.h rename to drivers/kernelsu/include/arch.h index 569ea3d14863..78377293c5d4 100644 --- a/drivers/kernelsu/arch.h +++ b/drivers/kernelsu/include/arch.h @@ -1,8 +1,6 @@ #ifndef __KSU_H_ARCH #define __KSU_H_ARCH -#include - #if defined(__aarch64__) #define __PT_PARM1_REG regs[0] @@ -23,11 +21,13 @@ #define SYS_REBOOT_SYMBOL "__arm64_sys_reboot" #define SYS_NEWFSTAT_SYMBOL "__arm64_sys_newfstat" #define SYS_FSTAT64_SYMBOL "__arm64_sys_fstat64" +#define SYS_READ_SYMBOL "__arm64_sys_read" #else #define SYS_EXECVE_SYMBOL "sys_execve" #define SYS_REBOOT_SYMBOL "sys_reboot" #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" #endif #elif defined(__arm__) @@ -61,6 +61,7 @@ #define SYS_REBOOT_SYMBOL "sys_reboot" #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" #elif defined(__x86_64__) @@ -83,11 +84,13 @@ #define SYS_REBOOT_SYMBOL "__x64_sys_reboot" #define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" #define SYS_FSTAT64_SYMBOL "__ia32_compat_sys_x86_fstat64" +#define SYS_NEWFSTAT_SYMBOL "__x64_sys_read" #else #define SYS_EXECVE_SYMBOL "sys_execve" #define SYS_REBOOT_SYMBOL "sys_reboot" #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" #endif #else diff --git a/drivers/kernelsu/klog.h b/drivers/kernelsu/include/klog.h similarity index 82% rename from drivers/kernelsu/klog.h rename to drivers/kernelsu/include/klog.h index a934027fbeeb..6de40a66680e 100644 --- a/drivers/kernelsu/klog.h +++ b/drivers/kernelsu/include/klog.h @@ -1,8 +1,6 @@ #ifndef __KSU_H_KLOG #define __KSU_H_KLOG -#include - #ifdef pr_fmt #undef pr_fmt #define pr_fmt(fmt) "KernelSU: " fmt diff --git a/drivers/kernelsu/ksu.h b/drivers/kernelsu/include/ksu.h similarity index 84% rename from drivers/kernelsu/ksu.h rename to drivers/kernelsu/include/ksu.h index 7b75ada2029e..5bc6b6c80709 100644 --- a/drivers/kernelsu/ksu.h +++ b/drivers/kernelsu/include/ksu.h @@ -1,10 +1,7 @@ #ifndef __KSU_H_KSU #define __KSU_H_KSU -#include -#include - -#define KERNEL_SU_VERSION 32322 +#define KERNEL_SU_VERSION 32449 #define EVENT_POST_FS_DATA 1 #define EVENT_BOOT_COMPLETED 2 diff --git a/drivers/kernelsu/include/uapi/app_profile.h b/drivers/kernelsu/include/uapi/app_profile.h new file mode 100644 index 000000000000..74ce7231e223 --- /dev/null +++ b/drivers/kernelsu/include/uapi/app_profile.h @@ -0,0 +1,61 @@ +#ifndef __KSU_UAPI_APP_PROFILE_H +#define __KSU_UAPI_APP_PROFILE_H + +#define KSU_APP_PROFILE_VER 3 +#define KSU_MAX_PACKAGE_NAME 256 +/* NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. */ +#define KSU_MAX_GROUPS 32 +#define KSU_SELINUX_DOMAIN 64 + +struct root_profile { + __s32 uid; + __s32 gid; + + __u32 groups_count; + __s32 groups[KSU_MAX_GROUPS]; + + /* kernel_cap_t is u32[2] for capabilities v3 */ + struct { + __u64 effective; + __u64 permitted; + __u64 inheritable; + } capabilities; + + char selinux_domain[KSU_SELINUX_DOMAIN]; + + __s32 namespaces; +}; + +struct non_root_profile { + bool umount_modules; +}; + +struct app_profile { + /* + * It may be utilized for backward compatibility, although we have never + * explicitly made any promises regarding this. + */ + __u32 version; + + /* this is usually the package of the app, but can be other value for special apps */ + char key[KSU_MAX_PACKAGE_NAME]; + __s32 current_uid; + bool allow_su; + + union { + struct { + bool use_default; + char template_name[KSU_MAX_PACKAGE_NAME]; + + struct root_profile profile; + } rp_config; + + struct { + bool use_default; + + struct non_root_profile profile; + } nrp_config; + }; +}; + +#endif diff --git a/drivers/kernelsu/include/uapi/feature.h b/drivers/kernelsu/include/uapi/feature.h new file mode 100644 index 000000000000..aafd7720148b --- /dev/null +++ b/drivers/kernelsu/include/uapi/feature.h @@ -0,0 +1,17 @@ +#ifndef __KSU_UAPI_FEATURE_H +#define __KSU_UAPI_FEATURE_H + +enum ksu_feature_id { + KSU_FEATURE_SU_COMPAT = 0, + KSU_FEATURE_KERNEL_UMOUNT = 1, + KSU_FEATURE_SULOG = 2, + KSU_FEATURE_ADB_ROOT = 3, + +#ifdef CONFIG_KSU_EXTRAS // custom extensions + KSU_FEATURE_AVC_SPOOF = 10003, +#endif + + KSU_FEATURE_MAX +}; + +#endif diff --git a/drivers/kernelsu/include/uapi/selinux.h b/drivers/kernelsu/include/uapi/selinux.h new file mode 100644 index 000000000000..960454f7f46a --- /dev/null +++ b/drivers/kernelsu/include/uapi/selinux.h @@ -0,0 +1,29 @@ +#ifndef __KSU_UAPI_SELINUX_H +#define __KSU_UAPI_SELINUX_H + +#define KSU_SEPOLICY_CMD_NORMAL_PERM 1 +#define KSU_SEPOLICY_CMD_XPERM 2 +#define KSU_SEPOLICY_CMD_TYPE_STATE 3 +#define KSU_SEPOLICY_CMD_TYPE 4 +#define KSU_SEPOLICY_CMD_TYPE_ATTR 5 +#define KSU_SEPOLICY_CMD_ATTR 6 +#define KSU_SEPOLICY_CMD_TYPE_TRANSITION 7 +#define KSU_SEPOLICY_CMD_TYPE_CHANGE 8 +#define KSU_SEPOLICY_CMD_GENFSCON 9 + +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW 1 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY 2 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW 3 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT 4 + +#define KSU_SEPOLICY_SUBCMD_XPERM_ALLOW 1 +#define KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW 2 +#define KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT 3 + +#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE 1 +#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE 2 + +#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE 1 +#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER 2 + +#endif diff --git a/drivers/kernelsu/include/uapi/sulog.h b/drivers/kernelsu/include/uapi/sulog.h new file mode 100644 index 000000000000..9453a4bd0c16 --- /dev/null +++ b/drivers/kernelsu/include/uapi/sulog.h @@ -0,0 +1,32 @@ +#ifndef __KSU_UAPI_SULOG_H +#define __KSU_UAPI_SULOG_H + +#include +#include + +#define KSU_SULOG_EVENT_VERSION 1 +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +enum ksu_sulog_event_type { + KSU_SULOG_EVENT_ROOT_EXECVE = 1, + KSU_SULOG_EVENT_SUCOMPAT = 2, + KSU_SULOG_EVENT_IOCTL_GRANT_ROOT = 3, +}; + +struct ksu_sulog_event { + __u16 version; + __u16 event_type; + __s32 retval; + __u32 pid; + __u32 tgid; + __u32 ppid; + __u32 uid; + __u32 euid; + char comm[TASK_COMM_LEN]; + __u32 filename_len; + __u32 argv_len; +} __packed; + +#endif diff --git a/drivers/kernelsu/include/uapi/supercall.h b/drivers/kernelsu/include/uapi/supercall.h new file mode 100644 index 000000000000..dbfc5f1158bd --- /dev/null +++ b/drivers/kernelsu/include/uapi/supercall.h @@ -0,0 +1,162 @@ +#ifndef __KSU_UAPI_SUPERCALL_H +#define __KSU_UAPI_SUPERCALL_H + +/* Magic numbers for reboot hook to install fd */ +#define KSU_INSTALL_MAGIC1 0xDEADBEEF +#define KSU_INSTALL_MAGIC2 0xCAFEBABE + +struct ksu_become_daemon_cmd { + __u8 token[65]; /* Input: daemon token (null-terminated) */ +}; + +#define EVENT_POST_FS_DATA 1 +#define EVENT_BOOT_COMPLETED 2 +#define EVENT_MODULE_MOUNTED 3 + +#define KSU_GET_INFO_FLAG_LKM (1U << 0) +#define KSU_GET_INFO_FLAG_MANAGER (1U << 1) +#define KSU_GET_INFO_FLAG_LATE_LOAD (1U << 2) +#define KSU_GET_INFO_FLAG_PR_BUILD (1U << 3) + +struct ksu_get_info_cmd { + __u32 version; /* Output: KERNEL_SU_VERSION */ + __u32 flags; /* Output: KSU_GET_INFO_FLAG_* bits */ + __u32 features; /* Output: max feature ID supported */ +}; + +struct ksu_report_event_cmd { + __u32 event; /* Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. */ +}; + +struct ksu_set_sepolicy_cmd { + __u64 data_len; /* Input: bytes of serialized command payload */ + __aligned_u64 data; /* Input: pointer to serialized payload */ +}; + +struct ksu_sepolicy_cmd_hdr { + __u32 cmd; /* Input: command type, CMD_* */ + __u32 subcmd; /* Input: command subtype */ +}; +/* + * After each ksu_sepolicy_cmd_hdr, command arguments are encoded sequentially as: + * [u32 len][len bytes][\0], where len excludes the trailing '\0'. + * len == 0 represents ALL. + * Argument count is derived from cmd: + * KSU_SEPOLICY_CMD_NORMAL_PERM=4, KSU_SEPOLICY_CMD_XPERM=5, + * KSU_SEPOLICY_CMD_TYPE_STATE=1, KSU_SEPOLICY_CMD_TYPE=2, + * KSU_SEPOLICY_CMD_TYPE_ATTR=2, KSU_SEPOLICY_CMD_ATTR=1, + * KSU_SEPOLICY_CMD_TYPE_TRANSITION=5, KSU_SEPOLICY_CMD_TYPE_CHANGE=4, + * KSU_SEPOLICY_CMD_GENFSCON=3. + */ + +struct ksu_check_safemode_cmd { + __u8 in_safe_mode; /* Output: true if in safe mode, false otherwise */ +}; + +/* deprecated */ +struct ksu_get_allow_list_cmd { + __u32 uids[128]; /* Output: array of allowed/denied UIDs */ + __u32 count; /* Output: number of UIDs in array */ + __u8 allow; /* Input: true for allow list, false for deny list */ +}; + +struct ksu_new_get_allow_list_cmd { + __u16 count; /* Input / Output: number of UIDs in array */ + __u16 total_count; /* Output: total number of UIDs in requested list */ + __u32 uids[0]; /* Output: array of allowed/denied UIDs */ +}; + +struct ksu_uid_granted_root_cmd { + __u32 uid; /* Input: target UID to check */ + __u8 granted; /* Output: true if granted, false otherwise */ +}; + +struct ksu_uid_should_umount_cmd { + __u32 uid; /* Input: target UID to check */ + __u8 should_umount; /* Output: true if should umount, false otherwise */ +}; + +struct ksu_get_manager_appid_cmd { + __u32 appid; /* Output: manager app id */ +}; + +struct ksu_get_app_profile_cmd { + struct app_profile profile; /* Input/Output: app profile structure */ +}; + +struct ksu_set_app_profile_cmd { + struct app_profile profile; /* Input: app profile structure */ +}; + +struct ksu_get_feature_cmd { + __u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */ + __u64 value; /* Output: feature value/state */ + __u8 supported; /* Output: true if feature is supported, false otherwise */ +}; + +struct ksu_set_feature_cmd { + __u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */ + __u64 value; /* Input: feature value/state to set */ +}; + +struct ksu_get_wrapper_fd_cmd { + __u32 fd; /* Input: userspace fd */ + __u32 flags; /* Input: flags of userspace fd */ +}; + +struct ksu_manage_mark_cmd { + __u32 operation; /* Input: KSU_MARK_* */ + __s32 pid; /* Input: target pid (0 for all processes) */ + __u32 result; /* Output: for get operation - mark status or reg_count */ +}; + +#define KSU_MARK_GET 1 +#define KSU_MARK_MARK 2 +#define KSU_MARK_UNMARK 3 +#define KSU_MARK_REFRESH 4 + +struct ksu_nuke_ext4_sysfs_cmd { + __aligned_u64 arg; /* Input: mnt pointer */ +}; + +struct ksu_add_try_umount_cmd { + __aligned_u64 arg; /* char ptr, this is the mountpoint */ + __u32 flags; /* this is the flag we use for it */ + __u8 mode; /* denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry */ +}; + +struct ksu_get_sulog_fd_cmd { + __u32 flags; /* Input: reserved for future use, must be 0 */ +}; + +#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list +#define KSU_UMOUNT_ADD 1 // add entry (path + flags) +#define KSU_UMOUNT_DEL 2 // delete entry, strcmp + +// IOCTL command definitions +#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) +#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) +#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) +#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ|_IOC_WRITE, 'K', 4, 0) +#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) +// deprecated +#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 6, 0) +// deprecated +#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 7, 0) +#define KSU_IOCTL_NEW_GET_ALLOW_LIST _IOWR('K', 6, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_NEW_GET_DENY_LIST _IOWR('K', 7, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ|_IOC_WRITE, 'K', 8, 0) +#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ|_IOC_WRITE, 'K', 9, 0) +#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) +#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ|_IOC_WRITE, 'K', 11, 0) +#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) +#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ|_IOC_WRITE, 'K', 13, 0) +#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) +#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) +#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ|_IOC_WRITE, 'K', 16, 0) +#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) +#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) +#define KSU_IOCTL_SET_INIT_PGRP _IO('K', 19) +#define KSU_IOCTL_GET_SULOG_FD _IOW('K', 20, struct ksu_get_sulog_fd_cmd) + +#endif diff --git a/drivers/kernelsu/infra/event_queue.c b/drivers/kernelsu/infra/event_queue.c new file mode 100644 index 000000000000..333a10c0c523 --- /dev/null +++ b/drivers/kernelsu/infra/event_queue.c @@ -0,0 +1,393 @@ +struct ksu_event_queue_node { + struct list_head list; + struct ksu_event_record_hdr hdr; + __u8 payload[]; +}; + +static size_t ksu_event_queue_record_size(__u32 payload_len) +{ + return sizeof(struct ksu_event_record_hdr) + payload_len; +} + +static void ksu_event_queue_note_drop_locked(struct ksu_event_queue *queue, __u64 seq) +{ + queue->dropped_total++; + if (!queue->dropped_pending) { + queue->dropped_first_seq = seq; + } + queue->dropped_pending++; + queue->dropped_last_seq = seq; +} + +static bool ksu_event_queue_has_data_locked(const struct ksu_event_queue *queue) +{ + return queue->dropped_pending || queue->dropped_inflight || !list_empty(&queue->pending); +} + +static void ksu_event_queue_mark_closed(struct ksu_event_queue *queue) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + queue->closed = true; + spin_unlock_irqrestore(&queue->lock, irq_flags); +} + +void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len) +{ + spin_lock_init(&queue->lock); + mutex_init(&queue->read_lock); + INIT_LIST_HEAD(&queue->pending); + init_waitqueue_head(&queue->read_wait); + queue->queued = 0; + queue->max_queued = max_queued; + queue->max_payload_len = max_payload_len; + queue->next_seq = 1; + queue->dropped_total = 0; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + queue->closed = false; +} + +void ksu_event_queue_destroy(struct ksu_event_queue *queue) +{ + struct ksu_event_queue_node *node, *tmp; + unsigned long irq_flags; + + ksu_event_queue_mark_closed(queue); + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); + + mutex_lock(&queue->read_lock); + spin_lock_irqsave(&queue->lock, irq_flags); + list_for_each_entry_safe (node, tmp, &queue->pending, list) { + list_del(&node->list); + kfree(node); + } + queue->queued = 0; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + mutex_unlock(&queue->read_lock); + + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); +} + +int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len, gfp_t gfp) +{ + struct ksu_event_queue_node *node = NULL; + unsigned long irq_flags; + __u64 seq; + bool wake = false; + int ret = 0; + + if (len > queue->max_payload_len) { + return -EMSGSIZE; + } + + if (len && !payload) { + return -EINVAL; + } + + node = kmalloc(struct_size(node, payload, len), gfp); + + if (node) { + INIT_LIST_HEAD(&node->list); + node->hdr.type = type; + node->hdr.flags = flags; + node->hdr.len = len; + node->hdr.ts_ns = 0; + node->hdr.seq = 0; + + if (len) { + memcpy(node->payload, payload, len); + } + } + + spin_lock_irqsave(&queue->lock, irq_flags); + if (queue->closed) { + ret = -EPIPE; + goto out_unlock; + } + + seq = queue->next_seq++; + if (!node || (queue->max_queued && queue->queued >= queue->max_queued)) { + ksu_event_queue_note_drop_locked(queue, seq); + wake = true; + ret = node ? -ENOSPC : -ENOMEM; + goto out_unlock; + } + + node->hdr.seq = seq; + node->hdr.ts_ns = ktime_get_ns(); + list_add_tail(&node->list, &queue->pending); + queue->queued++; + wake = true; + +out_unlock: + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (ret && node) { + kfree(node); + } + + if (wake) { + wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM); + } + + return ret; +} + +void ksu_event_queue_drop(struct ksu_event_queue *queue) +{ + unsigned long irq_flags; + __u64 seq; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (queue->closed) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return; + } + + seq = queue->next_seq++; + ksu_event_queue_note_drop_locked(queue, seq); + spin_unlock_irqrestore(&queue->lock, irq_flags); + + wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM); +} + +static int ksu_event_queue_wait_ready(struct ksu_event_queue *queue, int file_flags) +{ + int ret; + + for (;;) { + if (ksu_event_queue_has_data(queue)) { + return 0; + } + + if (READ_ONCE(queue->closed)) { + return 0; + } + + if (file_flags & O_NONBLOCK) { + return -EAGAIN; + } + + ret = wait_event_interruptible(queue->read_wait, queue->closed || ksu_event_queue_has_data(queue)); + if (ret) { + return ret; + } + } +} + +static ssize_t ksu_event_queue_read_drop(struct ksu_event_queue *queue, char __user *buf, size_t count) +{ + struct ksu_event_record_hdr hdr; + struct ksu_event_queue_dropped_info info; + size_t record_size = ksu_event_queue_record_size(sizeof(info)); + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (!queue->dropped_pending) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return 0; + } + if (count < record_size) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return -EMSGSIZE; + } + + hdr.type = KSU_EVENT_QUEUE_TYPE_DROPPED; + hdr.flags = KSU_EVENT_RECORD_FLAG_INTERNAL; + hdr.len = sizeof(info); + hdr.seq = queue->dropped_first_seq; + hdr.ts_ns = ktime_get_ns(); + + info.dropped = queue->dropped_pending; + info.first_seq = queue->dropped_first_seq; + info.last_seq = queue->dropped_last_seq; + + queue->dropped_inflight = queue->dropped_pending; + queue->dropped_inflight_first_seq = queue->dropped_first_seq; + queue->dropped_inflight_last_seq = queue->dropped_last_seq; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (copy_to_user(buf, &hdr, sizeof(hdr))) { + goto out_restore; + } + + if (copy_to_user(buf + sizeof(hdr), &info, sizeof(info))) { + goto out_restore; + } + + spin_lock_irqsave(&queue->lock, irq_flags); + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return record_size; + +out_restore: + spin_lock_irqsave(&queue->lock, irq_flags); + if (!queue->dropped_pending) { + queue->dropped_pending = queue->dropped_inflight; + queue->dropped_first_seq = queue->dropped_inflight_first_seq; + queue->dropped_last_seq = queue->dropped_inflight_last_seq; + } else { + queue->dropped_pending += queue->dropped_inflight; + queue->dropped_first_seq = queue->dropped_inflight_first_seq; + } + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return -EFAULT; +} + +static ssize_t ksu_event_queue_read_node(struct ksu_event_queue *queue, char __user *buf, size_t count) +{ + struct ksu_event_queue_node *node; + struct list_head *first; + size_t record_size; + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (list_empty(&queue->pending)) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return 0; + } + + first = queue->pending.next; + node = list_entry(first, struct ksu_event_queue_node, list); + record_size = ksu_event_queue_record_size(node->hdr.len); + if (count < record_size) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return -EMSGSIZE; + } + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (copy_to_user(buf, &node->hdr, sizeof(node->hdr))) { + return -EFAULT; + } + + if (node->hdr.len && copy_to_user(buf + sizeof(node->hdr), node->payload, node->hdr.len)) { + return -EFAULT; + } + + spin_lock_irqsave(&queue->lock, irq_flags); + list_del(first); + queue->queued--; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + kfree(node); + return record_size; +} + +ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags) +{ + ssize_t ret; + ssize_t copied = 0; + + if (!count) { + return 0; + } + + ret = mutex_lock_interruptible(&queue->read_lock); + if (ret) { + return ret; + } + + ret = ksu_event_queue_wait_ready(queue, file_flags); + if (ret) { + copied = ret; + goto out_unlock; + } + + while (count > 0) { + ret = ksu_event_queue_read_drop(queue, buf, count); + if (ret < 0) { + if (!copied) { + copied = ret; + } + break; + } + if (ret > 0) { + copied += ret; + buf += ret; + count -= ret; + continue; + } + + ret = ksu_event_queue_read_node(queue, buf, count); + if (ret < 0) { + if (!copied) { + copied = ret; + } + break; + } + if (ret == 0) { + break; + } + + copied += ret; + buf += ret; + count -= ret; + } + + if (!copied && READ_ONCE(queue->closed)) { + copied = 0; + } + +out_unlock: + mutex_unlock(&queue->read_lock); + return copied; +} + +unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait) +{ + unsigned __bitwise mask = 0; + unsigned long irq_flags; + + poll_wait(file, &queue->read_wait, wait); + + spin_lock_irqsave(&queue->lock, irq_flags); + if (ksu_event_queue_has_data_locked(queue)) { + mask |= POLLIN | POLLRDNORM; + } + if (queue->closed) { + mask |= POLLHUP; + } + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return mask; +} + +void ksu_event_queue_close(struct ksu_event_queue *queue) +{ + ksu_event_queue_mark_closed(queue); + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); +} + +bool ksu_event_queue_has_data(struct ksu_event_queue *queue) +{ + bool has_data; + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + has_data = ksu_event_queue_has_data_locked(queue); + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return has_data; +} diff --git a/drivers/kernelsu/infra/event_queue.h b/drivers/kernelsu/infra/event_queue.h new file mode 100644 index 000000000000..2170f64fd8c8 --- /dev/null +++ b/drivers/kernelsu/infra/event_queue.h @@ -0,0 +1,54 @@ +#ifndef KSU_EVENT_QUEUE_H +#define KSU_EVENT_QUEUE_H + +#define KSU_EVENT_RECORD_FLAG_INTERNAL (1U << 0) +#define KSU_EVENT_QUEUE_TYPE_DROPPED ((__u16)0xFFFF) + +struct ksu_event_record_hdr { + __u16 type; + __u16 flags; + __u32 len; + __u64 seq; + __u64 ts_ns; +}; + +struct ksu_event_queue_dropped_info { + __u64 dropped; + __u64 first_seq; + __u64 last_seq; +}; + +struct ksu_event_queue { + spinlock_t lock; + /* The first implementation supports a single reader. */ + struct mutex read_lock; + struct list_head pending; + wait_queue_head_t read_wait; + __u32 queued; + __u32 max_queued; + __u32 max_payload_len; + __u64 next_seq; + __u64 dropped_total; + __u64 dropped_pending; + __u64 dropped_first_seq; + __u64 dropped_last_seq; + __u64 dropped_inflight; + __u64 dropped_inflight_first_seq; + __u64 dropped_inflight_last_seq; + bool closed; +}; + +void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len); +void ksu_event_queue_destroy(struct ksu_event_queue *queue); + +int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len, + gfp_t gfp); +void ksu_event_queue_drop(struct ksu_event_queue *queue); + +ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags); +unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait); + +void ksu_event_queue_close(struct ksu_event_queue *queue); +bool ksu_event_queue_has_data(struct ksu_event_queue *queue); + +#endif // KSU_EVENT_QUEUE_H diff --git a/drivers/kernelsu/file_wrapper.c b/drivers/kernelsu/infra/file_wrapper.c similarity index 95% rename from drivers/kernelsu/file_wrapper.c rename to drivers/kernelsu/infra/file_wrapper.c index e3de8d9d4eac..fa91276f76fe 100644 --- a/drivers/kernelsu/file_wrapper.c +++ b/drivers/kernelsu/infra/file_wrapper.c @@ -1,19 +1,3 @@ -#include -#include -#include -#include -#include // kernel 3.18 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - struct ksu_file_wrapper { struct file *orig; struct file_operations ops; @@ -21,27 +5,10 @@ struct ksu_file_wrapper { static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) -#ifndef replace_fops -#define replace_fops(f, fops) \ - do { \ - struct file *__file = (f); \ - fops_put(__file->f_op); \ - BUG_ON(!(__file->f_op = (fops))); \ - } while(0) -#endif -#endif - static int ksu_wrapper_open(struct inode *ino, struct file *fp) { struct path *orig_path = fp->f_path.dentry->d_fsdata; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) struct file *orig_file = dentry_open(orig_path, fp->f_flags, current_cred()); -#else - struct file *orig_file = dentry_open((*orig_path).dentry, (*orig_path).mnt, fp->f_flags, current_cred()); -#endif - if (IS_ERR(orig_file)) { return PTR_ERR(orig_file); } @@ -467,7 +434,6 @@ static const struct dentry_operations ksu_file_wrapper_d_ops = { #define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) #define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure - #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) // There is no anon_inode_create_getfile before 5.16, but it's not difficult to implement it. // https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e @@ -525,12 +491,7 @@ static struct file *ksu_anon_inode_create_getfile_compat( return file; } #else -struct file * -ksu_anon_inode_create_getfile_compat(const char *name, const struct file_operations *fops, - void *priv, int flags, const struct inode *context_inode) -{ - return anon_inode_getfile(name, fops, priv, flags); -} +#define ksu_anon_inode_create_getfile_compat(a, b, c, d, e) anon_inode_getfile(a, b, c, d) #endif int ksu_install_file_wrapper(int fd) @@ -608,7 +569,7 @@ int ksu_install_file_wrapper(int fd) return ret; } -void ksu_file_wrapper_init(void) +void __init ksu_file_wrapper_init(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) static const struct file_operations tmp = { .owner = THIS_MODULE }; diff --git a/drivers/kernelsu/file_wrapper.h b/drivers/kernelsu/infra/file_wrapper.h similarity index 76% rename from drivers/kernelsu/file_wrapper.h rename to drivers/kernelsu/infra/file_wrapper.h index faae4dded301..ee672312b7aa 100644 --- a/drivers/kernelsu/file_wrapper.h +++ b/drivers/kernelsu/infra/file_wrapper.h @@ -1,9 +1,6 @@ #ifndef KSU_FILE_WRAPPER_H #define KSU_FILE_WRAPPER_H -#include -#include - int ksu_install_file_wrapper(int fd); void ksu_file_wrapper_init(void); diff --git a/drivers/kernelsu/su_mount_ns.c b/drivers/kernelsu/infra/su_mount_ns.c similarity index 81% rename from drivers/kernelsu/su_mount_ns.c rename to drivers/kernelsu/infra/su_mount_ns.c index 185599b3890e..7f5651d5de73 100644 --- a/drivers/kernelsu/su_mount_ns.c +++ b/drivers/kernelsu/infra/su_mount_ns.c @@ -1,36 +1,3 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) -#include -#else -#include -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -#include -#else -#include -#endif -#endif - extern int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); @@ -63,15 +30,9 @@ static long ksu_sys_setns(int fd, int flags) #endif } #else -static long ksu_sys_setns(int fd, int flags) -{ - return sys_setns(fd, flags); -} -__weak int ksys_unshare(unsigned long unshare_flags) -{ - return sys_unshare(unshare_flags); -} -#endif +#define ksu_sys_setns sys_setns +#define ksys_unshare sys_unshare +#endif // > 4.17 // global mode , need CAP_SYS_ADMIN and CAP_SYS_CHROOT to perform setns static void ksu_mnt_ns_global(void) @@ -127,11 +88,9 @@ static void ksu_mnt_ns_global(void) } #else try_setns: - barrier(); // to shutup declaration after label - + ; // on UL kernels we can try to just feed it with struct path of /proc/1/ns/mnt // we do NOT have ns_get_path. if it works, GOOD. if it doesn't I don't care. - struct path ns_path; const struct cred *saved = override_creds(ksu_cred); @@ -146,11 +105,7 @@ static void ksu_mnt_ns_global(void) revert_creds(saved); #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) struct file *ns_file = dentry_open(&ns_path, O_RDONLY, ksu_cred); -#else - struct file *ns_file = dentry_open(ns_path.dentry, ns_path.mnt, O_RDONLY, ksu_cred); -#endif path_put(&ns_path); if (IS_ERR(ns_file)) { diff --git a/drivers/kernelsu/su_mount_ns.h b/drivers/kernelsu/infra/su_mount_ns.h similarity index 100% rename from drivers/kernelsu/su_mount_ns.h rename to drivers/kernelsu/infra/su_mount_ns.h diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c index 1de8941da722..52214d1b2feb 100644 --- a/drivers/kernelsu/kernel_compat.c +++ b/drivers/kernelsu/kernel_compat.c @@ -1,114 +1,3 @@ -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include // signal_struct -#include -#else -#include -#endif -#include -#include -#include - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) -#include -#include -#include -struct key *init_session_keyring = NULL; - -static inline int install_session_keyring(struct key *keyring) -{ - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = install_session_keyring_to_cred(new, keyring); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); -} - -// this is on tgcred on < 3.8 -// while we can grab that one, it seems to not actually be needed -static void ksu_grab_init_session_keyring(const char *filename) -{ - if (init_session_keyring) - return; - - if (!strstr(filename, "init")) - return; - - if (!!strcmp(current->comm, "init")) - return; - - if (!!!is_init(get_current_cred())) - return; - - // thats surely some exclamation comedy - // and now we are sure that this is the key we want - // up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ - // so we need to grab this using rcu_dereference - struct key *keyring = rcu_dereference(current->cred->session_keyring); - if (!keyring) - return; - - init_session_keyring = key_get(keyring); - - pr_info("%s: init_session_keyring: 0x%p \n", __func__, init_session_keyring); - -} -struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) -{ - // normally we only put this on ((current->flags & PF_WQ_WORKER) || (current->flags & PF_KTHREAD)) - // but in the grand scale of things, this does NOT matter. - // pr_info("installing init session keyring for older kernel\n"); - if (init_session_keyring != NULL && !current_cred()->session_keyring) { - install_session_keyring(init_session_keyring); - } - return filp_open(filename, flags, mode); -} -#else -struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) -{ - return filp_open(filename, flags, mode); -} -#endif - -ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_read(p, buf, count, pos); -#else // https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 - mm_segment_t old_fs; - old_fs = get_fs(); - set_fs(get_ds()); - ssize_t result = vfs_read(p, (void __user *)buf, count, pos); - set_fs(old_fs); - return result; -#endif -} - -ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_write(p, buf, count, pos); -#else // https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 - mm_segment_t old_fs; - old_fs = get_fs(); - set_fs(get_ds()); - ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); - set_fs(old_fs); - return res; -#endif -} - #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) __weak int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) @@ -130,6 +19,46 @@ __weak int path_mount(const char *dev_name, struct path *path, } #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +__weak int path_umount(struct path *path, int flags) +{ + char buf[256] = {0}; + int ret; + + // -1 on the size as implicit null termination + // as we zero init the thing + char *usermnt = d_path(path, buf, sizeof(buf) - 1); + if (!(usermnt && usermnt != buf)) { + ret = -ENOENT; + goto out; + } + + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) + ret = ksys_umount((char __user *)usermnt, flags); +#else + ret = (int)sys_umount((char __user *)usermnt, flags); +#endif + + set_fs(old_fs); + + // release ref here! user_path_at increases it + // then only cleans for itself +out: + path_put(path); + return ret; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) || !defined(CONFIG_EXT4_FS) +__weak void ext4_unregister_sysfs(struct super_block *sb) +{ + pr_info("%s: feature not implemented!\n", __func__); +} +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) __weak long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { @@ -141,7 +70,7 @@ __weak long copy_from_user_nofault(void *dst, const void __user *src, size_t siz // normally theres an access_ok check here // but for what we use it, it will always be true. - + // so we skip it pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h index 9119648121ff..a522116f1dd4 100644 --- a/drivers/kernelsu/kernel_compat.h +++ b/drivers/kernelsu/kernel_compat.h @@ -1,55 +1,174 @@ #ifndef __KSU_H_KERNEL_COMPAT #define __KSU_H_KERNEL_COMPAT -#include -#include -#include -#include -#include -#include -#include - -extern struct file *ksu_filp_open_compat(const char *filename, int flags, - umode_t mode); -extern ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, - loff_t *pos); -extern ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, - size_t count, loff_t *pos); +#define ksu_get_uid_t(x) *(unsigned int *)&(x) + +#if defined(CONFIG_KEYS) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) + +extern int install_session_keyring_to_cred(struct cred *cred, struct key *keyring); +static struct key *init_session_keyring = NULL; + +bool is_init(const struct cred* cred); + +static inline int install_session_keyring(struct key *keyring) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = install_session_keyring_to_cred(new, keyring); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +// this is on tgcred on < 3.8 +// while we can grab that one, it seems to not actually be needed +__attribute__((cold)) +static noinline void ksu_grab_init_session_keyring(const char *filename) +{ + if (init_session_keyring) + return; + + if (!strstr(filename, "init")) + return; + + if (!!strcmp(current->comm, "init")) + return; + + if (!!!is_init(current_cred())) + return; + + // thats surely some exclamation comedy + // and now we are sure that this is the key we want + // up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ + // so we need to grab this using rcu_dereference + struct key *keyring = rcu_dereference(current->cred->session_keyring); + if (!keyring) + return; + + init_session_keyring = key_get(keyring); + + pr_info("%s: init_session_keyring: 0x%lx \n", __func__, (uintptr_t)init_session_keyring); +} + +static noinline struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) +{ + // it used to be that we put this on (current->flags & PF_WQ_WORKER) + // but since things actually needing this has been offloaded to kthread + // like allowlist write, we check for that instead. + if (!(current->flags & PF_KTHREAD)) + goto filp_open; + + if (!init_session_keyring) + goto filp_open; + + if (current_cred()->session_keyring) + goto filp_open; + + install_session_keyring(init_session_keyring); + +filp_open: + return filp_open(filename, flags, mode); +} +#define filp_open ksu_filp_open_compat +#else +static inline void ksu_grab_init_session_keyring(const char *filename) {} // no-op +#endif // KEYS && ( >= 3.8 && < 5.2 ) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 +static noinline ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t result = vfs_read(p, (void __user *)buf, count, pos); + set_fs(old_fs); + return result; +} +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 +static noinline ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); + set_fs(old_fs); + return res; +} +#define kernel_read ksu_kernel_read_compat +#define kernel_write ksu_kernel_write_compat +#endif // < 4.14 + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +static inline void *ksu_kvmalloc(size_t size, gfp_t flags) +{ + void *buf = kmalloc(size, flags); + if (!buf) + buf = vmalloc(size); + + return buf; +} + +static inline void ksu_kvfree(void *buf) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} +#define kvmalloc ksu_kvmalloc +#define kvfree ksu_kvfree +#endif // for supercalls.c fd install tw -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) -#ifndef TWA_RESUME +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) && !defined(TWA_RESUME) #define TWA_RESUME 1 #endif -#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0) -__weak int close_fd(unsigned fd) -{ - return sys_close(fd); -} +// this is ksys_close, however that is spotty to use +// as 5.10 backported close_fd and rekt ksys_close +// so we use what it does internally, __close_fd +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#define close_fd(fd) __close_fd(current->files, fd) +#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0) +#define close_fd sys_close #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) -__weak int close_fd(unsigned fd) +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 0) +static inline struct file *ksu_dentry_open(const struct path *path, int flags, const struct cred *cred) { - // this is ksys_close, but that shit is inline - // its problematic to cascade a weak symbol for it - return __close_fd(current->files, fd); + return dentry_open((*path).dentry, (*path).mnt, flags, cred); } +#define dentry_open ksu_dentry_open #endif -extern long copy_from_user_nofault(void *dst, const void __user *src, size_t size); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) +#ifndef replace_fops +#define replace_fops(f, fops) \ + do { \ + struct file *__file = (f); \ + fops_put(__file->f_op); \ + BUG_ON(!(__file->f_op = (fops))); \ + } while(0) +#endif +#endif -/* +/** * ksu_copy_from_user_retry * try nofault copy first, if it fails, try with plain * paramters are the same as copy_from_user * 0 = success - * + hot since this is reused on sucompat */ -__attribute__((hot)) -static long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned long count) +extern long copy_from_user_nofault(void *dst, const void __user *src, size_t size); +static __always_inline long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned long count) { long ret = copy_from_user_nofault(to, from, count); if (likely(!ret)) @@ -59,17 +178,31 @@ static long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned return copy_from_user(to, from, count); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) -struct dir_context { - const filldir_t actor; - loff_t pos; -}; - -static int iterate_dir(struct file *file, struct dir_context *ctx) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) // caller is reponsible for sanity! +static inline void ksu_zeroed_strncpy(char *dest, const char *src, size_t count) { - return vfs_readdir(file, ctx->actor, ctx); + // this is actually faster due to dead store elimination + // count - 1 as implicit null termination + __builtin_memset(dest, 0, count); + __builtin_strncpy(dest, src, count - 1); } -#endif // KSU_HAS_ITERATE_DIR +#define strscpy ksu_zeroed_strncpy +#define strscpy_pad ksu_zeroed_strncpy +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) +#define d_is_reg(dentry) S_ISREG((dentry)->d_inode->i_mode) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 5, 0) +struct user_struct *ksu_alloc_uid(kuid_t uid) { return alloc_uid(current_user_ns(), uid); } +#define alloc_uid ksu_alloc_uid +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) +struct dir_context { const filldir_t actor; loff_t pos; }; +#define iterate_dir(file, ctx) vfs_readdir(file, (ctx)->actor, ctx) +#endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) __weak char *bin2hex(char *dst, const void *src, size_t count) @@ -81,32 +214,68 @@ __weak char *bin2hex(char *dst, const void *src, size_t count) } #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) && !defined(KSU_UL_HAS_FILE_INODE) -static inline struct inode *file_inode(struct file *f) -{ - return f->f_path.dentry->d_inode; -} +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +#define file_inode(f) ((f)->f_path.dentry->d_inode) #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_INODE) -static inline struct inode_security_struct *selinux_inode(const struct inode *inode) -{ - return inode->i_security; -} +#define selinux_inode(inode) ((inode)->i_security) #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_CRED) -static inline struct task_security_struct *selinux_cred(const struct cred *cred) -{ - return cred->security; -} +#define selinux_cred(cred) ((cred)->security) #endif #if LINUX_VERSION_CODE < KERNEL_VERSION (4, 15, 0) -__weak void groups_sort(struct group_info *group_info) -{ - return; -} +__weak void groups_sort(struct group_info *group_info) { } // no-op +#endif + +#ifndef U16_MAX +#define U16_MAX ((u16)(~0U)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0) && !defined(EPOLLIN) +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 +#endif // < 4.12 && !EPOLLIN + +#ifndef READ_ONCE +#define READ_ONCE(x) (*(const volatile typeof(x) *)&(x)) #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 15, 0) +#define task_ppid_nr(a) (pid_t)sys_getppid() +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 17, 0) +static inline u64 ksu_ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } +#define ktime_get_ns ksu_ktime_get_ns +#endif + +// WARNING: no overflow safety! +#ifndef struct_size +#define struct_size(p, member, n) (sizeof(*(p)) + (n) * sizeof(*(p)->member)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0) +#ifndef ALIGN_DOWN +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#endif +#endif + +#ifndef untagged_addr +#define untagged_addr(addr) (addr) +#endif + +static inline void ksu_kfree_byref(void *buf) { kfree(*(void **)buf); } + #endif diff --git a/drivers/kernelsu/kernel_includes.h b/drivers/kernelsu/kernel_includes.h new file mode 100644 index 000000000000..81d33744940e --- /dev/null +++ b/drivers/kernelsu/kernel_includes.h @@ -0,0 +1,160 @@ +#ifndef __KSU_H_KERNEL_INCLUDES +#define __KSU_H_KERNEL_INCLUDES + +// common +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// versioned / conditional + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) +#include +#else +#include +#endif +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) +#include +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#include +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#include +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#include +#endif + +/** + * replace common mem/str functions with builtins + * so legacy kernels get better inlining and optimized routines (with newer compielrs) + * a lot of people rice their flags (mcpu/march), this'll be a good reward for them. + * minimum that people use is gcc 4.9 for 3.x kernels, so these are fineee + * https://github.com/gcc-mirror/gcc/blob/releases/gcc-4.9/gcc/builtins.def#L562 + * + */ +#if !defined(CONFIG_FORTIFY_SOURCE) + +#define memcmp __builtin_memcmp +#define memcpy __builtin_memcpy +#define memmove __builtin_memmove +#define memset __builtin_memset +#define strchr __builtin_strchr +#define strcmp __builtin_strcmp +#define strcpy __builtin_strcpy +#define strlen __builtin_strlen +#define strncmp __builtin_strncmp +#define strncpy __builtin_strncpy +#define strstr __builtin_strstr + +#endif // !CONFIG_FORTIFY_SOURCE + +#endif // __KSU_H_KERNEL_INCLUDES diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c index d12add3c4415..b4a96bf9827f 100644 --- a/drivers/kernelsu/ksu.c +++ b/drivers/kernelsu/ksu.c @@ -1,39 +1,43 @@ -#include -#include -#include -#include -#include -#include /* LINUX_VERSION_CODE, KERNEL_VERSION macros */ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) -#include -#else -#include -#endif +#include "kernel_includes.h" + +// uapi +#include "include/uapi/app_profile.h" +#include "include/uapi/feature.h" +#include "include/uapi/selinux.h" +#include "include/uapi/supercall.h" +#include "include/uapi/sulog.h" -#define ksu_get_uid_t(x) *(unsigned int *)&(x) +// includes +#include "include/klog.h" +#include "include/arch.h" +#include "include/ksu.h" -#include "allowlist.h" -#include "apk_sign.h" -#include "app_profile.h" -#include "arch.h" -#include "core_hook.h" -#include "feature.h" -#include "file_wrapper.h" +// kernel compat, lite ones #include "kernel_compat.h" -#include "klog.h" -#include "ksud.h" -#include "ksu.h" -#include "manager.h" -#include "sucompat.h" -#include "supercalls.h" -#include "throne_tracker.h" -#include "su_mount_ns.h" + +#include "policy/app_profile.h" +#include "policy/allowlist.h" +#include "policy/feature.h" +#include "manager/apk_sign.h" +#include "manager/manager_identity.h" +#include "manager/throne_tracker.h" +#include "supercall/internal.h" +#include "supercall/supercall.h" +#include "infra/su_mount_ns.h" +#include "infra/file_wrapper.h" +#include "infra/event_queue.h" +#include "feature/adb_root.h" +#include "feature/kernel_umount.h" +#include "feature/sucompat.h" +#include "feature/sulog.h" +#include "runtime/ksud.h" +#include "sulog/event.h" +#include "sulog/fd.h" + #include "selinux/selinux.h" #include "selinux/sepolicy.h" // selinux includes -#include #include "avc_ss.h" #include "objsec.h" #include "ss/services.h" @@ -45,18 +49,30 @@ // unity build #include "tiny_sulog.c" -#include "allowlist.c" -#include "app_profile.c" -#include "apk_sign.c" -#include "sucompat.c" -#include "throne_tracker.c" -#include "core_hook.c" -#include "supercalls.c" -#include "feature.c" -#include "su_mount_ns.c" -#include "ksud.c" -#include "kernel_compat.c" -#include "file_wrapper.c" +#include "policy/allowlist.c" +#include "policy/app_profile.c" +#include "policy/feature.c" +#include "manager/apk_sign.c" +#include "manager/throne_tracker.c" + +#include "supercall/perm.c" +#include "supercall/dispatch.c" +#include "supercall/supercall.c" + +#include "infra/su_mount_ns.c" +#include "infra/file_wrapper.c" +#include "infra/event_queue.c" + +#include "feature/adb_root.c" +#include "feature/kernel_umount.c" +#include "feature/sucompat.c" +#include "feature/sulog.c" +#include "runtime/ksud.c" + +#include "sulog/event.c" +#include "sulog/fd.c" + +#include "hook/core_hook.c" // lsm #include "selinux/selinux.c" #include "selinux/sepolicy.c" @@ -64,24 +80,23 @@ #ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE #ifdef CONFIG_ARM64 -#include "syscall_table_hook.c" +#include "hook/syscall_table_hook_arm64.c" #elif CONFIG_ARM -#include "syscall_table_hook_arm.c" +#include "hook/syscall_table_hook_arm.c" #endif #endif -#ifdef CONFIG_KSU_KPROBES_KSUD -#include "kp_ksud.c" -#endif - -#ifdef CONFIG_KSU_KRETPROBES_SUCOMPAT -#include "rp_sucompat.c" +#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) +#include "hook/kp_ksud.c" #endif #ifdef CONFIG_KSU_EXTRAS #include "extras.c" #endif +// __weak fn's +#include "kernel_compat.c" + struct cred* ksu_cred; extern void ksu_supercalls_init(); @@ -109,6 +124,16 @@ int __init kernelsu_init(void) ksu_sucompat_init(); // so the feature is registered + ksu_kernel_umount_init(); // so the feature is registered + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_init(); // so the feature is registered +#endif + +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_init(); // so the feature is registered +#endif + ksu_core_init(); ksu_allowlist_init(); @@ -123,7 +148,7 @@ int __init kernelsu_init(void) ksu_syscall_table_hook_init(); #endif -#ifdef CONFIG_KSU_KPROBES_KSUD +#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) kp_ksud_init(); #endif diff --git a/drivers/kernelsu/apk_sign.c b/drivers/kernelsu/manager/apk_sign.c similarity index 83% rename from drivers/kernelsu/apk_sign.c rename to drivers/kernelsu/manager/apk_sign.c index 697ecd81d9d9..f79b2c9a3a00 100644 --- a/drivers/kernelsu/apk_sign.c +++ b/drivers/kernelsu/manager/apk_sign.c @@ -1,20 +1,3 @@ -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_KSU_DEBUG -#include -#endif -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) -#include -#else -#include -#endif -#include - struct sdesc { struct shash_desc shash; char ctx[]; @@ -70,31 +53,34 @@ static int ksu_sha256(const unsigned char *data, unsigned int datalen, static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset, unsigned expected_size, const char *expected_sha256) { - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer-sequence length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signed data length + kernel_read(fp, size4, 0x4, pos); // signer-sequence length + kernel_read(fp, size4, 0x4, pos); // signer length + kernel_read(fp, size4, 0x4, pos); // signed data length *offset += 0x4 * 3; - ksu_kernel_read_compat(fp, size4, 0x4, pos); // digests-sequence length + kernel_read(fp, size4, 0x4, pos); // digests-sequence length *pos += *size4; *offset += 0x4 + *size4; - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificates length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificate length + kernel_read(fp, size4, 0x4, pos); // certificates length + kernel_read(fp, size4, 0x4, pos); // certificate length *offset += 0x4 * 2; if (*size4 == expected_size) { *offset += *size4; #define CERT_MAX_LENGTH 1024 - char cert[CERT_MAX_LENGTH]; + char *cert __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(CERT_MAX_LENGTH, GFP_KERNEL); + if (!cert) + return false; + if (*size4 > CERT_MAX_LENGTH) { pr_info("cert length overlimit\n"); return false; } - ksu_kernel_read_compat(fp, cert, *size4, pos); + kernel_read(fp, cert, *size4, pos); unsigned char digest[SHA256_DIGEST_SIZE]; if (ksu_sha256(cert, *size4, digest) < 0 ) { pr_info("sha256 error\n"); @@ -136,7 +122,7 @@ static bool has_v1_signature_file(struct file *fp) loff_t pos = 0; - while (ksu_kernel_read_compat(fp, &header, + while (kernel_read(fp, &header, sizeof(struct zip_entry_header), &pos) == sizeof(struct zip_entry_header)) { if (header.signature != 0x04034b50) { @@ -146,7 +132,7 @@ static bool has_v1_signature_file(struct file *fp) // Read the entry file name if (header.file_name_length == sizeof(MANIFEST) - 1) { char fileName[sizeof(MANIFEST)]; - ksu_kernel_read_compat(fp, fileName, + kernel_read(fp, fileName, header.file_name_length, &pos); fileName[header.file_name_length] = '\0'; @@ -183,6 +169,7 @@ static __always_inline bool check_v2_signature(char *path, bool v3_1_signing_exist = false; int i; + struct path kpath; if (kern_path(path, 0, &kpath)) return false; @@ -200,7 +187,7 @@ static __always_inline bool check_v2_signature(char *path, path_put(&kpath); - struct file *fp = ksu_filp_open_compat(path, O_RDONLY, 0); + struct file *fp = filp_open(path, O_RDONLY, 0); if (IS_ERR(fp)) { // pr_err("open %s error.\n", path); return false; @@ -213,10 +200,10 @@ static __always_inline bool check_v2_signature(char *path, for (i = 0;; ++i) { unsigned short n; pos = vfs_llseek(fp, -i - 2, SEEK_END); - ksu_kernel_read_compat(fp, &n, 2, &pos); + kernel_read(fp, &n, 2, &pos); if (n == i) { pos -= 22; - ksu_kernel_read_compat(fp, &size4, 4, &pos); + kernel_read(fp, &size4, 4, &pos); if ((size4 ^ 0xcafebabeu) == 0xccfbf1eeu) { break; } @@ -229,18 +216,17 @@ static __always_inline bool check_v2_signature(char *path, pos += 12; // offset - ksu_kernel_read_compat(fp, &size4, 0x4, &pos); + kernel_read(fp, &size4, 0x4, &pos); pos = size4 - 0x18; - ksu_kernel_read_compat(fp, &size8, 0x8, &pos); - ksu_kernel_read_compat(fp, buffer, 0x10, &pos); - // !! remove this casting to char just to strcmp + kernel_read(fp, &size8, 0x8, &pos); + kernel_read(fp, buffer, 0x10, &pos); if (memcmp(buffer, "APK Sig Block 42", 16)) { goto clean; } pos = size4 - (size8 + 0x8); - ksu_kernel_read_compat(fp, &size_of_block, 0x8, &pos); + kernel_read(fp, &size_of_block, 0x8, &pos); if (size_of_block != size8) { goto clean; } @@ -249,12 +235,11 @@ static __always_inline bool check_v2_signature(char *path, while (loop_count++ < 10) { uint32_t id; uint32_t offset; - ksu_kernel_read_compat(fp, &size8, 0x8, - &pos); // sequence length + kernel_read(fp, &size8, 0x8, &pos); // sequence length if (size8 == size_of_block) { break; } - ksu_kernel_read_compat(fp, &id, 0x4, &pos); // id + kernel_read(fp, &id, 0x4, &pos); // id offset = 4; if (id == 0x7109871au) { v2_signing_blocks++; @@ -369,7 +354,7 @@ bool is_manager_apk(char *path) { int tries = 0; - while (tries++ < 10) { + while (tries++ < 10 && (current->flags & PF_KTHREAD) ) { if (!is_lock_held(path)) break; diff --git a/drivers/kernelsu/apk_sign.h b/drivers/kernelsu/manager/apk_sign.h similarity index 85% rename from drivers/kernelsu/apk_sign.h rename to drivers/kernelsu/manager/apk_sign.h index d3a44bd207c2..65b3a1e51cdd 100644 --- a/drivers/kernelsu/apk_sign.h +++ b/drivers/kernelsu/manager/apk_sign.h @@ -1,8 +1,6 @@ #ifndef __KSU_H_APK_V2_SIGN #define __KSU_H_APK_V2_SIGN -#include - bool is_manager_apk(char *path); int get_pkg_from_apk_path(char *pkg, const char *path); diff --git a/drivers/kernelsu/manager.h b/drivers/kernelsu/manager/manager_identity.h similarity index 84% rename from drivers/kernelsu/manager.h rename to drivers/kernelsu/manager/manager_identity.h index e3159c988c0c..5a7c6b2b399d 100644 --- a/drivers/kernelsu/manager.h +++ b/drivers/kernelsu/manager/manager_identity.h @@ -1,9 +1,7 @@ -#ifndef __KSU_H_KSU_MANAGER -#define __KSU_H_KSU_MANAGER +#ifndef __KSU_H_MANAGER_IDENTITY +#define __KSU_H_MANAGER_IDENTITY -#include -#include -#include "allowlist.h" +// #include "allowlist.h" #define KSU_INVALID_APPID -1 diff --git a/drivers/kernelsu/throne_tracker.c b/drivers/kernelsu/manager/throne_tracker.c similarity index 84% rename from drivers/kernelsu/throne_tracker.c rename to drivers/kernelsu/manager/throne_tracker.c index 6a2503fd945b..7711b0e1b95a 100644 --- a/drivers/kernelsu/throne_tracker.c +++ b/drivers/kernelsu/manager/throne_tracker.c @@ -1,17 +1,5 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - uid_t ksu_manager_appid = KSU_INVALID_APPID; -static struct task_struct *throne_thread = NULL; #define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list" struct uid_data { @@ -133,7 +121,7 @@ FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, return FILLDIR_ACTOR_CONTINUE; } - strncpy(data->dirpath, dirpath, DATA_PATH_LEN - 1 ); + strscpy(data->dirpath, dirpath, DATA_PATH_LEN); data->depth = my_ctx->depth - 1; list_add_tail(&data->list, my_ctx->data_path_list); @@ -150,9 +138,9 @@ FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, // compat: https://elixir.bootlin.com/linux/v3.9/source/include/linux/fs.h#L771 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) -#define S_MAGIC_COMPAT(x) ((x)->f_inode->i_sb->s_magic) +#define ksu_get_magic(x) ((x)->f_inode->i_sb->s_magic) #else -#define S_MAGIC_COMPAT(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic) +#define ksu_get_magic(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic) #endif void search_manager(const char *path, int depth, struct list_head *uid_data) @@ -163,10 +151,13 @@ void search_manager(const char *path, int depth, struct list_head *uid_data) unsigned long data_app_magic = 0; // First depth - struct data_path data = { }; - strncpy(data.dirpath, path, DATA_PATH_LEN - 1 ); - data.depth = depth; - list_add_tail(&data.list, &data_path_list); + struct data_path *data __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + strscpy(data->dirpath, path, DATA_PATH_LEN); + data->depth = depth; + list_add_tail(&data->list, &data_path_list); // we put the apk path we collected here char candidate_path[DATA_PATH_LEN]; @@ -188,7 +179,7 @@ void search_manager(const char *path, int depth, struct list_head *uid_data) if (stop) goto skip_iterate; - struct file *file = ksu_filp_open_compat(pos->dirpath, O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0); + struct file *file = filp_open(pos->dirpath, O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0); if (IS_ERR(file)) { pr_err("Failed to open directory: %s, err: %ld\n", pos->dirpath, PTR_ERR(file)); goto skip_iterate; @@ -196,8 +187,8 @@ void search_manager(const char *path, int depth, struct list_head *uid_data) // grab magic on first folder, which is /data/app if (!data_app_magic) { - if (S_MAGIC_COMPAT(file)) { - data_app_magic = S_MAGIC_COMPAT(file); + if (ksu_get_magic(file)) { + data_app_magic = ksu_get_magic(file); pr_info("%s: dir: %s got magic! 0x%lx\n", __func__, pos->dirpath, data_app_magic); } else { filp_close(file, NULL); @@ -205,8 +196,8 @@ void search_manager(const char *path, int depth, struct list_head *uid_data) } } - if (S_MAGIC_COMPAT(file) != data_app_magic) { - pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", __func__, pos->dirpath, S_MAGIC_COMPAT(file), data_app_magic); + if (ksu_get_magic(file) != data_app_magic) { + pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", __func__, pos->dirpath, ksu_get_magic(file), data_app_magic); filp_close(file, NULL); goto skip_iterate; } @@ -232,7 +223,7 @@ void search_manager(const char *path, int depth, struct list_head *uid_data) skip_iterate: list_del(&pos->list); - if (pos != &data) + if (pos != data) kfree(pos); } } @@ -257,12 +248,18 @@ static bool is_uid_exist(uid_t uid, char *package, void *data) static void throne_tracker_fn(bool prune_only) { - struct file *fp; + struct file *fp = NULL; int tries = 0; + if (unlikely(!(current->flags & PF_KTHREAD))) { + pr_info("%s: not a kthread! skip retry for: %s\n", __func__, SYSTEM_PACKAGES_LIST_PATH); + fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); + goto skip_retry; + } + while (tries++ < 10) { if (!is_lock_held(SYSTEM_PACKAGES_LIST_PATH)) { - fp = ksu_filp_open_compat(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); + fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); if (!IS_ERR(fp)) break; } @@ -270,7 +267,8 @@ static void throne_tracker_fn(bool prune_only) pr_info("%s: waiting for %s\n", __func__, SYSTEM_PACKAGES_LIST_PATH); msleep(100); // migth as well add a delay }; - + +skip_retry: if (IS_ERR(fp)) { pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", __func__, PTR_ERR(fp)); return; @@ -285,13 +283,13 @@ static void throne_tracker_fn(bool prune_only) loff_t line_start = 0; char buf[KSU_MAX_PACKAGE_NAME]; for (;;) { - ssize_t count = ksu_kernel_read_compat(fp, &chr, sizeof(chr), &pos); + ssize_t count = kernel_read(fp, &chr, sizeof(chr), &pos); if (count != sizeof(chr)) break; if (chr != '\n') continue; - count = ksu_kernel_read_compat(fp, buf, sizeof(buf), &line_start); + count = kernel_read(fp, buf, sizeof(buf), &line_start); struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_ATOMIC); if (!data) { @@ -361,6 +359,8 @@ static void throne_tracker_fn(bool prune_only) } } +static DEFINE_MUTEX(throne_tracker_mutex); + static int throne_tracker_thread(void *data) { // now de-void it here @@ -368,12 +368,20 @@ static int throne_tracker_thread(void *data) pr_info("throne_tracker: pid: %d started\n", current->pid); - // this is normally not needed, but it wont hurt - escape_to_root_forced(); + mutex_lock(&throne_tracker_mutex); + + // lessen that window where user opens manager right away, yet its not crowned + // we are async/non-blocking in these kthreads + // sched_set_fifo_low + struct sched_param param = { 0 }; + param.sched_priority = 1; + sched_setscheduler_nocheck(current, 1, ¶m); + escape_to_root_forced(); throne_tracker_fn(prune_only); - throne_thread = NULL; - smp_mb(); + + mutex_unlock(&throne_tracker_mutex); + pr_info("throne_tracker: pid: %d exit!\n", current->pid); return 0; } @@ -383,25 +391,16 @@ void track_throne(bool prune_only) #ifndef CONFIG_KSU_THRONE_TRACKER_ALWAYS_THREADED static bool throne_tracker_first_run __read_mostly = true; if (unlikely(throne_tracker_first_run)) { + mutex_lock(&throne_tracker_mutex); throne_tracker_fn(prune_only); + mutex_unlock(&throne_tracker_mutex); throne_tracker_first_run = false; return; } #endif - smp_mb(); - if (throne_thread != NULL) // single instance lock - return; // HACK: force cast prune_only to be a void * - // this way we won't need to create a struct. - // there is only one argument anyway for track_throne() - // so yes, true or false is now a void pointer. - // reality is what I want to be. - throne_thread = kthread_run(throne_tracker_thread, (void *)prune_only, "throne_tracker"); - if (IS_ERR(throne_thread)) { - throne_thread = NULL; - return; - } + kthread_run(throne_tracker_thread, (void *)prune_only, "thronetracker"); } void ksu_throne_tracker_init() diff --git a/drivers/kernelsu/throne_tracker.h b/drivers/kernelsu/manager/throne_tracker.h similarity index 100% rename from drivers/kernelsu/throne_tracker.h rename to drivers/kernelsu/manager/throne_tracker.h diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/policy/allowlist.c similarity index 87% rename from drivers/kernelsu/allowlist.c rename to drivers/kernelsu/policy/allowlist.c index 5e49d30b6942..897c40e8e6dd 100644 --- a/drivers/kernelsu/allowlist.c +++ b/drivers/kernelsu/policy/allowlist.c @@ -1,22 +1,3 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) -#include -#endif -#include - #define FILE_MAGIC 0x7f4b5355 // ' KSU', u32 #define FILE_FORMAT_VERSION 3 // u32 @@ -93,20 +74,6 @@ void ksu_show_allow_list(void) rcu_read_unlock(); } -#ifdef CONFIG_KSU_DEBUG -static void ksu_grant_root_to_shell() -{ struct app_profile profile = { - .version = KSU_APP_PROFILE_VER, - .allow_su = true, - .current_uid = 2000, - }; - strcpy(profile.key, "com.android.shell"); - strcpy(profile.rp_config.profile.selinux_domain, - KSU_DEFAULT_SELINUX_DOMAIN); - ksu_set_app_profile(&profile); -} -#endif - bool ksu_get_app_profile(struct app_profile *profile) { struct perm_data *p = NULL; @@ -141,6 +108,18 @@ static bool profile_valid(struct app_profile *profile) return false; } + bool need_migrate_su_domain = false; + + if (unlikely(profile->version == 2)) { + profile->version = KSU_APP_PROFILE_VER; + need_migrate_su_domain = true; + } + + if (strnlen(profile->key, sizeof(profile->key)) >= sizeof(profile->key)) { + pr_err("invalid app_profile key\n"); + return false; + } + if (profile->version < KSU_APP_PROFILE_VER) { pr_info("Unsupported profile version: %d\n", profile->version); return false; @@ -148,10 +127,22 @@ static bool profile_valid(struct app_profile *profile) if (profile->allow_su) { if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) { + pr_err("invalid groups_count in app_profile: %s\n", profile->key); return false; } - if (strlen(profile->rp_config.profile.selinux_domain) == 0) { + char *domain = profile->rp_config.profile.selinux_domain; + static const size_t domain_len = sizeof(profile->rp_config.profile.selinux_domain); + if (unlikely(need_migrate_su_domain)) { + if (strncmp(domain, "u:r:su:s0", domain_len) == 0) { + strscpy_pad(domain, KSU_DEFAULT_SELINUX_DOMAIN, domain_len); + pr_info("migrated profile domain: %s\n", profile->key); + } + } + size_t len = strnlen(domain, domain_len); + + if (len == 0 || len >= domain_len) { + pr_err("invalid selinux_domain in app_profile: %s\n", profile->key); return false; } } @@ -275,6 +266,9 @@ bool __ksu_is_allow_uid(uid_t uid) return true; } + if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) + return true; + if (likely(uid <= BITMAP_UID_MAX)) { return !!(allow_list_bitmap[uid / BITS_PER_BYTE] & (1 << (uid % BITS_PER_BYTE))); @@ -305,6 +299,10 @@ bool ksu_uid_should_umount(uid_t uid) // we should not umount on manager! return false; } + if (unlikely(uid == WEBVIEW_ZYGOTE_UID)) { + // we should not umount for webview zygote + return false; + } bool found = ksu_get_app_profile(&profile); if (!found) { // no app profile found, it must be non root app @@ -331,6 +329,9 @@ void ksu_get_root_profile(uid_t uid, struct root_profile *profile) goto use_default; } + if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) + goto use_default; + rcu_read_lock(); list_for_each_entry_rcu (p, &allow_list, list) { if (uid == p->profile.current_uid && p->profile.allow_su) { @@ -376,26 +377,26 @@ bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, } -void ksu_persistent_allow_list_fn() +static void ksu_persistent_allow_list_fn() { u32 magic = FILE_MAGIC; u32 version = FILE_FORMAT_VERSION; struct perm_data *p = NULL; loff_t off = 0; - struct file *fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); + struct file *fp = filp_open(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (IS_ERR(fp)) { pr_err("save_allow_list create file failed: %ld\n", PTR_ERR(fp)); goto out; } // store magic and version - if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) != sizeof(magic)) { + if (kernel_write(fp, &magic, sizeof(magic), &off) != sizeof(magic)) { pr_err("save_allow_list write magic failed.\n"); goto close_file; } - if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) != sizeof(version)) { + if (kernel_write(fp, &version, sizeof(version), &off) != sizeof(version)) { pr_err("save_allow_list write version failed.\n"); goto close_file; } @@ -404,7 +405,7 @@ void ksu_persistent_allow_list_fn() pr_info("save allow list, name: %s uid :%d, allow: %d\n", p->profile.key, p->profile.current_uid, p->profile.allow_su); - ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile), &off); + kernel_write(fp, &p->profile, sizeof(p->profile), &off); } close_file: @@ -451,26 +452,21 @@ void ksu_load_allow_list() u32 magic; u32 version; -#ifdef CONFIG_KSU_DEBUG - // always allow adb shell by default - ksu_grant_root_to_shell(); -#endif - // load allowlist now! - fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); + fp = filp_open(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); if (IS_ERR(fp)) { pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp)); return; } // verify magic - if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) != sizeof(magic) || + if (kernel_read(fp, &magic, sizeof(magic), &off) != sizeof(magic) || magic != FILE_MAGIC) { pr_err("allowlist file invalid: %d!\n", magic); goto exit; } - if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) != sizeof(version)) { + if (kernel_read(fp, &version, sizeof(version), &off) != sizeof(version)) { pr_err("allowlist read version: %d failed\n", version); goto exit; } @@ -480,7 +476,7 @@ void ksu_load_allow_list() while (true) { struct app_profile profile; - ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile), &off); + ret = kernel_read(fp, &profile, sizeof(profile), &off); if (ret <= 0) { pr_info("load_allow_list read err: %zd\n", ret); @@ -534,7 +530,7 @@ void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data } } -void ksu_allowlist_init(void) +void __init ksu_allowlist_init(void) { int i; @@ -549,7 +545,7 @@ void ksu_allowlist_init(void) init_default_profiles(); } -void ksu_allowlist_exit(void) +void __exit ksu_allowlist_exit(void) { struct perm_data *np = NULL; struct perm_data *n = NULL; diff --git a/drivers/kernelsu/allowlist.h b/drivers/kernelsu/policy/allowlist.h similarity index 96% rename from drivers/kernelsu/allowlist.h rename to drivers/kernelsu/policy/allowlist.h index 7c65ab7c744e..5eb99182aded 100644 --- a/drivers/kernelsu/allowlist.h +++ b/drivers/kernelsu/policy/allowlist.h @@ -1,11 +1,10 @@ #ifndef __KSU_H_ALLOWLIST #define __KSU_H_ALLOWLIST -#include -#include #include "app_profile.h" #define PER_USER_RANGE 100000 +#define WEBVIEW_ZYGOTE_UID 1053 #define FIRST_APPLICATION_UID 10000 #define LAST_APPLICATION_UID 19999 #define FIRST_ISOLATED_UID 99000 diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/policy/app_profile.c similarity index 76% rename from drivers/kernelsu/app_profile.c rename to drivers/kernelsu/policy/app_profile.c index 729b6a30820f..380c6cb59d83 100644 --- a/drivers/kernelsu/app_profile.c +++ b/drivers/kernelsu/policy/app_profile.c @@ -1,22 +1,6 @@ -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include // signal_struct -#include -#else -#include -#endif -#include -#include -#include -#include -#include - #if LINUX_VERSION_CODE >= KERNEL_VERSION (6, 7, 0) static struct group_info root_groups = { .usage = REFCOUNT_INIT(2) }; -#else +#else static struct group_info root_groups = { .usage = ATOMIC_INIT(2) }; #endif @@ -64,7 +48,7 @@ static void setup_groups(struct root_profile *profile, struct cred *cred) put_group_info(group_info); } -void disable_seccomp() +static void disable_seccomp() { // for < 5.9 lets have free_task do it for us (put_seccomp_filter) @@ -73,7 +57,7 @@ void disable_seccomp() #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) struct task_struct *fake; - fake = kmalloc(sizeof(*fake), GFP_ATOMIC); + fake = kmalloc(sizeof(*fake), GFP_KERNEL); if (!fake) { pr_warn("failed to alloc fake task_struct\n"); return; @@ -115,21 +99,22 @@ void disable_seccomp() #endif // 5.9 } -static void escape_to_root(bool is_forced) +static int escape_to_root(bool is_forced) { + int ret = 0; struct cred *cred; struct root_profile profile; + struct user_struct *new_user; cred = prepare_creds(); if (!cred) { pr_warn("prepare_creds failed!\n"); - return; + return -ENOMEM; } if (!is_forced && ksu_get_uid_t(cred->euid) == 0) { pr_warn("Already root, don't escape!\n"); - abort_creds(cred); - return; + goto out_abort_creds; } ksu_get_root_profile(ksu_get_uid_t(cred->uid), &profile); @@ -147,6 +132,35 @@ static void escape_to_root(bool is_forced) BUILD_BUG_ON(sizeof(profile.capabilities.effective) != sizeof(kernel_cap_t)); + /* + * Mirror the kernel set*uid path: update cred->user first, then + * cred->ucounts, before commit_creds(). commit_creds() moves + * RLIMIT_NPROC accounting based on cred->user; if uid changes while + * user/ucounts stay stale, the old charge can remain pinned to the + * previous UID. + * See kernel/sys.c:set_user() and kernel/cred.c:set_cred_ucounts() / + * commit_creds(): + * https://github.com/torvalds/linux/blob/v5.14/kernel/sys.c + * https://github.com/torvalds/linux/blob/v5.14/kernel/cred.c + */ + new_user = alloc_uid(cred->uid); + if (!new_user) { + ret = -ENOMEM; + goto out_abort_creds; + } + + free_uid(cred->user); + cred->user = new_user; + + // v5.14+ added cred->ucounts, so we must refresh it after changing uid/user: + // https://github.com/torvalds/linux/commit/905ae01c4ae2ae3df05bb141801b1db4b7d83c61#diff-ff6060da281bd9ef3f24e17b77a9b0b5b2ed2d7208bb69b29107bee69732bd31 + // on older kernels, per-UID process accounting lives in user_struct. +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) + if (set_cred_ucounts(cred)) { + goto out_abort_creds; + } +#endif + // setup capabilities // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! @@ -164,22 +178,16 @@ static void escape_to_root(bool is_forced) disable_seccomp(); setup_mount_ns(profile.namespaces); -} + return 0; -void escape_to_root_for_init(void) { - struct cred *cred = prepare_creds(); - if (!cred) { - pr_err("Failed to prepare init's creds!\n"); - return; - } - - setup_selinux(KERNEL_SU_CONTEXT, cred); - commit_creds(cred); +out_abort_creds: + abort_creds(cred); + return ret; } -void escape_with_root_profile(void) +int escape_with_root_profile(void) { - escape_to_root(false); + return escape_to_root(false); } void escape_to_root_forced(void) diff --git a/drivers/kernelsu/policy/app_profile.h b/drivers/kernelsu/policy/app_profile.h new file mode 100644 index 000000000000..747f550236d7 --- /dev/null +++ b/drivers/kernelsu/policy/app_profile.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_APP_PROFILE +#define __KSU_H_APP_PROFILE + +// Escalate current process to root with the appropriate profile +int escape_with_root_profile(void); + +void escape_to_root_forced(void); + +#endif diff --git a/drivers/kernelsu/feature.c b/drivers/kernelsu/policy/feature.c similarity index 97% rename from drivers/kernelsu/feature.c rename to drivers/kernelsu/policy/feature.c index 57600b1f234c..cf9ee4d5e0eb 100644 --- a/drivers/kernelsu/feature.c +++ b/drivers/kernelsu/policy/feature.c @@ -1,6 +1,3 @@ -#include -#include - static const struct ksu_feature_handler *feature_handlers[KSU_FEATURE_MAX]; static DEFINE_MUTEX(feature_mutex); @@ -147,7 +144,7 @@ int ksu_set_feature(u32 feature_id, u64 value) return ret; } -void ksu_feature_init(void) +void __init ksu_feature_init(void) { int i; @@ -158,7 +155,7 @@ void ksu_feature_init(void) pr_info("feature: feature management initialized\n"); } -void ksu_feature_exit(void) +void __exit ksu_feature_exit(void) { int i; diff --git a/drivers/kernelsu/feature.h b/drivers/kernelsu/policy/feature.h similarity index 73% rename from drivers/kernelsu/feature.h rename to drivers/kernelsu/policy/feature.h index bf0fda4d3761..1eb12392e617 100644 --- a/drivers/kernelsu/feature.h +++ b/drivers/kernelsu/policy/feature.h @@ -1,19 +1,6 @@ #ifndef __KSU_H_FEATURE #define __KSU_H_FEATURE -#include - -enum ksu_feature_id { - KSU_FEATURE_SU_COMPAT = 0, - KSU_FEATURE_KERNEL_UMOUNT = 1, - -#ifdef CONFIG_KSU_EXTRAS // custom extensions - KSU_FEATURE_AVC_SPOOF = 10003, -#endif - - KSU_FEATURE_MAX -}; - typedef int (*ksu_feature_get_t)(u64 *value); typedef int (*ksu_feature_set_t)(u64 value); diff --git a/drivers/kernelsu/rp_sucompat.c b/drivers/kernelsu/rp_sucompat.c deleted file mode 100644 index c156e29a7bbb..000000000000 --- a/drivers/kernelsu/rp_sucompat.c +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_MUTEX(ksu_rp_sucompat_lock); - -// struct filename *getname_flags(const char __user *filename, int flags, int *empty) -// https://elixir.bootlin.com/linux/v4.9.337/source/samples/kprobes/kretprobe_example.c - -extern int ksu_getname_flags_kernel(char **kname, int flags); - -struct kretprobe *getname_rp; - -static int getname_flags_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - int *flags = (int *)ri->data; - - struct filename *ret = (struct filename *)PT_REGS_RC(regs); - if (IS_ERR(ret) || !ret || !ret->name) - return 0; - - ksu_getname_flags_kernel((char **)&ret->name, *flags); - return 0; -} - -static int getname_flags_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - int *flags = (int *)ri->data; // as per sample, we store everything on ri->data ? - *flags = (int)PT_REGS_PARM2(regs); // keep a copy of arg2 - - return 0; -} - -#if 0 -static struct kretprobe getname_kretprobe = { - .kp.symbol_name = "getname_flags", - .entry_handler = getname_flags_entry_handler, - .handler = getname_flags_ret_handler, - .data_size = sizeof(int), - .maxactive = 20, -}; -#endif - -// kanged from upstrteam -// this method allows high volume register/unregister -static struct kretprobe *init_kretprobe(const char *symbol, - kretprobe_handler_t entry_handler, - kretprobe_handler_t ret_handler, - size_t data_size, - int maxactive) -{ - struct kretprobe *rp = kzalloc(sizeof(struct kretprobe), GFP_KERNEL); - if (!rp) - return NULL; - - rp->kp.symbol_name = symbol; - rp->entry_handler = entry_handler; - rp->handler = ret_handler; - rp->data_size = data_size; - rp->maxactive = maxactive; - - mutex_lock(&ksu_rp_sucompat_lock); - int ret = register_kretprobe(rp); - mutex_unlock(&ksu_rp_sucompat_lock); - if (ret) { - kfree(rp); - return NULL; - } - pr_info("rp_sucompat: planted kretprobe at %s: %p\n", rp->kp.symbol_name, rp->kp.addr); - - return rp; -} - -static void destroy_kretprobe(struct kretprobe **rp_ptr) -{ - if (!rp_ptr || !*rp_ptr) - return; - - mutex_lock(&ksu_rp_sucompat_lock); - unregister_kretprobe(*rp_ptr); - mutex_unlock(&ksu_rp_sucompat_lock); - kfree(*rp_ptr); - *rp_ptr = NULL; -} - -static void rp_sucompat_exit() -{ - pr_info("rp_sucompat: unregister getname_flags!\n"); - destroy_kretprobe(&getname_rp); -} - -static void rp_sucompat_init() -{ - pr_info("%s: register getname_flags!\n", __func__); - getname_rp = init_kretprobe("getname_flags", getname_flags_entry_handler, - getname_flags_ret_handler, sizeof(int), 20); -} diff --git a/drivers/kernelsu/ksud.c b/drivers/kernelsu/runtime/ksud.c similarity index 77% rename from drivers/kernelsu/ksud.c rename to drivers/kernelsu/runtime/ksud.c index 40ebe997b787..fa578e50a55a 100644 --- a/drivers/kernelsu/ksud.c +++ b/drivers/kernelsu/runtime/ksud.c @@ -1,34 +1,3 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) -#include -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) -#include -#else -#include -#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) -#include -#endif -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include /* fatal_signal_pending */ -#else -#include /* fatal_signal_pending */ -#endif -#include - bool ksu_module_mounted __read_mostly = false; bool ksu_boot_completed __read_mostly = false; @@ -84,7 +53,6 @@ void on_post_fs_data(void) stop_input_hook(); } -#if defined(CONFIG_EXT4_FS) && ( LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) || defined(KSU_HAS_MODERN_EXT4) ) extern void ext4_unregister_sysfs(struct super_block *sb); int nuke_ext4_sysfs(const char *mnt) { @@ -107,12 +75,6 @@ int nuke_ext4_sysfs(const char *mnt) path_put(&path); return 0; } -#else -int nuke_ext4_sysfs(const char* mnt) { - pr_info("%s: feature not implemented!\n", __func__); - return 0; -} -#endif void on_module_mounted(void) { @@ -131,7 +93,7 @@ void on_boot_completed(void) // since _ksud handler only uses argv and envp for comparisons // this can probably work // adapted from ksu_handle_execveat_ksud -static int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const char *envp, size_t envp_len) +static inline int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const char *envp, size_t envp_len) { static const char app_process[] = "/system/bin/app_process"; static bool first_app_process = true; @@ -152,63 +114,56 @@ static int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const c // debug! remove me! pr_info("%s: filename: %s argv1: %s envp_len: %zu\n", __func__, filename, argv1, envp_len); -#ifdef CONFIG_KSU_DEBUG - const char *envp_n = envp; - unsigned int envc = 1; - do { - pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); - envp_n += strlen(envp_n) + 1; - envc++; - } while (envp_n < envp + 256); -#endif - if (init_second_stage_executed) goto first_app_process; // /system/bin/init with argv1 if (!strcmp(filename, system_bin_init) && argv1 && !strcmp(argv1, "second_stage")) { pr_info("%s: /system/bin/init second_stage executed\n", __func__); + init_second_stage_executed = true; apply_kernelsu_rules(); cache_sid(); setup_ksu_cred(); - init_second_stage_executed = true; } // /init with argv1 if (!strcmp(filename, old_system_init) && argv1 && !strcmp(argv1, "--second-stage")) { pr_info("%s: /init --second-stage executed\n", __func__); + init_second_stage_executed = true; apply_kernelsu_rules(); cache_sid(); setup_ksu_cred(); - init_second_stage_executed = true; } if (!envp || !envp_len) goto first_app_process; + if (init_second_stage_executed) + goto first_app_process; + // /init without argv1/useless-argv1 but usable envp - // untested! TODO: test and debug me! - if (!init_second_stage_executed && !strcmp(filename, old_system_init)) { - - // we hunt for "INIT_SECOND_STAGE" - const char *envp_n = envp; - unsigned int envc = 1; - do { - if (strstarts(envp_n, "INIT_SECOND_STAGE")) - break; - envp_n += strlen(envp_n) + 1; - envc++; - } while (envp_n < envp + envp_len); - pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); - - if (!strcmp(envp_n, "INIT_SECOND_STAGE=1") - || !strcmp(envp_n, "INIT_SECOND_STAGE=true") ) { - pr_info("%s: /init +envp: INIT_SECOND_STAGE executed\n", __func__); - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); - init_second_stage_executed = true; - } + // we don't check filename for this as we are a step late on bprm + // the envp we see is the one before it forks. + // we hunt for "INIT_SECOND_STAGE" + const char *envp_n = envp; + unsigned int envc = 1; + do { + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); + + if (strstarts(envp_n, "INIT_SECOND_STAGE")) + break; + + envp_n += strlen(envp_n) + 1; + envc++; + } while (envp_n < envp + envp_len); + + if (!strcmp(envp_n, "INIT_SECOND_STAGE=1") || !strcmp(envp_n, "INIT_SECOND_STAGE=true") ) { + pr_info("%s: /init +envp: %s executed\n", __func__, envp_n); + init_second_stage_executed = true; + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); } first_app_process: @@ -222,7 +177,7 @@ static int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const c return 0; } -int ksu_handle_pre_ksud(const char *filename) +static noinline int ksu_handle_pre_ksud(const char *filename) { if (likely(!ksu_execveat_hook)) return 0; @@ -357,7 +312,7 @@ static bool is_init_rc(struct file *fp) return false; } - if (!S_ISREG(fp->f_path.dentry->d_inode->i_mode)) { + if (!d_is_reg(fp->f_path.dentry)) { return false; } @@ -382,13 +337,13 @@ static bool is_init_rc(struct file *fp) return true; } -static void ksu_handle_initrc(struct file *file) +__attribute__((cold)) +static noinline void ksu_install_rc_hook(struct file *file) { - if (!ksu_vfs_read_hook) { + if (likely(!ksu_vfs_read_hook)) return; - } - if (!is_init(get_current_cred())) + if (!is_init(current_cred())) return; if (!is_init_rc(file)) { @@ -429,23 +384,35 @@ static void ksu_handle_initrc(struct file *file) return; } -// NOTE: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 -// - added 260110, seems needed for A17 +// for sys_read kp / syscall table +__attribute__((cold)) +static noinline void ksu_handle_sys_read_fd(unsigned int fd) +{ + if (likely(!ksu_vfs_read_hook)) + return; -#define STAT_NATIVE 0 -#define STAT_STAT64 1 + if (!is_init(current_cred())) + return; -static __always_inline void ksu_common_newfstat_ret(unsigned long fd_long, void **statbuf_ptr, const int type) -{ - - if (!ksu_vfs_read_hook) { + struct file *file = fget(fd); + if (!file) { return; } + ksu_install_rc_hook(file); + fput(file); +} + +#define STAT_NATIVE 0 +#define STAT_STAT64 1 - if (!is_init(get_current_cred())) +__attribute__((cold)) +static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf_ptr, + const int type, const char *syscall_name) +{ + if (!is_init(current_cred())) return; - struct file *file = fget(fd_long); + struct file *file = fget(fd_int); if (!file) return; @@ -455,7 +422,7 @@ static __always_inline void ksu_common_newfstat_ret(unsigned long fd_long, void } fput(file); - pr_info("%s: stat init.rc \n", __func__); + pr_info("%s: stat init.rc \n", syscall_name); uintptr_t statbuf_ptr_local = (uintptr_t)*(void **)statbuf_ptr; void __user *statbuf = (void __user *)statbuf_ptr_local; @@ -477,36 +444,38 @@ static __always_inline void ksu_common_newfstat_ret(unsigned long fd_long, void #endif if (copy_from_user(&size, st_size_ptr, len)) { - pr_info("%s: read statbuf 0x%lx failed \n", __func__, (unsigned long)st_size_ptr); + pr_info("%s: read statbuf 0x%lx failed \n", syscall_name, (unsigned long)st_size_ptr); return; } new_size = size + ksu_rc_len; - pr_info("%s: adding ksu_rc_len: %ld -> %ld \n", __func__, size, new_size); + pr_info("%s: adding ksu_rc_len: %ld -> %ld \n", syscall_name, size, new_size); if (!copy_to_user(st_size_ptr, &new_size, len)) - pr_info("%s: added ksu_rc_len \n", __func__); + pr_info("%s: added ksu_rc_len \n", syscall_name); else - pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", __func__, (unsigned long)st_size_ptr); + pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", syscall_name, (unsigned long)st_size_ptr); return; } void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr) { - unsigned long fd_long = (unsigned long)*fd; + if (likely(!ksu_vfs_read_hook)) + return; - // native - ksu_common_newfstat_ret(fd_long, (void **)statbuf_ptr, STAT_NATIVE); + ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); } #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr) { - unsigned long fd_long = (unsigned long)*fd; - // 32-bit call uses this! - ksu_common_newfstat_ret(fd_long, (void **)statbuf_ptr, STAT_STAT64); + if (likely(!ksu_vfs_read_hook)) + return; + + // WARNING: LE-only!!! + ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); } #endif @@ -523,7 +492,6 @@ bool ksu_is_safe_mode() // stop hook first! stop_input_hook(); - if (!safe_mode_flag) return false; @@ -646,51 +614,6 @@ static int vol_detector_exit() return 0; } -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) // is_ksu_transition -u32 ksud_init_sid = 0; -u32 ksud_su_sid = 0; - -int grab_transition_sids() -{ - int error = security_secctx_to_secid("u:r:init:s0", strlen("u:r:init:s0"), &ksud_init_sid); - if (error) - return 1; - - pr_info("is_ksu_transition: got init sid: %d\n", ksud_init_sid); - - error = security_secctx_to_secid(KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &ksud_su_sid); - if (error) - return 1; - - pr_info("is_ksu_transition: got su sid: %d\n", ksud_su_sid); - - return 0; -} - -bool is_ksu_transition(const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec) -{ - - // we don't need this hook anymore after the third ksud run, which is boot-complete. - if (likely(ksu_boot_completed)) - return false; - - if (!ksud_su_sid || !ksud_init_sid) { - int ret = grab_transition_sids(); - if (ret) - return false; - } - - // if its init transitioning to su, allow it - if (old_tsec->sid == ksud_init_sid && new_tsec->sid == ksud_su_sid) { - pr_info("%s: allowing init (%d) -> su (%d)\n", __func__, ksud_init_sid, ksud_su_sid); - return true; - } - - return false; -} -#endif // is_ksu_transition - static void stop_vfs_read_hook() { ksu_vfs_read_hook = false; @@ -712,7 +635,7 @@ static void stop_input_hook() vol_detector_exit(); } -void ksu_ksud_init() +void __init ksu_ksud_init() { vol_detector_init(); } diff --git a/drivers/kernelsu/ksud.h b/drivers/kernelsu/runtime/ksud.h similarity index 65% rename from drivers/kernelsu/ksud.h rename to drivers/kernelsu/runtime/ksud.h index 2a2ccf265f8c..28e00fea44c6 100644 --- a/drivers/kernelsu/ksud.h +++ b/drivers/kernelsu/runtime/ksud.h @@ -1,8 +1,6 @@ #ifndef __KSU_H_KSUD #define __KSU_H_KSUD -#include - #define KSUD_PATH "/data/adb/ksud" void ksu_ksud_init(); @@ -16,8 +14,11 @@ bool ksu_is_safe_mode(void); int nuke_ext4_sysfs(const char* mnt); -extern bool ksu_execveat_hook __read_mostly; -extern int ksu_handle_pre_ksud(const char *filename); +bool ksu_execveat_hook __read_mostly; +static noinline int ksu_handle_pre_ksud(const char *filename); + +bool ksu_vfs_read_hook __read_mostly; +static noinline void ksu_install_rc_hook(struct file *file); extern u32 ksu_file_sid; extern bool ksu_module_mounted; diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c index d0bd6d2b788e..1e3f7db26150 100644 --- a/drivers/kernelsu/selinux/rules.c +++ b/drivers/kernelsu/selinux/rules.c @@ -1,46 +1,60 @@ -#include -#include -#include - #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) #define SELINUX_POLICY_INSTEAD_SELINUX_SS #endif #define ALL NULL - -static struct policydb *get_policydb(void) -{ - struct policydb *db; -// selinux_state does not exists before 4.19 -#ifdef KSU_COMPAT_USE_SELINUX_STATE -#ifdef SELINUX_POLICY_INSTEAD_SELINUX_SS - struct selinux_policy *policy = selinux_state.policy; - db = &policy->policydb; +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) +extern int avc_ss_reset(u32 seqno); #else - struct selinux_ss *ss = selinux_state.ss; - db = &ss->policydb; +extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); #endif +// reset avc cache table, otherwise the new rules will not take effect if already denied +static void reset_avc_cache() +{ +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) + avc_ss_reset(0); + selnl_notify_policyload(0); + selinux_status_update_policyload(0); #else - db = &policydb; + struct selinux_avc *avc = selinux_state.avc; + avc_ss_reset(avc, 0); + selnl_notify_policyload(0); + selinux_status_update_policyload(&selinux_state, 0); #endif - return db; + selinux_xfrm_notify_policyload(); } -static DEFINE_MUTEX(ksu_rules); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) -void apply_kernelsu_rules() -{ - struct policydb *db; +#if defined(KSU_COMPAT_USE_SELINUX_STATE) +static struct policydb *get_policydb(void) { return &selinux_state.ss->policydb; } +#else +static struct policydb *get_policydb(void) { return &policydb; } +#endif - if (!getenforce()) { - pr_info("SELinux permissive or disabled, apply rules!\n"); - } +// rwlock +#if defined(KSU_COMPAT_USE_SELINUX_STATE) +static inline rwlock_t *ksu_get_policy_rwlock() { return &selinux_state.ss->policy_rwlock; } +#elif defined(KSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK) +static inline rwlock_t *ksu_get_policy_rwlock() { extern rwlock_t policy_rwlock; return &policy_rwlock; } +#else +static inline rwlock_t *ksu_get_policy_rwlock() { return NULL; } +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) || defined(KSU_COMPAT_HAS_BACKPORTED_CPUS_PTR) +static inline cpumask_t *ksu_get_current_cpumask_t() { return current->cpus_ptr; } +#else +static inline cpumask_t *ksu_get_current_cpumask_t() { return ¤t->cpus_allowed; } +#endif - mutex_lock(&ksu_rules); +#endif // < 5.10 - db = get_policydb(); +static int apply_kernelsu_rules_fn(void *ptr) +{ + struct policydb *db = (struct policydb *)ptr; + ksu_type(db, KERNEL_SU_DOMAIN, "domain"); ksu_permissive(db, KERNEL_SU_DOMAIN); ksu_typeattribute(db, KERNEL_SU_DOMAIN, "mlstrustedsubject"); ksu_typeattribute(db, KERNEL_SU_DOMAIN, "netdomain"); @@ -49,7 +63,7 @@ void apply_kernelsu_rules() // Create unconstrained file type ksu_type(db, KERNEL_SU_FILE, "file_type"); ksu_typeattribute(db, KERNEL_SU_FILE, "mlstrustedobject"); - ksu_allow(db, ALL, KERNEL_SU_FILE, ALL, ALL); + ksu_allow(db, "domain", KERNEL_SU_FILE, ALL, ALL); // allow all! ksu_allow(db, KERNEL_SU_DOMAIN, ALL, ALL, ALL); @@ -76,7 +90,7 @@ void apply_kernelsu_rules() ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "open"); ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "read"); ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "process", "sigchld"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "process", "sigchld"); // allowLog ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "dir", "search"); @@ -84,12 +98,12 @@ void apply_kernelsu_rules() ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "open"); ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "getattr"); - // dumpsys - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fd", "use"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "write"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "read"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "open"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "getattr"); + // dumpsys, send fd + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fd", "use"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "write"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "read"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "open"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "getattr"); // bootctl ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "dir", "search"); @@ -98,357 +112,621 @@ void apply_kernelsu_rules() ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); // Allow all binder transactions - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "binder", ALL); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "binder", ALL); // Allow system server kill su process ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "getpgid"); ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "sigkill"); - mutex_unlock(&ksu_rules); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "read"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "write"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "connectto"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getopt"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getattr"); + + return 0; } -#define MAX_SEPOL_LEN 128 +void apply_kernelsu_rules() +{ + struct policydb *db; + + if (!getenforce()) { + pr_info("SELinux permissive or disabled, apply rules!\n"); + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) + struct selinux_policy *pol, *old_pol = selinux_state.policy; + mutex_lock(&selinux_state.policy_mutex); + pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex))); + if (!pol) { + pr_err("failed to dup selinux_policy\n"); + goto out_unlock; + } + db = &pol->policydb; + + apply_kernelsu_rules_fn((void *)db); + + rcu_assign_pointer(selinux_state.policy, pol); + synchronize_rcu(); + ksu_destroy_sepolicy(old_pol); -#define CMD_NORMAL_PERM 1 -#define CMD_XPERM 2 -#define CMD_TYPE_STATE 3 -#define CMD_TYPE 4 -#define CMD_TYPE_ATTR 5 -#define CMD_ATTR 6 -#define CMD_TYPE_TRANSITION 7 -#define CMD_TYPE_CHANGE 8 -#define CMD_GENFSCON 9 + reset_avc_cache(); +out_unlock: + mutex_unlock(&selinux_state.policy_mutex); +#else + + cpumask_t old_mask; + db = get_policydb(); + + rwlock_t *lock = ksu_get_policy_rwlock(); + if (!lock) + goto do_stop_machine; + + /* + * HACK: write_lock() is held with preempt enabled. DO NOT let the + * task be migrated to any other CPU than the current CPU. And since + * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get + * current CPU and bypass preemption checks. + */ + cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); + set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); + + pr_info("%s: type: policy_rwlock \n", __func__); + write_lock(lock); + preempt_enable(); + + // we do this dance since both kernel and userspace can trigger this + if (likely(current && current->mm)) + goto has_current_mm; + + apply_kernelsu_rules_fn((void *)db); + goto out_unlock; + +has_current_mm: + ; + // HACK: raise priority of this to the heavens + int old_policy = current->policy; + struct sched_param old_param = { .sched_priority = current->rt_priority }; + struct sched_param new_param = { .sched_priority = 50 }; + + sched_setscheduler_nocheck(current, 1, &new_param); // raise, fifo, 50 + apply_kernelsu_rules_fn((void *)db); + sched_setscheduler_nocheck(current, old_policy, &old_param); // restore + +out_unlock: + preempt_disable(); + write_unlock(lock); + set_cpus_allowed_ptr(current, &old_mask); + goto out_flush; + +do_stop_machine: + pr_info("%s: type: stop_machine()\n", __func__); + stop_machine(apply_kernelsu_rules_fn, (void *)db, NULL); + +out_flush: + smp_mb(); + reset_avc_cache(); +#endif +} + +#define KSU_SEPOLICY_MAX_BATCH_SIZE (8U * 1024U * 1024U) +#define KSU_SEPOLICY_MAX_ARGS 5 struct sepol_data { u32 cmd; u32 subcmd; - u64 sepol1; - u64 sepol2; - u64 sepol3; - u64 sepol4; - u64 sepol5; - u64 sepol6; - u64 sepol7; }; -static int get_object(char *buf, char __user *user_object, size_t buf_sz, - char **object) +struct sepol_batch_cursor { + const u8 *cur; + const u8 *end; +}; + +static size_t sepol_remaining(const struct sepol_batch_cursor *cursor) { - if (!user_object) { - *object = ALL; - return 0; - } + return (size_t)(cursor->end - cursor->cur); +} - if (strncpy_from_user(buf, user_object, buf_sz) < 0) { +static int sepol_read_cmd_header(struct sepol_batch_cursor *cursor, struct sepol_data *header) +{ + if (sepol_remaining(cursor) < sizeof(*header)) { return -EINVAL; } - *object = buf; + memcpy(header, cursor->cur, sizeof(*header)); + cursor->cur += sizeof(*header); return 0; } -#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || \ - LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) -extern int avc_ss_reset(u32 seqno); -#else -extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); -#endif -// reset avc cache table, otherwise the new rules will not take effect if already denied -static void reset_avc_cache() -{ -#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || \ - LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) - avc_ss_reset(0); - selnl_notify_policyload(0); - selinux_status_update_policyload(0); -#else - struct selinux_avc *avc = selinux_state.avc; - avc_ss_reset(avc, 0); - selnl_notify_policyload(0); - selinux_status_update_policyload(&selinux_state, 0); -#endif - selinux_xfrm_notify_policyload(); -} - -int handle_sepolicy(unsigned long arg3, void __user *arg4) +static int sepol_read_string(struct sepol_batch_cursor *cursor, const char **out) { - struct policydb *db; + u32 len; + const char *str; - if (!arg4) { + if (sepol_remaining(cursor) < sizeof(len)) { return -EINVAL; } - if (!getenforce()) { - pr_info("SELinux permissive or disabled when handle policy!\n"); + memcpy(&len, cursor->cur, sizeof(len)); + cursor->cur += sizeof(len); + + if (len >= sepol_remaining(cursor)) { + return -EINVAL; } - struct sepol_data data; - if (copy_from_user(&data, arg4, sizeof(struct sepol_data))) { - pr_err("sepol: copy sepol_data failed.\n"); + str = (const char *)cursor->cur; + if (memchr(str, '\0', len) != NULL || str[len] != '\0') { return -EINVAL; } - u32 cmd = data.cmd; - u32 subcmd = data.subcmd; + cursor->cur += len + 1; + if (len == 0) { + *out = ALL; + return 0; + } - mutex_lock(&ksu_rules); + *out = str; + return 0; +} - db = get_policydb(); +static int sepol_require_not_all(const char *value, const char *name) +{ + if (value != ALL) { + return 0; + } - int ret = -EINVAL; - if (cmd == CMD_NORMAL_PERM) { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - char perm_buf[MAX_SEPOL_LEN]; + pr_err("sepol: %s cannot be ALL.\n", name); + return -EINVAL; +} - char *s, *t, *c, *p; - if (get_object(src_buf, (void __user *)(uintptr_t)data.sepol1, sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } +static int sepol_expected_argc(u32 cmd) +{ + switch (cmd) { + case KSU_SEPOLICY_CMD_NORMAL_PERM: + return 4; + case KSU_SEPOLICY_CMD_XPERM: + return 5; + case KSU_SEPOLICY_CMD_TYPE_STATE: + return 1; + case KSU_SEPOLICY_CMD_TYPE: + case KSU_SEPOLICY_CMD_TYPE_ATTR: + return 2; + case KSU_SEPOLICY_CMD_ATTR: + return 1; + case KSU_SEPOLICY_CMD_TYPE_TRANSITION: + return 5; + case KSU_SEPOLICY_CMD_TYPE_CHANGE: + return 4; + case KSU_SEPOLICY_CMD_GENFSCON: + return 3; + default: + return -EINVAL; + } +} - if (get_object(tgt_buf, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; +static int apply_one_sepolicy_cmd(struct policydb *db, const struct sepol_data *header, const char **args) +{ + bool success = false; + int ret; + + switch (header->cmd) { + case KSU_SEPOLICY_CMD_NORMAL_PERM: + if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW) { + success = ksu_allow(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY) { + success = ksu_deny(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW) { + success = ksu_auditallow(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT) { + success = ksu_dontaudit(db, args[0], args[1], args[2], args[3]); + } else { + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } + return success ? 0 : -EINVAL; - if (get_object(cls_buf, (void __user *)(uintptr_t)data.sepol3, sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_XPERM: + ret = sepol_require_not_all(args[3], "operation"); + if (ret < 0) { + return ret; } - - if (get_object(perm_buf, (void __user *)(uintptr_t)data.sepol4, sizeof(perm_buf), &p) < - 0) { - pr_err("sepol: copy perm failed.\n"); - goto exit; + ret = sepol_require_not_all(args[4], "perm_set"); + if (ret < 0) { + return ret; } - bool success = false; - if (subcmd == 1) { - success = ksu_allow(db, s, t, c, p); - } else if (subcmd == 2) { - success = ksu_deny(db, s, t, c, p); - } else if (subcmd == 3) { - success = ksu_auditallow(db, s, t, c, p); - } else if (subcmd == 4) { - success = ksu_dontaudit(db, s, t, c, p); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - ret = success ? 0 : -EINVAL; - - } else if (cmd == CMD_XPERM) { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - - char __maybe_unused - operation[MAX_SEPOL_LEN]; // it is always ioctl now! - char perm_set[MAX_SEPOL_LEN]; - - char *s, *t, *c; - if (get_object(src_buf, (void __user *)(uintptr_t)data.sepol1, sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (get_object(tgt_buf, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (get_object(cls_buf, (void __user *)(uintptr_t)data.sepol3, sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(operation, (void __user *)(uintptr_t)data.sepol4, - sizeof(operation)) < 0) { - pr_err("sepol: copy operation failed.\n"); - goto exit; - } - if (strncpy_from_user(perm_set, (void __user *)(uintptr_t)data.sepol5, sizeof(perm_set)) < - 0) { - pr_err("sepol: copy perm_set failed.\n"); - goto exit; - } - - bool success = false; - if (subcmd == 1) { - success = ksu_allowxperm(db, s, t, c, perm_set); - } else if (subcmd == 2) { - success = ksu_auditallowxperm(db, s, t, c, perm_set); - } else if (subcmd == 3) { - success = ksu_dontauditxperm(db, s, t, c, perm_set); + if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_ALLOW) { + success = ksu_allowxperm(db, args[0], args[1], args[2], args[4]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW) { + success = ksu_auditallowxperm(db, args[0], args[1], args[2], args[4]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT) { + success = ksu_dontauditxperm(db, args[0], args[1], args[2], args[4]); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } - ret = success ? 0 : -EINVAL; - } else if (cmd == CMD_TYPE_STATE) { - char src[MAX_SEPOL_LEN]; + return success ? 0 : -EINVAL; - if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_TYPE_STATE: + ret = sepol_require_not_all(args[0], "type"); + if (ret < 0) { + return ret; } - bool success = false; - if (subcmd == 1) { - success = ksu_permissive(db, src); - } else if (subcmd == 2) { - success = ksu_enforce(db, src); + if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE) { + success = ksu_permissive(db, args[0]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE) { + success = ksu_enforce(db, args[0]); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } - if (success) - ret = 0; - - } else if (cmd == CMD_TYPE || cmd == CMD_TYPE_ATTR) { - char type[MAX_SEPOL_LEN]; - char attr[MAX_SEPOL_LEN]; + return success ? 0 : -EINVAL; - if (strncpy_from_user(type, (void __user *)(uintptr_t)data.sepol1, sizeof(type)) < 0) { - pr_err("sepol: copy type failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_TYPE: + case KSU_SEPOLICY_CMD_TYPE_ATTR: + ret = sepol_require_not_all(args[0], "type"); + if (ret < 0) { + return ret; } - if (strncpy_from_user(attr, (void __user *)(uintptr_t)data.sepol2, sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; + ret = sepol_require_not_all(args[1], "attribute"); + if (ret < 0) { + return ret; } - bool success = false; - if (cmd == CMD_TYPE) { - success = ksu_type(db, type, attr); + if (header->cmd == KSU_SEPOLICY_CMD_TYPE) { + success = ksu_type(db, args[0], args[1]); } else { - success = ksu_typeattribute(db, type, attr); + success = ksu_typeattribute(db, args[0], args[1]); } if (!success) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; } - ret = 0; + return 0; + + case KSU_SEPOLICY_CMD_ATTR: + ret = sepol_require_not_all(args[0], "attribute"); + if (ret < 0) { + return ret; + } + + if (!ksu_attribute(db, args[0])) { + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; + } + return 0; - } else if (cmd == CMD_ATTR) { - char attr[MAX_SEPOL_LEN]; + case KSU_SEPOLICY_CMD_TYPE_TRANSITION: { + const char *object = ALL; - if (strncpy_from_user(attr, (void __user *)(uintptr_t)data.sepol1, sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; + ret = sepol_require_not_all(args[0], "src"); + if (ret < 0) { + return ret; } - if (!ksu_attribute(db, attr)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; + ret = sepol_require_not_all(args[1], "tgt"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[2], "cls"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[3], "default_type"); + if (ret < 0) { + return ret; } - ret = 0; - } else if (cmd == CMD_TYPE_TRANSITION) { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - char object[MAX_SEPOL_LEN]; + object = args[4]; - if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; + success = ksu_type_transition(db, args[0], args[1], args[2], args[3], object); + return success ? 0 : -EINVAL; + } + + case KSU_SEPOLICY_CMD_TYPE_CHANGE: + ret = sepol_require_not_all(args[0], "src"); + if (ret < 0) { + return ret; } - if (strncpy_from_user(tgt, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; + ret = sepol_require_not_all(args[1], "tgt"); + if (ret < 0) { + return ret; } - if (strncpy_from_user(cls, (void __user *)(uintptr_t)data.sepol3, sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; + ret = sepol_require_not_all(args[2], "cls"); + if (ret < 0) { + return ret; } - if (strncpy_from_user(default_type, (void __user *)(uintptr_t)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; + ret = sepol_require_not_all(args[3], "default_type"); + if (ret < 0) { + return ret; } - char *real_object; - if ((void __user *)(uintptr_t)data.sepol5 == NULL) { - real_object = NULL; + + if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE) { + success = ksu_type_change(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER) { + success = ksu_type_member(db, args[0], args[1], args[2], args[3]); } else { - if (strncpy_from_user(object, (void __user *)(uintptr_t)data.sepol5, - sizeof(object)) < 0) { - pr_err("sepol: copy object failed.\n"); - goto exit; + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); + } + return success ? 0 : -EINVAL; + + case KSU_SEPOLICY_CMD_GENFSCON: + ret = sepol_require_not_all(args[0], "name"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[1], "path"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[2], "context"); + if (ret < 0) { + return ret; + } + + if (!ksu_genfscon(db, args[0], args[1], args[2])) { + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; + } + return 0; + + default: + pr_err("sepol: unknown cmd: %d\n", header->cmd); + return -EINVAL; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +int handle_sepolicy(void __user *user_data, u64 data_len) +{ + struct selinux_policy *pol, *old_pol; + struct policydb *db; + struct sepol_batch_cursor cursor; + u8 *payload; + int ret; + int success_cmd_count; + u32 cmd_index; + + if (!user_data || !data_len) { + return -EINVAL; + } + + if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE) { + return -E2BIG; + } + + payload = kvmalloc((size_t)data_len, GFP_KERNEL); + if (!payload) { + return -ENOMEM; + } + + if (copy_from_user(payload, user_data, (size_t)data_len)) { + ret = -EFAULT; + goto out_free; + } + + if (!getenforce()) { + pr_info("SELinux permissive or disabled when handle policy!\n"); + } + + mutex_lock(&selinux_state.policy_mutex); + + old_pol = selinux_state.policy; + pol = ksu_dup_sepolicy(rcu_dereference_protected( + old_pol, lockdep_is_held(&selinux_state.policy_mutex))); + if (!pol) { + ret = -ENOMEM; + goto out_unlock; + } + db = &pol->policydb; + + cursor.cur = payload; + cursor.end = payload + (size_t)data_len; + + ret = 0; + success_cmd_count = 0; + cmd_index = 0; + while (cursor.cur < cursor.end) { + struct sepol_data header; + const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 }; + int expected_argc; + u32 arg_index; + + ret = sepol_read_cmd_header(&cursor, &header); + if (ret < 0) { + pr_err("sepol: failed to read cmd header #%u.\n", cmd_index); + goto out_drop_new_policy; + } + + expected_argc = sepol_expected_argc(header.cmd); + if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) { + ret = -EINVAL; + pr_err("sepol: invalid cmd header #%u.\n", cmd_index); + goto out_drop_new_policy; + } + + for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) { + ret = sepol_read_string(&cursor, &args[arg_index]); + if (ret < 0) { + pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index); + goto out_drop_new_policy; } - real_object = object; - } - - bool success = ksu_type_transition(db, src, tgt, cls, - default_type, real_object); - if (success) - ret = 0; - - } else if (cmd == CMD_TYPE_CHANGE) { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)(uintptr_t)data.sepol1, sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (strncpy_from_user(tgt, (void __user *)(uintptr_t)data.sepol2, sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (strncpy_from_user(cls, (void __user *)(uintptr_t)data.sepol3, sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(default_type, (void __user *)(uintptr_t)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; - } - bool success = false; - if (subcmd == 1) { - success = ksu_type_change(db, src, tgt, cls, - default_type); - } else if (subcmd == 2) { - success = ksu_type_member(db, src, tgt, cls, - default_type); + } + + ret = apply_one_sepolicy_cmd(db, &header, args); + if (ret < 0) { + pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); - } - if (success) - ret = 0; - } else if (cmd == CMD_GENFSCON) { - char name[MAX_SEPOL_LEN]; - char path[MAX_SEPOL_LEN]; - char context[MAX_SEPOL_LEN]; - if (strncpy_from_user(name, (void __user *)(uintptr_t)data.sepol1, sizeof(name)) < 0) { - pr_err("sepol: copy name failed.\n"); - goto exit; - } - if (strncpy_from_user(path, (void __user *)(uintptr_t)data.sepol2, sizeof(path)) < 0) { - pr_err("sepol: copy path failed.\n"); - goto exit; - } - if (strncpy_from_user(context, (void __user *)(uintptr_t)data.sepol3, sizeof(context)) < - 0) { - pr_err("sepol: copy context failed.\n"); - goto exit; - } - - if (!ksu_genfscon(db, name, path, context)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; - } - ret = 0; - } else { - pr_err("sepol: unknown cmd: %d\n", cmd); + success_cmd_count++; + } + cmd_index++; } -exit: - mutex_unlock(&ksu_rules); + rcu_assign_pointer(selinux_state.policy, pol); + synchronize_rcu(); + ksu_destroy_sepolicy(old_pol); - // only allow and xallow needs to reset avc cache, but we cannot do that because - // we are in atomic context. so we just reset it every time. reset_avc_cache(); + ret = success_cmd_count; + goto out_unlock; + +out_drop_new_policy: + ksu_destroy_sepolicy(pol); +out_unlock: + mutex_unlock(&selinux_state.policy_mutex); +out_free: + kvfree(payload); return ret; } +#else + +struct handle_sepolicy_args { + void *ctx_success_cmd_count; + void *ctx_payload; + u64 ctx_data_len; +}; + +static int handle_sepolicy_fn(void *data) +{ + struct sepol_batch_cursor cursor; + int ret = 0; + u32 cmd_index = 0; + int success_cmd_count = 0; + + struct policydb *db = get_policydb(); + struct handle_sepolicy_args *ctx = (struct handle_sepolicy_args *)data; + u8 *payload = (u8 *)ctx->ctx_payload; + u64 data_len = ctx->ctx_data_len; + + cursor.cur = payload; + cursor.end = payload + (size_t)data_len; + + while (cursor.cur < cursor.end) { + struct sepol_data header; + const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 }; + int expected_argc; + u32 arg_index; + + ret = sepol_read_cmd_header(&cursor, &header); + if (ret < 0) { + pr_err("sepol: failed to read cmd header #%u.\n", cmd_index); + goto out; + } + + expected_argc = sepol_expected_argc(header.cmd); + if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) { + ret = -EINVAL; + pr_err("sepol: invalid cmd header #%u.\n", cmd_index); + goto out; + } + + for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) { + ret = sepol_read_string(&cursor, &args[arg_index]); + if (ret < 0) { + pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index); + goto out; + } + } + + ret = apply_one_sepolicy_cmd(db, &header, args); + if (ret < 0) + pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); + else { + pr_info("sepol: cmd #%u success, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); + success_cmd_count++; + } + + cmd_index++; + } + +out: + *(int *)(ctx->ctx_success_cmd_count) = success_cmd_count; + return ret; +} + +int handle_sepolicy(void __user *user_data, u64 data_len) +{ + u8 *payload; + int ret = 0; + int success_cmd_count = 0; + cpumask_t old_mask; + + if (!user_data || !data_len) + return -EINVAL; + + if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE) + return -E2BIG; + + payload = kvmalloc((size_t)data_len, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + if (copy_from_user(payload, user_data, (size_t)data_len)) { + ret = -EFAULT; + goto out_free; + } + + if (!getenforce()) { + pr_info("SELinux permissive or disabled when handle policy!\n"); + } + + struct handle_sepolicy_args ctx = { 0 }; + ctx.ctx_success_cmd_count = (void *)&success_cmd_count; + ctx.ctx_payload = (void *)payload; + ctx.ctx_data_len = (u64)data_len; + + rwlock_t *lock = ksu_get_policy_rwlock(); + if (!lock) + goto do_stop_machine; + + /* + * HACK: write_lock() is held with preempt enabled. DO NOT let the + * task be migrated to any other CPU than the current CPU. And since + * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get + * current CPU and bypass preemption checks. + */ + cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); + set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); + + write_lock(lock); + preempt_enable(); + + if (likely(current && current->mm)) + goto has_current_mm; + + ret = handle_sepolicy_fn((void *)&ctx); + goto out_unlock; + +has_current_mm: + ; + int old_policy = current->policy; + struct sched_param old_param = { .sched_priority = current->rt_priority }; + struct sched_param new_param = { .sched_priority = 50 }; + + sched_setscheduler_nocheck(current, 1, &new_param); + ret = handle_sepolicy_fn((void *)&ctx); + sched_setscheduler_nocheck(current, old_policy, &old_param); + +out_unlock: + preempt_disable(); + write_unlock(lock); + set_cpus_allowed_ptr(current, &old_mask); + goto out_done; + +do_stop_machine: + ret = stop_machine(handle_sepolicy_fn, (void *)&ctx, NULL); + +out_done: + if (ret) + goto out_free; + + smp_mb(); + reset_avc_cache(); + ret = success_cmd_count; + +out_free: + kvfree(payload); + + return ret; +} +#endif diff --git a/drivers/kernelsu/selinux/selinux.c b/drivers/kernelsu/selinux/selinux.c index 975c0ae4e05f..d7c6a71d20c6 100644 --- a/drivers/kernelsu/selinux/selinux.c +++ b/drivers/kernelsu/selinux/selinux.c @@ -15,7 +15,7 @@ static u32 cached_zygote_sid __read_mostly = 0; static u32 cached_init_sid __read_mostly = 0; u32 ksu_file_sid __read_mostly = 0; -static int transive_to_domain(const char *domain, struct cred *cred) +static int transive_to_domain(const char *domain, struct cred *cred, bool clear_exec_sid) { u32 sid; int error; @@ -39,13 +39,16 @@ static int transive_to_domain(const char *domain, struct cred *cred) tsec->create_sid = 0; tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; + if (clear_exec_sid) { + tsec->exec_sid = 0; + } } return error; } void setup_selinux(const char *domain, struct cred *cred) { - if (transive_to_domain(domain, cred)) { + if (transive_to_domain(domain, cred, false)) { pr_err("transive domain failed.\n"); return; } @@ -53,7 +56,7 @@ void setup_selinux(const char *domain, struct cred *cred) void setup_ksu_cred(void) { - if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred)) { + if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred, false)) { pr_err("setup ksu cred failed.\n"); } } @@ -208,3 +211,19 @@ bool is_init(const struct cred *cred) { return is_sid_match(cred, cached_init_sid, INIT_CONTEXT); } + +void escape_to_root_for_adb_root(void) +{ + struct cred *cred = prepare_creds(); + if (!cred) { + pr_err("Failed to prepare adbd's creds!\n"); + return; + } + + if (transive_to_domain(KERNEL_SU_CONTEXT, cred, true)) { + pr_err("transive domain failed.\n"); + abort_creds(cred); + return; + } + commit_creds(cred); +} diff --git a/drivers/kernelsu/selinux/selinux.h b/drivers/kernelsu/selinux/selinux.h index c80e6cf764f1..cbeac553d20a 100644 --- a/drivers/kernelsu/selinux/selinux.h +++ b/drivers/kernelsu/selinux/selinux.h @@ -1,16 +1,11 @@ #ifndef __KSU_H_SELINUX #define __KSU_H_SELINUX -#include "linux/types.h" -#include "linux/version.h" -#include "linux/cred.h" - #if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)) || defined(KSU_COMPAT_HAS_SELINUX_STATE) #define KSU_COMPAT_USE_SELINUX_STATE #endif -// TODO: rename to "ksu" -#define KERNEL_SU_DOMAIN "su" +#define KERNEL_SU_DOMAIN "ksu" #define KERNEL_SU_FILE "ksu_file" #define KERNEL_SU_CONTEXT "u:r:" KERNEL_SU_DOMAIN ":s0" @@ -36,8 +31,10 @@ bool is_init(const struct cred* cred); void apply_kernelsu_rules(); -int handle_sepolicy(unsigned long arg3, void __user *arg4); +int handle_sepolicy(void __user *user_data, u64 data_len); void setup_ksu_cred(); +void escape_to_root_for_adb_root(); + #endif diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c index 32b1ac209565..45e32b8e780d 100644 --- a/drivers/kernelsu/selinux/sepolicy.c +++ b/drivers/kernelsu/selinux/sepolicy.c @@ -1,8 +1,3 @@ -#include -#include -#include -#include - #define KSU_SUPPORT_ADD_TYPE /* @@ -365,7 +360,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, if (datum->u.xperms == NULL) { datum->u.xperms = (struct avtab_extended_perms *)(kzalloc( - sizeof(xperms), GFP_ATOMIC)); + sizeof(xperms), GFP_KERNEL)); if (!datum->u.xperms) { pr_err("alloc xperms failed\n"); return; @@ -562,12 +557,12 @@ static bool add_filename_trans(struct policydb *db, const char *s, if (trans == NULL) { trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_ATOMIC); + GFP_KERNEL); struct filename_trans_key *new_key = (struct filename_trans_key *)kzalloc(sizeof(*new_key), - GFP_ATOMIC); + GFP_KERNEL); *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); + new_key->name = kstrdup(key.name, GFP_KERNEL); trans->next = last; trans->otype = def->value; hashtab_insert(&db->filename_trans, new_key, trans, @@ -588,20 +583,20 @@ static bool add_filename_trans(struct policydb *db, const char *s, if (trans == NULL) { trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), - 1, GFP_ATOMIC); + 1, GFP_KERNEL); if (!trans) { pr_err("add_filename_trans: Failed to alloc datum\n"); return false; } struct filename_trans *new_key = (struct filename_trans *)kmalloc(sizeof(*new_key), - GFP_ATOMIC); + GFP_KERNEL); if (!new_key) { pr_err("add_filename_trans: Failed to alloc new_key\n"); return false; } *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); + new_key->name = kstrdup(key.name, GFP_KERNEL); trans->otype = def->value; hashtab_insert(db->filename_trans, new_key, trans); } @@ -619,8 +614,8 @@ static bool add_genfscon(struct policydb *db, const char *fs_name, // https://github.com/torvalds/linux/commit/590b9d576caec6b4c46bba49ed36223a399c3fc5#diff-cc9aa90e094e6e0f47bd7300db4f33cf4366b98b55d8753744f31eb69c691016R844-R845 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_ATOMIC) -#else +#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_KERNEL) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) // https://cs.android.com/android/_/android/kernel/common/+/f5f3e54f811679761c33526e695bd296190faade // Some 5.10 kernel don't have this backport, so copy one. static void *ksu_kvrealloc_compat(const void *p, size_t oldsize, size_t newsize, gfp_t flags) @@ -636,7 +631,7 @@ static void *ksu_kvrealloc_compat(const void *p, size_t oldsize, size_t newsize, kvfree(p); return newp; } -#define ksu_kvrealloc(p, new_size, old_size) ksu_kvrealloc_compat(p, old_size, new_size, GFP_ATOMIC) +#define ksu_kvrealloc(p, new_size, old_size) ksu_kvrealloc_compat(p, old_size, new_size, GFP_KERNEL) #endif static bool add_type(struct policydb *db, const char *type_name, bool attr) @@ -650,7 +645,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) u32 value = ++db->p_types.nprim; type = (struct type_datum *)kzalloc(sizeof(struct type_datum), - GFP_ATOMIC); + GFP_KERNEL); if (!type) { pr_err("add_type: alloc type_datum failed.\n"); return false; @@ -660,7 +655,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) type->value = value; type->attribute = attr; - char *key = kstrdup(type_name, GFP_ATOMIC); + char *key = kstrdup(type_name, GFP_KERNEL); if (!key) { pr_err("add_type: alloc key failed.\n"); return false; @@ -774,12 +769,12 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) */ size_t new_size = sizeof(struct ebitmap) * db->p_types.nprim; struct ebitmap *new_type_attr_map = - (krealloc(db->type_attr_map, new_size, GFP_ATOMIC)); + (krealloc(db->type_attr_map, new_size, GFP_KERNEL)); struct type_datum **new_type_val_to_struct = krealloc(db->type_val_to_struct, sizeof(*db->type_val_to_struct) * db->p_types.nprim, - GFP_ATOMIC); + GFP_KERNEL); if (!new_type_attr_map) { pr_err("add_type: alloc type_attr_map failed\n"); @@ -821,15 +816,15 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) // flex_array is not extensible, we need to create a new bigger one instead struct flex_array *new_type_attr_map_array = flex_array_alloc(sizeof(struct ebitmap), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); struct flex_array *new_type_val_to_struct = flex_array_alloc(sizeof(struct type_datum *), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); struct flex_array *new_val_to_name_types = flex_array_alloc(sizeof(char *), db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); if (!new_type_attr_map_array) { pr_err("add_type: alloc type_attr_map_array failed\n"); @@ -848,20 +843,20 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) // preallocate so we don't have to worry about the put ever failing if (flex_array_prealloc(new_type_attr_map_array, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc type_attr_map_array failed\n"); return false; } if (flex_array_prealloc(new_type_val_to_struct, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc type_val_to_struct_array failed\n"); return false; } if (flex_array_prealloc(new_val_to_name_types, 0, db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc val_to_name_types failed\n"); return false; } @@ -873,14 +868,14 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) old_elem = flex_array_get(db->type_attr_map_array, j); if (old_elem) flex_array_put(new_type_attr_map_array, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } for (j = 0; j < db->type_val_to_struct_array->total_nr_elements; j++) { old_elem = flex_array_get_ptr(db->type_val_to_struct_array, j); if (old_elem) flex_array_put_ptr(new_type_val_to_struct, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } for (j = 0; j < db->symtab[SYM_TYPES].nprim; j++) { @@ -888,7 +883,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_get_ptr(db->sym_val_to_name[SYM_TYPES], j); if (old_elem) flex_array_put_ptr(new_val_to_name_types, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } // store the pointer of old flex arrays first, when assigning new ones we @@ -911,7 +906,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_free(old_fa); } flex_array_put_ptr(db->type_val_to_struct_array, value - 1, type, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); old_fa = db->sym_val_to_name[SYM_TYPES]; db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; @@ -919,7 +914,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_free(old_fa); } flex_array_put_ptr(db->sym_val_to_name[SYM_TYPES], value - 1, key, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); int i; for (i = 0; i < db->p_roles.nprim; ++i) { @@ -1132,3 +1127,461 @@ bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, { return add_genfscon(db, fs_name, path, ctx); } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#include "ss/avtab.h" +#include "ss/constraint.h" +#include "ss/ebitmap.h" +#include "ss/hashtab.h" +#include "ss/policydb.h" +#include "ss/services.h" + + +// https://github.com/torvalds/linux/commit/581646c3fb98494009671f6d347ea125bc0e663a +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0) +#define CONST_IF_6_10 const +#else +#define CONST_IF_6_10 +#endif + +// ======== begin copy ======== + +static int copy_hashtab_node(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) +{ + new_node->datum = old_node->datum; + new_node->key = old_node->key; + return 0; +} + +static int destroy_hashtab_node(void *key, void *datum, void *data) +{ + // just copied pointer, no need to free + return 0; +} + +static int shallow_copy_hashtab(struct hashtab *new_tab, struct hashtab *old_tab) +{ + return hashtab_duplicate(new_tab, old_tab, copy_hashtab_node, destroy_hashtab_node, NULL); +} + +// ======== class_datum ======== + +static int +copy_class_datum_partially_callback(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) +{ + struct policydb *db = data; + struct class_datum *cls = old_node->datum, *new_cls; + struct constraint_node *oldn, *n, *nprev = NULL; + struct constraint_expr *olde, *e, *eprev; + new_node->key = old_node->key; + new_cls = kmemdup(cls, sizeof(struct class_datum), GFP_KERNEL); + if (!new_cls) + return -ENOMEM; + new_node->datum = new_cls; + new_cls->constraints = NULL; + for (oldn = cls->constraints; oldn; oldn = oldn->next) { + n = kmemdup(oldn, sizeof(struct constraint_node), GFP_KERNEL); + if (!n) + goto out_nomem; + if (nprev) { + nprev->next = n; + } else { + new_cls->constraints = n; + } + eprev = NULL; + n->expr = NULL; + for (olde = oldn->expr; olde; olde = olde->next) { + e = kmemdup(olde, sizeof(struct constraint_expr), GFP_KERNEL); + if (!e) { + goto out_nomem; + } + if (eprev) { + eprev->next = e; + } else { + n->expr = e; + } + if (olde->expr_type == CEXPR_NAMES) { + if (ebitmap_cpy(&e->names, &olde->names) < 0) { + goto out_nomem; + } + } + eprev = e; + } + nprev = n; + } + + db->class_val_to_struct[new_cls->value - 1] = new_cls; + + return 0; +out_nomem: + return -ENOMEM; +} + +static int destroy_class_datum_partially_callback(void *key, void *datum, void *data) +{ + struct class_datum *cls = datum; + struct constraint_node *n, *nprev; + struct constraint_expr *e, *eprev; + if (cls) { + for (n = cls->constraints; n;) { + for (e = n->expr; e;) { + if (e->expr_type == CEXPR_NAMES) { + ebitmap_destroy(&e->names); + } + eprev = e; + e = e->next; + kfree(eprev); + } + nprev = n; + n = n->next; + kfree(nprev); + } + } + kfree(cls); + + return 0; +} + +static void free_class_datum_partially(struct policydb *db) +{ + if (db->class_val_to_struct) { + kfree(db->class_val_to_struct); + } + + if (db->p_classes.table.htable) { + hashtab_map(&db->p_classes.table, destroy_class_datum_partially_callback, NULL); + hashtab_destroy(&db->p_classes.table); + } +} + +static int copy_class_datum_partially(struct policydb *new_db, struct policydb *old_db) +{ + int ret; + u32 n = new_db->symtab[SYM_CLASSES].nprim; + struct class_datum **new_class_val_to_struct; + + new_db->class_val_to_struct = NULL; + memset(&new_db->p_classes.table, 0, sizeof(new_db->p_classes.table)); + + new_class_val_to_struct = + kcalloc(n, sizeof(struct class_datum *), GFP_KERNEL); + if (!new_class_val_to_struct) { + ret = -ENOMEM; + goto exit; + } + new_db->class_val_to_struct = new_class_val_to_struct; + + ret = hashtab_duplicate(&new_db->p_classes.table, &old_db->p_classes.table, + copy_class_datum_partially_callback, + destroy_class_datum_partially_callback, new_db); + + if (ret) { + goto exit; + } + + return 0; + +exit: + free_class_datum_partially(new_db); + return ret; +} + +// ======== avtab ======== + +static int copy_avtab(struct avtab *new_avtab, struct avtab *old_avtab) +{ + int ret, i; + struct avtab_node *n, *p; + ret = avtab_alloc_dup(new_avtab, old_avtab); + if (ret < 0) + return ret; + // avtab_alloc_dup didn't zero it + new_avtab->nel = 0; + + for (i = 0; i < old_avtab->nslot; i++) { + n = old_avtab->htable[i]; + while (n) { + p = avtab_insert_nonunique(new_avtab, &n->key, &n->datum); + if (!p) { + ret = -ENOMEM; + goto out_free; + } + n = n->next; + } + } + + return 0; + +out_free: + avtab_destroy(new_avtab); + return ret; +} + +// ======== role_datum ======== + +static int +copy_role_datum_partially_callback(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) +{ + int ret = 0; + struct policydb *db = data; + struct role_datum *role = old_node->datum, *new_role; + new_role = kmemdup(role, sizeof(struct role_datum), GFP_KERNEL); + if (!new_role) { + ret = -ENOMEM; + goto out; + } + new_node->datum = new_role; + new_node->key = old_node->key; + + ret = ebitmap_cpy(&new_role->types, &role->types); + if (ret) { + goto out; + } + db->role_val_to_struct[role->value - 1] = new_role; + +out: + return ret; +} + +static int destroy_role_datum_partially_callback(void *key, void *datum, void *data) +{ + struct role_datum *role = datum; + if (role) { + ebitmap_destroy(&role->types); + kfree(role); + } + return 0; +} + +static void free_role_datum_partially(struct policydb *db) +{ + if (db->role_val_to_struct) { + kfree(db->role_val_to_struct); + } + if (db->p_roles.table.htable) { + hashtab_map(&db->p_roles.table, destroy_role_datum_partially_callback, NULL); + hashtab_destroy(&db->p_roles.table); + } +} + +static int copy_role_datum_partially(struct policydb *new_db, struct policydb *old_db) +{ + int ret; + struct role_datum **new_role_val_to_struct; + u32 n = old_db->p_roles.nprim; + + new_db->role_val_to_struct = NULL; + memset(&new_db->p_roles.table, 0, sizeof(new_db->p_roles.table)); + + new_role_val_to_struct = + kcalloc(n, sizeof(*new_db->role_val_to_struct), GFP_KERNEL); + if (!new_role_val_to_struct) { + ret = -ENOMEM; + goto out_free; + } + new_db->role_val_to_struct = new_role_val_to_struct; + + ret = hashtab_duplicate(&new_db->p_roles.table, &old_db->p_roles.table, + copy_role_datum_partially_callback, + destroy_role_datum_partially_callback, new_db); + if (ret) + goto out_free; + return 0; + +out_free: + free_role_datum_partially(new_db); + + return ret; +} + +// ======== type_datum ======== + +static void free_type_datum_partially(struct policydb *db) +{ + u32 sz = db->p_types.nprim, i; + if (db->type_attr_map_array) { + for (i = 0; i < sz; i++) { + ebitmap_destroy(&db->type_attr_map_array[i]); + } + + kvfree(db->type_attr_map_array); + } + + if (db->type_val_to_struct) { + kvfree(db->type_val_to_struct); + } + + if (db->sym_val_to_name[SYM_TYPES]) { + kvfree(db->sym_val_to_name[SYM_TYPES]); + } + + hashtab_destroy(&db->p_types.table); +} + +static int copy_type_datum_partially(struct policydb *new_db, struct policydb *old_db) +{ + int ret = -ENOMEM; + u32 sz = new_db->p_types.nprim, i; + struct ebitmap *new_type_attr_map_array; + struct type_datum **new_type_val_to_struct; + char **new_sym_val_to_name_types; + + new_db->type_attr_map_array = NULL; + new_db->type_val_to_struct = NULL; + new_db->sym_val_to_name[SYM_TYPES] = NULL; + memset(&new_db->p_types.table, 0, sizeof(new_db->p_types.table)); + + // ======== type_attr_map_array ======== + + new_type_attr_map_array = kvcalloc(sz, sizeof(struct ebitmap), GFP_KERNEL); + + if (!new_type_attr_map_array) { + goto out; + } + + new_db->type_attr_map_array = new_type_attr_map_array; + for (i = 0; i < sz; i++) { + ret = ebitmap_cpy(&new_db->type_attr_map_array[i], + &old_db->type_attr_map_array[i]); + if (ret < 0) + goto out; + } + + // ======== type_val_to_struct ======== + ret = -ENOMEM; + + new_type_val_to_struct = + kvcalloc(sz, sizeof(*new_db->type_val_to_struct), GFP_KERNEL); + if (!new_type_val_to_struct) { + goto out; + } + new_db->type_val_to_struct = new_type_val_to_struct; + memcpy(new_db->type_val_to_struct, old_db->type_val_to_struct, + sz * sizeof(*new_db->type_val_to_struct)); + + // ======== sym_val_to_name[SYM_TYPES] ======== + + new_sym_val_to_name_types = + kvcalloc(sz, sizeof(*new_db->sym_val_to_name[SYM_TYPES]), GFP_KERNEL); + if (!new_sym_val_to_name_types) + goto out; + new_db->sym_val_to_name[SYM_TYPES] = new_sym_val_to_name_types; + memcpy(new_db->sym_val_to_name[SYM_TYPES], + old_db->sym_val_to_name[SYM_TYPES], + sz * sizeof(*new_db->sym_val_to_name[SYM_TYPES])); + + // ======== p_types ======== + + ret = shallow_copy_hashtab(&new_db->p_types.table, &old_db->p_types.table); + if (ret < 0) + goto out; + + return 0; +out: + free_type_datum_partially(new_db); + return ret; +} + +// ======== permissive_map ======== + +static void free_permissive_map(struct policydb *db) +{ + ebitmap_destroy(&db->permissive_map); +} + +static int copy_permissive_map(struct policydb *new_db, struct policydb *old_db) +{ + // On failure, the old ebitmap is cleaned. + return ebitmap_cpy(&new_db->permissive_map, &old_db->permissive_map); +} + +// ======== filename_trans ======== + +static void free_filename_trans(struct policydb *db) +{ + hashtab_destroy(&db->filename_trans); +} + +static int copy_filename_trans(struct policydb *new_db, struct policydb *old_db) +{ + // On failure, the old hashtab is cleaned. + return shallow_copy_hashtab(&new_db->filename_trans, &old_db->filename_trans); +} + +// ======== sepolicy ======== + +void ksu_destroy_sepolicy(struct selinux_policy *pol) +{ + if (!pol) + return; + + struct policydb *db = &pol->policydb; + + free_class_datum_partially(db); + + avtab_destroy(&db->te_avtab); + + free_role_datum_partially(db); + + free_type_datum_partially(db); + + free_permissive_map(db); + + free_filename_trans(db); + + kfree(pol); +} + +struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol) +{ + int ret; + struct selinux_policy *new_pol = + kmemdup(old_pol, sizeof(*old_pol), GFP_KERNEL); + if (!new_pol) { + return NULL; + } + struct policydb *new_db = &new_pol->policydb, *old_db = &old_pol->policydb; + + ret = copy_class_datum_partially(new_db, old_db); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_class_datum_partially\n"); + goto out; + } + + ret = copy_avtab(&new_db->te_avtab, &old_db->te_avtab); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_avtab\n"); + goto out; + } + + ret = copy_role_datum_partially(new_db, old_db); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_role_datum_partially\n"); + goto out; + } + + ret = copy_type_datum_partially(new_db, old_db); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_type_datum_partially\n"); + goto out; + } + + ret = copy_permissive_map(new_db, old_db); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_permissive_map\n"); + goto out; + } + + ret = copy_filename_trans(new_db, old_db); + if (ret < 0) { + pr_err("ksu_dup_sepolicy: copy_filename_trans\n"); + goto out; + } + + return new_pol; + +out: + kfree(new_pol); + return NULL; +} +#endif diff --git a/drivers/kernelsu/selinux/sepolicy.h b/drivers/kernelsu/selinux/sepolicy.h index 675d1499e46d..8ae79e3dc3b3 100644 --- a/drivers/kernelsu/selinux/sepolicy.h +++ b/drivers/kernelsu/selinux/sepolicy.h @@ -1,10 +1,14 @@ #ifndef __KSU_H_SEPOLICY #define __KSU_H_SEPOLICY -#include - #include "ss/policydb.h" +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol); + +void ksu_destroy_sepolicy(struct selinux_policy *orig); +#endif + // Operation on types bool ksu_type(struct policydb *db, const char *name, const char *attr); bool ksu_attribute(struct policydb *db, const char *name); diff --git a/drivers/kernelsu/sulog/event.c b/drivers/kernelsu/sulog/event.c new file mode 100644 index 000000000000..80a73b6fcf38 --- /dev/null +++ b/drivers/kernelsu/sulog/event.c @@ -0,0 +1,271 @@ +#define KSU_SULOG_MAX_QUEUED 256U +#define KSU_SULOG_MAX_PAYLOAD_LEN 2048U +#define KSU_SULOG_MAX_ARG_STRINGS 0x7FFFFFFF +#define KSU_SULOG_MAX_ARG_CHUNK 256U +#define KSU_SULOG_MAX_FILENAME_LEN 256U + +static struct ksu_event_queue sulog_queue; + +struct ksu_sulog_pending_event { + __u16 event_type; + void *payload; + __u32 payload_len; +}; + +struct ksu_sulog_identity { + __u32 uid; + __u32 euid; +}; + +static void ksu_sulog_fill_task_info(struct ksu_sulog_event *event, __u16 event_type, int retval) +{ + event->version = KSU_SULOG_EVENT_VERSION; + event->event_type = event_type; + event->retval = retval; + event->pid = task_pid_nr(current); + event->tgid = task_tgid_nr(current); + event->ppid = task_ppid_nr(current); + + kuid_t current_uid = current_uid(); + kuid_t current_euid = current_euid(); + + event->uid = ksu_get_uid_t(current_uid); + event->euid = ksu_get_uid_t(current_euid); + + get_task_comm(event->comm, current); +} + +static void ksu_sulog_set_identity(struct ksu_sulog_event *event, const struct ksu_sulog_identity *identity) +{ + if (!identity) + return; + + event->uid = identity->uid; + event->euid = identity->euid; +} + +static struct ksu_sulog_pending_event *ksu_sulog_capture(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) +{ + struct ksu_sulog_pending_event *pending = NULL; + struct ksu_sulog_event *event; + void *payload = NULL; + __u32 payload_len; + __u32 filename_len; + __u32 argv_len; + __u32 remaining; + char *filename_buf; + bool should_skip_copy = false; + + if (!ksu_sulog_is_enabled()) + return NULL; + + if (event_type == KSU_SULOG_EVENT_IOCTL_GRANT_ROOT || event_type == KSU_SULOG_EVENT_SUCOMPAT) { + filename_len = 0; + argv_len = 0; + should_skip_copy = true; + goto alloc; + } + + if (!bprm_argv) + return NULL; + + if (!bprm_argv_len) + return NULL; + + if (bprm_argv_len <= 0) + return NULL; + +alloc: + pending = kzalloc(sizeof(*pending), gfp); + if (!pending) + goto out_drop; + + payload = kzalloc(KSU_SULOG_MAX_PAYLOAD_LEN, gfp); + if (!payload) + goto out_free_pending; + + event = payload; + ksu_sulog_fill_task_info(event, event_type, 0); + + if (should_skip_copy) + goto skip_copy; + + remaining = KSU_SULOG_MAX_PAYLOAD_LEN - sizeof(*event); + filename_buf = (char *)payload + sizeof(*event); + + size_t actual_copy_len = bprm_argv_len; + + if (bprm_argv_len > remaining - 1) + actual_copy_len = remaining - 1 ; + + memcpy(filename_buf, bprm_argv, actual_copy_len); + filename_buf[actual_copy_len] = '\0'; + + filename_len = strlen(filename_buf) + 1 ; // argv0 + null terminator + + if (actual_copy_len > filename_len) + argv_len = actual_copy_len - (filename_len); + else + argv_len = 0; + +skip_copy: + event->filename_len = filename_len; + event->argv_len = argv_len; + + payload_len = (__u32)sizeof(*event) + filename_len + argv_len; + + // unlikely + if (payload_len > KSU_SULOG_MAX_PAYLOAD_LEN || (__u32)sizeof(*event) > payload_len) + goto out_free_payload; + + pending->event_type = event_type; + pending->payload = payload; + pending->payload_len = payload_len; + return pending; + +out_free_payload: + kfree(payload); +out_free_pending: + kfree(pending); +out_drop: + ksu_event_queue_drop(&sulog_queue); + return NULL; +} + +static struct ksu_sulog_pending_event *ksu_sulog_capture_grant_root(const struct ksu_sulog_identity *identity, gfp_t gfp) +{ + struct ksu_sulog_pending_event *pending; + struct ksu_sulog_event *event; + + pending = ksu_sulog_capture(KSU_SULOG_EVENT_IOCTL_GRANT_ROOT, NULL, NULL, gfp); + if (!pending) + return NULL; + + event = pending->payload; + ksu_sulog_set_identity(event, identity); + return pending; +} + +int ksu_sulog_events_init(void) +{ + ksu_event_queue_init(&sulog_queue, KSU_SULOG_MAX_QUEUED, KSU_SULOG_MAX_PAYLOAD_LEN); + return 0; +} + +void ksu_sulog_events_exit(void) +{ + ksu_event_queue_destroy(&sulog_queue); +} + +static void ksu_sulog_free_pending(struct ksu_sulog_pending_event *pending) +{ + if (!pending) + return; + kfree(pending->payload); + kfree(pending); +} + +void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp) +{ + struct ksu_sulog_event *event; + + if (!pending) + return; + + event = pending->payload; + event->retval = retval; + ksu_event_queue_push(&sulog_queue, pending->event_type, 0, pending->payload, pending->payload_len, gfp); + ksu_sulog_free_pending(pending); +} + +int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp) +{ + if (!ksu_sulog_is_enabled()) + return 0; + + struct ksu_sulog_pending_event *pending; + struct ksu_sulog_identity identity = { + .uid = uid, + .euid = euid, + }; + + pending = ksu_sulog_capture_grant_root(&identity, gfp); + if (!pending) + return 0; + + ksu_sulog_emit_pending(pending, retval, gfp); + return 0; +} + +int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) +{ + if (!ksu_sulog_is_enabled()) + return 0; + + struct ksu_sulog_pending_event *pending; + + pending = ksu_sulog_capture(event_type, bprm_argv, bprm_argv_len, gfp); + if (!pending) + return 0; + + ksu_sulog_emit_pending(pending, 0, gfp); + return 0; +} + +void ksu_sulog_emit_bprm(const char *filename) +{ + if (!ksu_sulog_is_enabled()) + return; + + if (!is_ksu_domain()) + return; + + if (!current->mm) + return; + + unsigned long arg_start = current->mm->arg_start; + unsigned long arg_end = current->mm->arg_end; + size_t arg_len = arg_end - arg_start; + + if (arg_len <= 0) + return; + +#define ARGV_MAX_BPRM 128 + char args[ARGV_MAX_BPRM] = {0}; + + size_t argv_copy_len = (arg_len > ARGV_MAX_BPRM) ? ARGV_MAX_BPRM : arg_len; + + // we cant use strncpy on here, else it will truncate once it sees \0 + if (ksu_copy_from_user_retry(args, (void __user *)arg_start, argv_copy_len)) + return; + + args[argv_copy_len - 1] = '\0'; + + // we grab strlen of argv0 as that needs to be kept as \0, basically to skip it + size_t argv0_len = strnlen(args, argv_copy_len); + char *buf = args + argv0_len + 1; + +flatten: + if (buf >= args + argv_copy_len - 1) + goto flatten_done; + + int len = strlen(buf); + if (!len) + goto flatten_done; + + *(buf + len) = ' '; + buf = buf + len + 1; + + if (buf - args < argv_copy_len - argv0_len - 1) + goto flatten; + +flatten_done: + // this should look like + // /system/bin/sh\0-c sh -c id + ksu_sulog_emit(KSU_SULOG_EVENT_ROOT_EXECVE, args, argv_copy_len, GFP_KERNEL); +} + +struct ksu_event_queue *ksu_sulog_get_queue(void) +{ + return &sulog_queue; +} diff --git a/drivers/kernelsu/sulog/event.h b/drivers/kernelsu/sulog/event.h new file mode 100644 index 000000000000..bf272a7328eb --- /dev/null +++ b/drivers/kernelsu/sulog/event.h @@ -0,0 +1,17 @@ +#ifndef __KSU_H_SULOG_EVENT +#define __KSU_H_SULOG_EVENT + +struct ksu_event_queue; +struct ksu_sulog_pending_event; + +int ksu_sulog_events_init(void); +void ksu_sulog_events_exit(void); + +void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp); +int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp); + +int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp); + +struct ksu_event_queue *ksu_sulog_get_queue(void); + +#endif diff --git a/drivers/kernelsu/sulog/fd.c b/drivers/kernelsu/sulog/fd.c new file mode 100644 index 000000000000..70da685e73ea --- /dev/null +++ b/drivers/kernelsu/sulog/fd.c @@ -0,0 +1,83 @@ +static DEFINE_MUTEX(ksu_sulog_fd_lock); +static bool ksu_sulog_fd_active; + +static ssize_t ksu_sulog_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + return ksu_event_queue_read(ksu_sulog_get_queue(), buf, count, file->f_flags); +} + +static unsigned __bitwise ksu_sulog_poll(struct file *file, poll_table *wait) +{ + return ksu_event_queue_poll(ksu_sulog_get_queue(), file, wait); +} + +static int ksu_sulog_release(struct inode *inode, struct file *file) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); + + pr_info("sulog: fd released\n"); + return 0; +} + +static const struct file_operations ksu_sulog_fops = { + .owner = THIS_MODULE, + .read = ksu_sulog_read, + .poll = ksu_sulog_poll, + .release = ksu_sulog_release, + .llseek = noop_llseek, +}; + +int ksu_install_sulog_fd(void) +{ + struct file *filp; + int fd; + + mutex_lock(&ksu_sulog_fd_lock); + + if (ksu_sulog_fd_active) { + fd = -EBUSY; + goto out_unlock; + } + + if (READ_ONCE(ksu_sulog_get_queue()->closed)) { + fd = -EPIPE; + goto out_unlock; + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + goto out_unlock; + + filp = anon_inode_getfile("[ksu_sulog]", &ksu_sulog_fops, NULL, O_RDONLY | O_CLOEXEC); + if (IS_ERR(filp)) { + put_unused_fd(fd); + fd = PTR_ERR(filp); + goto out_unlock; + } + + ksu_sulog_fd_active = true; + fd_install(fd, filp); + pr_info("sulog: fd installed %d for pid %d\n", fd, current->pid); + +out_unlock: + mutex_unlock(&ksu_sulog_fd_lock); + return fd; +} + +void ksu_sulog_fd_init(void) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); +} + +void ksu_sulog_fd_exit(void) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); + + ksu_event_queue_close(ksu_sulog_get_queue()); +} diff --git a/drivers/kernelsu/sulog/fd.h b/drivers/kernelsu/sulog/fd.h new file mode 100644 index 000000000000..6a117fedc0a9 --- /dev/null +++ b/drivers/kernelsu/sulog/fd.h @@ -0,0 +1,8 @@ +#ifndef __KSU_H_SULOG_FD +#define __KSU_H_SULOG_FD + +int ksu_install_sulog_fd(void); +void ksu_sulog_fd_init(void); +void ksu_sulog_fd_exit(void); + +#endif diff --git a/drivers/kernelsu/supercalls.c b/drivers/kernelsu/supercall/dispatch.c similarity index 68% rename from drivers/kernelsu/supercalls.c rename to drivers/kernelsu/supercall/dispatch.c index 3ca054dc2dde..2d6973ee8ae0 100644 --- a/drivers/kernelsu/supercalls.c +++ b/drivers/kernelsu/supercall/dispatch.c @@ -1,79 +1,41 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // utsname() and uts_sem - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include // put_task_struct -#else -#include -#endif - -// Permission check functions -bool only_manager(void) -{ - return is_manager(); -} - -bool only_root(void) -{ - kuid_t current_uid = current_uid(); - return ksu_get_uid_t(current_uid) == 0; -} - -bool manager_or_root(void) -{ - kuid_t current_uid = current_uid(); - return ksu_get_uid_t(current_uid) == 0 || is_manager(); -} - -bool always_allow(void) -{ - return true; // No permission check -} - -bool allowed_for_su(void) -{ - kuid_t current_uid = current_uid(); - bool is_allowed = is_manager() || ksu_is_allow_uid_for_current(ksu_get_uid_t(current_uid)); - return is_allowed; -} - static int do_grant_root(void __user *arg) { + int ret; + kuid_t audit_uid = current_uid(); + kuid_t audit_euid = current_euid(); + // we already check uid above on allowed_for_su() write_sulog('i'); // log ioctl escalation - kuid_t current_uid = current_uid(); - pr_info("allow root for: %d\n", ksu_get_uid_t(current_uid)); - escape_with_root_profile(); + pr_info("allow root for: %d\n", ksu_get_uid_t(audit_uid)); + ret = escape_with_root_profile(); - return 0; +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit_grant_root(ret, ksu_get_uid_t(audit_uid), ksu_get_uid_t(audit_euid), GFP_KERNEL); +#endif + + return ret; } static uint32_t ksuver_override = 0; +static uint32_t ksuflags_override = 0; static int do_get_info(void __user *arg) { struct ksu_get_info_cmd cmd = {.version = KERNEL_SU_VERSION, .flags = 0}; - if (ksuver_override) - cmd.version = ksuver_override; - + // NOTE: we do not have LKM support so we don't bother with its flags or late-load if (is_manager()) { - cmd.flags |= 0x2; + cmd.flags |= KSU_GET_INFO_FLAG_MANAGER; } cmd.features = KSU_FEATURE_MAX; + if (ksuver_override) + cmd.version = ksuver_override; + + if (ksuflags_override) + cmd.flags = ksuflags_override; if (copy_to_user(arg, &cmd, sizeof(cmd))) { pr_err("get_version: copy_to_user failed\n"); @@ -131,7 +93,7 @@ static int do_set_sepolicy(void __user *arg) return -EFAULT; } - return handle_sepolicy(cmd.cmd, (void __user *)cmd.arg); + return handle_sepolicy((void __user *)cmd.data, cmd.data_len); } static int do_check_safemode(void __user *arg) @@ -467,6 +429,7 @@ static int do_manage_mark(void __user *arg) return 0; } + static int do_nuke_ext4_sysfs(void __user *arg) { struct ksu_nuke_ext4_sysfs_cmd cmd; @@ -591,7 +554,7 @@ static int add_try_umount(void __user *arg) return 0; } - + // this way userspace can deduce the memory it has to prepare. case KSU_UMOUNT_GETSIZE: { // check for pointer first @@ -640,7 +603,7 @@ static int add_try_umount(void __user *arg) return 0; } - + default: { pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode); return -EINVAL; @@ -651,6 +614,53 @@ static int add_try_umount(void __user *arg) return 0; } +static int do_set_init_pgrp(void __user *arg) +{ + int err; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + struct pid *pids[PIDTYPE_MAX] = { 0 }; +#endif + write_lock_irq(&tasklist_lock); + struct task_struct *p = current->group_leader; + struct pid *init_group = task_pgrp(&init_task); + + err = -EPERM; + if (task_session(p) != task_session(&init_task)) + goto out; + + err = 0; + if (task_pgrp(p) != init_group) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + change_pid(pids, p, PIDTYPE_PGID, init_group); +#else + change_pid(p, PIDTYPE_PGID, init_group); +#endif + } +out: + write_unlock_irq(&tasklist_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + free_pids(pids); +#endif + return err; +} + +static int do_get_sulog_fd(void __user *arg) +{ + struct ksu_get_sulog_fd_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_sulog_fd: copy_from_user failed\n"); + return -EFAULT; + } + + if (cmd.flags) { + pr_err("get_sulog_fd: unsupported flags 0x%x\n", cmd.flags); + return -EINVAL; + } + + return ksu_install_sulog_fd(); +} + // IOCTL handlers mapping table static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { { .cmd = KSU_IOCTL_GRANT_ROOT, .name = "GRANT_ROOT", .handler = do_grant_root, .perm_check = allowed_for_su }, @@ -673,219 +683,14 @@ static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { { .cmd = KSU_IOCTL_MANAGE_MARK, .name = "MANAGE_MARK", .handler = do_manage_mark, .perm_check = manager_or_root }, { .cmd = KSU_IOCTL_NUKE_EXT4_SYSFS, .name = "NUKE_EXT4_SYSFS", .handler = do_nuke_ext4_sysfs, .perm_check = manager_or_root }, { .cmd = KSU_IOCTL_ADD_TRY_UMOUNT, .name = "ADD_TRY_UMOUNT", .handler = add_try_umount, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_SET_INIT_PGRP, .name = "SET_INIT_PGRP", .handler = do_set_init_pgrp, .perm_check = only_root }, + { .cmd = KSU_IOCTL_GET_SULOG_FD, .name = "GET_SULOG_FD", .handler = do_get_sulog_fd, .perm_check = only_root }, { .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } // Sentinel }; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) -#include -#include - -struct ksu_install_fd_tw { - struct callback_head cb; - int __user *outp; -}; - -static void ksu_install_fd_tw_func(struct callback_head *cb) -{ - struct ksu_install_fd_tw *tw = container_of(cb, struct ksu_install_fd_tw, cb); - int fd = ksu_install_fd(); - pr_info("[%d] install ksu fd: %d\n", current->pid, fd); - - if (copy_to_user(tw->outp, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - close_fd(fd); - } - - kfree(tw); -} - -static int ksu_handle_fd_request(void __user *arg4) -{ - struct ksu_install_fd_tw *tw; - - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return 0; - - tw->outp = (int __user *)arg4; - tw->cb.func = ksu_install_fd_tw_func; - - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_warn("install fd add task_work failed\n"); - } - - return 0; -} -#else -static int ksu_handle_fd_request(void __user *arg4) -{ - int fd = ksu_install_fd(); - pr_info("[%d] install ksu fd: %d\n", current->pid, fd); - - if (copy_to_user(arg4, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - close_fd(fd); - } - - return 0; -} -#endif - -// downstream: make sure to pass arg as reference, this can allow us to extend things. -int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg) -{ - - if (magic1 != KSU_INSTALL_MAGIC1) - return 0; - - pr_info("sys_reboot: intercepted call! magic: 0x%x id: %d\n", magic1, magic2); - - // arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs); - // downstream: dereference arg as arg4 so we can be inline to upstream - void __user *arg4 = (void __user *)*arg; - - // Check if this is a request to install KSU fd - if (magic2 == KSU_INSTALL_MAGIC2) { - return ksu_handle_fd_request(arg4); - } - - // extensions - u64 reply = (u64)*arg; - - kuid_t current_uid = current_uid(); - if (ksu_get_uid_t(current_uid) != 0) - return 0; - - if (magic2 == CHANGE_MANAGER_UID) { - // only root is allowed for this command - - pr_info("sys_reboot: ksu_set_manager_appid to: %d\n", cmd); - ksu_set_manager_appid(cmd); - - if (cmd == ksu_get_manager_appid()) { - if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) - pr_info("sys_reboot: reply fail\n"); - } - - return 0; - } - - if (magic2 == GET_SULOG_DUMP_V2) { - int ret = send_sulog_dump(*arg); - if (ret) - return 0; - - if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) - return 0; - } - - if (magic2 == CHANGE_KSUVER) { - - pr_info("sys_reboot: ksu_change_ksuver to: %d\n", cmd); - ksuver_override = cmd; - - if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) - return 0; - } - - // WARNING!!! triple ptr zone! *** - // https://wiki.c2.com/?ThreeStarProgrammer - if (magic2 == CHANGE_SPOOF_UNAME) { - - char release_buf[65]; - char version_buf[65]; - static char original_release_buf[65] = {0}; - static char original_version_buf[65] = {0}; - - // basically void * void __user * void __user *arg - void ***ppptr = (void ***)(uintptr_t)arg; - - // user pointer storage - // init this as zero so this works on 32-on-64 compat (LE) - uint64_t u_pptr = 0; - uint64_t u_ptr = 0; - - pr_info("sys_reboot: ppptr: 0x%lx \n", (uintptr_t)ppptr); - - // arg here is ***, dereference to pull out ** - if (copy_from_user(&u_pptr, (void __user *)*ppptr, sizeof(u_pptr))) - return 0; - - pr_info("sys_reboot: u_pptr: 0x%lx \n", (uintptr_t)u_pptr); - - // now we got the __user ** - // we cannot dereference this as this is __user - // we just do another copy_from_user to get it - if (copy_from_user(&u_ptr, (void __user *)u_pptr, sizeof(u_ptr))) - return 0; - - pr_info("sys_reboot: u_ptr: 0x%lx \n", (uintptr_t)u_ptr); - - // for release - if (strncpy_from_user(release_buf, (char __user *)u_ptr, sizeof(release_buf)) < 0) - return 0; - release_buf[sizeof(release_buf) - 1] = '\0'; - - // for version - if (strncpy_from_user(version_buf, (char __user *)(u_ptr + strlen(release_buf) + 1), sizeof(version_buf)) < 0) - return 0; - version_buf[sizeof(version_buf) - 1] = '\0'; - - if (original_release_buf[0] == '\0') { - struct new_utsname *u_curr = utsname(); - // we save current version as the original before modifying - strncpy(original_release_buf, u_curr->release, sizeof(original_release_buf)); - strncpy(original_version_buf, u_curr->version, sizeof(original_version_buf)); - pr_info("sys_reboot: original uname saved: %s %s\n", original_release_buf, original_version_buf); - } - - // so user can reset - if (!strcmp(release_buf, "default")) { - memcpy(release_buf, original_release_buf, sizeof(release_buf)); - } - if (!strcmp(version_buf, "default")) { - memcpy(version_buf, original_version_buf, sizeof(version_buf)); - } - - pr_info("sys_reboot: spoofing kernel to: %s - %s\n", release_buf, version_buf); - - struct new_utsname *u = utsname(); - - down_write(&uts_sem); - strncpy(u->release, release_buf, sizeof(u->release)); - strncpy(u->version, version_buf, sizeof(u->version)); - up_write(&uts_sem); - - // we write our confirmation on ** - if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) - return 0; - } - - return 0; -} - -void ksu_supercalls_init(void) +long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp) { int i; - - pr_info("KernelSU IOCTL Commands:\n"); - for (i = 0; ksu_ioctl_handlers[i].handler; i++) { - pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, ksu_ioctl_handlers[i].cmd); - } - - sulog_init_heap(); // grab heap memory for sulog - -} - -void ksu_supercalls_exit(void){} - -// IOCTL dispatcher -static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int i; - kuid_t current_uid = current_uid(); #ifdef CONFIG_KSU_DEBUG @@ -910,46 +715,14 @@ static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long ar return -ENOTTY; } -// File release handler -static int anon_ksu_release(struct inode *inode, struct file *filp) -{ - pr_info("ksu fd released\n"); - return 0; -} - -// File operations structure -static const struct file_operations anon_ksu_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = anon_ksu_ioctl, - .compat_ioctl = anon_ksu_ioctl, - .release = anon_ksu_release, -}; - -// Install KSU fd to current process -int ksu_install_fd(void) +void __init ksu_supercall_dump_commands(void) { - struct file *filp; - int fd; - - // Get unused fd - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - pr_err("ksu_install_fd: failed to get unused fd\n"); - return fd; - } + int i; - // Create anonymous inode file - filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, O_RDWR | O_CLOEXEC); - if (IS_ERR(filp)) { - pr_err("ksu_install_fd: failed to create anon inode file\n"); - put_unused_fd(fd); - return PTR_ERR(filp); + pr_info("KernelSU IOCTL Commands:\n"); + for (i = 0; ksu_ioctl_handlers[i].handler; i++) { + pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, ksu_ioctl_handlers[i].cmd); } - - // Install fd - fd_install(fd, filp); - - pr_info("ksu fd installed: %d for pid %d\n", fd, current->pid); - - return fd; } + +void ksu_supercall_cleanup_state(void) {} diff --git a/drivers/kernelsu/supercall/internal.h b/drivers/kernelsu/supercall/internal.h new file mode 100644 index 000000000000..5287f2e5affe --- /dev/null +++ b/drivers/kernelsu/supercall/internal.h @@ -0,0 +1,14 @@ +#ifndef __KSU_H_SUPERCALL_INTERNAL +#define __KSU_H_SUPERCALL_INTERNAL + +bool only_manager(void); +bool only_root(void); +bool manager_or_root(void); +bool always_allow(void); +bool allowed_for_su(void); + +long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp); +void ksu_supercall_dump_commands(void); +void ksu_supercall_cleanup_state(void); + +#endif // __KSU_H_SUPERCALL_INTERNAL diff --git a/drivers/kernelsu/supercall/perm.c b/drivers/kernelsu/supercall/perm.c new file mode 100644 index 000000000000..a0191bd140c7 --- /dev/null +++ b/drivers/kernelsu/supercall/perm.c @@ -0,0 +1,27 @@ +bool only_manager(void) +{ + return is_manager(); +} + +bool only_root(void) +{ + kuid_t current_uid = current_uid(); + return ksu_get_uid_t(current_uid) == 0; +} + +bool manager_or_root(void) +{ + kuid_t current_uid = current_uid(); + return ksu_get_uid_t(current_uid) == 0 || is_manager(); +} + +bool always_allow(void) +{ + return true; +} + +bool allowed_for_su(void) +{ + kuid_t current_uid = current_uid(); + return is_manager() || ksu_is_allow_uid_for_current(ksu_get_uid_t(current_uid)); +} diff --git a/drivers/kernelsu/supercall/supercall.c b/drivers/kernelsu/supercall/supercall.c new file mode 100644 index 000000000000..a6720a489d6f --- /dev/null +++ b/drivers/kernelsu/supercall/supercall.c @@ -0,0 +1,249 @@ +static int anon_ksu_release(struct inode *inode, struct file *filp) +{ + pr_info("ksu fd released\n"); + return 0; +} + +static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + return ksu_supercall_handle_ioctl(cmd, (void __user *)arg); +} + +// File operations structure +static const struct file_operations anon_ksu_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = anon_ksu_ioctl, + .compat_ioctl = anon_ksu_ioctl, + .release = anon_ksu_release, +}; + +// Install KSU fd to current process +int ksu_install_fd(void) +{ + struct file *filp; + int fd; + + // Get unused fd + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + pr_err("ksu_install_fd: failed to get unused fd\n"); + return fd; + } + + // Create anonymous inode file + filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, O_RDWR | O_CLOEXEC); + if (IS_ERR(filp)) { + pr_err("ksu_install_fd: failed to create anon inode file\n"); + put_unused_fd(fd); + return PTR_ERR(filp); + } + + // Install fd + fd_install(fd, filp); + + pr_info("ksu fd installed: %d for pid %d\n", fd, current->pid); + + return fd; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +struct ksu_install_fd_tw { + struct callback_head cb; + int __user *outp; +}; + +static void ksu_install_fd_tw_func(struct callback_head *cb) +{ + struct ksu_install_fd_tw *tw = container_of(cb, struct ksu_install_fd_tw, cb); + int fd = ksu_install_fd(); + pr_info("[%d] install ksu fd: %d\n", current->pid, fd); + + if (copy_to_user(tw->outp, &fd, sizeof(fd))) { + pr_err("install ksu fd reply err\n"); + close_fd(fd); + } + + kfree(tw); +} + +static int ksu_handle_fd_request(void __user *arg4) +{ + struct ksu_install_fd_tw *tw; + + tw = kzalloc(sizeof(*tw), GFP_ATOMIC); + if (!tw) + return 0; + + tw->outp = (int __user *)arg4; + tw->cb.func = ksu_install_fd_tw_func; + + if (task_work_add(current, &tw->cb, TWA_RESUME)) { + kfree(tw); + pr_warn("install fd add task_work failed\n"); + } + + return 0; +} +#else +static int ksu_handle_fd_request(void __user *arg4) +{ + int fd = ksu_install_fd(); + pr_info("[%d] install ksu fd: %d\n", current->pid, fd); + + if (copy_to_user(arg4, &fd, sizeof(fd))) { + pr_err("install ksu fd reply err\n"); + close_fd(fd); + } + + return 0; +} +#endif + +// downstream: make sure to pass arg as reference, this can allow us to extend things. +int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg) +{ + + if (magic1 != KSU_INSTALL_MAGIC1) + return 0; + + pr_info("sys_reboot: intercepted call! magic: 0x%x id: %d\n", magic1, magic2); + + // arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs); + // downstream: dereference arg as arg4 so we can be inline to upstream + void __user *arg4 = (void __user *)*arg; + + // Check if this is a request to install KSU fd + if (magic2 == KSU_INSTALL_MAGIC2) { + return ksu_handle_fd_request(arg4); + } + + // only root is allowed for these commands + kuid_t current_uid = current_uid(); + if (ksu_get_uid_t(current_uid) != 0) + return 0; + + // extensions + u64 reply = (u64)*arg; + + if (magic2 == CHANGE_MANAGER_UID) { + pr_info("sys_reboot: ksu_set_manager_appid to: %d\n", cmd); + ksu_set_manager_appid(cmd); + + if (cmd == ksu_get_manager_appid()) { + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + pr_info("sys_reboot: reply fail\n"); + } + + return 0; + } + + if (magic2 == GET_SULOG_DUMP_V2) { + + int ret = send_sulog_dump(*arg); + if (ret) + return 0; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + if (magic2 == CHANGE_KSUVER) { + pr_info("sys_reboot: ksu_change_ksuver to: %d\n", cmd); + ksuver_override = cmd; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + // WARNING!!! triple ptr zone! *** + // https://wiki.c2.com/?ThreeStarProgrammer + if (magic2 == CHANGE_SPOOF_UNAME) { + + char release_buf[65]; + char version_buf[65]; + static char original_release_buf[65] = {0}; + static char original_version_buf[65] = {0}; + + // basically void * void __user * void __user *arg + void ***ppptr = (void ***)(uintptr_t)arg; + + // user pointer storage + // init this as zero so this works on 32-on-64 compat (LE) + uint64_t u_pptr = 0; + uint64_t u_ptr = 0; + + pr_info("sys_reboot: ppptr: 0x%lx \n", (uintptr_t)ppptr); + + // arg here is ***, dereference to pull out ** + if (copy_from_user(&u_pptr, (void __user *)*ppptr, sizeof(u_pptr))) + return 0; + + pr_info("sys_reboot: u_pptr: 0x%lx \n", (uintptr_t)u_pptr); + + // now we got the __user ** + // we cannot dereference this as this is __user + // we just do another copy_from_user to get it + if (copy_from_user(&u_ptr, (void __user *)u_pptr, sizeof(u_ptr))) + return 0; + + pr_info("sys_reboot: u_ptr: 0x%lx \n", (uintptr_t)u_ptr); + + // for release + if (strncpy_from_user(release_buf, (char __user *)u_ptr, sizeof(release_buf)) < 0) + return 0; + release_buf[sizeof(release_buf) - 1] = '\0'; + + // for version + if (strncpy_from_user(version_buf, (char __user *)(u_ptr + strlen(release_buf) + 1), sizeof(version_buf)) < 0) + return 0; + version_buf[sizeof(version_buf) - 1] = '\0'; + + if (original_release_buf[0] == '\0') { + struct new_utsname *u_curr = utsname(); + // we save current version as the original before modifying + strncpy(original_release_buf, u_curr->release, sizeof(original_release_buf)); + strncpy(original_version_buf, u_curr->version, sizeof(original_version_buf)); + pr_info("sys_reboot: original uname saved: %s %s\n", original_release_buf, original_version_buf); + } + + // so user can reset + if (!strcmp(release_buf, "default")) { + memcpy(release_buf, original_release_buf, sizeof(release_buf)); + } + if (!strcmp(version_buf, "default")) { + memcpy(version_buf, original_version_buf, sizeof(version_buf)); + } + + pr_info("sys_reboot: spoofing kernel to: %s - %s\n", release_buf, version_buf); + + struct new_utsname *u = utsname(); + + down_write(&uts_sem); + strncpy(u->release, release_buf, sizeof(u->release)); + strncpy(u->version, version_buf, sizeof(u->version)); + up_write(&uts_sem); + + // we write our confirmation on ** + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + return 0; + } + + if (magic2 == CHANGE_KSUFLAGS) { + pr_info("sys_reboot: ksu_change_ksuflags to: %d\n", cmd); + ksuflags_override = cmd; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + return 0; +} + +void __init ksu_supercalls_init(void) +{ + ksu_supercall_dump_commands(); + + tiny_sulog_init_heap(); // grab heap memory for sulog +} + +void __exit ksu_supercalls_exit(void) { } diff --git a/drivers/kernelsu/supercall/supercall.h b/drivers/kernelsu/supercall/supercall.h new file mode 100644 index 000000000000..1c9e5a0a27ed --- /dev/null +++ b/drivers/kernelsu/supercall/supercall.h @@ -0,0 +1,32 @@ +#ifndef __KSU_H_SUPERCALL +#define __KSU_H_SUPERCALL + +// IOCTL handler types +typedef int (*ksu_ioctl_handler_t)(void __user *arg); +typedef bool (*ksu_perm_check_t)(void); + +// IOCTL command mapping +struct ksu_ioctl_cmd_map { + unsigned int cmd; + const char *name; + ksu_ioctl_handler_t handler; + ksu_perm_check_t perm_check; // Permission check function +}; + +// Install KSU fd to current process +int ksu_install_fd(void); + +void ksu_supercalls_init(void); +void ksu_supercalls_exit(void); + +// extensions +#define CHANGE_MANAGER_UID 10006 +#define KSU_UMOUNT_GETSIZE 107 // get list size // shit is u8 we cant fit 10k+ on it +#define KSU_UMOUNT_GETLIST 108 // get list +#define GET_SULOG_DUMP 10009 // get sulog dump, max, last 100 escalations +#define GET_SULOG_DUMP_V2 10010 // get sulog dump, timestamped, last 250 escalations +#define CHANGE_KSUVER 10011 // change ksu version +#define CHANGE_SPOOF_UNAME 10012 // spoof uname +#define CHANGE_KSUFLAGS 10013 // change ksuflags, do the bit calc on your own, 0 + 1 + 2 + 4 + 8 blah + +#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/supercalls.h b/drivers/kernelsu/supercalls.h deleted file mode 100644 index 388df1eeaed5..000000000000 --- a/drivers/kernelsu/supercalls.h +++ /dev/null @@ -1,166 +0,0 @@ -#ifndef __KSU_H_SUPERCALLS -#define __KSU_H_SUPERCALLS - -#include -#include -#include "app_profile.h" - -// Magic numbers for reboot hook to install fd -#define KSU_INSTALL_MAGIC1 0xDEADBEEF -#define KSU_INSTALL_MAGIC2 0xCAFEBABE - -// Command structures for ioctl - -struct ksu_become_daemon_cmd { - __u8 token[65]; // Input: daemon token (null-terminated) -}; - -struct ksu_get_info_cmd { - __u32 version; // Output: KERNEL_SU_VERSION - __u32 flags; // Output: flags (bit 0: MODULE mode) - __u32 features; // Output: max feature ID supported -}; - -struct ksu_report_event_cmd { - __u32 event; // Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. -}; - -struct ksu_set_sepolicy_cmd { - __u64 cmd; // Input: sepolicy command - __aligned_u64 arg; // Input: sepolicy argument pointer -}; - -struct ksu_check_safemode_cmd { - __u8 in_safe_mode; // Output: true if in safe mode, false otherwise -}; - -// deprecated -struct ksu_get_allow_list_cmd { - __u32 uids[128]; // Output: array of allowed/denied UIDs - __u32 count; // Output: number of UIDs in array - __u8 allow; // Input: true for allow list, false for deny list -}; - -struct ksu_new_get_allow_list_cmd { - __u16 count; // Input / Output: number of UIDs in array - __u16 total_count; // Output: total number of UIDs in requested list - __u32 uids[0]; // Output: array of allowed/denied UIDs -}; - -struct ksu_uid_granted_root_cmd { - __u32 uid; // Input: target UID to check - __u8 granted; // Output: true if granted, false otherwise -}; - -struct ksu_uid_should_umount_cmd { - __u32 uid; // Input: target UID to check - __u8 should_umount; // Output: true if should umount, false otherwise -}; - -struct ksu_get_manager_appid_cmd { - __u32 appid; // Output: manager app id -}; - -struct ksu_get_app_profile_cmd { - struct app_profile profile; // Input/Output: app profile structure -}; - -struct ksu_set_app_profile_cmd { - struct app_profile profile; // Input: app profile structure -}; - -struct ksu_get_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Output: feature value/state - __u8 supported; // Output: true if feature is supported, false otherwise -}; - -struct ksu_set_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Input: feature value/state to set -}; - -struct ksu_get_wrapper_fd_cmd { - __u32 fd; // Input: userspace fd - __u32 flags; // Input: flags of userspace fd -}; - -struct ksu_manage_mark_cmd { - __u32 operation; // Input: KSU_MARK_* - __s32 pid; // Input: target pid (0 for all processes) - __u32 result; // Output: for get operation - mark status or reg_count -}; - -#define KSU_MARK_GET 1 -#define KSU_MARK_MARK 2 -#define KSU_MARK_UNMARK 3 -#define KSU_MARK_REFRESH 4 - -struct ksu_nuke_ext4_sysfs_cmd { - __aligned_u64 arg; // Input: mnt pointer -}; - -struct ksu_add_try_umount_cmd { - __aligned_u64 arg; // char ptr, this is the mountpoint - __u32 flags; // this is the flag we use for it - __u8 mode; // denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry -}; - -#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list -#define KSU_UMOUNT_ADD 1 // add entry (path + flags) -#define KSU_UMOUNT_DEL 2 // delete entry, strcmp - - - -// IOCTL command definitions -#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) -#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) -#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) -#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ|_IOC_WRITE, 'K', 4, 0) -#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) -// deprecated -#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 6, 0) -// deprecated -#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 7, 0) -#define KSU_IOCTL_NEW_GET_ALLOW_LIST _IOWR('K', 6, struct ksu_new_get_allow_list_cmd) -#define KSU_IOCTL_NEW_GET_DENY_LIST _IOWR('K', 7, struct ksu_new_get_allow_list_cmd) -#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ|_IOC_WRITE, 'K', 8, 0) -#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ|_IOC_WRITE, 'K', 9, 0) -#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) -#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ|_IOC_WRITE, 'K', 11, 0) -#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) -#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ|_IOC_WRITE, 'K', 13, 0) -#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) -#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) -#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ|_IOC_WRITE, 'K', 16, 0) -#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) -#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) - -// IOCTL handler types -typedef int (*ksu_ioctl_handler_t)(void __user *arg); -typedef bool (*ksu_perm_check_t)(void); - -// IOCTL command mapping -struct ksu_ioctl_cmd_map { - unsigned int cmd; - const char *name; - ksu_ioctl_handler_t handler; - ksu_perm_check_t perm_check; // Permission check function -}; - -// Install KSU fd to current process -int ksu_install_fd(void); - -void ksu_supercalls_init(void); -void ksu_supercalls_exit(void); - -// extensions -#define CHANGE_MANAGER_UID 10006 -#define KSU_UMOUNT_GETSIZE 107 // get list size // shit is u8 we cant fit 10k+ on it -#define KSU_UMOUNT_GETLIST 108 // get list -#define GET_SULOG_DUMP 10009 // get sulog dump, max, last 100 escalations -#define GET_SULOG_DUMP_V2 10010 // get sulog dump, timestamped, last 250 escalations -#define CHANGE_KSUVER 10011 // change ksu version -#define CHANGE_SPOOF_UNAME 10012 // spoof uname - -#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/tiny_sulog.c b/drivers/kernelsu/tiny_sulog.c index 9379182a311d..401f4c1c8daf 100644 --- a/drivers/kernelsu/tiny_sulog.c +++ b/drivers/kernelsu/tiny_sulog.c @@ -13,7 +13,7 @@ static uint8_t sulog_index_next = 0; static DEFINE_SPINLOCK(sulog_lock); -void sulog_init_heap() +static void tiny_sulog_init_heap() { sulog_buf_ptr = kzalloc(SULOG_BUFSIZ, GFP_KERNEL); if (!sulog_buf_ptr) @@ -48,7 +48,7 @@ static inline uint32_t boottime_s_get() return (uint32_t)boottime_s; } -void write_sulog(uint8_t sym) +static void write_sulog(uint8_t sym) { if (!sulog_buf_ptr) return; @@ -88,7 +88,7 @@ struct sulog_entry_rcv_ptr { uint64_t uptime_ptr; // uptime }; -int send_sulog_dump(void __user *uptr) +static int send_sulog_dump(void __user *uptr) { if (!sulog_buf_ptr) return 1; From 2c7237f801ebf8cd787e79bc8d385dfbabc3b60e Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sun, 12 Apr 2026 08:12:03 +0800 Subject: [PATCH 54/59] =?UTF-8?q?Revert=20"CI:=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E7=BC=96=E8=AF=91"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 9c2b350c5f39a3c21b925c6712f7430583ccb131. --- .github/workflows/build.yml | 234 ++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b5b4349add89..948e5d193a8c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,11 +16,20 @@ jobs: name: 编译内核 permissions: { contents: write } runs-on: ubuntu-latest + strategy: + matrix: + device: + - beryllium + - dipper + - equuleus + - perseus + - polaris + - ursa steps: - name: 安装软件包 - if: env.PACKAGES != '' env: PACKAGES: + ccache binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi run: | @@ -30,7 +39,7 @@ jobs: - name: 安装make4.4.1-2 run: | curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb - sudo apt-get install -y -q ./make.deb + sudo apt-get install -y ./make.deb rm ./make.deb - name: 同步仓库 @@ -38,167 +47,158 @@ jobs: with: path: kernel - - name: 配置Anykernel3 - run: | - git clone https://github.com/osm0sis/AnyKernel3.git --depth=1 ak3 - rm -rf .git .github README.md LICENSE - find ak3/ -name "placeholder" | xargs rm -rf - cat >ak3/anykernel.sh <anykernel.sh <> $GITHUB_OUTPUT - echo "time=$TIME_STR" >> $GITHUB_OUTPUT + echo "time=$(TZ='Asia/Shanghai' date -u +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT + echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT + + - name: 下载ci管理器 + continue-on-error: true + uses: dawidd6/action-download-artifact@master + with: + repo: rsuntk/KernelSU + workflow_conclusion: success + name: manager + workflow: build-manager.yml + path: manager + check_artifacts: true + search_artifacts: true - name: 发布 - if: github.event_name == 'push' - id: release uses: softprops/action-gh-release@master with: tag_name: rel-${{ steps.time.outputs.timestamp }} name: Kernel build ${{ steps.time.outputs.time }} prerelease: ${{ startsWith(github.ref_name, 'dev/') }} - files: dist/* + files: | + kernel/* + manager/* - name: 发送Telegram通知 - if: github.event_name == 'push' continue-on-error: true + env: + COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + COMMIT_URL: ${{ github.event.head_commit.url }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }} run: | - IDS=(${{ join(github.event.commits.*.id, ' ') }}) - MAX=6 - if [ "${#IDS[@]}" -gt "$MAX" ]; then - COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")" - else - COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")" - fi - MSG="\ - CI ${{ steps.time.outputs.time }} -
\
-          项目: ${{ github.repository }}
-          分支: ${{ github.ref_name }}\
-          
- 提交ID: -
$COMMIT_IDS_TEXT
\ + msg="*CI ${{ steps.time.outputs.time }}* + > Branch/分支: \`${{ github.ref_name }}\` + \`\`\` + $COMMIT_MESSAGE + \`\`\` + [Download/下载]($RELEASE_URL) + [Commit/提交]($COMMIT_URL) + [Run/工作流]($RUN_URL) " - PREVIEW_OPTIONS="{ \ - \"url\": \"${{ steps.release.outputs.url }}\", \ - \"prefer_small_media\": true, \ - \"show_above_text\": true \ - }" - BUTTONS="{\"inline_keyboard\": [ [ \ - { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \ - { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \ - ] ] }" - curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ - -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ - -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \ - -d "parse_mode=HTML" \ - --data-urlencode "text=$MSG" \ - -d "link_preview_options=$PREVIEW_OPTIONS" \ - -d "reply_markup=$BUTTONS" \ - -o response.txt && \ - (! ${{ startsWith(github.ref_name, 'stable/') }} || \ - curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ - -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ - -d "message_id=$(jq '.result.message_id' response.txt)") + curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ + -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ + -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \ + -F 'parse_mode="markdownv2"' \ + -F "text=\"$msg\"" | tee Markdown.txt + ! ${{ startsWith(github.ref_name, 'stable/') }} || \ + curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ + -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ + -F message_id=$(jq '.result.message_id' Markdown.txt) From ecebf0043fbab17b21ef897d97a184f227e27f03 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Sun, 12 Apr 2026 08:19:56 +0800 Subject: [PATCH 55/59] =?UTF-8?q?CI:=20=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- .github/workflows/build.yml | 127 ++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 948e5d193a8c..ac16c6964dbe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -27,9 +27,9 @@ jobs: - ursa steps: - name: 安装软件包 + if: env.PACKAGES != '' env: PACKAGES: - ccache binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi run: | @@ -39,7 +39,7 @@ jobs: - name: 安装make4.4.1-2 run: | curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb - sudo apt-get install -y ./make.deb + sudo apt-get install -y -q ./make.deb rm ./make.deb - name: 同步仓库 @@ -49,17 +49,25 @@ jobs: - name: 缓存Clang id: cache-clang - uses: actions/cache@main + uses: actions/cache/restore@main with: path: clang key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }} - name: 下载Clang + id: download_clang if: steps.cache-clang.outputs.cache-hit != 'true' - run: - mkdir -p clang && - curl -LSs "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" | - tar xz -C clang + run: | + mkdir -p clang + wget -c -t 10 "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" -O clang.tgz + tar -zxvf clang.tgz -C clang/ + + - name: 保存Clang + if: always() && steps.cache-clang.outputs.cache-hit != 'true' && steps.download_clang.outcome == 'success' + uses: actions/cache/save@main + with: + path: clang + key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }} - name: 缓存ccache uses: hendrikmuhs/ccache-action@main @@ -73,12 +81,12 @@ jobs: env: MAKE_ARGS: -j$(nproc --all) + O=out + LLVM=1 + LLVM_IAS=1 CC="ccache clang" LD=ld.lld ARCH=arm64 - LLVM=1 - LLVM_IAS=1 - O=out CROSS_COMPILE=aarch64-linux-gnu- CROSS_COMPILE_ARM32=arm-linux-gnueabi- CONFIG_FILES: @@ -112,31 +120,15 @@ jobs: EOF zip -qr9 Anykernel3-${{ matrix.device }}.zip * -x .git .github README.md *placeholder - - name: 打包(boot) - run: | - git clone https://android.googlesource.com/platform/system/tools/mkbootimg --depth=1 mkbootimg - cp kernel/out/arch/arm64/boot/Image.gz-dtb mkbootimg/ - cd mkbootimg - boot_url=$(curl -LSs https://download.lineageos.org/api/v2/devices/${{ matrix.device }}/builds | jq -r '.[0].files[1].url') - curl -LSs $boot_url -o boot.img - mkbootimg_args=$(./unpack_bootimg.py --out out --boot_img boot.img --format mkbootimg) - mv Image.gz-dtb out/kernel - eval "./mkbootimg.py $mkbootimg_args -o boot-lineage-${{ matrix.device }}.img" - - name: 上传文件 uses: actions/upload-artifact@main with: name: kernel-${{ matrix.device }}-ak3 path: ak3/Anykernel3-${{ matrix.device }}.zip - - name: 上传文件 - uses: actions/upload-artifact@main - with: - name: kernel-${{ matrix.device }}-boot - path: mkbootimg/boot-lineage-${{ matrix.device }}.img - release: name: 发布 + if: github.event_name == 'push' permissions: { contents: write } runs-on: ubuntu-latest needs: build @@ -151,54 +143,61 @@ jobs: - name: 获取当前时间 id: time run: | - echo "time=$(TZ='Asia/Shanghai' date -u +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT - echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT - - - name: 下载ci管理器 - continue-on-error: true - uses: dawidd6/action-download-artifact@master - with: - repo: rsuntk/KernelSU - workflow_conclusion: success - name: manager - workflow: build-manager.yml - path: manager - check_artifacts: true - search_artifacts: true + NOW=$(date +%s) + TIME_STR=$(TZ='Asia/Shanghai' date -d "@$NOW" +'%Y%m%d%H%M') + echo "timestamp=$NOW" >> $GITHUB_OUTPUT + echo "time=$TIME_STR" >> $GITHUB_OUTPUT - name: 发布 uses: softprops/action-gh-release@master + id: release with: tag_name: rel-${{ steps.time.outputs.timestamp }} name: Kernel build ${{ steps.time.outputs.time }} prerelease: ${{ startsWith(github.ref_name, 'dev/') }} files: | kernel/* - manager/* - name: 发送Telegram通知 continue-on-error: true - env: - COMMIT_MESSAGE: ${{ github.event.head_commit.message }} - COMMIT_URL: ${{ github.event.head_commit.url }} - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }} run: | - msg="*CI ${{ steps.time.outputs.time }}* - > Branch/分支: \`${{ github.ref_name }}\` - \`\`\` - $COMMIT_MESSAGE - \`\`\` - [Download/下载]($RELEASE_URL) - [Commit/提交]($COMMIT_URL) - [Run/工作流]($RUN_URL) + IDS=(${{ join(github.event.commits.*.id, ' ') }}) + MAX=6 + if [ "${#IDS[@]}" -gt "$MAX" ]; then + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")" + else + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")" + fi + MSG="\ + CI ${{ steps.time.outputs.time }} +
\
+          项目: ${{ github.repository }}
+          分支: ${{ github.ref_name }}\
+          
+ 提交ID: +
$COMMIT_IDS_TEXT
\ " - curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \ - -F 'parse_mode="markdownv2"' \ - -F "text=\"$msg\"" | tee Markdown.txt - ! ${{ startsWith(github.ref_name, 'stable/') }} || \ - curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F message_id=$(jq '.result.message_id' Markdown.txt) + PREVIEW_OPTIONS="{ \ + \"url\": \"${{ steps.release.outputs.url }}\", \ + \"prefer_small_media\": true, \ + \"show_above_text\": true \ + }" + BUTTONS="{\"inline_keyboard\": [ [ \ + { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \ + { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \ + ] ] }" + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=$MSG" \ + -d "link_preview_options=$PREVIEW_OPTIONS" \ + -d "reply_markup=$BUTTONS" \ + -o response.txt && \ + (! ${{ startsWith(github.ref_name, 'stable/') }} || \ + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_id=$(jq '.result.message_id' response.txt)") + if [ "${{ runner.debug }}" = "1" ]; then + cat response.txt + fi From 3fb6a13c59196ebdeaab23fa9f20a197fe8b3e59 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Tue, 5 May 2026 16:32:49 +0800 Subject: [PATCH 56/59] =?UTF-8?q?CI:=20=E9=BB=98=E8=AE=A4=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0lxc.config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ac16c6964dbe..e35f5377f681 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -92,6 +92,7 @@ jobs: CONFIG_FILES: vendor/xiaomi/mi845_defconfig vendor/xiaomi/${{ matrix.device }}.config + lxc.config run: | export PATH=$GITHUB_WORKSPACE/clang/bin:$PATH export KBUILD_BUILD_USER=${{ github.repository_owner }} From 443feccf3a358026a2095ff48bea1c78c0018f60 Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Tue, 5 May 2026 09:00:36 +0000 Subject: [PATCH 57/59] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E8=87=B3backslashxx/Ke?= =?UTF-8?q?rnelSU@2aa0289?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- drivers/kernelsu/Kconfig | 2 +- drivers/kernelsu/Makefile | 37 +- drivers/kernelsu/extras.c | 16 +- drivers/kernelsu/feature/adb_root.c | 126 +++--- drivers/kernelsu/feature/kernel_umount.c | 21 +- drivers/kernelsu/feature/sucompat.c | 276 +++++++----- drivers/kernelsu/hook/core_hook.c | 425 ++++++++++++------ drivers/kernelsu/hook/kp_ksud.c | 179 ++------ drivers/kernelsu/hook/setuid_hook.c | 35 ++ .../kernelsu/hook/syscall_table_hook_arm.c | 54 +-- .../kernelsu/hook/syscall_table_hook_arm64.c | 84 ++-- drivers/kernelsu/include/arch.h | 12 +- drivers/kernelsu/include/ksu.h | 2 +- drivers/kernelsu/include/uapi/app_profile.h | 2 +- drivers/kernelsu/infra/file_wrapper.c | 75 +--- drivers/kernelsu/kernel_compat.c | 34 +- drivers/kernelsu/kernel_compat.h | 196 +++++--- drivers/kernelsu/kernel_includes.h | 25 +- drivers/kernelsu/ksu.c | 24 +- drivers/kernelsu/linux_hashtable.h | 243 ++++++++++ drivers/kernelsu/manager/apk_sign.c | 20 +- drivers/kernelsu/manager/manager_identity.h | 7 +- drivers/kernelsu/manager/pkg_observer.c | 89 ++++ drivers/kernelsu/manager/throne_tracker.c | 68 +-- drivers/kernelsu/manager/throne_tracker.h | 37 +- drivers/kernelsu/policy/allowlist.c | 329 +++++++------- drivers/kernelsu/policy/allowlist.h | 9 +- drivers/kernelsu/policy/app_profile.c | 74 +-- drivers/kernelsu/runtime/ksud.c | 275 +++++------- drivers/kernelsu/runtime/ksud.h | 11 +- drivers/kernelsu/runtime/ksud_escape.c | 213 +++++++++ drivers/kernelsu/runtime/ksud_escape.h | 41 ++ drivers/kernelsu/selinux/rules.c | 61 +-- drivers/kernelsu/selinux/sepolicy.c | 270 ++++++----- drivers/kernelsu/sulog/event.c | 16 +- drivers/kernelsu/sulog/event.h | 5 +- drivers/kernelsu/supercall/dispatch.c | 45 +- drivers/kernelsu/supercall/perm.c | 10 +- drivers/kernelsu/supercall/supercall.c | 53 +-- drivers/kernelsu/tiny_sulog.c | 11 +- security/selinux/ss/services.c | 4 + 41 files changed, 2140 insertions(+), 1376 deletions(-) create mode 100644 drivers/kernelsu/hook/setuid_hook.c create mode 100644 drivers/kernelsu/linux_hashtable.h create mode 100644 drivers/kernelsu/manager/pkg_observer.c create mode 100644 drivers/kernelsu/runtime/ksud_escape.c create mode 100644 drivers/kernelsu/runtime/ksud_escape.h diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig index cb75564a95ae..a2a7bebe2921 100644 --- a/drivers/kernelsu/Kconfig +++ b/drivers/kernelsu/Kconfig @@ -46,7 +46,7 @@ config KSU_FEATURE_SULOG config KSU_FEATURE_ADBROOT bool "KernelSU ADB Root feature" depends on KSU - default n + default y help Build KernelSU's adb root feature. diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile index ab779e4d0a5d..8ed9b3857342 100644 --- a/drivers/kernelsu/Makefile +++ b/drivers/kernelsu/Makefile @@ -13,6 +13,15 @@ ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/includ CFLAGS_ksu.o += -DKSU_COMPAT_HAS_SELINUX_STATE endif +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT +endif + +# half-assed-backport from 5.1 +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY +endif + ifeq ($(shell grep -q "^DEFINE_RWLOCK(policy_rwlock);" $(srctree)/security/selinux/ss/services.c; echo $$?),0) CFLAGS_ksu.o += -DKSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK endif @@ -21,6 +30,10 @@ ifeq ($(shell grep -q "cpus_ptr;" $(srctree)/include/linux/sched.h; echo $$?),0) CFLAGS_ksu.o += -DKSU_COMPAT_HAS_BACKPORTED_CPUS_PTR endif +ifeq ($(shell grep -q "^struct security_operations selinux_ops" $(srctree)/security/selinux/hooks.c; echo $$?),0) +CFLAGS_ksu.o += -DKSU_HAS_EXPORTED_SELINUX_OPS +endif + # UL, look for read_iter on f_op struct ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0) CFLAGS_ksu.o += -DKSU_HAS_FOP_READ_ITER @@ -31,26 +44,10 @@ ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; ec CFLAGS_ksu.o += -DKSU_HAS_ITERATE_DIR endif -ifeq ($(shell grep -q "selinux_inode" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) -CFLAGS_ksu.o += -DKSU_HAS_SELINUX_INODE -endif - -ifeq ($(shell grep -q "selinux_cred" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) -CFLAGS_ksu.o += -DKSU_HAS_SELINUX_CRED -endif - -ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) -CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT -endif - -# half-assed-backport from 5.1 -ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) -CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY -endif - -CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-int-conversion -Wno-missing-prototypes -CFLAGS_ksu.o += -Wno-declaration-after-statement -Wno-unused-function -Wno-format -Wno-incompatible-pointer-types -CFLAGS_ksu.o += -Wno-unused-variable -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast +CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-declaration-after-statement +CFLAGS_ksu.o += -Wno-int-conversion -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast +CFLAGS_ksu.o += -Wno-unused-variable -Wno-unused-function -Wno-format +CFLAGS_ksu.o += -Wno-macro-redefined # so we can see stack use atleast, as we disable all stack safety here CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024) diff --git a/drivers/kernelsu/extras.c b/drivers/kernelsu/extras.c index 642c83bfd39a..e7436175102a 100644 --- a/drivers/kernelsu/extras.c +++ b/drivers/kernelsu/extras.c @@ -4,6 +4,7 @@ // - xx, 20251019 static u32 su_sid = 0; +static u32 ksu_sid = 0; static u32 priv_app_sid = 0; // init as disabled by default @@ -62,6 +63,13 @@ static int get_sid() } pr_info("avc_spoof/get_sid: su_sid: %u\n", su_sid); + err = security_secctx_to_secid("u:r:ksu:s0", strlen("u:r:ksu:s0"), &ksu_sid); + if (err) { + pr_info("avc_spoof/get_sid: ksu_sid not found!\n"); + return -1; + } + pr_info("avc_spoof/get_sid: ksu_sid: %u\n", ksu_sid); + err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid); if (err) { pr_info("avc_spoof/get_sid: priv_app_sid not found!\n"); @@ -82,8 +90,8 @@ static int ksu_handle_slow_avc_audit(u32 *tsid) // if tsid is su, we just replace it // unsure if its enough, but this is how it is aye? - if (*tsid == su_sid) { - pr_info("avc_spoof/slow_avc_audit: replacing su_sid: %u with priv_app_sid: %u\n", su_sid, priv_app_sid); + if (*tsid == su_sid || *tsid == ksu_sid) { + pr_info("avc_spoof/slow_avc_audit: replacing tsid: %u with priv_app_sid: %u\n", *tsid, priv_app_sid); *tsid = priv_app_sid; } @@ -151,10 +159,10 @@ int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass) if (atomic_read(&disable_spoof)) return 0; - if (tsid != su_sid) + if (tsid != su_sid && tsid != ksu_sid) return 0; - pr_info("avc_spoof/slow_avc_audit: prevent log for sid: %u\n", su_sid); + pr_info("avc_spoof/slow_avc_audit: prevent log for sid: %u\n", tsid); *tclass = 0; return 0; diff --git a/drivers/kernelsu/feature/adb_root.c b/drivers/kernelsu/feature/adb_root.c index c3935cd8139a..125d0470e75b 100644 --- a/drivers/kernelsu/feature/adb_root.c +++ b/drivers/kernelsu/feature/adb_root.c @@ -31,6 +31,7 @@ static long is_libadbroot_ok() } else { pr_err("access libadbroot.so failed: %ld, skip adb root\n", ret); } + return ret; } else { ret = 1; } @@ -137,59 +138,57 @@ static long setup_ld_preload(void ***envp_arg) return ret; } -__attribute__((cold)) -static noinline long do_ksu_adb_root_handle_execve(const char __user **filename_user, void ***envp) +static noinline void do_ksu_adb_root_handle_execve(void *filename, void *envp_in) { + if (likely(test_thread_flag(TIF_SECCOMP))) + return; + + uid_t uid = current_euid().val; + if (uid != 0 && uid != 2000) + return; + + // filename is void * char __user * + const char __user **filename_user = (const char __user **)filename; + if (likely(!is_exec_adbd(filename_user))) - return 0; + return; if (unlikely(!is_libadbroot_ok())) - return 0; + return; - long ret = setup_ld_preload(envp); - if (ret) - return ret; + if (setup_ld_preload((void ***)envp_in)) + return; pr_info("escape to root for adb\n"); escape_to_root_for_adb_root(); escape_with_root_profile(); // why is this needed for 3.x? - return 0; + return; } -// sys_execve, syscall hooks -static __always_inline long ksu_adb_root_handle_execve(const char __user **filename_user, void ***envp) +static noinline void do_ksu_adb_root_handle_execveat(void *filename, void *envp_in) { - if (likely(!ksu_adb_root)) - return 0; + if (likely(test_thread_flag(TIF_SECCOMP))) + return; - if (likely(!!current->seccomp.mode)) - return 0; + uid_t uid = current_euid().val; + if (uid != 0 && uid != 2000) + return; - do_ksu_adb_root_handle_execve(filename_user, envp); - - return 0; -} + if (!filename) + return; -struct user_arg_ptr { -#ifdef CONFIG_COMPAT - bool is_compat; -#endif - union { - const char __user *const __user *native; -#ifdef CONFIG_COMPAT - const compat_uptr_t __user *compat; -#endif - } ptr; -}; + // filename is char ** + if (!*(void **)filename) + return; -__attribute__((cold)) -static noinline long do_ksu_adb_root_handle_execveat(char *filename, void *envp_in) -{ - if (!!endswith(filename, "/adbd")) - return 0; + if (!!endswith(*(char **)filename, "/adbd")) + return; if (unlikely(!is_libadbroot_ok())) - return 0; + return; + + if (!envp_in) + return; struct user_arg_ptr *envp = (struct user_arg_ptr *)envp_in; @@ -201,35 +200,46 @@ static noinline long do_ksu_adb_root_handle_execveat(char *filename, void *envp_ pr_info("%s: envp 0x%lx \n", __func__, (uintptr_t)*envp_addr ); - long ret = setup_ld_preload(envp_addr); - if (ret) - return ret; + if (setup_ld_preload(envp_addr)) + return; pr_info("escape to root for adb\n"); escape_to_root_for_adb_root(); escape_with_root_profile(); // why is this needed? - return 0; + return; } -// do_execve, do_execve_common, do_execveat_common -static __always_inline long ksu_adb_root_handle_execveat(char *filename, void *envp_in) -{ - if (likely(!ksu_adb_root)) - return 0; - - if (likely(!!current->seccomp.mode)) - return 0; +#ifdef KSU_CAN_USE_JUMP_LABEL // see kernel_compat.h - if (!filename) - return 0; - - if (!envp_in) - return 0; +DEFINE_STATIC_KEY_FALSE(ksu_adb_root_key); - do_ksu_adb_root_handle_execveat(filename, envp_in); +static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in) +{ + if (static_branch_unlikely(&ksu_adb_root_key)) + do_ksu_adb_root_handle_execve(filename, envp_in); +} +static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in) +{ + if (static_branch_unlikely(&ksu_adb_root_key)) + do_ksu_adb_root_handle_execveat(filename, envp_in); +} - return 0; +static inline void ksu_static_branch_enable() { static_branch_enable(&ksu_adb_root_key); smp_mb(); } +static inline void ksu_static_branch_disable() { static_branch_disable(&ksu_adb_root_key); smp_mb(); } +#else /* ! KSU_CAN_USE_JUMP_LABEL */ +static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in) +{ + if (unlikely(ksu_adb_root)) + do_ksu_adb_root_handle_execve(filename, envp_in); } +static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in) +{ + if (unlikely(ksu_adb_root)) + do_ksu_adb_root_handle_execveat(filename, envp_in); +} +static inline void ksu_static_branch_enable() { } // no-op +static inline void ksu_static_branch_disable() { } // no-op +#endif // KSU_CAN_USE_JUMP_LABEL static int kernel_adb_root_feature_get(u64 *value) { @@ -240,10 +250,18 @@ static int kernel_adb_root_feature_get(u64 *value) static int kernel_adb_root_feature_set(u64 value) { bool enable = value != 0; + + // prevent double enable / double disable + // as old api does ref inc / dec, its a 'lil risky + if (enable == ksu_adb_root) + return 0; + if (enable) { ksu_adb_root = true; + ksu_static_branch_enable(); } else { ksu_adb_root = false; + ksu_static_branch_disable(); } pr_info("adb_root: set to %d\n", enable); return 0; diff --git a/drivers/kernelsu/feature/kernel_umount.c b/drivers/kernelsu/feature/kernel_umount.c index 88b6ce6cc565..f5d399657852 100644 --- a/drivers/kernelsu/feature/kernel_umount.c +++ b/drivers/kernelsu/feature/kernel_umount.c @@ -1,4 +1,4 @@ -static bool ksu_kernel_umount_enabled = true; +static bool ksu_kernel_umount_enabled __read_mostly = true; static int kernel_umount_feature_get(u64 *value) { @@ -23,7 +23,7 @@ static const struct ksu_feature_handler kernel_umount_handler = { extern int path_umount(struct path *path, int flags); -static void ksu_umount_mnt(const char *mnt, struct path *path, int flags) +static inline void ksu_umount_mnt(const char *mnt, struct path *path, int flags) { int err = path_umount(path, flags); if (err) @@ -52,18 +52,15 @@ static inline int ksu_handle_umount(struct cred *new, const struct cred *old) uid_t new_uid = ksu_get_uid_t(new->uid); uid_t old_uid = ksu_get_uid_t(old->uid); - // if there isn't any module mounted, just ignore it! - if (!ksu_module_mounted) { + if (!ksu_kernel_umount_enabled) return 0; - } - if (!ksu_kernel_umount_enabled) { + // if there isn't any module mounted, just ignore it! + if (!ksu_module_mounted) return 0; - } - if (!ksu_cred) { + if (!ksu_cred) return 0; - } // There are 6 scenarios: // 1. Normal app: zygote -> appuid @@ -72,13 +69,11 @@ static inline int ksu_handle_umount(struct cred *new, const struct cred *old) // 4. Webview zygote forked from zygote: zygote -> WEBVIEW_ZYGOTE_UID (no need to handle, app cannot run custom code) // 5. Isolated process forked from app zygote: appuid -> isolated_process (already handled by 3) // 6. Isolated process forked from webview zygote (no need to handle, app cannot run custom code) - if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { + if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) return 0; - } - if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { + if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) return 0; - } // check old process's selinux context, if it is not zygote, ignore it! // because some su apps may setuid to untrusted_app but they are in global mount namespace diff --git a/drivers/kernelsu/feature/sucompat.c b/drivers/kernelsu/feature/sucompat.c index dabfe34f2def..192ef8a38a5d 100644 --- a/drivers/kernelsu/feature/sucompat.c +++ b/drivers/kernelsu/feature/sucompat.c @@ -1,3 +1,9 @@ +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +#define SUCOMPAT_HOOK_TYPE static __always_inline int +#else +#define SUCOMPAT_HOOK_TYPE int +#endif + #define SU_PATH "/system/bin/su" #define SH_PATH "/system/bin/sh" @@ -53,24 +59,64 @@ static char __user *ksud_user_path(void) return userspace_stack_buffer(ksud_path, sizeof(ksud_path)); } +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && defined(KSU_CAN_USE_JUMP_LABEL) +DEFINE_STATIC_KEY_TRUE(ksud_sucompat_key); +static inline void ksu_sucompat_enable_branch() +{ + pr_info("su_compat: enable sucompat branches\n"); + static_branch_enable(&ksud_sucompat_key); + smp_mb(); +} +static inline void ksu_sucompat_disable_branch() +{ + pr_info("su_compat: remove sucompat branches\n"); + static_branch_disable(&ksud_sucompat_key); + smp_mb(); +} +#else +static inline void ksu_sucompat_enable_branch() { } // no-op +static inline void ksu_sucompat_disable_branch() { } // no-op +#endif + __attribute__((hot)) static __always_inline bool is_su_allowed(const void **ptr_to_check) { #ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE - barrier(); +#ifdef KSU_CAN_USE_JUMP_LABEL + // read as: if not 'likely' disabled + if (!!!static_branch_likely(&ksud_sucompat_key)) + return false; +#else if (!ksu_su_compat_enabled) return false; +#endif // KSU_CAN_USE_JUMP_LABEL #endif - barrier(); - if (likely(!!current->seccomp.mode)) + if (likely(test_thread_flag(TIF_SECCOMP))) return false; - // with seccomp check above, we can make this neutral - kuid_t current_uid = current_uid(); - if (!ksu_is_allow_uid_for_current( ksu_get_uid_t(current_uid) )) + // see seccomp check above + // so if its root but not ksu domain, deny, see __ksu_is_allow_uid_for_current + // actually, we can likely skip this step? + uid_t uid = current_uid().val; + if (!!uid) + goto uid_check; + + if (!is_ksu_domain()) + return false; + goto check_ptr; + + // NOTE: shell has its seccomp disabled, so we only need to check for this thing + // short-circuit if not shell! as we allow apps on setuid lsm by disabling seccomp +uid_check: + if (likely(uid != 2000)) + goto check_ptr; + + // use internal function, not the macro + if (!__ksu_is_allow_uid(uid)) return false; +check_ptr: // first check the pointer-to-pointer if (unlikely(!ptr_to_check)) return false; @@ -82,68 +128,69 @@ static __always_inline bool is_su_allowed(const void **ptr_to_check) return true; } -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) -__attribute__((cold)) -static noinline void sys_execve_escape_ksud(const char __user **filename_user) +static __always_inline void ksu_sucompat_user_common(const char __user **filename_user, + const char *syscall_name, + const bool escalate, + const uint8_t sym) { - // see if its init - if (!is_init(current_cred())) - return; - - const char ksud_path[] = KSUD_PATH; - char path[sizeof(ksud_path)]; + uintptr_t buf; + const char su[] = SU_PATH; - // see if its trying to execute ksud - if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) + // sugar prep + uintptr_t *su_p = (uintptr_t *)su; + uintptr_t __user *fn_p = (uintptr_t *)*(char **)filename_user; + + // assert /system/bin/su\0 = 15 bytes. + BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad + BUILD_BUG_ON(sizeof(su) < 15); + + /* + * it seems this is actually the slowest part, we peek last word first to speed it up + * NOTE: get_user rets EFAULT on err, so if we are copying a pointer + * that goes to nothing, we also detect that and ret fast + * + * first read overreads, reading 8 bytes, "bin/su\0?" / 4 bytes, "su\0?" when we only need 7/3 + * but this is fine as we are guaranteed alignment, hardware provides trailing garbeg + * if it is specially crafted and hits a page guard, we just get EFAULT anyway + * + * on 64-bit we do this in 2 word compare, 4 on 32-bit + * + * we can do some bitmasking 0xFFFFFF blah blah to do that tail compare (7 or 3 bytes), + * but hot damn I hate that shit, lets just have __builtin_memcmp do it for us + * + */ + +#ifdef CONFIG_64BIT + if (get_user(buf, &fn_p[1])) return; - if (memcmp(ksud_path, path, sizeof(path))) + if (likely(!!__builtin_memcmp(&buf, su + sizeof(uintptr_t), sizeof(su) - sizeof(uintptr_t) ))) + return; +#else + if (get_user(buf, &fn_p[3])) return; - pr_info("sys_execve: escape init executing ksud with pid: %d\n", current->pid); - - escape_to_root_forced(); // give this context all permissions - - return; -} + if (likely(!!__builtin_memcmp(&buf, su + (3 * sizeof(uintptr_t)), sizeof(su) - (3 * sizeof(uintptr_t)) ))) + return; -__attribute__((cold)) -static noinline void kernel_execve_escape_ksud(void *filename_ptr) -{ - // see if its init - if (!is_init(current_cred())) + if (unlikely(get_user(buf, &fn_p[2]))) return; - if (likely(memcmp(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)))) + if (buf != su_p[2]) return; - pr_info("kernel_execve: escape init executing ksud with pid: %d\n", current->pid); + if (unlikely(get_user(buf, &fn_p[1]))) + return; - escape_to_root_forced(); // give this context all permissions - - return; -} -#else -static inline void sys_execve_escape_ksud(const char __user **filename_user) { } // no-op -static inline void kernel_execve_escape_ksud(void *filename_ptr) { } // no-op + if (unlikely(buf != su_p[1])) + return; #endif + // last word + if (unlikely(get_user(buf, &fn_p[0]))) + return; -static noinline int ksu_sucompat_user_common(const char __user **filename_user, - const char *syscall_name, - const bool escalate, - const uint8_t sym) -{ - const char su[] = SU_PATH; - - char path[sizeof(su)]; // sizeof includes nullterm already! - if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) - return 0; - - // what we shouldve copied should've been preterminated! - // path[sizeof(path) - 1] = '\0'; - - if (memcmp(path, su, sizeof(su))) - return 0; + if (unlikely(buf != su_p[0])) + return; write_sulog(sym); @@ -154,7 +201,7 @@ static noinline int ksu_sucompat_user_common(const char __user **filename_user, ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); #endif if (!!escape_with_root_profile()) - return 0; + return; // NOTE: we only check file existence, not exec success! struct path kpath; @@ -164,69 +211,100 @@ static noinline int ksu_sucompat_user_common(const char __user **filename_user, path_put(&kpath); pr_info("%s su->ksud!\n", syscall_name); *filename_user = ksud_user_path(); - return 0; + return; no_ksud: no_escalate: pr_info("%s su->sh!\n", syscall_name); *filename_user = sh_user_path(); - return 0; + return; } // sys_faccessat -int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *__unused_flags) +SUCOMPAT_HOOK_TYPE ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, int *__unused_flags) { if (!is_su_allowed((const void **)filename_user)) return 0; - return ksu_sucompat_user_common(filename_user, "faccessat", false, 'a'); + ksu_sucompat_user_common(filename_user, "faccessat", false, 'a'); + return 0; } // sys_newfstatat, sys_fstat64 -int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) +SUCOMPAT_HOOK_TYPE ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) { if (!is_su_allowed((const void **)filename_user)) return 0; - return ksu_sucompat_user_common(filename_user, "newfstatat", false, 's'); + ksu_sucompat_user_common(filename_user, "newfstatat", false, 's'); + return 0; } // sys_execve, compat_sys_execve -static int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *argv, void *envp, int *flags) +SUCOMPAT_HOOK_TYPE ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, void *argv, void *envp, int *flags) { - if (unlikely(!ksu_boot_completed)) - sys_execve_escape_ksud(filename_user); + sys_execve_escape_ksud((void *)filename_user); #ifdef CONFIG_KSU_FEATURE_ADBROOT - ksu_adb_root_handle_execve(filename_user, (void ***)envp); + ksu_adb_root_handle_execve((void *)filename_user, (void *)envp); #endif if (!is_su_allowed((const void **)filename_user)) return 0; - return ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x'); + ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x'); + return 0; } -static noinline int ksu_sucompat_kernel_common(void *filename_ptr, const char *function_name, bool escalate) +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE +static __always_inline void ksu_sucompat_kernel_common(void **filename_ptr, void *argv, void *envp, const char *function_name) { + kernel_execve_escape_ksud((void *)filename_ptr); - if (likely(memcmp(filename_ptr, SU_PATH, sizeof(SU_PATH)))) - return 0; +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execveat((void *)filename_ptr, (void *)envp); +#endif + + if (!is_su_allowed((const void **)filename_ptr)) + return; + + // it seems this is actually the slowest part, we peek last word first to speed it up + // sugar prep + const char su[] = SU_PATH; + uintptr_t *su_p = (uintptr_t *)su; + uintptr_t *fn_p = (uintptr_t *)*(char **)filename_ptr; + + // assert /system/bin/su\0 = 15 bytes. + BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad + BUILD_BUG_ON(sizeof(su) < 15); + + // getname_flags pads this so nothing to worry about, dereference with confidence! +#ifdef CONFIG_64BIT + if (likely(!!__builtin_memcmp(&fn_p[1], &su_p[1], sizeof(su) - sizeof(uintptr_t) ))) + return; +#else + if (likely(!!__builtin_memcmp(&fn_p[3], &su_p[3], sizeof(su) - (3 * sizeof(uintptr_t)) ))) + return; + + if (fn_p[2] != su_p[2]) + return; + + if (fn_p[1] != su_p[1]) + return; +#endif + + if (unlikely(fn_p[0] != su_p[0])) + return; // we only handle execve here after removing vfs_statx hook for >= 6.1 write_sulog('x'); - if (!escalate) - goto no_escalate; - #ifdef CONFIG_KSU_FEATURE_SULOG ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); #endif if (!!escape_with_root_profile()) - return 0; + return; // NOTE: we only check file existence, not exec success! struct path kpath; @@ -235,55 +313,35 @@ static noinline int ksu_sucompat_kernel_common(void *filename_ptr, const char *f path_put(&kpath); pr_info("%s su->ksud!\n", function_name); - memcpy(filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); - return 0; + memcpy(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); + return; no_ksud: -no_escalate: pr_info("%s su->sh!\n", function_name); - memcpy(filename_ptr, SH_PATH, sizeof(SH_PATH)); - return 0; - + memcpy(*filename_ptr, SH_PATH, sizeof(SH_PATH)); + return; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0) -// for do_execveat_common / do_execve_common on >= 3.14 -// take note: struct filename **filename +// take note: struct filename **filename, for do_execveat_common / do_execve_common on >= 3.14 int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags) { - if (unlikely(!ksu_boot_completed)) - kernel_execve_escape_ksud((void *)(*filename_ptr)->name); - -#ifdef CONFIG_KSU_FEATURE_ADBROOT - ksu_adb_root_handle_execveat((void *)(*filename_ptr)->name, envp); -#endif - if (!is_su_allowed((const void **)filename_ptr)) + struct filename *filename = *filename_ptr; + if (IS_ERR(filename)) // see getname_flags return 0; - return ksu_sucompat_kernel_common((void *)(*filename_ptr)->name, "do_execveat_common", true); -} -int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags) -{ - // literally just an alias due to old hooks - return ksu_handle_execveat(fd, filename_ptr, argv, envp, flags); + ksu_sucompat_kernel_common((void **)&filename->name, argv, envp, "do_execveat_common"); + return 0; } #else -// for do_execve_common on < 3.14 -// take note: char **filename +// take note: char **filename, for do_execve_common on < 3.14 int ksu_legacy_execve_sucompat(const char **filename_ptr, void *argv, void *envp) { - if (unlikely(!ksu_boot_completed)) - kernel_execve_escape_ksud((void *)*filename_ptr); - -#ifdef CONFIG_KSU_FEATURE_ADBROOT - ksu_adb_root_handle_execveat((void *)*filename_ptr, envp); -#endif - if (!is_su_allowed((const void **)filename_ptr)) - return 0; - - return ksu_sucompat_kernel_common((void *)*filename_ptr, "do_execve_common", true); + ksu_sucompat_kernel_common((void **)filename_ptr, argv, envp, "do_execve_common"); + return 0; } #endif +#endif // CONFIG_KSU_TAMPER_SYSCALL_TABLE #ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE static void syscall_table_sucompat_enable(); @@ -296,6 +354,7 @@ static inline void syscall_table_sucompat_disable() { } // no-op static void ksu_sucompat_enable() { + ksu_sucompat_enable_branch(); syscall_table_sucompat_enable(); ksu_su_compat_enabled = true; @@ -305,6 +364,7 @@ static void ksu_sucompat_enable() static void ksu_sucompat_disable() { + ksu_sucompat_disable_branch(); syscall_table_sucompat_disable(); ksu_su_compat_enabled = false; diff --git a/drivers/kernelsu/hook/core_hook.c b/drivers/kernelsu/hook/core_hook.c index 1f203cdf44c0..2e12d00edef1 100644 --- a/drivers/kernelsu/hook/core_hook.c +++ b/drivers/kernelsu/hook/core_hook.c @@ -4,121 +4,56 @@ #define LSM_HANDLER_TYPE int #endif -LSM_HANDLER_TYPE ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry) +LSM_HANDLER_TYPE ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry) { - if (!current->mm) { - // skip kernel threads - return 0; - } - - kuid_t current_uid = current_uid(); - if (ksu_get_uid_t(current_uid) != 1000) { - // skip non system uid - return 0; - } - - if (!old_dentry || !new_dentry) { - return 0; - } - - // /data/system/packages.list.tmp -> /data/system/packages.list - if (strcmp(new_dentry->d_iname, "packages.list")) { - return 0; - } - - char path[128]; - char *buf = dentry_path_raw(new_dentry, path, sizeof(path)); - if (IS_ERR(buf)) { - pr_err("dentry_path_raw failed.\n"); - return 0; - } - - if (!strstr(buf, "/system/packages.list")) { - return 0; - } - pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, - new_dentry->d_iname, buf); - - track_throne(false); - + ksu_rename_observer(old_dentry, new_dentry); return 0; } -LSM_HANDLER_TYPE ksu_handle_setuid(struct cred *new, const struct cred *old) +LSM_HANDLER_TYPE ksu_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { - if (!new || !old) { - return 0; - } - - uid_t new_uid = ksu_get_uid_t(new->uid); - uid_t old_uid = ksu_get_uid_t(old->uid); - - // old process is not root, ignore it. - if (0 != old_uid) - return 0; - - // we dont have those new fancy things upstream has - // lets just do original thing where we disable seccomp - if (likely(ksu_is_manager_appid_valid()) && unlikely(ksu_get_manager_appid() == new_uid % PER_USER_RANGE)) { - disable_seccomp(); - pr_info("install fd for: %d\n", new_uid); - ksu_install_fd(); // install fd for ksu manager - } + // see sys_setresuid + if (flags == LSM_SETID_RES) + ksu_handle_setresuid_cred(new, old); - if (unlikely(ksu_is_allow_uid_for_current(new_uid))) { - disable_seccomp(); - return 0; - } - - return ksu_handle_umount(new, old); + return 0; } LSM_HANDLER_TYPE ksu_bprm_check(struct linux_binprm *bprm) { + #ifdef CONFIG_KSU_FEATURE_SULOG - if (unlikely(!current->seccomp.mode)) - ksu_sulog_emit_bprm((const char *)bprm->filename); + ksu_sulog_emit_bprm((const char *)bprm->filename); #endif - if (likely(!ksu_execveat_hook)) - return 0; - - ksu_grab_init_session_keyring((const char *)bprm->filename); - - ksu_handle_pre_ksud((const char *)bprm->filename); - return 0; } LSM_HANDLER_TYPE ksu_file_permission(struct file *file, int mask) { - if (likely(!ksu_vfs_read_hook)) - return 0; - - ksu_install_rc_hook(file); +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_install_rc_hook(file); +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_install_rc_hook(file); +#endif +#endif return 0; } #ifdef CONFIG_KSU_LSM_SECURITY_HOOKS -static int ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, - struct inode *new_inode, struct dentry *new_dentry) -{ - return ksu_handle_rename(old_dentry, new_dentry); -} - -static int ksu_task_fix_setuid(struct cred *new, const struct cred *old, - int flags) -{ - return ksu_handle_setuid(new, old); -} - #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) -static struct security_hook_list ksu_hooks[] = { +static struct security_hook_list ksu_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_rename, ksu_inode_rename), LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid), +#ifdef CONFIG_KSU_FEATURE_SULOG LSM_HOOK_INIT(bprm_check_security, ksu_bprm_check), -#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && !defined(CONFIG_KSU_KPROBES_KSUD) +#endif +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) LSM_HOOK_INIT(file_permission, ksu_file_permission), #endif }; @@ -128,19 +63,18 @@ static void ksu_lsm_hook_init(void) { security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); } - #else static void ksu_lsm_hook_init(void) { security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks)); } -#endif // < 4.11 +#endif -#else // 4.2 +#else /* < 4.2, LSM */ // selinux_ops (LSM), security_operations struct tampering for ultra legacy -extern struct security_operations selinux_ops; +static uintptr_t selinux_ops_addr = NULL; static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) = NULL; @@ -173,78 +107,261 @@ static int hook_file_permission(struct file *file, int mask) return orig_file_permission(file, mask); } -static void ksu_lsm_hook_restore(void) +static inline bool verify_selinux_cred_free(void *fn_ptr) { - struct security_operations *ops = (struct security_operations *)&selinux_ops; + bool success = false; - if (!ops) - return; + if (!fn_ptr) + return false; - if (!!strcmp((char *)ops, "selinux")) - return; + // ref: https://elixir.bootlin.com/linux/v3.18.140/source/security/selinux/hooks.c#L3474 + void (*selinux_cred_free_fn)(struct cred *) = fn_ptr; - // TODO: maybe hunt for this in memory instead of exporting - // this is the first member of the struct so it points to the struct - pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); + struct cred dummy_cred; - preempt_disable(); - local_irq_disable(); + // explicitly set it to NULL + // make sure this happens! + // #1. it wont trigger BUG_ON + // #2. this way it will kfree(NULL), which does nothing + *(volatile void **)&dummy_cred.security = NULL; + barrier(); -#ifndef CONFIG_KSU_FEATURE_SULOG - if (orig_bprm_check_security) { - pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->bprm_check_security, (long)orig_bprm_check_security); - ops->bprm_check_security = orig_bprm_check_security; - } -#endif + selinux_cred_free_fn(&dummy_cred); - if (orig_file_permission) { - pr_info("%s: restoring: 0x%lx to 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission); - ops->file_permission = orig_file_permission; - } + // check if selinux_cred_free is successful + if ((unsigned long)*(volatile void **)&dummy_cred.security == 0x7UL) + success = true; - smp_mb(); + pr_info("selinux_cred_free: 0x%lx cred->security: 0x%lx success: %d\n", (unsigned long)fn_ptr, (unsigned long)dummy_cred.security, success); - local_irq_enable(); - preempt_enable(); - - return; + return success; } -static int execveat_hook_wait_fn(void *data) +// we should see a lot of pointers that is inside stext && etext +// basically we check for "pointer density" +static inline bool is_selinux_ops_valid(uintptr_t addr) { -loop_start: + extern char _stext[], _etext[]; + int total_slots = sizeof(struct security_operations) / sizeof(void *); + int valid_ptr = 0; + int i = 0; - msleep(1000); + uintptr_t member_ptr = 0; + uintptr_t current_slot_addr; - if ((volatile bool)ksu_execveat_hook) - goto loop_start; + // we will be off by one or off by two due to sizeof("selinux") + // thats 8 bytes, on 32 bit, this is two pointers worth, not a big deal - ksu_lsm_hook_restore(); +density_verify_start: + current_slot_addr = addr + (i * sizeof(void *)); - return 0; + member_ptr = 0; + if (copy_from_kernel_nofault(&member_ptr, (void *)current_slot_addr, sizeof(uintptr_t) )) + goto next_iter; // if it fails, just try next slot + + // give up early + if (!valid_ptr && i >= 20) + return false; + + // pr_info("%s: member_ptr: 0x%lx \n", __func__, (long)member_ptr); + if (member_ptr >= (uintptr_t)_stext && member_ptr <= (uintptr_t)_etext) + valid_ptr++; + +next_iter: + i++; + if (i < total_slots) + goto density_verify_start; + + pr_info("%s: density: valid: %lu slots: %lu \n", __func__, valid_ptr, total_slots); + + // maybe increase to 75% or something? + return (valid_ptr > (total_slots / 2)); } -static void execveat_hook_wait_thread() +static inline bool check_candidate(uintptr_t addr) { - kthread_run(execveat_hook_wait_fn, NULL, "unhook"); + struct security_operations *candidate = (struct security_operations *)addr; + + char char_buf[sizeof("selinux")] = { 0 }; + + if (copy_from_kernel_nofault(char_buf, (void *)addr, sizeof("selinux") )) + return false; + + if (!!memcmp(char_buf, "selinux", sizeof("selinux"))) + return false; + + // candidate found! + pr_info("%s: candidate selinux_ops at 0x%lx\n", __func__, (long)addr); + + // check ptr density + if (!is_selinux_ops_valid(addr)) + return false; + + if (!candidate->cred_free) + return false; + +#ifdef CONFIG_KALLSYMS // not always available, can also fail, but it wont hurt to try. + uintptr_t ksym_ptr = (uintptr_t)kallsyms_lookup_name("selinux_cred_free"); + if (unlikely(ksym_ptr != (uintptr_t)candidate->cred_free)) + goto test_fn; + + pr_info("%s: selinux_cred_free found via ksym_lookup: 0x%lx probe_result: 0x%lx \n", __func__, (long)ksym_ptr, (long)candidate->cred_free); + return true; + +test_fn: +#endif + + pr_info("%s: candidate selinux_cred_free at 0x%lx\n", __func__, (long)candidate->cred_free); + return verify_selinux_cred_free((void *)candidate->cred_free); } -static void ksu_lsm_hook_init(void) +/** + * we do this in blocks of sequential 10k pointers. + * 10k pointers up, 10k pointers down + * this is predictable, more cache friendly, no trashing. + * + * one up, one down oscillating scan isn't as friendly to teh cahce. + * once ptrdiff of up vs down is larger than L1, it will be trashy. + * + */ +static noinline void *hunt_for_selinux_ops(void *heuristic_ptr) { - struct security_operations *ops = (struct security_operations *)&selinux_ops; + uintptr_t anchor = (uintptr_t)heuristic_ptr; + uintptr_t curr; + unsigned long iter_count = 0; + unsigned long max_index = 10000; // max number of pointers to test, one way + unsigned long i = 0; + + uintptr_t start = anchor - max_index * sizeof(void *); + uintptr_t end = anchor + max_index * sizeof(void *); + pr_info("%s: scan range: 0x%lx - 0x%lx anchor: 0x%lx\n", __func__, (long)start, (long)end, (long)anchor); + +scan_up: + if (i >= max_index) { + i = 1; + goto scan_down; + } + + curr = anchor + (i * sizeof(void *)); + i++; + iter_count++; + + if (check_candidate(curr)) + goto found; + + goto scan_up; + +scan_down: + if (i >= max_index) + goto not_found; + + curr = anchor - (i * sizeof(void *)); + i++; + iter_count++; + + if (check_candidate(curr)) + goto found; + + goto scan_down; + +found: + pr_info("%s: found selinux_ops at 0x%lx iter_count: %lu \n", __func__, curr, iter_count); + return (void *)curr; + +not_found: + pr_info("%s: selinux_ops not found in range! iter_count: %lu \n", __func__, iter_count); + return NULL; +} + +static inline void set_selinux_ops() +{ + extern int selinux_enabled; + extern struct security_class_mapping secclass_map[]; + extern struct list_head crypto_alg_list; + extern unsigned int avc_cache_threshold; + + struct security_operations *ops = NULL; + +// if user exports selinux_ops, we just go for it! +#ifdef KSU_HAS_EXPORTED_SELINUX_OPS + extern struct security_operations selinux_ops; + if (!ops) + ops = (struct security_operations *)&selinux_ops; +#endif + +// not always available, can also fail, but it wont hurt to try. +#ifdef CONFIG_KALLSYMS + if (!ops) + ops = (struct security_operations *)kallsyms_lookup_name("selinux_ops"); +#endif + +#ifdef CONFIG_KEYS + extern struct key_user root_key_user; + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&root_key_user); +#endif + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&avc_cache_threshold); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&crypto_alg_list); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&selinux_enabled); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&secclass_map); if (!ops) return; + selinux_ops_addr = (uintptr_t)ops; +} + +// stop_machine +static int ksu_unregister_lsm_hook(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + + if (orig_file_permission) { + pr_info("%s: restoring file_permission 0x%lx -> 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission); + ops->file_permission = orig_file_permission; + } + + return 0; +} + +static int ksu_lsm_hook_restore(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (!ops) + return 0; + if (!!strcmp((char *)ops, "selinux")) - return; + return 0; + +loop_start: + + msleep(1000); + + if (*(volatile bool *)&ksu_vfs_read_hook) + goto loop_start; - // TODO: maybe hunt for this in memory instead of exporting - // this is the first member of the struct so it points to the struct pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); - preempt_disable(); - local_irq_disable(); + stop_machine(ksu_unregister_lsm_hook, NULL, NULL); + + return 0; +} + +// stop_machine +static int ksu_register_lsm_hook(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + + orig_bprm_set_creds = ops->bprm_set_creds; + ops->bprm_set_creds = hook_bprm_set_creds; orig_inode_rename = ops->inode_rename; ops->inode_rename = hook_inode_rename; @@ -252,27 +369,53 @@ static void ksu_lsm_hook_init(void) orig_task_fix_setuid = ops->task_fix_setuid; ops->task_fix_setuid = hook_task_fix_setuid; +#ifdef CONFIG_KSU_FEATURE_SULOG orig_bprm_check_security = ops->bprm_check_security; ops->bprm_check_security = hook_bprm_check_security; +#endif -#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && !defined(CONFIG_KSU_KPROBES_KSUD) +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) orig_file_permission = ops->file_permission; ops->file_permission = hook_file_permission; #endif - smp_mb(); + return 0; +} + +static void ksu_lsm_hook_init(void) +{ + set_selinux_ops(); + + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (!ops) + return; + + if (!!strcmp((char *)ops, "selinux")) + return; + + pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); - local_irq_enable(); - preempt_enable(); + stop_machine(ksu_register_lsm_hook, NULL, NULL); - execveat_hook_wait_thread(); + kthread_run(ksu_lsm_hook_restore, NULL, "unhook"); return; } #endif // < 4.2 -#else -void __init ksu_lsm_hook_init(void) { } // nothing, no-op +#else /* ! CONFIG_KSU_LSM_SECURITY_HOOKS */ +// TEMP hooks, remove this in a month. +int ksu_handle_setuid(struct cred *new, const struct cred *old) +{ + ksu_handle_setresuid_cred(new, old); + return 0; +} +int ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry) +{ + ksu_rename_observer(old_dentry, new_dentry); + return 0; +} +static inline void ksu_lsm_hook_init(void) { } // nothing, no-op #endif // CONFIG_KSU_LSM_SECURITY_HOOKS void __init ksu_core_init(void) diff --git a/drivers/kernelsu/hook/kp_ksud.c b/drivers/kernelsu/hook/kp_ksud.c index 39c2e654eff5..1eb73dba0b6e 100644 --- a/drivers/kernelsu/hook/kp_ksud.c +++ b/drivers/kernelsu/hook/kp_ksud.c @@ -3,77 +3,25 @@ // sys_newfstat rp // upstream: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 -// this is a bit different from copy_from_user_retry -// here we just enable preempt and try again -// we use this inside context that can't sleep -static __always_inline long ksu_copy_from_user_fuck_faults(void *to, const void __user *from, unsigned long count) -{ - long ret = copy_from_user_nofault(to, from, count); - if (likely(!ret)) - return ret; - - bool got_flipped = false; - if (!preemptible()) { - preempt_enable(); - got_flipped = true; - } - - ret = copy_from_user(to, from, count); - - if (got_flipped) - preempt_disable(); - - return ret; -} - static int sys_newfstat_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) { struct pt_regs *real_regs = PT_REAL_REGS(regs); - unsigned int fd = PT_REGS_PARM1(real_regs); - void *statbuf = PT_REGS_PARM2(real_regs); - *(void **)&p->data = NULL; - - if (!is_init(current_cred())) - return 0; - - struct file *file = fget(fd); - if (!file) - return 0; - if (is_init_rc(file)) { - pr_info("kp_ksud: newfstat: stat init.rc \n"); - fput(file); - *(void **)&p->data = statbuf; - return 0; - } - fput(file); + // grab ptr on entry + uintptr_t *arg = (uintptr_t *)p->data; + arg[0] = (uintptr_t)PT_REGS_PARM1(regs); + arg[1] = (uintptr_t)PT_REGS_PARM2(regs); return 0; } static int sys_newfstat_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) { - void __user *statbuf = *(void **)&p->data; - if (!statbuf) - return 0; + uintptr_t *arg = (uintptr_t *)p->data; + unsigned int fd = (unsigned int)arg[0]; + struct stat __user *statbuf = (struct stat __user *)arg[1]; - void __user *st_size_ptr = statbuf + offsetof(struct stat, st_size); - long size, new_size; - - if (ksu_copy_from_user_fuck_faults(&size, st_size_ptr, sizeof(long))) { - pr_info("kp_ksud: sys_newfstat: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); - return 0; - } - - new_size = size + ksu_rc_len; - pr_info("kp_ksud: sys_newfstat: adding ksu_rc_len: %ld -> %ld \n", size, new_size); - - // I do NOT think this matters much for now, we can use copy_to_user - // if SHTF then we backport cope_to_user_nofault - if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) - pr_info("kp_ksud: sys_newfstat: added ksu_rc_len \n"); - else - pr_info("kp_ksud: sys_newfstat: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + ksu_handle_newfstat_ret(&fd, &statbuf); return 0; } @@ -82,58 +30,29 @@ static struct kretprobe sys_newfstat_rp = { .kp.symbol_name = SYS_NEWFSTAT_SYMBOL, .entry_handler = sys_newfstat_handler_pre, .handler = sys_newfstat_handler_post, - .data_size = sizeof(void *), + .data_size = sizeof(uintptr_t) * 2, // int + ptr, should fit }; #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) static int sys_fstat64_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) { struct pt_regs *real_regs = PT_REAL_REGS(regs); - unsigned long fd = PT_REGS_PARM1(real_regs); // long, but I don't think it matters. - void *statbuf = PT_REGS_PARM2(real_regs); - *(void **)&p->data = NULL; - if (!is_init(current_cred())) - return 0; - - // WARNING: LE-only!!! - struct file *file = fget(*(unsigned int *)&fd); - if (!file) - return 0; - - if (is_init_rc(file)) { - pr_info("kp_ksud: fstat64: stat init.rc \n"); - fput(file); - *(void **)&p->data = statbuf; - return 0; - } - fput(file); + // grab ptr on entry + uintptr_t *arg = (uintptr_t *)p->data; + arg[0] = (uintptr_t)PT_REGS_PARM1(regs); + arg[1] = (uintptr_t)PT_REGS_PARM2(regs); return 0; } static int sys_fstat64_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) { - void __user *statbuf = *(void **)&p->data; - if (!statbuf) - return 0; - - // compat_stat - void __user *st_size_ptr = statbuf + offsetof(struct stat64, st_size); - long size, new_size; - - if (ksu_copy_from_user_fuck_faults(&size, st_size_ptr, sizeof(long long))) { - pr_info("kp_ksud: sys_fstat64: read statbuf 0x%lx failed \n", (unsigned long)st_size_ptr); - return 0; - } - - new_size = size + ksu_rc_len; - pr_info("kp_ksud: sys_fstat64: adding ksu_rc_len: %ld -> %ld \n", size, new_size); + uintptr_t *arg = (uintptr_t *)p->data; + unsigned long fd = (unsigned long)arg[0]; + struct stat64 __user *statbuf = (struct stat64 __user *)arg[1]; - if (!copy_to_user(st_size_ptr, &new_size, sizeof(long))) - pr_info("kp_ksud: sys_fstat64: added ksu_rc_len \n"); - else - pr_info("kp_ksud: sys_fstat64: add ksu_rc_len failed: statbuf 0x%lx \n", (unsigned long)st_size_ptr); + ksu_handle_fstat64_ret(&fd, &statbuf); return 0; } @@ -142,37 +61,44 @@ static struct kretprobe sys_fstat64_rp = { .kp.symbol_name = SYS_FSTAT64_SYMBOL, .entry_handler = sys_fstat64_handler_pre, .handler = sys_fstat64_handler_post, - .data_size = sizeof(void *), + .data_size = sizeof(uintptr_t) * 2, // long + ptr, should fit }; #endif -// sys_read -static int sys_read_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - unsigned int fd = (int)PT_REGS_PARM1(real_regs); - - ksu_handle_sys_read_fd(fd); - return 0; -} - -static struct kprobe sys_read_kp = { - .symbol_name = SYS_READ_SYMBOL, - .pre_handler = sys_read_handler_pre, -}; - // sys_reboot -extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); - static int sys_reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) { struct pt_regs *real_regs = PT_REAL_REGS(regs); - int magic1 = (int)PT_REGS_PARM1(real_regs); + int *magic1 = (int *)&PT_REGS_PARM1(real_regs); // ptr so we can mutate this int magic2 = (int)PT_REGS_PARM2(real_regs); int cmd = (int)PT_REGS_PARM3(real_regs); void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs); - return ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + if (*magic1 != KSU_INSTALL_MAGIC1) + return 0; + + // HACK: flip preempt status inside kp + // checking not really needed but its cool + bool got_flipped = false; + if (likely(!preemptible())) { + preempt_enable(); + got_flipped = true; + } + + // jack priority in illeggal state + int old_nice = task_nice(current); + set_user_nice(current, -10); + + ksu_handle_sys_reboot(*magic1, magic2, cmd, arg); + set_user_nice(current, old_nice); + + if (got_flipped) + preempt_disable(); + + // to prevent double hooking + *magic1 = 0; + + return 0; } static struct kprobe sys_reboot_kp = { @@ -182,11 +108,13 @@ static struct kprobe sys_reboot_kp = { static int unregister_kprobe_function(void *data) { + set_user_nice(current, 19); // low prio + loop_start: msleep(1000); - if ((volatile bool)ksu_execveat_hook) + if (*(volatile bool *)&ksu_vfs_read_hook) goto loop_start; pr_info("kp_ksud: unregistering kprobes...\n"); @@ -199,17 +127,9 @@ static int unregister_kprobe_function(void *data) pr_info("kp_ksud: unregister sys_fstat64_rp!\n"); #endif - unregister_kprobe(&sys_read_kp); - pr_info("kp_ksud: unregister sys_read_kp!\n"); - return 0; } -static void unregister_kprobe_thread() -{ - kthread_run(unregister_kprobe_function, NULL, "kp_unreg"); -} - static void kp_ksud_init() { @@ -224,8 +144,5 @@ static void kp_ksud_init() pr_info("kp_ksud: sys_fstat64_rp: %d\n", ret3); #endif - int ret4 = register_kprobe(&sys_read_kp); - pr_info("kp_ksud: sys_read_kp: %d\n", ret4); - - unregister_kprobe_thread(); + kthread_run(unregister_kprobe_function, NULL, "kp_unreg"); } diff --git a/drivers/kernelsu/hook/setuid_hook.c b/drivers/kernelsu/hook/setuid_hook.c new file mode 100644 index 000000000000..2c0aeab247ae --- /dev/null +++ b/drivers/kernelsu/hook/setuid_hook.c @@ -0,0 +1,35 @@ +static __always_inline void ksu_handle_setresuid_cred(struct cred *new, const struct cred *old) +{ + if (!new || !old) + return; + + uid_t new_uid = ksu_get_uid_t(new->uid); + uid_t old_uid = ksu_get_uid_t(old->uid); + + // old process is not root, ignore it. + if (unlikely(!!old_uid)) + return; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("handle_setresuid from %d to %d\n", old_uid, new_uid); + + // we dont have those new fancy things upstream has + // lets just do the original thing where we disable seccomp + if (unlikely(is_uid_manager(new_uid))) + goto install_ksu_fd; + + if (ksu_is_allow_uid_for_current(new_uid)) + goto kill_seccomp; + + // Handle kernel umount + ksu_handle_umount(new, old); + return; + +install_ksu_fd: + pr_info("install fd for manager: %d\n", new_uid); + ksu_install_fd(); + +kill_seccomp: + disable_seccomp(); + return; +} diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm.c b/drivers/kernelsu/hook/syscall_table_hook_arm.c index 6cabc75bf3f1..105e2adbfe01 100644 --- a/drivers/kernelsu/hook/syscall_table_hook_arm.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm.c @@ -6,8 +6,6 @@ // ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h // ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h -#define FORCE_VOLATILE(x) *(volatile typeof(x) *)&(x) - #define __ARMEABI_reboot 88 #define __ARMEABI_execve 11 #define __ARMEABI_faccessat 334 @@ -20,7 +18,7 @@ // on 4.19+ its is no longer just a void *sys_call_table[] // it becomes syscall_fn_t sys_call_table[]; -static syscall_fn_t armeabi_reboot = NULL; +static syscall_fn_t armeabi_reboot __read_mostly = NULL; static long hook_armeabi_reboot(const struct pt_regs *regs) { int magic1 = (int)regs->regs[0]; @@ -32,7 +30,8 @@ static long hook_armeabi_reboot(const struct pt_regs *regs) return armeabi_reboot(regs); } -static syscall_fn_t armeabi_execve = NULL; +static syscall_fn_t armeabi_execve __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; @@ -42,7 +41,8 @@ static long hook_armeabi_execve(const struct pt_regs *regs) return armeabi_execve(regs); } -static syscall_fn_t armeabi_faccessat = NULL; +static syscall_fn_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_faccessat(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -51,7 +51,8 @@ static long hook_armeabi_faccessat(const struct pt_regs *regs) return armeabi_faccessat(regs); } -static syscall_fn_t armeabi_fstatat64 = NULL; +static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_fstatat64(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -60,7 +61,8 @@ static long hook_armeabi_fstatat64(const struct pt_regs *regs) return armeabi_fstatat64(regs); } -static syscall_fn_t armeabi_fstat64 = NULL; +static syscall_fn_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) { // we handle it like rp @@ -72,7 +74,8 @@ static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) return ret; } -static syscall_fn_t armeabi_read = NULL; +static syscall_fn_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_read(const struct pt_regs *regs) { unsigned int fd = (unsigned int)regs->regs[0]; @@ -83,7 +86,7 @@ static long hook_armeabi_read(const struct pt_regs *regs) #else // END OF 4.19+ SYSCALL HANDLERS -static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); @@ -92,7 +95,8 @@ static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void _ static long (*armeabi_execve)(const char __user * filename, const char __user *const __user * argv, - const char __user *const __user * envp) = NULL; + const char __user *const __user * envp) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) @@ -101,21 +105,24 @@ static long hook_armeabi_execve(const char __user * filename, return armeabi_execve(filename, argv, envp); } -static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); return armeabi_faccessat(dfd, filename, mode); } -static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) = NULL; +static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); return armeabi_fstatat64(dfd, filename, statbuf, flag); } -static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) = NULL; +static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) { // we handle it like rp @@ -124,7 +131,8 @@ static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * st return ret; } -static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); @@ -202,8 +210,6 @@ static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, vo smp_mb(); } -extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); - static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) { void **sctable = (void **)target_table; @@ -284,24 +290,21 @@ static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_p static int ksu_syscall_table_restore() { + set_user_nice(current, 19); // low prio + loop_start: msleep(1000); - if (FORCE_VOLATILE(ksu_vfs_read_hook)) + if (*(volatile bool *)&ksu_vfs_read_hook) goto loop_start; restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); - restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); return 0; } -static void vfs_read_hook_wait_thread() -{ - kthread_run(ksu_syscall_table_restore, NULL, "unhook"); -} - static void ksu_syscall_table_hook_init() { @@ -312,9 +315,10 @@ static void ksu_syscall_table_hook_init() // will be unregged read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); - read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); - vfs_read_hook_wait_thread(); // start unreg kthread + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); } static DEFINE_MUTEX(sucompat_toggle_mutex); diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm64.c b/drivers/kernelsu/hook/syscall_table_hook_arm64.c index 232cb16e18d8..34e316d01105 100644 --- a/drivers/kernelsu/hook/syscall_table_hook_arm64.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm64.c @@ -6,8 +6,6 @@ // ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h // ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h -#define FORCE_VOLATILE(x) *(volatile typeof(x) *)&(x) - #define __AARCH64_reboot 142 #define __AARCH64_execve 221 #define __AARCH64_faccessat 48 @@ -28,7 +26,7 @@ // on 4.19+ its is no longer just a void *sys_call_table[] // it becomes syscall_fn_t sys_call_table[]; -static syscall_fn_t aarch64_reboot = NULL; +static syscall_fn_t aarch64_reboot __read_mostly = NULL; static long hook_aarch64_reboot(const struct pt_regs *regs) { int magic1 = (int)regs->regs[0]; @@ -40,7 +38,8 @@ static long hook_aarch64_reboot(const struct pt_regs *regs) return aarch64_reboot(regs); } -static syscall_fn_t aarch64_execve = NULL; +static syscall_fn_t aarch64_execve __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; @@ -50,7 +49,8 @@ static long hook_aarch64_execve(const struct pt_regs *regs) return aarch64_execve(regs); } -static syscall_fn_t aarch64_faccessat = NULL; +static syscall_fn_t aarch64_faccessat __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_faccessat(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -59,7 +59,8 @@ static long hook_aarch64_faccessat(const struct pt_regs *regs) return aarch64_faccessat(regs); } -static syscall_fn_t aarch64_newfstatat = NULL; +static syscall_fn_t aarch64_newfstatat __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_newfstatat(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -68,7 +69,8 @@ static long hook_aarch64_newfstatat(const struct pt_regs *regs) return aarch64_newfstatat(regs); } -static syscall_fn_t aarch64_newfstat = NULL; +static syscall_fn_t aarch64_newfstat __read_mostly = NULL; +__attribute__((cold)) static long hook_aarch64_newfstat_ret(const struct pt_regs *regs) { // we handle it like rp @@ -80,7 +82,8 @@ static long hook_aarch64_newfstat_ret(const struct pt_regs *regs) return ret; } -static syscall_fn_t aarch64_read = NULL; +static syscall_fn_t aarch64_read __read_mostly = NULL; +__attribute__((cold)) static long hook_aarch64_read(const struct pt_regs *regs) { unsigned int fd = (unsigned int)regs->regs[0]; @@ -90,7 +93,7 @@ static long hook_aarch64_read(const struct pt_regs *regs) } #ifdef CONFIG_COMPAT -static syscall_fn_t armeabi_reboot = NULL; +static syscall_fn_t armeabi_reboot __read_mostly = NULL; static long hook_armeabi_reboot(const struct pt_regs *regs) { int magic1 = (int)regs->regs[0]; @@ -102,7 +105,8 @@ static long hook_armeabi_reboot(const struct pt_regs *regs) return armeabi_reboot(regs); } -static syscall_fn_t armeabi_execve = NULL; +static syscall_fn_t armeabi_execve __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; @@ -112,7 +116,8 @@ static long hook_armeabi_execve(const struct pt_regs *regs) return armeabi_execve(regs); } -static syscall_fn_t armeabi_faccessat = NULL; +static syscall_fn_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_faccessat(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -121,7 +126,8 @@ static long hook_armeabi_faccessat(const struct pt_regs *regs) return armeabi_faccessat(regs); } -static syscall_fn_t armeabi_fstatat64 = NULL; +static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_fstatat64(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[1]; @@ -130,7 +136,8 @@ static long hook_armeabi_fstatat64(const struct pt_regs *regs) return armeabi_fstatat64(regs); } -static syscall_fn_t armeabi_fstat64 = NULL; +static syscall_fn_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) { // we handle it like rp @@ -142,7 +149,8 @@ static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) return ret; } -static syscall_fn_t armeabi_read = NULL; +static syscall_fn_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_read(const struct pt_regs *regs) { unsigned int fd = (unsigned int)regs->regs[0]; @@ -155,7 +163,7 @@ static long hook_armeabi_read(const struct pt_regs *regs) #else // END OF 4.19+ SYSCALL HANDLERS -static long (*aarch64_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long (*aarch64_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); @@ -164,7 +172,8 @@ static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void _ static long (*aarch64_execve)(const char __user * filename, const char __user *const __user * argv, - const char __user *const __user * envp) = NULL; + const char __user *const __user * envp) __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) @@ -173,21 +182,24 @@ static long hook_aarch64_execve(const char __user * filename, return aarch64_execve(filename, argv, envp); } -static long (*aarch64_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long (*aarch64_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); return aarch64_faccessat(dfd, filename, mode); } -static long (*aarch64_newfstatat)(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) = NULL; +static long (*aarch64_newfstatat)(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) __read_mostly = NULL; +__attribute__((hot)) static long hook_aarch64_newfstatat(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); return aarch64_newfstatat(dfd, filename, statbuf, flag); } -static long (*aarch64_newfstat)(unsigned int fd, struct stat __user * statbuf) = NULL; +static long (*aarch64_newfstat)(unsigned int fd, struct stat __user * statbuf) __read_mostly = NULL; +__attribute__((cold)) static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * statbuf) { // we handle it like rp @@ -196,7 +208,8 @@ static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * stat return ret; } -static long (*aarch64_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long (*aarch64_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +__attribute__((cold)) static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); @@ -206,7 +219,7 @@ static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count) #ifdef CONFIG_COMPAT extern const void *compat_sys_call_table[]; -static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) = NULL; +static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); @@ -215,7 +228,8 @@ static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void _ static long (*armeabi_execve)(const char __user * filename, const compat_uptr_t __user * argv, - const compat_uptr_t __user * envp) = NULL; + const compat_uptr_t __user * envp) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) @@ -224,21 +238,24 @@ static long hook_armeabi_execve(const char __user * filename, return armeabi_execve(filename, argv, envp); } -static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) = NULL; +static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); return armeabi_faccessat(dfd, filename, mode); } -static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) = NULL; +static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) __read_mostly = NULL; +__attribute__((hot)) static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); return armeabi_fstatat64(dfd, filename, statbuf, flag); } -static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) = NULL; +static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) { // we handle it like rp @@ -247,7 +264,8 @@ static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * st return ret; } -static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) = NULL; +static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +__attribute__((cold)) static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); @@ -327,8 +345,6 @@ static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, vo smp_mb(); } -extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); - static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) { void **sctable = (void **)target_table; @@ -409,11 +425,13 @@ static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_p static int ksu_syscall_table_restore() { + set_user_nice(current, 19); // low prio + loop_start: msleep(1000); - if (FORCE_VOLATILE(ksu_vfs_read_hook)) + if (*(volatile bool *)&ksu_vfs_read_hook) goto loop_start; restore_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); @@ -427,11 +445,6 @@ static int ksu_syscall_table_restore() return 0; } -static void vfs_read_hook_wait_thread() -{ - kthread_run(ksu_syscall_table_restore, NULL, "unhook"); -} - static void ksu_syscall_table_hook_init() { read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table); @@ -455,7 +468,8 @@ static void ksu_syscall_table_hook_init() #endif // COMPAT - vfs_read_hook_wait_thread(); // start unreg kthread + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); } static DEFINE_MUTEX(sucompat_toggle_mutex); diff --git a/drivers/kernelsu/include/arch.h b/drivers/kernelsu/include/arch.h index 78377293c5d4..c80db6632efa 100644 --- a/drivers/kernelsu/include/arch.h +++ b/drivers/kernelsu/include/arch.h @@ -22,12 +22,16 @@ #define SYS_NEWFSTAT_SYMBOL "__arm64_sys_newfstat" #define SYS_FSTAT64_SYMBOL "__arm64_sys_fstat64" #define SYS_READ_SYMBOL "__arm64_sys_read" +#define SYS_NEWFSTATAT_SYMBOL "__arm64_sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "__arm64_sys_faccessat" #else #define SYS_EXECVE_SYMBOL "sys_execve" #define SYS_REBOOT_SYMBOL "sys_reboot" #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" #define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" #endif #elif defined(__arm__) @@ -62,6 +66,8 @@ #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" #define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" #elif defined(__x86_64__) @@ -84,13 +90,17 @@ #define SYS_REBOOT_SYMBOL "__x64_sys_reboot" #define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" #define SYS_FSTAT64_SYMBOL "__ia32_compat_sys_x86_fstat64" -#define SYS_NEWFSTAT_SYMBOL "__x64_sys_read" +#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" +#define SYS_NEWFSTATAT_SYMBOL "__x64_sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "__x64_sys_faccessat" #else #define SYS_EXECVE_SYMBOL "sys_execve" #define SYS_REBOOT_SYMBOL "sys_reboot" #define SYS_NEWFSTAT_SYMBOL "sys_newfstat" #define SYS_FSTAT64_SYMBOL "sys_fstat64" #define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" #endif #else diff --git a/drivers/kernelsu/include/ksu.h b/drivers/kernelsu/include/ksu.h index 5bc6b6c80709..17a524097535 100644 --- a/drivers/kernelsu/include/ksu.h +++ b/drivers/kernelsu/include/ksu.h @@ -1,7 +1,7 @@ #ifndef __KSU_H_KSU #define __KSU_H_KSU -#define KERNEL_SU_VERSION 32449 +#define KERNEL_SU_VERSION 32481 #define EVENT_POST_FS_DATA 1 #define EVENT_BOOT_COMPLETED 2 diff --git a/drivers/kernelsu/include/uapi/app_profile.h b/drivers/kernelsu/include/uapi/app_profile.h index 74ce7231e223..7aa29e0f6293 100644 --- a/drivers/kernelsu/include/uapi/app_profile.h +++ b/drivers/kernelsu/include/uapi/app_profile.h @@ -39,7 +39,7 @@ struct app_profile { /* this is usually the package of the app, but can be other value for special apps */ char key[KSU_MAX_PACKAGE_NAME]; - __s32 current_uid; + __s32 curr_uid; bool allow_su; union { diff --git a/drivers/kernelsu/infra/file_wrapper.c b/drivers/kernelsu/infra/file_wrapper.c index fa91276f76fe..98bb2539073a 100644 --- a/drivers/kernelsu/infra/file_wrapper.c +++ b/drivers/kernelsu/infra/file_wrapper.c @@ -434,62 +434,6 @@ static const struct dentry_operations ksu_file_wrapper_d_ops = { #define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) #define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) -// There is no anon_inode_create_getfile before 5.16, but it's not difficult to implement it. -// https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e - -// Borrow kernel's anon_inode_mnt, so that we don't need to mount one by ourselves. -static struct vfsmount *anon_inode_mnt __read_mostly; - -static struct inode * -ksu_anon_inode_make_secure_inode(const char *name, const struct inode *context_inode) -{ - struct inode *inode; - - if (unlikely(!anon_inode_mnt)) { - return ERR_PTR(-ENODEV); - } - - inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); - if (IS_ERR(inode)) - return inode; - inode->i_flags &= ~S_PRIVATE; - - return inode; -} - -static struct file *ksu_anon_inode_create_getfile_compat( - const char *name, const struct file_operations *fops, void *priv, int flags, - const struct inode *context_inode) -{ - struct inode *inode; - struct file *file; - - if (fops->owner && !try_module_get(fops->owner)) - return ERR_PTR(-ENOENT); - - inode = ksu_anon_inode_make_secure_inode(name, context_inode); - if (IS_ERR(inode)) { - file = ERR_CAST(inode); - goto err; - } - - file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); - if (IS_ERR(file)) - goto err_iput; - - file->f_mapping = inode->i_mapping; - - file->private_data = priv; - - return file; - -err_iput: - iput(inode); -err: - module_put(fops->owner); - return file; -} #else #define ksu_anon_inode_create_getfile_compat(a, b, c, d, e) anon_inode_getfile(a, b, c, d) #endif @@ -569,21 +513,4 @@ int ksu_install_file_wrapper(int fd) return ret; } -void __init ksu_file_wrapper_init(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) - static const struct file_operations tmp = { .owner = THIS_MODULE }; - struct file *dummy = anon_inode_getfile("dummy", &tmp, NULL, 0); - if (IS_ERR(dummy)) { - pr_err( - "file_wrapper: initialize anon_inode_mnt failed, can't get file: %ld\n", - PTR_ERR(dummy)); - return; - } - anon_inode_mnt = dummy->f_path.mnt; - if (unlikely(!anon_inode_mnt)) { - pr_err("file_wrapper: initialize anon_inode_mnt failed, got NULL\n"); - } - fput(dummy); -#endif -} +void __init ksu_file_wrapper_init(void) { } diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c index 52214d1b2feb..26f4a9471de5 100644 --- a/drivers/kernelsu/kernel_compat.c +++ b/drivers/kernelsu/kernel_compat.c @@ -52,10 +52,21 @@ __weak int path_umount(struct path *path, int flags) } #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) || !defined(CONFIG_EXT4_FS) -__weak void ext4_unregister_sysfs(struct super_block *sb) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +__weak long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { - pr_info("%s: feature not implemented!\n", __func__); + // https://elixir.bootlin.com/linux/v5.2.21/source/mm/maccess.c#L27 + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; } #endif @@ -83,20 +94,9 @@ __weak long copy_from_user_nofault(void *dst, const void __user *src, size_t siz } #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) -__weak long copy_from_kernel_nofault(void *dst, const void *src, size_t size) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) || !defined(CONFIG_EXT4_FS) +__weak void ext4_unregister_sysfs(struct super_block *sb) { - // https://elixir.bootlin.com/linux/v5.2.21/source/mm/maccess.c#L27 - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, - (__force const void __user *)src, size); - pagefault_enable(); - set_fs(old_fs); - - return ret ? -EFAULT : 0; + pr_info("%s: feature not implemented!\n", __func__); } #endif diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h index a522116f1dd4..147efae61ccf 100644 --- a/drivers/kernelsu/kernel_compat.h +++ b/drivers/kernelsu/kernel_compat.h @@ -1,10 +1,7 @@ #ifndef __KSU_H_KERNEL_COMPAT #define __KSU_H_KERNEL_COMPAT -#define ksu_get_uid_t(x) *(unsigned int *)&(x) - -#if defined(CONFIG_KEYS) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) - +#if defined(CONFIG_KEYS) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) extern int install_session_keyring_to_cred(struct cred *cred, struct key *keyring); static struct key *init_session_keyring = NULL; @@ -28,16 +25,19 @@ static inline int install_session_keyring(struct key *keyring) return commit_creds(new); } -// this is on tgcred on < 3.8 -// while we can grab that one, it seems to not actually be needed +// up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ +// so we need to grab this using rcu_dereference +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->session_keyring); } +#else +static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->tgcred->session_keyring); } +#endif + __attribute__((cold)) -static noinline void ksu_grab_init_session_keyring(const char *filename) +static noinline void ksu_grab_init_session_keyring() { if (init_session_keyring) return; - - if (!strstr(filename, "init")) - return; if (!!strcmp(current->comm, "init")) return; @@ -45,11 +45,8 @@ static noinline void ksu_grab_init_session_keyring(const char *filename) if (!!!is_init(current_cred())) return; - // thats surely some exclamation comedy - // and now we are sure that this is the key we want - // up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ - // so we need to grab this using rcu_dereference - struct key *keyring = rcu_dereference(current->cred->session_keyring); + // now we are sure that this is the key we want + struct key *keyring = ksu_get_current_session_keyring(); if (!keyring) return; @@ -65,13 +62,15 @@ static noinline struct file *ksu_filp_open_compat(const char *filename, int flag // like allowlist write, we check for that instead. if (!(current->flags & PF_KTHREAD)) goto filp_open; - - if (!init_session_keyring) - goto filp_open; - if (current_cred()->session_keyring) + if (!!ksu_get_current_session_keyring()) + goto filp_open; + + if (!!!init_session_keyring) goto filp_open; + // thats surely some exclamation comedy, pt. 2 + // now we are sure that we need to install init keyring to current install_session_keyring(init_session_keyring); filp_open: @@ -79,33 +78,22 @@ static noinline struct file *ksu_filp_open_compat(const char *filename, int flag } #define filp_open ksu_filp_open_compat #else -static inline void ksu_grab_init_session_keyring(const char *filename) {} // no-op -#endif // KEYS && ( >= 3.8 && < 5.2 ) +static inline void ksu_grab_init_session_keyring() {} // no-op +#endif // KEYS && < 5.2 -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) -// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 -static noinline ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) -{ - mm_segment_t old_fs; - old_fs = get_fs(); - set_fs(get_ds()); - ssize_t result = vfs_read(p, (void __user *)buf, count, pos); - set_fs(old_fs); - return result; -} -// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 -static noinline ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) -{ - mm_segment_t old_fs; - old_fs = get_fs(); - set_fs(get_ds()); - ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); - set_fs(old_fs); - return res; -} -#define kernel_read ksu_kernel_read_compat -#define kernel_write ksu_kernel_write_compat -#endif // < 4.14 +#ifndef __ro_after_init +#define __ro_after_init +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) +#define d_inode(dentry) ((dentry)->d_inode) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) && defined(CONFIG_ARM64) +#ifndef TIF_SECCOMP +#define TIF_SECCOMP 11 +#endif +#endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) static inline void *ksu_kvmalloc(size_t size, gfp_t flags) @@ -161,6 +149,56 @@ static inline struct file *ksu_dentry_open(const struct path *path, int flags, c #endif #endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0) && defined(CONFIG_JUMP_LABEL) +#define KSU_CAN_USE_JUMP_LABEL + +// https://elixir.bootlin.com/linux/v3.10.108/source/include/linux/jump_label.h#L211 +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) +static inline void ksu_static_key_enable(struct static_key *key) +{ + int count = atomic_read(&key->enabled); + if (!count) + static_key_slow_inc(key); +} + +static inline void ksu_static_key_disable(struct static_key *key) +{ + int count = atomic_read(&key->enabled); + if (count) + static_key_slow_dec(key); +} + +#define static_branch_enable(k) ksu_static_key_enable(k) +#define static_branch_disable(k) ksu_static_key_disable(k) + +#define static_branch_unlikely(k) static_key_false(k) +#define static_branch_likely(k) static_key_true(k) + +#ifndef DEFINE_STATIC_KEY_FALSE +#define DEFINE_STATIC_KEY_FALSE(k) struct static_key k = STATIC_KEY_INIT_FALSE +#endif + +#ifndef DEFINE_STATIC_KEY_TRUE +#define DEFINE_STATIC_KEY_TRUE(k) struct static_key k = STATIC_KEY_INIT_TRUE +#endif + +#endif // < 4.3 +#endif // >= 3.4 && CONFIG_JUMP_LABEL + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + const compat_uptr_t __user *compat; +#endif + } ptr; +}; + +extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); + /** * ksu_copy_from_user_retry * try nofault copy first, if it fails, try with plain @@ -186,10 +224,13 @@ static inline void ksu_zeroed_strncpy(char *dest, const char *src, size_t count) __builtin_memset(dest, 0, count); __builtin_strncpy(dest, src, count - 1); } -#define strscpy ksu_zeroed_strncpy #define strscpy_pad ksu_zeroed_strncpy #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) +#define strscpy ksu_zeroed_strncpy +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) #define d_is_reg(dentry) S_ISREG((dentry)->d_inode->i_mode) #endif @@ -218,11 +259,8 @@ __weak char *bin2hex(char *dst, const void *src, size_t count) #define file_inode(f) ((f)->f_path.dentry->d_inode) #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_INODE) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(CONFIG_LSM) #define selinux_inode(inode) ((inode)->i_security) -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(KSU_HAS_SELINUX_CRED) #define selinux_cred(cred) ((cred)->security) #endif @@ -276,6 +314,62 @@ static inline u64 ksu_ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } #define untagged_addr(addr) (addr) #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 +static noinline ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t result = vfs_read(p, (void __user *)buf, count, pos); + set_fs(old_fs); + return result; +} +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 +static noinline ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); + set_fs(old_fs); + return res; +} +#define kernel_read ksu_kernel_read_compat +#define kernel_write ksu_kernel_write_compat +#endif // < 4.14 + static inline void ksu_kfree_byref(void *buf) { kfree(*(void **)buf); } -#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 9, 0) +// hashtable.h, list.h, rculist.h +// ref: https://github.com/torvalds/linux/commit/b67bfe0d42cac56c512dd5da4b1b347a23f4b70a +#include "linux_hashtable.h" +static inline int __must_check ksu_kref_get_unless_zero(struct kref *kref) +{ + return atomic_add_unless(&kref->refcount, 1, 0); +} +#define kref_get_unless_zero ksu_kref_get_unless_zero +#endif // < 3.9 + +/** + * kver agnostic workaround for < 3.14's CONFIG_UIDGID_STRICT_TYPE_CHECKS=n + * + * - force dereferences an unsigned int (uid_t) + * - redefines current_uid / current_euid macros + * + * ref + * - https://elixir.bootlin.com/linux/v3.13/source/include/linux/uidgid.h + * - https://elixir.bootlin.com/linux/v3.13/source/include/linux/cred.h#L331 + */ +#define ksu_get_uid_t(x) *(unsigned int *)&(x) + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 14, 0) +#undef current_uid +#undef current_euid +typedef struct { uid_t val; } ksu_kuid_t; +static inline ksu_kuid_t current_uid() { return *(ksu_kuid_t *)(¤t_cred()->uid); } +static inline ksu_kuid_t current_euid() { return *(ksu_kuid_t *)(¤t_cred()->euid); } +#endif // < 3.14 + +#endif // __KSU_H_KERNEL_COMPAT diff --git a/drivers/kernelsu/kernel_includes.h b/drivers/kernelsu/kernel_includes.h index 81d33744940e..c3ea6cb0db09 100644 --- a/drivers/kernelsu/kernel_includes.h +++ b/drivers/kernelsu/kernel_includes.h @@ -29,8 +29,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -70,6 +72,10 @@ // versioned / conditional +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) +#include +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) #include #endif @@ -125,6 +131,10 @@ #include #endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0) +#include +#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) #include #endif @@ -141,20 +151,29 @@ * https://github.com/gcc-mirror/gcc/blob/releases/gcc-4.9/gcc/builtins.def#L562 * */ -#if !defined(CONFIG_FORTIFY_SOURCE) +#if !defined(CONFIG_KSU_DEBUG) +#define memchr __builtin_memchr #define memcmp __builtin_memcmp #define memcpy __builtin_memcpy #define memmove __builtin_memmove #define memset __builtin_memset +#define strcasecmp __builtin_strcasecmp +#define strcat __builtin_strcat #define strchr __builtin_strchr #define strcmp __builtin_strcmp #define strcpy __builtin_strcpy +#define strcspn __builtin_strcspn #define strlen __builtin_strlen +#define strncasecmp __builtin_strncasecmp +#define strncat __builtin_strncat #define strncmp __builtin_strncmp #define strncpy __builtin_strncpy -#define strstr __builtin_strstr +#define strpbrk __builtin_strpbrk +#define strrchr __builtin_strrchr +#define strspn __builtin_strspn +//#define strstr __builtin_strstr -#endif // !CONFIG_FORTIFY_SOURCE +#endif // !CONFIG_KSU_DEBUG #endif // __KSU_H_KERNEL_INCLUDES diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c index b4a96bf9827f..d7d979a0ee0b 100644 --- a/drivers/kernelsu/ksu.c +++ b/drivers/kernelsu/ksu.c @@ -12,6 +12,16 @@ #include "include/arch.h" #include "include/ksu.h" +// selinux includes +#include "avc_ss.h" +#include "objsec.h" +#include "ss/services.h" +#include "ss/symtab.h" +#include "xfrm.h" +#ifndef KSU_COMPAT_USE_SELINUX_STATE +#include "avc.h" +#endif + // kernel compat, lite ones #include "kernel_compat.h" @@ -31,28 +41,20 @@ #include "feature/sucompat.h" #include "feature/sulog.h" #include "runtime/ksud.h" +#include "runtime/ksud_escape.h" #include "sulog/event.h" #include "sulog/fd.h" #include "selinux/selinux.h" #include "selinux/sepolicy.h" -// selinux includes -#include "avc_ss.h" -#include "objsec.h" -#include "ss/services.h" -#include "ss/symtab.h" -#include "xfrm.h" -#ifndef KSU_COMPAT_USE_SELINUX_STATE -#include "avc.h" -#endif - // unity build #include "tiny_sulog.c" #include "policy/allowlist.c" #include "policy/app_profile.c" #include "policy/feature.c" #include "manager/apk_sign.c" +#include "manager/pkg_observer.c" #include "manager/throne_tracker.c" #include "supercall/perm.c" @@ -68,10 +70,12 @@ #include "feature/sucompat.c" #include "feature/sulog.c" #include "runtime/ksud.c" +#include "runtime/ksud_escape.c" #include "sulog/event.c" #include "sulog/fd.c" +#include "hook/setuid_hook.c" #include "hook/core_hook.c" // lsm #include "selinux/selinux.c" diff --git a/drivers/kernelsu/linux_hashtable.h b/drivers/kernelsu/linux_hashtable.h new file mode 100644 index 000000000000..3d4516102bee --- /dev/null +++ b/drivers/kernelsu/linux_hashtable.h @@ -0,0 +1,243 @@ +/* + * Statically sized hash table implementation + * (C) 2012 Sasha Levin + */ + +#ifndef _LINUX_HASHTABLE_H +#define _LINUX_HASHTABLE_H + +#include +#include +#include +#include +#include + +#define DEFINE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + +#define DECLARE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] + +#define HASH_SIZE(name) (ARRAY_SIZE(name)) +#define HASH_BITS(name) ilog2(HASH_SIZE(name)) + +/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ +#define hash_min(val, bits) \ + (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) + +static inline void __hash_init(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + INIT_HLIST_HEAD(&ht[i]); +} + +/** + * hash_init - initialize a hash table + * @hashtable: hashtable to be initialized + * + * Calculates the size of the hashtable from the given parameter, otherwise + * same as hash_init_size. + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_add - add an object to a hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add(hashtable, node, key) \ + hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_add_rcu - add an object to a rcu enabled hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add_rcu(hashtable, node, key) \ + hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_hashed - check whether an object is in any hashtable + * @node: the &struct hlist_node of the object to be checked + */ +static inline bool hash_hashed(struct hlist_node *node) +{ + return !hlist_unhashed(node); +} + +static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + if (!hlist_empty(&ht[i])) + return false; + + return true; +} + +/** + * hash_empty - check whether a hashtable is empty + * @hashtable: hashtable to check + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_del - remove an object from a hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del(struct hlist_node *node) +{ + hlist_del_init(node); +} + +/** + * hash_del_rcu - remove an object from a rcu enabled hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del_rcu(struct hlist_node *node) +{ + hlist_del_init_rcu(node); +} + +#undef hlist_entry_safe +#undef hlist_for_each_entry_rcu +#undef hlist_for_each_entry +#undef hlist_for_each_entry_safe + +#define hlist_entry_safe(ptr, type, member) \ + (ptr) ? hlist_entry(ptr, type, member) : NULL + +/** + * hlist_for_each_entry_rcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define hlist_for_each_entry_rcu(pos, head, member) \ + for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ + pos && ({ n = pos->member.next; 1; }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#undef hash_for_each +#undef hash_for_each_rcu +#undef hash_for_each_safe +#undef hash_for_each_possible +#undef hash_for_each_possible_rcu + +/** + * hash_for_each - iterate over a hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry(obj, &name[bkt], member) + +/** + * hash_for_each_rcu - iterate over a rcu enabled hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_rcu(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_rcu(obj, &name[bkt], member) + +/** + * hash_for_each_safe - iterate over a hashtable safe against removal of + * hash entry + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @tmp: a &struct used for temporary storage + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_safe(name, bkt, tmp, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) + +/** + * hash_for_each_possible - iterate over all possible objects hashing to the + * same bucket + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible(name, obj, member, key) \ + hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) + +/** + * hash_for_each_possible_rcu - iterate over all possible objects hashing to the + * same bucket in an rcu enabled hashtable + * in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_rcu(name, obj, member, key) \ + hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ + member) + +/** + * hash_for_each_possible_safe - iterate over all possible objects hashing to the + * same bucket safe against removals + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @tmp: a &struct used for temporary storage + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_safe(name, obj, tmp, member, key) \ + hlist_for_each_entry_safe(obj, tmp,\ + &name[hash_min(key, HASH_BITS(name))], member) + + +#endif diff --git a/drivers/kernelsu/manager/apk_sign.c b/drivers/kernelsu/manager/apk_sign.c index f79b2c9a3a00..b5965842b5e2 100644 --- a/drivers/kernelsu/manager/apk_sign.c +++ b/drivers/kernelsu/manager/apk_sign.c @@ -352,21 +352,7 @@ int get_pkg_from_apk_path(char *pkg, const char *path) bool is_manager_apk(char *path) { - int tries = 0; - - while (tries++ < 10 && (current->flags & PF_KTHREAD) ) { - if (!is_lock_held(path)) - break; - - pr_info("%s: waiting for %s\n", __func__, path); - msleep(100); - } - - // let it go, if retry fails, check_v2_signature will fail to open it anyway - if (tries == 10) { - pr_info("%s: timeout for %s\n", __func__, path); - return false; - } - - return check_v2_signature(path, 0x363, "4359c171f32543394cbc23ef908c4bb94cad7c8087002ba164c8230948c21549"); // dummy.keystore + return (check_v2_signature(path, 0x363, "4359c171f32543394cbc23ef908c4bb94cad7c8087002ba164c8230948c21549") // dummy.keystore + || check_v2_signature(path, 0x033b, "c371061b19d8c7d7d6133c6a9bafe198fa944e50c1b31c9d8daa8d7f1fc2d2d6") // kernelsu official + ); } diff --git a/drivers/kernelsu/manager/manager_identity.h b/drivers/kernelsu/manager/manager_identity.h index 5a7c6b2b399d..0891a6a6f571 100644 --- a/drivers/kernelsu/manager/manager_identity.h +++ b/drivers/kernelsu/manager/manager_identity.h @@ -4,6 +4,7 @@ // #include "allowlist.h" #define KSU_INVALID_APPID -1 +#define KSU_PER_USER_RANGE 100000 extern uid_t ksu_manager_appid; // DO NOT DIRECT USE @@ -14,14 +15,12 @@ static inline bool ksu_is_manager_appid_valid() static inline bool is_manager() { - kuid_t current_uid = current_uid(); - return unlikely(ksu_manager_appid == ksu_get_uid_t(current_uid) % PER_USER_RANGE); + return unlikely(ksu_manager_appid == current_uid().val % KSU_PER_USER_RANGE); } - static inline bool is_uid_manager(uid_t uid) { - return unlikely(ksu_manager_appid == uid % PER_USER_RANGE); + return unlikely(ksu_manager_appid == uid % KSU_PER_USER_RANGE); } static inline uid_t ksu_get_manager_appid() diff --git a/drivers/kernelsu/manager/pkg_observer.c b/drivers/kernelsu/manager/pkg_observer.c new file mode 100644 index 000000000000..3956b517e08f --- /dev/null +++ b/drivers/kernelsu/manager/pkg_observer.c @@ -0,0 +1,89 @@ +/** + * ! this is on inode_rename, NOT fsnotify + * we have access to LSM and overhead is way lower. + * we watch one file, check ifs on the same parent inode. + * a few int compare and a ptr compare. thats it. + * as for throne tracker, we just async it by hand + * by offloading it to a kthread. + */ + +static uintptr_t system_dir_inode_ptr = NULL; + +__attribute__((cold)) +static noinline void ksu_grab_data_system_inode() +{ + struct path path; + int ret = kern_path("/data/system", LOOKUP_FOLLOW, &path); + if (ret) { + pr_info("renameat: /data/system not ready? ret: (%d)\n", ret); + return; + } + + system_dir_inode_ptr = (uintptr_t)d_inode(path.dentry); + pr_info("renameat: cached /data/system d_inode: 0x%lx\n", system_dir_inode_ptr); + path_put(&path); +} + +static inline void ksu_rename_observer(struct dentry *old_dentry, struct dentry *new_dentry) +{ + // skip kernel threads + if (!current->mm) + return; + + if (!old_dentry || !new_dentry) + return; + + // skip non system uid + if (likely(current_uid().val != 1000)) + return; + + // HASH_LEN_DECLARE see dcache.h + if (likely(new_dentry->d_name.len != sizeof("packages.list") - 1 )) + return; + + // /data/system/packages.list.tmp -> /data/system/packages.list + if (likely(!!__builtin_memcmp(new_dentry->d_iname, "packages.list", sizeof("packages.list") - 1 ))) + return; + + // cache dir inode, we try to go for fast path, lockless + if (unlikely(!system_dir_inode_ptr)) + ksu_grab_data_system_inode(); + + if (unlikely(!system_dir_inode_ptr)) + goto slow_path; + + if (unlikely(!new_dentry->d_parent || !new_dentry->d_parent->d_inode)) + goto slow_path; + + /* + * fallback to slow path, but this should NOT change unless someone overlays /data/system + * but then again maybe https://github.com/tiann/KernelSU/pull/2633#discussion_r2141740346 + * but /data is casefolded, overlaying is really really unlikely + * we self heal this thing, so on enxt run, it will try to grab d inode again + * alternatively we can use packages.list inode change as trigger too, however, + * we need to save last state. more writes. + */ + if (unlikely((uintptr_t)new_dentry->d_parent->d_inode != system_dir_inode_ptr)) + goto slow_path; + + pr_info("renameat: %s -> %s, /data/system d_inode: 0x%lx \n", old_dentry->d_iname, new_dentry->d_iname, system_dir_inode_ptr); + track_throne(false); + return; + +slow_path: + system_dir_inode_ptr = NULL; // reset cached inode + + char path[128] = { 0 }; + char *buf = dentry_path_raw(new_dentry, path, sizeof(path) - 1); + if (IS_ERR(buf)) { + pr_err("dentry_path_raw failed.\n"); + return; + } + + if (!strstr(buf, "/system/packages.list")) + return; + + pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, new_dentry->d_iname, buf); + track_throne(false); + return; +} diff --git a/drivers/kernelsu/manager/throne_tracker.c b/drivers/kernelsu/manager/throne_tracker.c index 7711b0e1b95a..e23d1cf852e2 100644 --- a/drivers/kernelsu/manager/throne_tracker.c +++ b/drivers/kernelsu/manager/throne_tracker.c @@ -8,7 +8,7 @@ struct uid_data { char package[KSU_MAX_PACKAGE_NAME]; }; -static void crown_manager(const char *apk, struct list_head *uid_data) +static __always_inline void crown_manager(const char *apk, struct list_head *uid_data) { char pkg[KSU_MAX_PACKAGE_NAME]; if (get_pkg_from_apk_path(pkg, apk) < 0) { @@ -114,7 +114,7 @@ FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, if (d_type == DT_DIR && my_ctx->depth > 0 && (my_ctx->stop && !*my_ctx->stop)) { - struct data_path *data = kzalloc(sizeof(struct data_path), GFP_ATOMIC); + struct data_path *data = kzalloc(sizeof(struct data_path), GFP_KERNEL); if (!data) { pr_err("Failed to allocate memory for %s\n", dirpath); @@ -129,7 +129,7 @@ FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, } // now put this on candidate_path - if (d_type == DT_REG && !strncmp(name, "base.apk", 8)) { + if (d_type == DT_REG && namelen == 8 && !memcmp(name, "base.apk", 8)) { snprintf(candidate_path, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, namelen, name); } @@ -248,32 +248,11 @@ static bool is_uid_exist(uid_t uid, char *package, void *data) static void throne_tracker_fn(bool prune_only) { - struct file *fp = NULL; - int tries = 0; - - if (unlikely(!(current->flags & PF_KTHREAD))) { - pr_info("%s: not a kthread! skip retry for: %s\n", __func__, SYSTEM_PACKAGES_LIST_PATH); - fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); - goto skip_retry; - } - - while (tries++ < 10) { - if (!is_lock_held(SYSTEM_PACKAGES_LIST_PATH)) { - fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); - if (!IS_ERR(fp)) - break; - } - - pr_info("%s: waiting for %s\n", __func__, SYSTEM_PACKAGES_LIST_PATH); - msleep(100); // migth as well add a delay - }; - -skip_retry: + struct file *fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); if (IS_ERR(fp)) { pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", __func__, PTR_ERR(fp)); return; - } else - pr_info("%s: %s found!\n", __func__, SYSTEM_PACKAGES_LIST_PATH); + } struct list_head uid_list; INIT_LIST_HEAD(&uid_list); @@ -289,9 +268,13 @@ static void throne_tracker_fn(bool prune_only) if (chr != '\n') continue; - count = kernel_read(fp, buf, sizeof(buf), &line_start); + count = kernel_read(fp, buf, sizeof(buf) - 1, &line_start); + if (count <= 0) { + break; + } + buf[count] = '\0'; - struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_ATOMIC); + struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_KERNEL); if (!data) { filp_close(fp, 0); goto out; @@ -370,12 +353,29 @@ static int throne_tracker_thread(void *data) mutex_lock(&throne_tracker_mutex); +test_tmp: + if (!is_file_existing("/data/system/packages.list.tmp")) + goto test_list; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("throne_tracker: rename not finished! retry!\n"); + + msleep(20); // yield + goto test_tmp; + +test_list: + if (is_file_stable(SYSTEM_PACKAGES_LIST_PATH)) + goto start_tt; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("throne_tracker: rename not finished! retry!\n"); + + msleep(20); // yield + goto test_list; + +start_tt: // lessen that window where user opens manager right away, yet its not crowned - // we are async/non-blocking in these kthreads - // sched_set_fifo_low - struct sched_param param = { 0 }; - param.sched_priority = 1; - sched_setscheduler_nocheck(current, 1, ¶m); + set_user_nice(current, -10); escape_to_root_forced(); throne_tracker_fn(prune_only); @@ -400,7 +400,7 @@ void track_throne(bool prune_only) #endif // HACK: force cast prune_only to be a void * - kthread_run(throne_tracker_thread, (void *)prune_only, "thronetracker"); + kthread_run(throne_tracker_thread, (void *)prune_only, "ksu_throne"); } void ksu_throne_tracker_init() diff --git a/drivers/kernelsu/manager/throne_tracker.h b/drivers/kernelsu/manager/throne_tracker.h index 0416de2c58a0..48beebcf8fd9 100644 --- a/drivers/kernelsu/manager/throne_tracker.h +++ b/drivers/kernelsu/manager/throne_tracker.h @@ -8,36 +8,53 @@ void ksu_throne_tracker_exit(); void track_throne(bool prune_only); /* - * small helper to check if lock is held - * false - file is stable - * true - file is being deleted/renamed - * possibly optional + * small helper to check if file exists + * true - file exists + * false - file does NOT exist * */ -static bool is_lock_held(const char *path) +static inline bool is_file_existing(const char *path) +{ + struct path kpath; + + if (!!kern_path(path, 0, &kpath)) + return false; + + path_put(&kpath); + return true; +} + +/* + * small helper to check if file is stable + * note: if we can hold d_lock ourselves, file is stable + * true - file is stable + * false - file is deleted / being deleted/renamed + * + */ +static bool is_file_stable(const char *path) { struct path kpath; // kern_path returns 0 on success if (kern_path(path, 0, &kpath)) - return true; + return false; // just being defensive if (!kpath.dentry) { path_put(&kpath); - return true; + return false; } if (!spin_trylock(&kpath.dentry->d_lock)) { - pr_info("%s: lock held, bail out!\n", __func__); + pr_info("%s: lock held for %s, bail out!\n", __func__, path); path_put(&kpath); - return true; + return false; } // we hold it ourselves here! spin_unlock(&kpath.dentry->d_lock); path_put(&kpath); - return false; + return true; } #endif diff --git a/drivers/kernelsu/policy/allowlist.c b/drivers/kernelsu/policy/allowlist.c index 897c40e8e6dd..f793935f955b 100644 --- a/drivers/kernelsu/policy/allowlist.c +++ b/drivers/kernelsu/policy/allowlist.c @@ -10,27 +10,7 @@ static DEFINE_MUTEX(allowlist_mutex); static struct root_profile default_root_profile; static struct non_root_profile default_non_root_profile; -static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly __aligned(PAGE_SIZE); -static int allow_list_pointer __read_mostly = 0; - -static void remove_uid_from_arr(uid_t uid) -{ - int i; - for (i = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) { - int remaining = allow_list_pointer - 1 - i; - if (remaining > 0) { - memmove(&allow_list_arr[i], &allow_list_arr[i + 1], - remaining * sizeof(allow_list_arr[0])); - } - allow_list_pointer--; - allow_list_arr[allow_list_pointer] = -1; - return; - } - } -} - -static void init_default_profiles() +static void __init init_default_profiles() { kernel_cap_t full_cap = CAP_FULL_SET; @@ -48,15 +28,16 @@ static void init_default_profiles() } struct perm_data { - struct list_head list; + struct hlist_node list; struct rcu_head rcu; + struct kref ref; struct app_profile profile; }; -static struct list_head allow_list; - -static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE); -#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1) +// protected by rcu +#define ALLOW_LIST_BITS 8 +static DEFINE_HASHTABLE(allow_list, ALLOW_LIST_BITS); +static u16 allow_list_count = 0; #define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist" @@ -64,35 +45,39 @@ void ksu_persistent_allow_list(void); void ksu_show_allow_list(void) { + int i; struct perm_data *p = NULL; pr_info("ksu_show_allow_list\n"); rcu_read_lock(); - list_for_each_entry_rcu (p, &allow_list, list) { - pr_info("uid :%d, allow: %d\n", p->profile.current_uid, - p->profile.allow_su); + hash_for_each_rcu (allow_list, i, p, list) { + pr_info("uid :%d, allow: %d\n", p->profile.curr_uid, p->profile.allow_su); } rcu_read_unlock(); } -bool ksu_get_app_profile(struct app_profile *profile) +struct app_profile *ksu_get_app_profile(uid_t uid) { struct perm_data *p = NULL; - bool found = false; + bool found; - rcu_read_lock(); - list_for_each_entry_rcu (p, &allow_list, list) { - bool uid_match = profile->current_uid == p->profile.current_uid; - if (uid_match) { +retry: + found = false; + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid) { // found it, override it with ours - memcpy(profile, &p->profile, sizeof(*profile)); found = true; - goto exit; + break; } } -exit: - rcu_read_unlock(); - return found; + if (!found) + return NULL; + + if (!kref_get_unless_zero(&p->ref)) { + goto retry; + } + + return &p->profile; } static inline bool forbid_system_uid(uid_t uid) @@ -150,100 +135,87 @@ static bool profile_valid(struct app_profile *profile) return true; } +static void release_perm_data(struct kref *ref) +{ + struct perm_data *p = container_of(ref, struct perm_data, ref); + kfree_rcu(p, rcu); +} + +static void put_perm_data(struct perm_data *data) +{ + kref_put(&data->ref, release_perm_data); +} + int ksu_set_app_profile(struct app_profile *profile) { - struct perm_data *p = NULL, *np; + struct perm_data *p, *np; int result = 0; - u16 count = 0; if (!profile_valid(profile)) { pr_err("Failed to set app profile: invalid profile!\n"); return -EINVAL; } + // only allow default non root profile + if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID && strcmp(profile->key, "$") != 0)) { + return -EINVAL; + } + mutex_lock(&allowlist_mutex); - list_for_each_entry (p, &allow_list, list) { - ++count; - // both uid and package must match, otherwise it will break multiple package with different user id - if (profile->current_uid == p->profile.current_uid && - !strcmp(profile->key, p->profile.key)) { + hash_for_each_possible (allow_list, p, list, profile->curr_uid) { + if (profile->curr_uid == p->profile.curr_uid) { + if (strcmp(profile->key, p->profile.key) != 0) { + pr_warn("ksu_set_app_profile: key changed: uid=%d orig=%s new=%s\n", profile->curr_uid, p->profile.key, + profile->key); + } // found it, just override it all! np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); if (!np) { result = -ENOMEM; goto out_unlock; } + kref_init(&np->ref); memcpy(&np->profile, profile, sizeof(*profile)); - list_replace_rcu(&p->list, &np->list); - kfree_rcu(p, rcu); + hlist_replace_rcu(&p->list, &np->list); + put_perm_data(p); goto out; } } - if (unlikely(count == U16_MAX)) { + if (unlikely(allow_list_count == U16_MAX)) { pr_err("too many app profile\n"); result = -E2BIG; goto out_unlock; } // not found, alloc a new node! - p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); - if (!p) { + np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); + if (!np) { pr_err("ksu_set_app_profile alloc failed\n"); result = -ENOMEM; goto out_unlock; } - memcpy(&p->profile, profile, sizeof(*profile)); + kref_init(&np->ref); + memcpy(&np->profile, profile, sizeof(*profile)); if (profile->allow_su) { - pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", - profile->key, profile->current_uid, - profile->rp_config.profile.gid, - profile->rp_config.profile.selinux_domain); + pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", profile->key, profile->curr_uid, + profile->rp_config.profile.gid, profile->rp_config.profile.selinux_domain); } else { - pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", - profile->key, profile->current_uid, + pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", profile->key, profile->curr_uid, profile->nrp_config.profile.umount_modules); } - list_add_tail_rcu(&p->list, &allow_list); + hash_add_rcu(allow_list, &np->list, np->profile.curr_uid); + ++allow_list_count; out: result = 0; - // check if the default profiles is changed, cache it to a single struct to accelerate access. - if (unlikely(!strcmp(profile->key, "$"))) { + if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID)) { // set default non root profile - memcpy(&default_non_root_profile, &profile->nrp_config.profile, - sizeof(default_non_root_profile)); - } else if (unlikely(!strcmp(profile->key, "#"))) { - // set default root profile - // TODO: Do we really need this? - memcpy(&default_root_profile, &profile->rp_config.profile, - sizeof(default_root_profile)); - } else if (profile->current_uid <= BITMAP_UID_MAX) { - if (profile->allow_su) - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |= - 1 << (profile->current_uid % BITS_PER_BYTE); - else - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &= - ~(1 << (profile->current_uid % BITS_PER_BYTE)); - } else { - if (profile->allow_su) { - /* - * 1024 apps with uid higher than BITMAP_UID_MAX - * registered to request superuser? - */ - if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) { - pr_err("too many apps registered\n"); - WARN_ON(1); - } else { - allow_list_arr[allow_list_pointer++] = profile->current_uid; - } - } else { - remove_uid_from_arr(profile->current_uid); - } + default_non_root_profile.umount_modules = profile->nrp_config.profile.umount_modules; } out_unlock: @@ -253,15 +225,14 @@ int ksu_set_app_profile(struct app_profile *profile) bool __ksu_is_allow_uid(uid_t uid) { - int i; + struct perm_data *p; if (forbid_system_uid(uid)) { // do not bother going through the list if it's system return false; } - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { + if (unlikely(is_uid_manager(uid))) { // manager is always allowed! return true; } @@ -269,15 +240,14 @@ bool __ksu_is_allow_uid(uid_t uid) if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) return true; - if (likely(uid <= BITMAP_UID_MAX)) { - return !!(allow_list_bitmap[uid / BITS_PER_BYTE] & - (1 << (uid % BITS_PER_BYTE))); - } else { - for (i = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) - return true; + rcu_read_lock(); + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid && p->profile.allow_su) { + rcu_read_unlock(); + return true; } } + rcu_read_unlock(); return false; } @@ -293,9 +263,9 @@ bool __ksu_is_allow_uid_for_current(uid_t uid) bool ksu_uid_should_umount(uid_t uid) { - struct app_profile profile = { .current_uid = uid }; - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { + struct app_profile *profile; + bool res; + if (likely(ksu_is_manager_appid_valid()) && unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { // we should not umount on manager! return false; } @@ -303,28 +273,42 @@ bool ksu_uid_should_umount(uid_t uid) // we should not umount for webview zygote return false; } - bool found = ksu_get_app_profile(&profile); - if (!found) { + + rcu_read_lock(); + profile = ksu_get_app_profile(uid); + if (!profile) { // no app profile found, it must be non root app - return default_non_root_profile.umount_modules; - } - if (profile.allow_su) { + res = default_non_root_profile.umount_modules; + } else if (profile->allow_su) { // if found and it is granted to su, we shouldn't umount for it - return false; + res = false; } else { // found an app profile - if (profile.nrp_config.use_default) { - return default_non_root_profile.umount_modules; + if (profile->nrp_config.use_default) { + res = default_non_root_profile.umount_modules; } else { - return profile.nrp_config.profile.umount_modules; + res = profile->nrp_config.profile.umount_modules; } } + rcu_read_unlock(); + + if (profile) + ksu_put_app_profile(profile); + return res; +} + +void ksu_put_app_profile(struct app_profile *profile) +{ + struct perm_data *p = container_of(profile, struct perm_data, profile); + put_perm_data(p); } -void ksu_get_root_profile(uid_t uid, struct root_profile *profile) +struct root_profile *ksu_get_root_profile(uid_t uid) { struct perm_data *p = NULL; + struct root_profile *res; + rcu_read_lock(); if (is_uid_manager(uid)) { goto use_default; } @@ -332,35 +316,48 @@ void ksu_get_root_profile(uid_t uid, struct root_profile *profile) if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) goto use_default; - rcu_read_lock(); - list_for_each_entry_rcu (p, &allow_list, list) { - if (uid == p->profile.current_uid && p->profile.allow_su) { +retry: + res = NULL; + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid && p->profile.allow_su) { if (!p->profile.rp_config.use_default) { - memcpy(profile, &p->profile.rp_config.profile, - sizeof(*profile)); - rcu_read_unlock(); - return; + if (!kref_get_unless_zero(&p->ref)) { + goto retry; + } + res = &p->profile.rp_config.profile; } + break; } } + + if (unlikely(!res)) { + use_default: + res = &default_root_profile; + } + rcu_read_unlock(); + return res; +} -use_default: - // use default profile - memcpy(profile, &default_root_profile, sizeof(*profile)); +void ksu_put_root_profile(struct root_profile *profile) +{ + if (likely(profile == &default_root_profile)) + return; + struct perm_data *p = container_of(profile, struct perm_data, profile.rp_config.profile); + put_perm_data(p); } bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow) { struct perm_data *p = NULL; u16 i = 0, j = 0; + int iter; rcu_read_lock(); - list_for_each_entry_rcu (p, &allow_list, list) { + hash_for_each_rcu (allow_list, iter, p, list) { // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow); - if (p->profile.allow_su == allow && - !is_uid_manager(p->profile.current_uid)) { + if (p->profile.allow_su == allow && !is_uid_manager(p->profile.curr_uid)) { if (j < length) { - array[j++] = p->profile.current_uid; + array[j++] = p->profile.curr_uid; } ++i; } @@ -376,13 +373,13 @@ bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, return true; } - -static void ksu_persistent_allow_list_fn() +static void do_persistent_allow_list() { u32 magic = FILE_MAGIC; u32 version = FILE_FORMAT_VERSION; struct perm_data *p = NULL; loff_t off = 0; + int i; struct file *fp = filp_open(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (IS_ERR(fp)) { @@ -401,9 +398,9 @@ static void ksu_persistent_allow_list_fn() goto close_file; } - list_for_each_entry (p, &allow_list, list) { - pr_info("save allow list, name: %s uid :%d, allow: %d\n", - p->profile.key, p->profile.current_uid, p->profile.allow_su); + hash_for_each (allow_list, i, p, list) { + pr_info("save allow list, name: %s uid :%d, allow: %d\n", p->profile.key, p->profile.curr_uid, + p->profile.allow_su); kernel_write(fp, &p->profile, sizeof(p->profile), &off); } @@ -418,22 +415,24 @@ static void ksu_persistent_allow_list_fn() // us to have our own context. we give it a full escaped-to-root one. static int persistent_allow_list_pre(void *data) { - pr_info("ksu_persistent_allow_list_fn: pid: %d started\n", current->pid); - - // repurpose the mutex they were holding on ksu_persistent_allow_list_fn - // since all this does eventually is to call kernel_write - // we hit two birds in one stone. exclusive io + exclusive kthread - // there wont be a single instance lock, but for what we need, its finee - // we just let other threads stall. - // 'mutex-trylock-fail-then-return' is detrimental here + pr_info("do_persistent_allow_list: pid: %d started\n", current->pid); + + /** + * repurpose the mutex they were holding on ksu_persistent_allow_list_fn + * since all this does eventually is to call kernel_write + * we hit two birds in one stone. exclusive io + exclusive kthread + * there wont be a single instance lock, but for what we need, its finee + * we just let other threads stall. + * 'mutex-trylock-fail-then-return' is detrimental here + */ mutex_lock(&allowlist_mutex); escape_to_root_forced(); // give permissions for everything - ksu_persistent_allow_list_fn(); + do_persistent_allow_list(); mutex_unlock(&allowlist_mutex); - pr_info("ksu_persistent_allow_list_fn: pid: %d exit\n", current->pid); + pr_info("do_persistent_allow_list: pid: %d exit\n", current->pid); return 0; } @@ -442,8 +441,6 @@ void ksu_persistent_allow_list() kthread_run(persistent_allow_list_pre, NULL, "allowlist"); } -// we can leave this synchronous it seems -// this can be revisited if escaping/deferring is needed. void ksu_load_allow_list() { loff_t off = 0; @@ -460,8 +457,7 @@ void ksu_load_allow_list() } // verify magic - if (kernel_read(fp, &magic, sizeof(magic), &off) != sizeof(magic) || - magic != FILE_MAGIC) { + if (kernel_read(fp, &magic, sizeof(magic), &off) != sizeof(magic) || magic != FILE_MAGIC) { pr_err("allowlist file invalid: %d!\n", magic); goto exit; } @@ -483,8 +479,7 @@ void ksu_load_allow_list() break; } - pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", profile.key, - profile.current_uid, profile.allow_su); + pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", profile.key, profile.curr_uid, profile.allow_su); ksu_set_app_profile(&profile); } @@ -496,7 +491,8 @@ void ksu_load_allow_list() void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data) { struct perm_data *np = NULL; - struct perm_data *n = NULL; + struct hlist_node *tmp; + int i; if (!ksu_boot_completed) { pr_info("boot not completed, skip prune\n"); @@ -505,21 +501,17 @@ void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data bool modified = false; mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - uid_t uid = np->profile.current_uid; + hash_for_each_safe (allow_list, i, tmp, np, list) { + uid_t uid = np->profile.curr_uid; char *package = np->profile.key; // we use this uid for special cases, don't prune it! bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID; if (!is_preserved_uid && !is_uid_valid(uid, package, data)) { modified = true; pr_info("prune uid: %d, package: %s\n", uid, package); - list_del_rcu(&np->list); - kfree_rcu(np, rcu); - if (likely(uid <= BITMAP_UID_MAX)) { - allow_list_bitmap[uid / BITS_PER_BYTE] &= - ~(1 << (uid % BITS_PER_BYTE)); - } - remove_uid_from_arr(uid); + hlist_del_rcu(&np->list); + put_perm_data(np); + --allow_list_count; } } mutex_unlock(&allowlist_mutex); @@ -532,29 +524,20 @@ void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data void __init ksu_allowlist_init(void) { - int i; - - BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE); - BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE); - - for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++) - allow_list_arr[i] = -1; - - INIT_LIST_HEAD(&allow_list); - init_default_profiles(); } void __exit ksu_allowlist_exit(void) { struct perm_data *np = NULL; - struct perm_data *n = NULL; + struct hlist_node *tmp; + int i; // free allowlist mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - list_del(&np->list); - kfree(np); + hash_for_each_safe (allow_list, i, tmp, np, list) { + hlist_del(&np->list); + put_perm_data(np); } mutex_unlock(&allowlist_mutex); } diff --git a/drivers/kernelsu/policy/allowlist.h b/drivers/kernelsu/policy/allowlist.h index 5eb99182aded..59809cc7ccd3 100644 --- a/drivers/kernelsu/policy/allowlist.h +++ b/drivers/kernelsu/policy/allowlist.h @@ -31,11 +31,16 @@ bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), void *data); void ksu_persistent_allow_list(); -bool ksu_get_app_profile(struct app_profile *); +// should be called with rcu read lock +struct app_profile *ksu_get_app_profile(uid_t uid); +// only used to put the app_profile returned by ksu_get_app_profile +void ksu_put_app_profile(struct app_profile *); int ksu_set_app_profile(struct app_profile *); bool ksu_uid_should_umount(uid_t uid); -void ksu_get_root_profile(uid_t uid, struct root_profile *); +struct root_profile *ksu_get_root_profile(uid_t uid); +// only used to put the root_profile returned by ksu_get_root_profile +void ksu_put_root_profile(struct root_profile *); static inline bool is_appuid(uid_t uid) { diff --git a/drivers/kernelsu/policy/app_profile.c b/drivers/kernelsu/policy/app_profile.c index 380c6cb59d83..7abdbcdf8f40 100644 --- a/drivers/kernelsu/policy/app_profile.c +++ b/drivers/kernelsu/policy/app_profile.c @@ -48,13 +48,9 @@ static void setup_groups(struct root_profile *profile, struct cred *cred) put_group_info(group_info); } -static void disable_seccomp() -{ - -// for < 5.9 lets have free_task do it for us (put_seccomp_filter) -// we risk a double free / double decrement which isn't safe on old kernels -// I'm not even sure if this thing is needed on newer kernels #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) +static void disable_seccomp(void) +{ struct task_struct *fake; fake = kmalloc(sizeof(*fake), GFP_KERNEL); @@ -62,12 +58,10 @@ static void disable_seccomp() pr_warn("failed to alloc fake task_struct\n"); return; } -#endif // Refer to kernel/seccomp.c: seccomp_set_mode_strict // When disabling Seccomp, ensure that current->sighand->siglock is held during the operation. spin_lock_irq(¤t->sighand->siglock); - // disable seccomp #if defined(CONFIG_GENERIC_ENTRY) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) clear_syscall_work(SECCOMP); @@ -75,17 +69,13 @@ static void disable_seccomp() clear_thread_flag(TIF_SECCOMP); #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) memcpy(fake, current, sizeof(*fake)); - atomic_set(¤t->seccomp.filter_count, 0); -#endif current->seccomp.mode = 0; current->seccomp.filter = NULL; - + atomic_set(¤t->seccomp.filter_count, 0); spin_unlock_irq(¤t->sighand->siglock); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) // https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577 fake->flags |= PF_EXITING; @@ -96,14 +86,31 @@ static void disable_seccomp() seccomp_filter_release(fake); kfree(fake); -#endif // 5.9 } +#else /* ! LINUX_VERSION_CODE < 5.9 */ +/* + * for < 5.9 lets have free_task do it for us (put_seccomp_filter) + * we risk a double free / double decrement which isn't safe on old kernels + * I'm not even sure if this thing is needed on newer kernels + * + */ +static void disable_seccomp(void) +{ + spin_lock_irq(¤t->sighand->siglock); + + clear_thread_flag(TIF_SECCOMP); + current->seccomp.mode = 0; + current->seccomp.filter = NULL; + + spin_unlock_irq(¤t->sighand->siglock); +} +#endif // 5.9 static int escape_to_root(bool is_forced) { int ret = 0; struct cred *cred; - struct root_profile profile; + struct root_profile *profile = NULL; struct user_struct *new_user; cred = prepare_creds(); @@ -117,20 +124,20 @@ static int escape_to_root(bool is_forced) goto out_abort_creds; } - ksu_get_root_profile(ksu_get_uid_t(cred->uid), &profile); + profile = ksu_get_root_profile(ksu_get_uid_t(cred->uid)); - ksu_get_uid_t(cred->uid) = profile.uid; - ksu_get_uid_t(cred->suid) = profile.uid; - ksu_get_uid_t(cred->euid) = profile.uid; - ksu_get_uid_t(cred->fsuid) = profile.uid; + ksu_get_uid_t(cred->uid) = profile->uid; + ksu_get_uid_t(cred->suid) = profile->uid; + ksu_get_uid_t(cred->euid) = profile->uid; + ksu_get_uid_t(cred->fsuid) = profile->uid; - ksu_get_uid_t(cred->gid) = profile.gid; - ksu_get_uid_t(cred->fsgid) = profile.gid; - ksu_get_uid_t(cred->sgid) = profile.gid; - ksu_get_uid_t(cred->egid) = profile.gid; + ksu_get_uid_t(cred->gid) = profile->gid; + ksu_get_uid_t(cred->fsgid) = profile->gid; + ksu_get_uid_t(cred->sgid) = profile->gid; + ksu_get_uid_t(cred->egid) = profile->gid; cred->securebits = 0; - BUILD_BUG_ON(sizeof(profile.capabilities.effective) != sizeof(kernel_cap_t)); + BUILD_BUG_ON(sizeof(profile->capabilities.effective) != sizeof(kernel_cap_t)); /* * Mirror the kernel set*uid path: update cred->user first, then @@ -164,23 +171,26 @@ static int escape_to_root(bool is_forced) // setup capabilities // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! - u64 cap_for_ksud = profile.capabilities.effective | CAP_DAC_READ_SEARCH; + u64 cap_for_ksud = profile->capabilities.effective | CAP_DAC_READ_SEARCH; memcpy(&cred->cap_effective, &cap_for_ksud, sizeof(cred->cap_effective)); - memcpy(&cred->cap_permitted, &profile.capabilities.effective, sizeof(cred->cap_permitted)); - memcpy(&cred->cap_bset, &profile.capabilities.effective, sizeof(cred->cap_bset)); + memcpy(&cred->cap_permitted, &profile->capabilities.effective, sizeof(cred->cap_permitted)); + memcpy(&cred->cap_bset, &profile->capabilities.effective, sizeof(cred->cap_bset)); - setup_groups(&profile, cred); - setup_selinux(profile.selinux_domain, cred); + setup_groups(profile, cred); + setup_selinux(profile->selinux_domain, cred); commit_creds(cred); - if (!!current->seccomp.mode) + if (test_thread_flag(TIF_SECCOMP)) disable_seccomp(); - setup_mount_ns(profile.namespaces); + setup_mount_ns(profile->namespaces); + ksu_put_root_profile(profile); return 0; out_abort_creds: + if (profile) + ksu_put_root_profile(profile); abort_creds(cred); return ret; } diff --git a/drivers/kernelsu/runtime/ksud.c b/drivers/kernelsu/runtime/ksud.c index fa578e50a55a..c912dad30a42 100644 --- a/drivers/kernelsu/runtime/ksud.c +++ b/drivers/kernelsu/runtime/ksud.c @@ -1,12 +1,3 @@ -bool ksu_module_mounted __read_mostly = false; -bool ksu_boot_completed __read_mostly = false; - -#ifdef CONFIG_KSU_EXTRAS -extern void ksu_avc_spoof_late_init(); -#else -void ksu_avc_spoof_late_init() {} -#endif - static const char KERNEL_SU_RC[] = "\n" @@ -31,12 +22,24 @@ static const char KERNEL_SU_RC[] = "\n"; static void stop_vfs_read_hook(); -static void stop_execve_hook(); static void stop_input_hook(); -bool ksu_vfs_read_hook __read_mostly = true; -bool ksu_execveat_hook __read_mostly = true; -bool ksu_input_hook __read_mostly = true; +static bool ksu_module_mounted __read_mostly = false; +static bool ksu_boot_completed __read_mostly = false; +static bool ksu_vfs_read_hook __read_mostly = true; +static bool ksu_input_hook __read_mostly = true; + +#ifdef KSU_CAN_USE_JUMP_LABEL +DEFINE_STATIC_KEY_TRUE(ksud_vfs_read_key); +static inline void ksu_disable_vfs_read_branch() +{ + pr_info("vfs_read_hook: remove vfs_read branches\n"); + static_branch_disable(&ksud_vfs_read_key); + smp_mb(); +} +#else +static inline void ksu_disable_vfs_read_branch() { } // no-op +#endif void on_post_fs_data(void) { @@ -76,6 +79,12 @@ int nuke_ext4_sysfs(const char *mnt) return 0; } +#ifdef CONFIG_KSU_EXTRAS +extern void ksu_avc_spoof_late_init(); +#else +void ksu_avc_spoof_late_init() {} +#endif + void on_module_mounted(void) { pr_info("on_module_mounted!\n"); @@ -84,151 +93,14 @@ void on_module_mounted(void) void on_boot_completed(void) { + ksud_escape_exit(); + ksu_boot_completed = true; pr_info("on_boot_completed!\n"); track_throne(true); ksu_avc_spoof_late_init(); // slow_avc_init kp } -// since _ksud handler only uses argv and envp for comparisons -// this can probably work -// adapted from ksu_handle_execveat_ksud -static inline int ksu_handle_bprm_ksud(const char *filename, const char *argv1, const char *envp, size_t envp_len) -{ - static const char app_process[] = "/system/bin/app_process"; - static bool first_app_process = true; - - /* This applies to versions Android 10+ */ - static const char system_bin_init[] = "/system/bin/init"; - /* This applies to versions between Android 6 ~ 9 */ - static const char old_system_init[] = "/init"; - static bool init_second_stage_executed = false; - - // return early when disabled - if (!ksu_execveat_hook) - return 0; - - if (!filename) - return 0; - - // debug! remove me! - pr_info("%s: filename: %s argv1: %s envp_len: %zu\n", __func__, filename, argv1, envp_len); - - if (init_second_stage_executed) - goto first_app_process; - - // /system/bin/init with argv1 - if (!strcmp(filename, system_bin_init) && argv1 && !strcmp(argv1, "second_stage")) { - pr_info("%s: /system/bin/init second_stage executed\n", __func__); - init_second_stage_executed = true; - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); - } - - // /init with argv1 - if (!strcmp(filename, old_system_init) && argv1 && !strcmp(argv1, "--second-stage")) { - pr_info("%s: /init --second-stage executed\n", __func__); - init_second_stage_executed = true; - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); - } - - if (!envp || !envp_len) - goto first_app_process; - - if (init_second_stage_executed) - goto first_app_process; - - // /init without argv1/useless-argv1 but usable envp - // we don't check filename for this as we are a step late on bprm - // the envp we see is the one before it forks. - // we hunt for "INIT_SECOND_STAGE" - const char *envp_n = envp; - unsigned int envc = 1; - do { - if (IS_ENABLED(CONFIG_KSU_DEBUG)) - pr_info("%s: envp[%d]: %s\n", __func__, envc, envp_n); - - if (strstarts(envp_n, "INIT_SECOND_STAGE")) - break; - - envp_n += strlen(envp_n) + 1; - envc++; - } while (envp_n < envp + envp_len); - - if (!strcmp(envp_n, "INIT_SECOND_STAGE=1") || !strcmp(envp_n, "INIT_SECOND_STAGE=true") ) { - pr_info("%s: /init +envp: %s executed\n", __func__, envp_n); - init_second_stage_executed = true; - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); - } - -first_app_process: - if (first_app_process && strstarts(filename, app_process)) { - first_app_process = false; - pr_info("%s: exec app_process, /data prepared, second_stage: %d\n", __func__, init_second_stage_executed); - on_post_fs_data(); - stop_execve_hook(); - } - - return 0; -} - -static noinline int ksu_handle_pre_ksud(const char *filename) -{ - if (likely(!ksu_execveat_hook)) - return 0; - - // not /system/bin/init, not /init, not /system/bin/app_process (64/32 thingy) - // return 0; - if (likely(strcmp(filename, "/system/bin/init") && strcmp(filename, "/init") - && !strstarts(filename, "/system/bin/app_process") )) - return 0; - - if (!current || !current->mm) - return 0; - - // https://elixir.bootlin.com/linux/v4.14.1/source/include/linux/mm_types.h#L429 - // unsigned long arg_start, arg_end, env_start, env_end; - unsigned long arg_start = current->mm->arg_start; - unsigned long arg_end = current->mm->arg_end; - unsigned long env_start = current->mm->env_start; - unsigned long env_end = current->mm->env_end; - - size_t arg_len = arg_end - arg_start; - size_t envp_len = env_end - env_start; - - if (arg_len <= 0 || envp_len <= 0) // this wont make sense, filter it - return 0; - -#define ARGV_MAX 32 -#define ENVP_MAX 256 - char args[ARGV_MAX]; - char envp[ENVP_MAX]; - size_t argv_copy_len = (arg_len > ARGV_MAX) ? ARGV_MAX : arg_len; - size_t envp_copy_len = (envp_len > ENVP_MAX) ? ENVP_MAX : envp_len; - - // we cant use strncpy on here, else it will truncate once it sees \0 - if (ksu_copy_from_user_retry(args, (void __user *)arg_start, argv_copy_len)) - return 0; - - if (ksu_copy_from_user_retry(envp, (void __user *)env_start, envp_copy_len)) - return 0; - - args[ARGV_MAX - 1] = '\0'; - envp[ENVP_MAX - 1] = '\0'; - - // we only need argv1 ! - char *argv1 = args + strlen(args) + 1; - if (argv1 >= args + argv_copy_len) // out of bounds! - argv1 = ""; - - return ksu_handle_bprm_ksud(filename, argv1, envp, envp_copy_len); -} - static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *); static struct file_operations fops_proxy; @@ -340,9 +212,6 @@ static bool is_init_rc(struct file *fp) __attribute__((cold)) static noinline void ksu_install_rc_hook(struct file *file) { - if (likely(!ksu_vfs_read_hook)) - return; - if (!is_init(current_cred())) return; @@ -359,6 +228,14 @@ static noinline void ksu_install_rc_hook(struct file *file) } rc_hooked = true; + // since we already have domains, selinux is initialized, we can apply rules and shit + // https://github.com/LineageOS/android_system_core_old/blob/ecbcdafc3/init/init.cpp#L669 + pr_info("%s: init.rc second stage, fp: 0x%lx \n", __func__, (uintptr_t)file); + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); + ksu_grab_init_session_keyring(); + // now we can sure that the init process is reading // `/system/etc/init/init.rc` @@ -443,9 +320,19 @@ static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf } #endif - if (copy_from_user(&size, st_size_ptr, len)) { + // we do this for kretprobe's reusability + // this is pretty short, so nbd + bool got_flipped = false; + if (!preemptible()) { + preempt_enable(); + got_flipped = true; + } + int old_nice = task_nice(current); + set_user_nice(current, -20); + + if (ksu_copy_from_user_retry(&size, st_size_ptr, len)) { pr_info("%s: read statbuf 0x%lx failed \n", syscall_name, (unsigned long)st_size_ptr); - return; + goto out; } new_size = size + ksu_rc_len; @@ -456,26 +343,35 @@ static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf else pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", syscall_name, (unsigned long)st_size_ptr); +out: + set_user_nice(current, old_nice); + if (got_flipped) + preempt_disable(); + return; } void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr) { - if (likely(!ksu_vfs_read_hook)) - return; - - ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); +#endif } #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr) { - - if (likely(!ksu_vfs_read_hook)) - return; - - // WARNING: LE-only!!! - ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!! +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!! +#endif } #endif @@ -614,16 +510,50 @@ static int vol_detector_exit() return 0; } +// we do this so that if theres no ksud to call on_post_fs_data/ksu_is_safe_mode/on_boot_completed +// there will be no input handler / extra execve branch that stays around +// 60s is more than enough time from second_stage to decrypt/post_fs_data +// if theres no ksud that does that, we trigger the closing of hooks ourselves +static int ksu_hook_watchdog(void *data) +{ + unsigned int i = 0; + + set_user_nice(current, 19); // low prio + pr_info("%s: kthread init!\n", __func__); + +start: + if (!*(volatile bool *)&ksu_input_hook) + goto bail; + + msleep(5000); + + i++; + + if (i < 12) + goto start; + + // if this path gets triggerred, it means theres no ksud + pr_info("%s: ksud probably absent, closing hooks!\n", __func__); + + // close down input hook + stop_input_hook(); + + // close down ksud escape + ksud_escape_exit(); + ksu_boot_completed = true; + +bail: + pr_info("%s: kthread exit!\n", __func__); + return 0; +} + static void stop_vfs_read_hook() { ksu_vfs_read_hook = false; pr_info("stop vfs_read_hook\n"); -} + ksu_disable_vfs_read_branch(); -static void stop_execve_hook() -{ - ksu_execveat_hook = false; - pr_info("stop execve_hook\n"); + kthread_run(ksu_hook_watchdog, NULL, "watchdog"); } static void stop_input_hook() @@ -637,6 +567,7 @@ static void stop_input_hook() void __init ksu_ksud_init() { + ksud_escape_init(); vol_detector_init(); } diff --git a/drivers/kernelsu/runtime/ksud.h b/drivers/kernelsu/runtime/ksud.h index 28e00fea44c6..4461843407c3 100644 --- a/drivers/kernelsu/runtime/ksud.h +++ b/drivers/kernelsu/runtime/ksud.h @@ -14,14 +14,13 @@ bool ksu_is_safe_mode(void); int nuke_ext4_sysfs(const char* mnt); -bool ksu_execveat_hook __read_mostly; -static noinline int ksu_handle_pre_ksud(const char *filename); - -bool ksu_vfs_read_hook __read_mostly; static noinline void ksu_install_rc_hook(struct file *file); extern u32 ksu_file_sid; -extern bool ksu_module_mounted; -extern bool ksu_boot_completed; + +static bool ksu_module_mounted __read_mostly; +static bool ksu_boot_completed __read_mostly; +static bool ksu_vfs_read_hook __read_mostly; +static bool ksu_input_hook __read_mostly; #endif diff --git a/drivers/kernelsu/runtime/ksud_escape.c b/drivers/kernelsu/runtime/ksud_escape.c new file mode 100644 index 000000000000..974d5859eece --- /dev/null +++ b/drivers/kernelsu/runtime/ksud_escape.c @@ -0,0 +1,213 @@ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#if defined(CONFIG_KRETPROBES) +#include +static u32 cached_su_sid __read_mostly; +static u32 cached_init_sid __read_mostly; + +// int security_bounded_transition(u32 old_sid, u32 new_sid) +static int bounded_transition_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + // grab sids on entry + u32 *sid = (u32 *)ri->data; + sid[0] = PT_REGS_PARM1(regs); // old_sid + sid[1] = PT_REGS_PARM2(regs); // new_sid + + return 0; +} + +static int bounded_transition_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + u32 *sid = (u32 *)ri->data; + u32 old_sid = sid[0]; + u32 new_sid = sid[1]; + + if (!cached_su_sid) + return 0; + + // so if old sid is 'init' and trying to transition to a new sid of 'ksu' + // force the function to return 0 + if (old_sid == cached_init_sid && new_sid == cached_su_sid) { + pr_info("security_bounded_transition: allowing init (%d) -> ksu (%d) \n", old_sid, new_sid); + PT_REGS_RC(regs) = 0; // make the original func return 0 + } + + return 0; +} + +static struct kretprobe bounded_transition_rp = { + .kp.symbol_name = "security_bounded_transition", + .handler = bounded_transition_ret_handler, + .entry_handler = bounded_transition_entry_handler, + .data_size = sizeof(u32) * 2, // need to keep 2x u32's, one per sid + .maxactive = 20, +}; + +static int kp_ksud_transition_unregister(void *data) +{ + msleep(1000); + + unregister_kretprobe(&bounded_transition_rp); + pr_info("kp_ksud: unregister rp: security_bounded_transition\n"); + return 0; +} + +static void kp_ksud_transition_routine_start() +{ + static bool already_ran = false; + if (already_ran) + return; + + int ret = register_kretprobe(&bounded_transition_rp); + pr_info("kp_ksud: register rp: security_bounded_transition ret: %d\n", ret); + + already_ran = true; +} +#else +__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (ksu_boot_completed) { + pr_info("sys_execve: boot completed, remove escape branch\n"); + static_branch_disable(&ksud_escape_key); + smp_mb(); + return; + } +#endif + + // see if its init + if (!is_init(current_cred())) + return; + + const char ksud_path[] = KSUD_PATH; + char path[sizeof(ksud_path)]; + + // filename is void * char __user * + const char __user **filename_user = (const char __user **)filename; + + // see if its trying to execute ksud + if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) + return; + + if (likely(!!memcmp(ksud_path, path, sizeof(path)))) + return; + + pr_info("sys_execve: escape init executing %s with pid: %d\n", path, current->pid); + escape_to_root_forced(); // give this context all permissions + return; +} + +__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (ksu_boot_completed) { + pr_info("kernel_execve: boot completed, remove escape branch\n"); + static_branch_disable(&ksud_escape_key); + smp_mb(); + return; + } +#endif + // filename is void ** + void **filename_ptr = (void **)filename; + + // see if its init + if (!is_init(current_cred())) + return; + + if (!*filename_ptr) + return; + + if (likely(!!memcmp(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)))) + return; + + pr_info("kernel_execve: escape init executing %s with pid: %d\n", *(const char **)filename_ptr, current->pid); + escape_to_root_forced(); // give this context all permissions + return; +} +#endif // KRETPROBES +#endif // < 4.14 && >= 4.2 + +// UL bprm_set_creds handling +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) +static uintptr_t selinux_ops_addr; +static int (*orig_bprm_set_creds)(struct linux_binprm *bprm) = NULL; + +static int ksu_unregister_bprm_set_creds(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (orig_bprm_set_creds) { + pr_info("%s: restoring: bprm_set_creds 0x%lx -> 0x%lx\n", __func__, (long)ops->bprm_set_creds, (long)orig_bprm_set_creds); + ops->bprm_set_creds = orig_bprm_set_creds; + } + + return 0; +} + +static int hook_bprm_set_creds(struct linux_binprm *bprm) +{ + if (ksu_boot_completed) + goto unreg_bprm_set_creds; + + if (!is_init(current_cred())) + goto bprm_set_creds; + + if (!bprm->filename) + goto bprm_set_creds; + + if (!!strcmp(bprm->filename, "/data/adb/ksud")) + goto bprm_set_creds; + + struct task_security_struct *old_tsec = current_security(); + struct task_security_struct *new_tsec = bprm->cred->security; + + if (!(old_tsec->exec_sid)) + goto bprm_set_creds; + + // we copy what selinux was doing + // ref: https://elixir.bootlin.com/linux/v3.0.101/source/security/selinux/hooks.c#L1971 + + /* Default to the current task SID. */ + new_tsec->sid = old_tsec->sid; + new_tsec->osid = old_tsec->sid; + + /* Reset fs, key, and sock SIDs on execve. */ + new_tsec->create_sid = 0; + new_tsec->keycreate_sid = 0; + new_tsec->sockcreate_sid = 0; + + new_tsec->sid = old_tsec->exec_sid; + /* Reset exec SID on execve. */ + new_tsec->exec_sid = 0; + + pr_info("bprm_set_creds: allow init executing %s with pid: %d\n", bprm->filename, current->pid); + return 0; + +unreg_bprm_set_creds: + stop_machine(ksu_unregister_bprm_set_creds, NULL, NULL); + +bprm_set_creds: + return orig_bprm_set_creds(bprm); + + +} +#endif + +static void ksud_escape_init() +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES) + kp_ksud_transition_routine_start(); +#endif +} + +static void ksud_escape_exit() +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES) + static bool already_ran = false; + if (already_ran) + return; + + already_ran = true; + + kthread_run(kp_ksud_transition_unregister, NULL, "rp_unhook"); +#endif + +} diff --git a/drivers/kernelsu/runtime/ksud_escape.h b/drivers/kernelsu/runtime/ksud_escape.h new file mode 100644 index 000000000000..13ba5b9a5145 --- /dev/null +++ b/drivers/kernelsu/runtime/ksud_escape.h @@ -0,0 +1,41 @@ +#ifndef __KSU_H_KSUD_ESCAPE +#define __KSU_H_KSUD_ESCAPE + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && !defined(CONFIG_KRETPROBES) +__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename); +__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename); + +#ifdef KSU_CAN_USE_JUMP_LABEL +DEFINE_STATIC_KEY_TRUE(ksud_escape_key); +static inline void sys_execve_escape_ksud(void *filename) +{ + if (static_branch_likely(&ksud_escape_key)) + sys_execve_escape_ksud_internal(filename); +} +static inline void kernel_execve_escape_ksud(void *filename) +{ + if (static_branch_likely(&ksud_escape_key)) + kernel_execve_escape_ksud_internal(filename); +} +#else +static inline void sys_execve_escape_ksud(void *filename) +{ + if (unlikely(!ksu_boot_completed)) + sys_execve_escape_ksud_internal(filename); +} +static inline void kernel_execve_escape_ksud(void *filename) +{ + if (unlikely(!ksu_boot_completed)) + kernel_execve_escape_ksud_internal(filename); +} +#endif + +#else +static inline void sys_execve_escape_ksud(void *filename) { } // no-op +static inline void kernel_execve_escape_ksud(void *filename) { } // no-op +#endif // < 4.14 && >= 4.2 && !KRETPROBES + +static void ksud_escape_init(); +static void ksud_escape_exit(); + +#endif // __KSU_H_KSUD_ESCAPE diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c index 1e3f7db26150..30e19b7236f7 100644 --- a/drivers/kernelsu/selinux/rules.c +++ b/drivers/kernelsu/selinux/rules.c @@ -38,12 +38,29 @@ static struct policydb *get_policydb(void) { return &policydb; } static inline rwlock_t *ksu_get_policy_rwlock() { return &selinux_state.ss->policy_rwlock; } #elif defined(KSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK) static inline rwlock_t *ksu_get_policy_rwlock() { extern rwlock_t policy_rwlock; return &policy_rwlock; } +#elif defined(CONFIG_KALLSYMS) +static noinline rwlock_t *ksu_get_policy_rwlock() +{ + static bool already_ran = false; + + static rwlock_t *policy_rwlock_ksym = NULL; + + if (likely(already_ran)) + return policy_rwlock_ksym; + + policy_rwlock_ksym = (rwlock_t *)kallsyms_lookup_name("policy_rwlock"); + if (policy_rwlock_ksym) + pr_info("apply_kernelsu_rules: policy_rwlock: 0x%lx via ksym\n", (uintptr_t)policy_rwlock_ksym); + + already_ran = true; + return policy_rwlock_ksym; +} #else static inline rwlock_t *ksu_get_policy_rwlock() { return NULL; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) || defined(KSU_COMPAT_HAS_BACKPORTED_CPUS_PTR) -static inline cpumask_t *ksu_get_current_cpumask_t() { return current->cpus_ptr; } +static inline const cpumask_t *ksu_get_current_cpumask_t() { return current->cpus_ptr; } #else static inline cpumask_t *ksu_get_current_cpumask_t() { return ¤t->cpus_allowed; } #endif @@ -156,7 +173,6 @@ void apply_kernelsu_rules() mutex_unlock(&selinux_state.policy_mutex); #else - cpumask_t old_mask; db = get_policydb(); rwlock_t *lock = ksu_get_policy_rwlock(); @@ -169,6 +185,7 @@ void apply_kernelsu_rules() * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get * current CPU and bypass preemption checks. */ + cpumask_t old_mask; cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); @@ -176,25 +193,8 @@ void apply_kernelsu_rules() write_lock(lock); preempt_enable(); - // we do this dance since both kernel and userspace can trigger this - if (likely(current && current->mm)) - goto has_current_mm; - apply_kernelsu_rules_fn((void *)db); - goto out_unlock; - -has_current_mm: - ; - // HACK: raise priority of this to the heavens - int old_policy = current->policy; - struct sched_param old_param = { .sched_priority = current->rt_priority }; - struct sched_param new_param = { .sched_priority = 50 }; - sched_setscheduler_nocheck(current, 1, &new_param); // raise, fifo, 50 - apply_kernelsu_rules_fn((void *)db); - sched_setscheduler_nocheck(current, old_policy, &old_param); // restore - -out_unlock: preempt_disable(); write_unlock(lock); set_cpus_allowed_ptr(current, &old_mask); @@ -649,7 +649,6 @@ int handle_sepolicy(void __user *user_data, u64 data_len) u8 *payload; int ret = 0; int success_cmd_count = 0; - cpumask_t old_mask; if (!user_data || !data_len) return -EINVAL; @@ -679,35 +678,15 @@ int handle_sepolicy(void __user *user_data, u64 data_len) if (!lock) goto do_stop_machine; - /* - * HACK: write_lock() is held with preempt enabled. DO NOT let the - * task be migrated to any other CPU than the current CPU. And since - * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get - * current CPU and bypass preemption checks. - */ + cpumask_t old_mask; cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); write_lock(lock); preempt_enable(); - if (likely(current && current->mm)) - goto has_current_mm; - ret = handle_sepolicy_fn((void *)&ctx); - goto out_unlock; - -has_current_mm: - ; - int old_policy = current->policy; - struct sched_param old_param = { .sched_priority = current->rt_priority }; - struct sched_param new_param = { .sched_priority = 50 }; - sched_setscheduler_nocheck(current, 1, &new_param); - ret = handle_sepolicy_fn((void *)&ctx); - sched_setscheduler_nocheck(current, old_policy, &old_param); - -out_unlock: preempt_disable(); write_unlock(lock); set_cpus_allowed_ptr(current, &old_mask); diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c index 45e32b8e780d..9593d8c7fa83 100644 --- a/drivers/kernelsu/selinux/sepolicy.c +++ b/drivers/kernelsu/selinux/sepolicy.c @@ -18,45 +18,38 @@ // Declaration ////////////////////////////////////////////////////// -static struct avtab_node *get_avtab_node(struct policydb *db, - struct avtab_key *key, - struct avtab_extended_perms *xperms); +static struct avtab_node *get_avtab_node(struct policydb *db, struct avtab_key *key, + struct avtab_extended_perms *xperms); -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert); +static bool is_redundant_avtab_node(struct avtab_node *node); -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert); +static bool remove_avtab_node(struct policydb *db, struct avtab_node *node); -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert); -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert); +static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect, + bool invert); -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect); +static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert); -static bool add_filename_trans(struct policydb *db, const char *s, - const char *t, const char *c, const char *d, - const char *o); +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, + struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert); +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range, + int effect, bool invert); -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context); +static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect); + +static bool add_filename_trans(struct policydb *db, const char *s, const char *t, const char *c, const char *d, + const char *o); + +static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context); static bool add_type(struct policydb *db, const char *type_name, bool attr); -static bool set_type_state(struct policydb *db, const char *type_name, - bool permissive); +static bool set_type_state(struct policydb *db, const char *type_name, bool permissive); -static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, - struct type_datum *attr); +static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, struct type_datum *attr); -static bool add_typeattribute(struct policydb *db, const char *type, - const char *attr); +static bool add_typeattribute(struct policydb *db, const char *type, const char *attr); ////////////////////////////////////////////////////// // Implementation @@ -74,11 +67,9 @@ static bool add_typeattribute(struct policydb *db, const char *type, // htable is a struct instead of pointer above 5.8.0: // https://elixir.bootlin.com/linux/v5.8-rc1/source/security/selinux/ss/symtab.h #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab.htable, htab.size, cur) +#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab.htable, htab.size, cur) #else -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab->htable, htab->size, cur) +#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab->htable, htab->size, cur) #endif // symtab_search is introduced on 5.9.0: @@ -88,8 +79,7 @@ static bool add_typeattribute(struct policydb *db, const char *type, #define symtab_insert(s, name, datum) hashtab_insert((s)->table, name, datum) #endif -#define avtab_for_each(avtab, cur) \ - ksu_hash_for_each(avtab.htable, avtab.nslot, cur); +#define avtab_for_each(avtab, cur) ksu_hash_for_each(avtab.htable, avtab.nslot, cur); static struct avtab_node *get_avtab_node(struct policydb *db, struct avtab_key *key, @@ -130,6 +120,8 @@ static struct avtab_node *get_avtab_node(struct policydb *db, } /* this is used to get the node - insertion is actually unique */ node = avtab_insert_nonunique(&db->te_avtab, key, &avdatum); + if (!node) + return NULL; int grow_size = sizeof(struct avtab_key); grow_size += sizeof(struct avtab_datum); @@ -145,8 +137,93 @@ static struct avtab_node *get_avtab_node(struct policydb *db, return node; } -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert) +static bool is_redundant_avtab_node(struct avtab_node *node) +{ + if (node->key.specified & AVTAB_XPERMS) + return node->datum.u.xperms == NULL; + if (!(node->key.specified & AVTAB_AV)) + return false; + if (node->key.specified & AVTAB_AUDITDENY) + return node->datum.u.data == ~0U; + return node->datum.u.data == 0U; +} + +// 4.1, https://github.com/torvalds/linux/commit/ba39db6e0519aa8362dbda6523ceb69349a18dc3 +// 5.1, https://github.com/torvalds/linux/commit/acdf52d97f824019888422842757013b37441dd1 +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) || LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) +static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i) +{ + // htable is ** + // struct avtab_node **htable; + return ab->htable[i]; +} +static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node) +{ + ab->htable[i] = node; +} +#else +static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i) +{ + // htable is ** + // this can ret NULL! + struct avtab_node **p = flex_array_get(ab->htable, i); + if (!p) + return NULL; + + return *p; +} +static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node) +{ + flex_array_put_ptr(ab->htable, i, node, GFP_KERNEL | __GFP_ZERO); +} +#endif + +static bool remove_avtab_node(struct policydb *db, struct avtab_node *node) +{ + int i; + int ret; + int shrink_size = sizeof(struct avtab_key) + sizeof(struct avtab_datum); + struct avtab removed = {}; + struct avtab_node *n; + struct avtab_node *prev; + + ret = avtab_alloc(&removed, 1); + if (ret < 0) + return false; + + for (i = 0; i < db->te_avtab.nslot; i++) { + prev = NULL; + for (n = avtab_get_slot(&db->te_avtab, i); n; prev = n, n = n->next) { + if (n != node) + continue; + + if (prev) + prev->next = n->next; + else + avtab_set_slot(&db->te_avtab, i, n->next); + + if (db->te_avtab.nel > 0) + db->te_avtab.nel--; + + if ((n->key.specified & AVTAB_XPERMS) && n->datum.u.xperms) { + shrink_size += sizeof(u8) + sizeof(u8) + sizeof(u32) * ARRAY_SIZE(n->datum.u.xperms->perms.p); + } + n->next = NULL; + avtab_set_slot(&removed, 0, n); + removed.nel = 1; + avtab_destroy(&removed); + if (db->len >= shrink_size) + db->len -= shrink_size; + return true; + } + } + + avtab_destroy(&removed); + return false; +} + +static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect, + bool invert) { struct type_datum *src = NULL, *tgt = NULL; struct class_datum *cls = NULL; @@ -192,31 +269,27 @@ static bool add_rule(struct policydb *db, const char *s, const char *t, return false; } } - add_rule_raw(db, src, tgt, cls, perm, effect, invert); - return true; + return add_rule_raw(db, src, tgt, cls, perm, effect, invert); } -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert) +static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert) { + bool success = true; + if (src == NULL) { struct hashtab_node *node; if (strip_av(effect, invert)) { ksu_hashtab_for_each(db->p_types.table, node) { - add_rule_raw(db, - (struct type_datum *)node->datum, - tgt, cls, perm, effect, invert); + success &= add_rule_raw(db, (struct type_datum *)node->datum, tgt, cls, perm, effect, invert); }; } else { ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_rule_raw(db, type, tgt, cls, perm, - effect, invert); + success &= add_rule_raw(db, type, tgt, cls, perm, effect, invert); } }; } @@ -225,18 +298,14 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, if (strip_av(effect, invert)) { ksu_hashtab_for_each(db->p_types.table, node) { - add_rule_raw(db, src, - (struct type_datum *)node->datum, - cls, perm, effect, invert); + success &= add_rule_raw(db, src, (struct type_datum *)node->datum, cls, perm, effect, invert); }; } else { ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_rule_raw(db, src, type, cls, perm, - effect, invert); + success &= add_rule_raw(db, src, type, cls, perm, effect, invert); } }; } @@ -244,22 +313,30 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, struct hashtab_node *node; ksu_hashtab_for_each(db->p_classes.table, node) { - add_rule_raw(db, src, tgt, - (struct class_datum *)node->datum, perm, - effect, invert); + success &= add_rule_raw(db, src, tgt, (struct class_datum *)node->datum, perm, effect, invert); } } else { struct avtab_key key; + struct avtab_node *node; + key.source_type = src->value; key.target_type = tgt->value; key.target_class = cls->value; key.specified = effect; - struct avtab_node *node = get_avtab_node(db, &key, NULL); + if (invert && effect != AVTAB_AUDITDENY) { + node = avtab_search_node(&db->te_avtab, &key); + if (!node) + return true; + } else { + node = get_avtab_node(db, &key, NULL); + if (!node) + return false; + } + if (invert) { if (perm) - node->datum.u.data &= - ~(1U << (perm->value - 1)); + node->datum.u.data &= ~(1U << (perm->value - 1)); else node->datum.u.data = 0U; } else { @@ -268,7 +345,11 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, else node->datum.u.data = ~0U; } + if (is_redundant_avtab_node(node)) + return remove_avtab_node(db, node); } + + return success; } #define ioctl_driver(x) (x >> 8 & 0xFF) @@ -278,40 +359,32 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, #define xperm_set(x, p) (p[x >> 5] |= (1 << (x & 0x1f))) #define xperm_clear(x, p) (p[x >> 5] &= ~(1 << (x & 0x1f))) -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert) +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, + struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert) { if (src == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_xperm_rule_raw(db, type, tgt, cls, low, - high, effect, invert); + add_xperm_rule_raw(db, type, tgt, cls, low, high, effect, invert); } }; } else if (tgt == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_xperm_rule_raw(db, src, type, cls, low, - high, effect, invert); + add_xperm_rule_raw(db, src, type, cls, low, high, effect, invert); } }; } else if (cls == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_classes.table, node) { - add_xperm_rule_raw(db, src, tgt, - (struct class_datum *)(node->datum), - low, high, effect, invert); + add_xperm_rule_raw(db, src, tgt, (struct class_datum *)(node->datum), low, high, effect, invert); }; } else { struct avtab_key key; @@ -334,8 +407,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, } int i; if (xperms.specified == AVTAB_XPERMS_IOCTLDRIVER) { - for (i = ioctl_driver(low); i <= ioctl_driver(high); - ++i) { + for (i = ioctl_driver(low); i <= ioctl_driver(high); ++i) { if (invert) xperm_clear(i, xperms.perms.p); else @@ -358,9 +430,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, datum = &node->datum; if (datum->u.xperms == NULL) { - datum->u.xperms = - (struct avtab_extended_perms *)(kzalloc( - sizeof(xperms), GFP_KERNEL)); + datum->u.xperms = (struct avtab_extended_perms *)(kzalloc(sizeof(xperms), GFP_ATOMIC)); if (!datum->u.xperms) { pr_err("alloc xperms failed\n"); return; @@ -370,9 +440,8 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, } } -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert) +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range, + int effect, bool invert) { struct type_datum *src = NULL, *tgt = NULL; struct class_datum *cls = NULL; @@ -419,8 +488,7 @@ static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, return true; } -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect) +static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect) { struct type_datum *src, *tgt, *def; struct class_datum *cls; @@ -453,6 +521,8 @@ static bool add_type_rule(struct policydb *db, const char *s, const char *t, key.specified = effect; struct avtab_node *node = get_avtab_node(db, &key, NULL); + if (!node) + return false; node->datum.u.data = def->value; return true; @@ -537,11 +607,9 @@ static bool add_filename_trans(struct policydb *db, const char *s, struct filename_trans_datum *last = NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - struct filename_trans_datum *trans = - policydb_filenametr_search(db, &key); + struct filename_trans_datum *trans = policydb_filenametr_search(db, &key); #else - struct filename_trans_datum *trans = - hashtab_search(&db->filename_trans, &key); + struct filename_trans_datum *trans = hashtab_search(&db->filename_trans, &key); #endif while (trans) { if (ebitmap_get_bit(&trans->stypes, src->value - 1)) { @@ -556,17 +624,13 @@ static bool add_filename_trans(struct policydb *db, const char *s, } if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_KERNEL); - struct filename_trans_key *new_key = - (struct filename_trans_key *)kzalloc(sizeof(*new_key), - GFP_KERNEL); + trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), GFP_KERNEL); + struct filename_trans_key *new_key = (struct filename_trans_key *)kzalloc(sizeof(*new_key), GFP_KERNEL); *new_key = key; new_key->name = kstrdup(key.name, GFP_KERNEL); trans->next = last; trans->otype = def->value; - hashtab_insert(&db->filename_trans, new_key, trans, - filenametr_key_params); + hashtab_insert(&db->filename_trans, new_key, trans, filenametr_key_params); } db->compat_filename_trans_count++; @@ -582,15 +646,13 @@ static bool add_filename_trans(struct policydb *db, const char *s, hashtab_search(db->filename_trans, &key); if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), - 1, GFP_KERNEL); + trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), 1, GFP_KERNEL); if (!trans) { pr_err("add_filename_trans: Failed to alloc datum\n"); return false; } struct filename_trans *new_key = - (struct filename_trans *)kmalloc(sizeof(*new_key), - GFP_KERNEL); + (struct filename_trans *)kmalloc(sizeof(*new_key), GFP_KERNEL); if (!new_key) { pr_err("add_filename_trans: Failed to alloc new_key\n"); return false; @@ -601,13 +663,11 @@ static bool add_filename_trans(struct policydb *db, const char *s, hashtab_insert(db->filename_trans, new_key, trans); } - return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == - 0; + return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == 0; #endif } -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context) +static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context) { return false; } diff --git a/drivers/kernelsu/sulog/event.c b/drivers/kernelsu/sulog/event.c index 80a73b6fcf38..95f532979df5 100644 --- a/drivers/kernelsu/sulog/event.c +++ b/drivers/kernelsu/sulog/event.c @@ -25,13 +25,8 @@ static void ksu_sulog_fill_task_info(struct ksu_sulog_event *event, __u16 event_ event->pid = task_pid_nr(current); event->tgid = task_tgid_nr(current); event->ppid = task_ppid_nr(current); - - kuid_t current_uid = current_uid(); - kuid_t current_euid = current_euid(); - - event->uid = ksu_get_uid_t(current_uid); - event->euid = ksu_get_uid_t(current_euid); - + event->uid = current_uid().val; + event->euid = current_euid().val; get_task_comm(event->comm, current); } @@ -178,7 +173,7 @@ void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, ksu_sulog_free_pending(pending); } -int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp) +static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp) { if (!ksu_sulog_is_enabled()) return 0; @@ -197,7 +192,7 @@ int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp) return 0; } -int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) +static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) { if (!ksu_sulog_is_enabled()) return 0; @@ -212,11 +207,12 @@ int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len return 0; } -void ksu_sulog_emit_bprm(const char *filename) +static void ksu_sulog_emit_bprm(const char *filename) { if (!ksu_sulog_is_enabled()) return; + // maybe tag the process instead? if (!is_ksu_domain()) return; diff --git a/drivers/kernelsu/sulog/event.h b/drivers/kernelsu/sulog/event.h index bf272a7328eb..92563ded6d10 100644 --- a/drivers/kernelsu/sulog/event.h +++ b/drivers/kernelsu/sulog/event.h @@ -8,9 +8,10 @@ int ksu_sulog_events_init(void); void ksu_sulog_events_exit(void); void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp); -int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp); -int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp); +static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp); +static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp); +static void ksu_sulog_emit_bprm(const char *filename); struct ksu_event_queue *ksu_sulog_get_queue(void); diff --git a/drivers/kernelsu/supercall/dispatch.c b/drivers/kernelsu/supercall/dispatch.c index 2d6973ee8ae0..2ea7d8b4cbff 100644 --- a/drivers/kernelsu/supercall/dispatch.c +++ b/drivers/kernelsu/supercall/dispatch.c @@ -1,20 +1,19 @@ static int do_grant_root(void __user *arg) { int ret; - kuid_t audit_uid = current_uid(); - kuid_t audit_euid = current_euid(); + __u32 audit_uid = current_uid().val; + __u32 audit_euid = current_euid().val; // we already check uid above on allowed_for_su() write_sulog('i'); // log ioctl escalation - pr_info("allow root for: %d\n", ksu_get_uid_t(audit_uid)); + pr_info("allow root for: %d\n", audit_uid); ret = escape_with_root_profile(); #ifdef CONFIG_KSU_FEATURE_SULOG - ksu_sulog_emit_grant_root(ret, ksu_get_uid_t(audit_uid), ksu_get_uid_t(audit_euid), GFP_KERNEL); + ksu_sulog_emit_grant_root(ret, audit_uid, audit_euid, GFP_KERNEL); #endif - return ret; } @@ -269,23 +268,28 @@ static int do_get_manager_appid(void __user *arg) static int do_get_app_profile(void __user *arg) { - struct ksu_get_app_profile_cmd cmd; + uid_t uid; + struct app_profile *profile; + int ret = 0; - if (copy_from_user(&cmd, arg, sizeof(cmd))) { + if (copy_from_user(&uid, (char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile.curr_uid), sizeof(uid_t))) { pr_err("get_app_profile: copy_from_user failed\n"); return -EFAULT; } - if (!ksu_get_app_profile(&cmd.profile)) { - return -ENOENT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_app_profile: copy_to_user failed\n"); - return -EFAULT; + rcu_read_lock(); + profile = ksu_get_app_profile(uid); + rcu_read_unlock(); + if (!profile) { + ret = -ENOENT; + } else { + if (copy_to_user((char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile), profile, sizeof(struct app_profile))) { + pr_err("get_app_profile: copy_to_user failed\n"); + ret = -EFAULT; + } + ksu_put_app_profile(profile); } - - return 0; + return ret; } static int do_set_app_profile(void __user *arg) @@ -691,19 +695,16 @@ static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp) { int i; - kuid_t current_uid = current_uid(); #ifdef CONFIG_KSU_DEBUG - pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, ksu_get_uid_t(current_uid)); + pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val); #endif for (i = 0; ksu_ioctl_handlers[i].handler; i++) { if (cmd == ksu_ioctl_handlers[i].cmd) { // Check permission first - if (ksu_ioctl_handlers[i].perm_check && - !ksu_ioctl_handlers[i].perm_check()) { - pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", - cmd, ksu_get_uid_t(current_uid)); + if (ksu_ioctl_handlers[i].perm_check && !ksu_ioctl_handlers[i].perm_check()) { + pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", cmd, current_uid().val); return -EPERM; } // Execute handler diff --git a/drivers/kernelsu/supercall/perm.c b/drivers/kernelsu/supercall/perm.c index a0191bd140c7..89b674885072 100644 --- a/drivers/kernelsu/supercall/perm.c +++ b/drivers/kernelsu/supercall/perm.c @@ -5,14 +5,12 @@ bool only_manager(void) bool only_root(void) { - kuid_t current_uid = current_uid(); - return ksu_get_uid_t(current_uid) == 0; + return current_uid().val == 0; } bool manager_or_root(void) { - kuid_t current_uid = current_uid(); - return ksu_get_uid_t(current_uid) == 0 || is_manager(); + return current_uid().val == 0 || is_manager(); } bool always_allow(void) @@ -22,6 +20,6 @@ bool always_allow(void) bool allowed_for_su(void) { - kuid_t current_uid = current_uid(); - return is_manager() || ksu_is_allow_uid_for_current(ksu_get_uid_t(current_uid)); + return is_manager() || ksu_is_allow_uid_for_current(current_uid().val); + } diff --git a/drivers/kernelsu/supercall/supercall.c b/drivers/kernelsu/supercall/supercall.c index a6720a489d6f..9bfd347d3d2c 100644 --- a/drivers/kernelsu/supercall/supercall.c +++ b/drivers/kernelsu/supercall/supercall.c @@ -46,46 +46,7 @@ int ksu_install_fd(void) return fd; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) -struct ksu_install_fd_tw { - struct callback_head cb; - int __user *outp; -}; - -static void ksu_install_fd_tw_func(struct callback_head *cb) -{ - struct ksu_install_fd_tw *tw = container_of(cb, struct ksu_install_fd_tw, cb); - int fd = ksu_install_fd(); - pr_info("[%d] install ksu fd: %d\n", current->pid, fd); - - if (copy_to_user(tw->outp, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - close_fd(fd); - } - - kfree(tw); -} - -static int ksu_handle_fd_request(void __user *arg4) -{ - struct ksu_install_fd_tw *tw; - - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return 0; - - tw->outp = (int __user *)arg4; - tw->cb.func = ksu_install_fd_tw_func; - - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_warn("install fd add task_work failed\n"); - } - - return 0; -} -#else -static int ksu_handle_fd_request(void __user *arg4) +static inline int ksu_handle_fd_request(void __user *arg4) { int fd = ksu_install_fd(); pr_info("[%d] install ksu fd: %d\n", current->pid, fd); @@ -97,16 +58,19 @@ static int ksu_handle_fd_request(void __user *arg4) return 0; } -#endif // downstream: make sure to pass arg as reference, this can allow us to extend things. int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg) { - if (magic1 != KSU_INSTALL_MAGIC1) return 0; - pr_info("sys_reboot: intercepted call! magic: 0x%x id: %d\n", magic1, magic2); + // when ternary on fmt? + // cold syscall, we can splurge xD + if (magic2 == KSU_INSTALL_MAGIC2) + pr_info("sys_reboot: magic: 0x%x id: 0x%x pid: %d comm: %s \n", magic1, magic2, current->pid, current->comm); + else + pr_info("sys_reboot: magic: 0x%x id: %d pid: %d pid: %s \n", magic1, magic2, current->pid, current->comm); // arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs); // downstream: dereference arg as arg4 so we can be inline to upstream @@ -118,8 +82,7 @@ int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user } // only root is allowed for these commands - kuid_t current_uid = current_uid(); - if (ksu_get_uid_t(current_uid) != 0) + if (current_uid().val != 0) return 0; // extensions diff --git a/drivers/kernelsu/tiny_sulog.c b/drivers/kernelsu/tiny_sulog.c index 401f4c1c8daf..1fc8a5b1e3dd 100644 --- a/drivers/kernelsu/tiny_sulog.c +++ b/drivers/kernelsu/tiny_sulog.c @@ -22,7 +22,7 @@ static void tiny_sulog_init_heap() pr_info("sulog_init: allocated %lu bytes on 0x%p \n", SULOG_BUFSIZ, sulog_buf_ptr); } -/* +/** * * boottime_s_get, get kernel uptime in seconds * @@ -55,12 +55,10 @@ static void write_sulog(uint8_t sym) unsigned int offset = sulog_index_next * sizeof(struct sulog_entry); struct sulog_entry entry = {0}; - - kuid_t current_uid = current_uid(); // WARNING!!! this is LE only! entry.s_time = boottime_s_get(); - entry.data = (uint32_t)ksu_get_uid_t(current_uid); + entry.data = (uint32_t)current_uid().val; *((char *)&entry.data + 3) = sym; // we can perform this write atomic on 64-bit @@ -73,13 +71,16 @@ static void write_sulog(uint8_t sym) #else __builtin_memcpy(sulog_buf_ptr + offset, &entry, sizeof(entry)); #endif - spin_unlock(&sulog_lock); // move ptr for next iteration sulog_index_next = sulog_index_next + 1; if (sulog_index_next >= SULOG_ENTRY_MAX) sulog_index_next = 0; + + spin_unlock(&sulog_lock); + + return; } struct sulog_entry_rcv_ptr { diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index b818410d2418..58e5ccf6b1e9 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -76,7 +76,11 @@ int selinux_policycap_netpeer; int selinux_policycap_openperm; int selinux_policycap_alwaysnetwork; +#ifdef CONFIG_KSU +DEFINE_RWLOCK(policy_rwlock); +#else static DEFINE_RWLOCK(policy_rwlock); +#endif static struct sidtab sidtab; struct policydb policydb; From ba46d57edbf8436d362aad9c7a6ce4adbdbcea6d Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Tue, 12 May 2026 04:15:08 +0000 Subject: [PATCH 58/59] =?UTF-8?q?KernelSU:=20=E5=90=8C=E6=AD=A5=E8=87=B3ba?= =?UTF-8?q?ckslashxx/KernelSU@1b3dade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- drivers/kernelsu/Kconfig | 16 +- drivers/kernelsu/Makefile | 8 + drivers/kernelsu/extras.c | 222 -------- drivers/kernelsu/feature/selinux_hide.c | 404 +++++++++++++++ drivers/kernelsu/feature/selinux_hide.h | 65 +++ drivers/kernelsu/feature/sucompat.c | 2 +- drivers/kernelsu/hook/core_hook.c | 32 +- drivers/kernelsu/hook/kp_ksud.c | 9 +- .../kernelsu/hook/syscall_table_hook_arm.c | 118 +++-- .../kernelsu/hook/syscall_table_hook_arm64.c | 120 ++--- drivers/kernelsu/include/ksu.h | 2 +- drivers/kernelsu/include/uapi/feature.h | 5 +- drivers/kernelsu/ksu.c | 34 +- drivers/kernelsu/manager/pkg_observer.c | 35 +- drivers/kernelsu/manager/throne_tracker.c | 2 +- drivers/kernelsu/runtime/ksud.c | 10 - drivers/kernelsu/selinux/rules.c | 38 +- drivers/kernelsu/selinux/sepolicy.c | 479 ++---------------- 18 files changed, 768 insertions(+), 833 deletions(-) delete mode 100644 drivers/kernelsu/extras.c create mode 100644 drivers/kernelsu/feature/selinux_hide.c create mode 100644 drivers/kernelsu/feature/selinux_hide.h diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig index a2a7bebe2921..10608831444f 100644 --- a/drivers/kernelsu/Kconfig +++ b/drivers/kernelsu/Kconfig @@ -9,14 +9,6 @@ config KSU help Enable kernel-level root privileges on Android System. -config KSU_EXTRAS - bool "Enable custom stuff" - depends on KSU - default n - help - Custom extensions. Experimental. - Currently, only avc log spoofing is implemented. - config KSU_KPROBES_KSUD bool "Enable dynamic kprobes for early boot hooks" depends on KPROBES && KRETPROBES @@ -50,6 +42,14 @@ config KSU_FEATURE_ADBROOT help Build KernelSU's adb root feature. +config KSU_FEATURE_SELINUX_HIDE + bool "KernelSU SELinux hide feature" + depends on KSU + default y + help + Build KernelSU's SELinux hide feature. + This is a dumber implementation, but it should be fine for most cases. + config KSU_DEBUG bool "KernelSU debug mode" depends on KSU diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile index 8ed9b3857342..7c2fcedc7eac 100644 --- a/drivers/kernelsu/Makefile +++ b/drivers/kernelsu/Makefile @@ -5,6 +5,11 @@ obj-$(CONFIG_KSU) := ksu.o CFLAGS_ksu.o += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include CFLAGS_ksu.o += -I$(objtree)/security/selinux +# uncommon, but wont hurt, check for 3-arg security_add_hooks +ifeq ($(shell grep -A1 "void security_add_hooks" $(srctree)/include/linux/lsm_hooks.h 2>/dev/null | grep -q lsm 2>/dev/null; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_SECURITY_ADD_HOOKS_V2 +endif + ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) CFLAGS_ksu.o += -DKSU_COMPAT_HAS_CURRENT_SID endif @@ -49,6 +54,9 @@ CFLAGS_ksu.o += -Wno-int-conversion -Wno-int-to-pointer-cast -Wno-pointer-to-int CFLAGS_ksu.o += -Wno-unused-variable -Wno-unused-function -Wno-format CFLAGS_ksu.o += -Wno-macro-redefined +# dont be too strict +CFLAGS_REMOVE_ksu.o += -Werror + # so we can see stack use atleast, as we disable all stack safety here CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024) diff --git a/drivers/kernelsu/extras.c b/drivers/kernelsu/extras.c deleted file mode 100644 index e7436175102a..000000000000 --- a/drivers/kernelsu/extras.c +++ /dev/null @@ -1,222 +0,0 @@ -// sorry for the ifdef hell -// but im too lazy to fragment this out. -// theres only one feature so far anyway -// - xx, 20251019 - -static u32 su_sid = 0; -static u32 ksu_sid = 0; -static u32 priv_app_sid = 0; - -// init as disabled by default -static atomic_t disable_spoof = ATOMIC_INIT(1); - -void ksu_avc_spoof_enable(); -void ksu_avc_spoof_disable(); - -static bool ksu_avc_spoof_enabled = true; -static bool boot_completed = false; - -static int avc_spoof_feature_get(u64 *value) -{ - *value = ksu_avc_spoof_enabled ? 1 : 0; - return 0; -} - -static int avc_spoof_feature_set(u64 value) -{ - bool enable = value != 0; - - if (enable == ksu_avc_spoof_enabled) { - pr_info("avc_spoof: no need to change\n"); - return 0; - } - - ksu_avc_spoof_enabled = enable; - - if (boot_completed) { - if (enable) { - ksu_avc_spoof_enable(); - } else { - ksu_avc_spoof_disable(); - } - } - - pr_info("avc_spoof: set to %d\n", enable); - - return 0; -} - -static const struct ksu_feature_handler avc_spoof_handler = { - .feature_id = KSU_FEATURE_AVC_SPOOF, - .name = "avc_spoof", - .get_handler = avc_spoof_feature_get, - .set_handler = avc_spoof_feature_set, -}; - -static int get_sid() -{ - // dont load at all if we cant get sids - int err = security_secctx_to_secid("u:r:su:s0", strlen("u:r:su:s0"), &su_sid); - if (err) { - pr_info("avc_spoof/get_sid: su_sid not found!\n"); - return -1; - } - pr_info("avc_spoof/get_sid: su_sid: %u\n", su_sid); - - err = security_secctx_to_secid("u:r:ksu:s0", strlen("u:r:ksu:s0"), &ksu_sid); - if (err) { - pr_info("avc_spoof/get_sid: ksu_sid not found!\n"); - return -1; - } - pr_info("avc_spoof/get_sid: ksu_sid: %u\n", ksu_sid); - - err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid); - if (err) { - pr_info("avc_spoof/get_sid: priv_app_sid not found!\n"); - return -1; - } - pr_info("avc_spoof/get_sid: priv_app_sid: %u\n", priv_app_sid); - return 0; -} - -#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) -#include -static struct kprobe *slow_avc_audit_kp; - -static int ksu_handle_slow_avc_audit(u32 *tsid) -{ - if (atomic_read(&disable_spoof)) - return 0; - - // if tsid is su, we just replace it - // unsure if its enough, but this is how it is aye? - if (*tsid == su_sid || *tsid == ksu_sid) { - pr_info("avc_spoof/slow_avc_audit: replacing tsid: %u with priv_app_sid: %u\n", *tsid, priv_app_sid); - *tsid = priv_app_sid; - } - - return 0; -} - -static int slow_avc_audit_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - if (atomic_read(&disable_spoof)) - return 0; - - /* - * for < 4.17 int slow_avc_audit(u32 ssid, u32 tsid - * for >= 4.17 int slow_avc_audit(struct selinux_state *state, u32 ssid, u32 tsid - * for >= 6.4 int slow_avc_audit(u32 ssid, u32 tsid - * not to mention theres also DKSU_HAS_SELINUX_STATE - * since its hard to make sure this selinux state thing - * cross crossing with 4.17 ~ 6.4's where slow_avc_audit - * changes abi (tsid in arg2 vs arg3) - */ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) - u32 *tsid = (u32 *)&PT_REGS_PARM2(regs); - ksu_handle_slow_avc_audit(tsid); -#else - u32 *tsid = (u32 *)&PT_REGS_PARM3(regs); - ksu_handle_slow_avc_audit(tsid); -#endif - - return 0; -} - -// copied from upstream -static struct kprobe *init_kprobe(const char *name, - kprobe_pre_handler_t handler) -{ - struct kprobe *kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!kp) - return NULL; - kp->symbol_name = name; - kp->pre_handler = handler; - - int ret = register_kprobe(kp); - pr_info("sucompat: register_%s kprobe: %d\n", name, ret); - if (ret) { - kfree(kp); - return NULL; - } - - return kp; -} -static void destroy_kprobe(struct kprobe **kp_ptr) -{ - struct kprobe *kp = *kp_ptr; - if (!kp) - return; - unregister_kprobe(kp); - synchronize_rcu(); - kfree(kp); - *kp_ptr = NULL; -} -#else // CONFIG_KPROBES -int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass) -{ - if (atomic_read(&disable_spoof)) - return 0; - - if (tsid != su_sid && tsid != ksu_sid) - return 0; - - pr_info("avc_spoof/slow_avc_audit: prevent log for sid: %u\n", tsid); - *tclass = 0; - - return 0; -} -#endif - -void ksu_avc_spoof_disable(void) -{ -#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - pr_info("avc_spoof/exit: unregister slow_avc_audit kprobe!\n"); - destroy_kprobe(&slow_avc_audit_kp); -#endif - atomic_set(&disable_spoof, 1); - pr_info("avc_spoof/exit: slow_avc_audit spoofing disabled!\n"); -} - -void ksu_avc_spoof_enable(void) -{ - int ret = get_sid(); - if (ret) { - pr_info("avc_spoof/init: sid grab fail!\n"); - return; - } - -#if defined(CONFIG_KPROBES) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - pr_info("avc_spoof/init: register slow_avc_audit kprobe!\n"); - slow_avc_audit_kp = init_kprobe("slow_avc_audit", slow_avc_audit_pre_handler); -#endif - // once we get the sids, we can now enable the hook handler - atomic_set(&disable_spoof, 0); - - pr_info("avc_spoof/init: slow_avc_audit spoofing enabled!\n"); -} - -void ksu_avc_spoof_late_init() -{ - boot_completed = true; - - if (ksu_avc_spoof_enabled) { - ksu_avc_spoof_enable(); - } -} - -void ksu_avc_spoof_init() -{ - if (ksu_register_feature_handler(&avc_spoof_handler)) { - pr_err("Failed to register avc spoof feature handler\n"); - } -} - -void ksu_avc_spoof_exit() -{ - if (ksu_avc_spoof_enabled) { - ksu_avc_spoof_disable(); - } - ksu_unregister_feature_handler(KSU_FEATURE_AVC_SPOOF); -} diff --git a/drivers/kernelsu/feature/selinux_hide.c b/drivers/kernelsu/feature/selinux_hide.c new file mode 100644 index 000000000000..962fadf7fa8c --- /dev/null +++ b/drivers/kernelsu/feature/selinux_hide.c @@ -0,0 +1,404 @@ +/** + * NOTE: this isnt the fullblown thing like upstream's where we straight up backport + * SELinux. This is just questionable to do when we want to support a plethora of + * non-standard kernels. + * + * While what we are doing here is kinda improper, for most cases + * this should be mroe than enough. + * + * this will include write_op / selinux_transaction_write spoofing and then avc spoofing. + * our goal for this one is to be self contained as much as possible + * with only one call from ksu's initcall. + * + */ + +// enabled by default +static bool ksu_selinux_hide_enabled __read_mostly = true; + +// sids for avc spoofing +static u32 su_sid __read_mostly = 0; +static u32 ksu_sid __read_mostly = 0; +static u32 priv_app_sid __read_mostly = 0; + +static inline int ksu_selinux_get_sids() +{ + // dont load at all if we cant get sids + int err = security_secctx_to_secid("u:r:su:s0", strlen("u:r:su:s0"), &su_sid); + if (!err) + pr_info("selinux_hide: su_sid: %u\n", su_sid); + + err = security_secctx_to_secid("u:r:ksu:s0", strlen("u:r:ksu:s0"), &ksu_sid); + if (!err) + pr_info("selinux_hide: ksu_sid: %u\n", su_sid); + + err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid); + if (!err) + pr_info("selinux_hide: priv_app_sid: %u\n", su_sid); + + if (!su_sid || !ksu_sid || !priv_app_sid) + return -1; + + return 0; +} + +// deprecate in a month +int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass) +{ + if (!ksu_selinux_hide_enabled) + return 0; + + if (tsid != su_sid && tsid != ksu_sid) + return 0; + + pr_info("selinux_hide: prevent log for sid: %u\n", tsid); + *tclass = 0; + + return 0; +} + +void ksu_slow_avc_audit(u32 *tsid) +{ + if (!ksu_selinux_hide_enabled) + return; + + // if tsid is su, we just replace it + // unsure if its enough, but this is how it is aye? + if (*tsid == su_sid || *tsid == ksu_sid) { + pr_info("selinux_hide: slow_avc_audit: replace tsid: %u with priv_app_sid: %u\n", *tsid, priv_app_sid); + *tsid = priv_app_sid; + } + + return; +} + +static inline bool ksu_should_destroy_context(char *str) +{ + if (!str) + return false; + + struct ksu_hidden_node *node; + + read_lock(&ksu_sepolicy_shitlist_lock); + list_for_each_entry(node, &ksu_sepolicy_rule_list, list) { + if (strstr(str, node->name)) { + read_unlock(&ksu_sepolicy_shitlist_lock); + return true; + } + } + read_unlock(&ksu_sepolicy_shitlist_lock); + + return false; +} + +/** + * security_setprocattr is a weird LSM on 5.4 and up, and this is normally backported + * down to 4.14 and 4.19. somehow this LSM is a one-shot. only the first to register + * is called. + * + * however this is not an issue for us on 3.x as we are hijacking selinux_ops on it + * + */ +int ksu_hide_setprocattr(const char *name, void *value, size_t size) +{ + if (!ksu_selinux_hide_enabled) + return 0; + + // only hook when seccomp is enabled + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + // only appuid + if (current_uid().val < 10000) + return 0; + + if (!size) + return 0; + + if (!name) + return 0; + + if (!!strcmp(name, "current")) + return 0; + + char *str = (char *)value; + + if (!str) + return 0; + + // to make sure its terminated + char buf[64] = { 0 }; + size_t len = (size < 63) ? size : 63; + + memcpy(buf, str, len); + + if (!ksu_should_destroy_context(buf)) + return 0; + + pr_info("block setprocattr for context: %s\n", buf); + str[1] = '1'; + + return 0; +} + +// for manual hook +void ksu_sel_write_context(struct file **file, char **buf, size_t *size) +{ + if (!ksu_selinux_hide_enabled) + return; + + // only hook when seccomp is enabled + if (!test_thread_flag(TIF_SECCOMP)) + return; + + // only appuid + if (current_uid().val < 10000) + return; + + // upstream doesnt do this, so we should also not. + //if (!ksu_uid_should_umount(current_uid().val)) + // return; + + char *mbuf = *buf; + + if (!mbuf) + return; + + if (!ksu_should_destroy_context(mbuf)) + return; + + pr_info("selinux_hide: destroy: %s \n", mbuf); + mbuf[1] = '1'; + return; + +} + +#if defined(CONFIG_KPROBES) + +#include +static struct kprobe *slow_avc_audit_kp; +static struct kprobe *sel_write_context_kp; +static struct kprobe *sel_write_access_kp; + +static int slow_avc_audit_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + +#if defined(KSU_COMPAT_HAS_SELINUX_STATE) + u32 *tsid = (u32 *)&PT_REGS_PARM3(regs); +#else + u32 *tsid = (u32 *)&PT_REGS_PARM2(regs); +#endif + + ksu_slow_avc_audit(tsid); + + return 0; +} + +static int sel_write_context_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + char **buf = (char **)&PT_REGS_PARM2(regs); + + ksu_sel_write_context(NULL, buf, NULL); + return 0; +} + +// this deals with __user, this is here in case its really needed. +#if 0 +static int selinux_transaction_write_pre_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + + bool *should_destroy = (bool *)ri->data; + *should_destroy = false; + + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + if (current_uid().val < 10000) + return 0; + + if (!ksu_uid_should_umount(current_uid().val)) + return 0; + + const char __user **buf = (const char __user **)&PT_REGS_PARM2(regs); + char __user *uptr = *(char **)buf; + + char kbuf[128] = { 0 }; + + if (ksu_copy_from_user_retry(kbuf, uptr, 127)) + return 0; + + // move ptr to the next one after space + char *target = strchr(kbuf, ' '); + if (likely(target)) + target++; + else + target = kbuf; + + if (!ksu_should_destroy_context(target)) + return 0; + + pr_info("selinux_transaction_write: destroy: %s \n", kbuf); + *should_destroy = true; + + return 0; +} + +static int selinux_transaction_write_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + // if bool is true, mod PT_REGS_RC to ret EINVAL + bool *should_destroy = (bool *)ri->data; + + if (*should_destroy) + PT_REGS_RC(regs) = -EINVAL; + + return 0; +} + +static struct kretprobe selinux_transaction_write_rp = { + .kp.symbol_name = "selinux_transaction_write", + .handler = selinux_transaction_write_ret_handler, + .entry_handler = selinux_transaction_write_pre_handler, + .data_size = sizeof(bool), + .maxactive = 20, +}; +#endif + +// copied from upstream +static struct kprobe *init_kprobe(const char *name, kprobe_pre_handler_t handler) +{ + struct kprobe *kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kp) + return NULL; + kp->symbol_name = name; + kp->pre_handler = handler; + + int ret = register_kprobe(kp); + pr_info("%s: register %s kprobe: %d\n", __func__, name, ret); + if (ret) { + kfree(kp); + return NULL; + } + + return kp; +} +static void destroy_kprobe(struct kprobe **kp_ptr) +{ + struct kprobe *kp = *kp_ptr; + if (!kp) + return; + unregister_kprobe(kp); + synchronize_rcu(); + kfree(kp); + *kp_ptr = NULL; +} +#endif // CONFIG_KPROBES + + +static void ksu_selinux_hide_enable() +{ + int ret = ksu_selinux_get_sids(); + if (ret) + pr_info("selinux_hide: sid grab fail!\n"); + +#if defined(CONFIG_KPROBES) + slow_avc_audit_kp = init_kprobe("slow_avc_audit", slow_avc_audit_pre_handler); + + sel_write_context_kp = init_kprobe("sel_write_context", sel_write_context_pre_handler); + sel_write_access_kp = init_kprobe("sel_write_access", sel_write_context_pre_handler); +#endif + + pr_info("selinux_hide: started! make sure manual hooks are in-place!\n"); + + ksu_selinux_hide_enabled = true; +} + +static void ksu_selinux_hide_disable() +{ +#if defined(CONFIG_KPROBES) + pr_info("selinux_hide: unregister slow_avc_audit kprobe!\n"); + destroy_kprobe(&slow_avc_audit_kp); + + pr_info("selinux_hide: unregister sel_write_context kprobe!\n"); + destroy_kprobe(&sel_write_context_kp); + + pr_info("selinux_hide: unregister sel_write_access kprobe!\n"); + destroy_kprobe(&sel_write_access_kp); +#endif + + pr_info("selinux_hide: closing down hooks!\n"); + + ksu_selinux_hide_enabled = false; +} + +// init kthread +static int ksu_hide_init_thread(void *data) +{ + unsigned int i = 0; + + set_user_nice(current, 19); // low prio + +start: + if (!!*(volatile bool *)&ksu_boot_completed) + goto bail; + + msleep(5000); + + i++; + + if (i < 12) + goto start; + +bail: + + ksu_add_shit_to_list(KERNEL_SU_DOMAIN); + ksu_add_shit_to_list(KERNEL_SU_FILE); + + ksu_selinux_hide_enable(); + return 0; +} + +static int selinux_hide_feature_get(u64 *value) +{ + *value = ksu_selinux_hide_enabled ? 1 : 0; + return 0; +} + +static int selinux_hide_feature_set(u64 value) +{ + bool enable = value != 0; + int ret = 0; + + if (enable == ksu_selinux_hide_enabled) + return 0; + + pr_info("selinux_hide: set to %d\n", enable); + + if (enable) + ksu_selinux_hide_enable(); + else + ksu_selinux_hide_disable(); + + return ret; +} + +static const struct ksu_feature_handler selinux_hide_handler = { + .feature_id = KSU_FEATURE_SELINUX_HIDE, + .name = "selinux_hide", + .get_handler = selinux_hide_feature_get, + .set_handler = selinux_hide_feature_set, +}; + +void __init ksu_selinux_hide_init() +{ + // we init this on a kthread + kthread_run(ksu_hide_init_thread, NULL, "kthread"); + + if (ksu_register_feature_handler(&selinux_hide_handler)) { + pr_err("Failed to register selinux_hide feature handler\n"); + } +} + +void __exit ksu_selinux_hide_exit() +{ + ksu_unregister_feature_handler(KSU_FEATURE_SELINUX_HIDE); +} + diff --git a/drivers/kernelsu/feature/selinux_hide.h b/drivers/kernelsu/feature/selinux_hide.h new file mode 100644 index 000000000000..39c60206b9c6 --- /dev/null +++ b/drivers/kernelsu/feature/selinux_hide.h @@ -0,0 +1,65 @@ +#ifndef __KSU_H_SELINUX_HIDE +#define __KSU_H_SELINUX_HIDE + +void ksu_selinux_hide_init(); +void ksu_selinux_hide_exit(); + +// /selinux/rules.c, linked list +LIST_HEAD(ksu_sepolicy_rule_list); +DEFINE_RWLOCK(ksu_sepolicy_shitlist_lock); + +struct ksu_hidden_node { + struct list_head list; + char *name; +}; + +static void ksu_add_shit_to_list(const char *name) +{ + if (!name) + return; + + if (!strcmp(name, "zygote")) + return; + + if (!strcmp(name, "app_zygote")) + return; + + struct ksu_hidden_node *node; + size_t name_len = strlen(name); + + // check for dupes + write_lock(&ksu_sepolicy_shitlist_lock); + list_for_each_entry(node, &ksu_sepolicy_rule_list, list) { + // ":name:" + if (strlen(node->name) == (name_len + 2) && !memcmp(node->name + 1, name, name_len)) + goto unlock_list; + } + + node = kmalloc(sizeof(*node), GFP_ATOMIC); + if (!node) + goto unlock_list; + + // ':' + original + ':' + \0 + size_t len = strlen(name); + node->name = kmalloc(name_len + 3, GFP_ATOMIC); + if (!node->name) { + kfree(node); + goto unlock_list; + } + + node->name[0] = ':'; + memcpy(node->name + 1, name, name_len); + node->name[name_len + 1] = ':'; + node->name[name_len + 2] = '\0'; + + list_add(&node->list, &ksu_sepolicy_rule_list); + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("%s: now tracking type: %s, padded: %s \n", __func__, name, node->name); + +unlock_list: + write_unlock(&ksu_sepolicy_shitlist_lock); + return; +} + +#endif diff --git a/drivers/kernelsu/feature/sucompat.c b/drivers/kernelsu/feature/sucompat.c index 192ef8a38a5d..174e170cb146 100644 --- a/drivers/kernelsu/feature/sucompat.c +++ b/drivers/kernelsu/feature/sucompat.c @@ -242,7 +242,7 @@ SUCOMPAT_HOOK_TYPE ksu_handle_stat(int *dfd, const char __user **filename_user, } // sys_execve, compat_sys_execve -SUCOMPAT_HOOK_TYPE ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, void *argv, void *envp, int *flags) +SUCOMPAT_HOOK_TYPE ksu_handle_execve(const char __user **filename_user, void *argv, void *envp) { sys_execve_escape_ksud((void *)filename_user); diff --git a/drivers/kernelsu/hook/core_hook.c b/drivers/kernelsu/hook/core_hook.c index 2e12d00edef1..54572c2c611f 100644 --- a/drivers/kernelsu/hook/core_hook.c +++ b/drivers/kernelsu/hook/core_hook.c @@ -58,17 +58,18 @@ static struct security_hook_list ksu_hooks[] __ro_after_init = { #endif }; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -static void ksu_lsm_hook_init(void) -{ - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); -} +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) || defined(KSU_COMPAT_SECURITY_ADD_HOOKS_V2) +#define ksu_security_add_hooks security_add_hooks #else -static void ksu_lsm_hook_init(void) +#define ksu_security_add_hooks(a, b, c) security_add_hooks(a, b) +#endif + +static __init void ksu_lsm_hook_init(void) { - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks)); + ksu_security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); + + pr_info("core_hook: initialized %d LSMs \n", ARRAY_SIZE(ksu_hooks)); } -#endif #else /* < 4.2, LSM */ @@ -76,6 +77,16 @@ static void ksu_lsm_hook_init(void) static uintptr_t selinux_ops_addr = NULL; +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE +static int (*orig_setprocattr) (struct task_struct *p, char *name, void *value, size_t size) = NULL; +static int hook_setprocattr(struct task_struct *p, char *name, void *value, size_t size) +{ + + ksu_hide_setprocattr(name, value, size); + return orig_setprocattr(p, name, value, size); +} +#endif + static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) = NULL; static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry, @@ -366,6 +377,11 @@ static int ksu_register_lsm_hook(void *data) orig_inode_rename = ops->inode_rename; ops->inode_rename = hook_inode_rename; +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE + orig_setprocattr = ops->setprocattr; + ops->setprocattr = hook_setprocattr; +#endif + orig_task_fix_setuid = ops->task_fix_setuid; ops->task_fix_setuid = hook_task_fix_setuid; diff --git a/drivers/kernelsu/hook/kp_ksud.c b/drivers/kernelsu/hook/kp_ksud.c index 1eb73dba0b6e..24ad5c3a14b4 100644 --- a/drivers/kernelsu/hook/kp_ksud.c +++ b/drivers/kernelsu/hook/kp_ksud.c @@ -85,12 +85,7 @@ static int sys_reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) got_flipped = true; } - // jack priority in illeggal state - int old_nice = task_nice(current); - set_user_nice(current, -10); - ksu_handle_sys_reboot(*magic1, magic2, cmd, arg); - set_user_nice(current, old_nice); if (got_flipped) preempt_disable(); @@ -130,9 +125,8 @@ static int unregister_kprobe_function(void *data) return 0; } -static void kp_ksud_init() +static __init int kp_ksud_init() { - int ret = register_kprobe(&sys_reboot_kp); // dont unreg this one pr_info("kp_ksud: sys_reboot_kp: %d\n", ret); @@ -145,4 +139,5 @@ static void kp_ksud_init() #endif kthread_run(unregister_kprobe_function, NULL, "kp_unreg"); + return 0; } diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm.c b/drivers/kernelsu/hook/syscall_table_hook_arm.c index 105e2adbfe01..996b3da89a06 100644 --- a/drivers/kernelsu/hook/syscall_table_hook_arm.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm.c @@ -35,9 +35,10 @@ __attribute__((hot)) static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); + ksu_handle_execve(filename, argv, envp); return armeabi_execve(regs); } @@ -86,57 +87,104 @@ static long hook_armeabi_read(const struct pt_regs *regs) #else // END OF 4.19+ SYSCALL HANDLERS -static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; +/** + * for legacy syscall abi, we straight up call the syscall symbol + * this is easier and maybe a little bit faster + * + */ + +extern void *sys_call_table[]; + +static uintptr_t armeabi_reboot __read_mostly = NULL; static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); - return armeabi_reboot(magic1, magic2, cmd, arg); + return sys_reboot(magic1, magic2, cmd, arg); } -static long (*armeabi_execve)(const char __user * filename, - const char __user *const __user * argv, - const char __user *const __user * envp) __read_mostly = NULL; +static uintptr_t armeabi_execve __read_mostly = NULL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) __attribute__((hot)) static long hook_armeabi_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) { - ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); - return armeabi_execve(filename, argv, envp); + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return sys_execve(filename, argv, envp); +} + +#else /* sys_execve_oabi */ + +/** + * on 3.0 / 3.4 ARM, sys_execve sc entry accepts 3 args (r0, r1, r2) + * however, sys_execve on that version, needs 4. the kernel does this small wrapper + * where it puts sp + 8 on r3. without it, hook won't work. + * + * // arch/arm/kernel/entry-common.S + * + * sys_execve_wrapper: + * add r3, sp, #S_OFF + * b sys_execve + * ENDPROC(sys_execve_wrapper) + * + */ +#include + +__attribute__((used, noipa)) +static long hook_sys_execve(const char __user *filenamei, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) +{ + ksu_handle_execve(&filenamei, (void ***)&argv, (void ***)&envp); + return sys_execve(filenamei, argv, envp, regs); } -static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +#define S_OFF "8" +__attribute__((naked)) +static noinline void hook_armeabi_execve() +{ + asm volatile( + "add r3, sp, #" S_OFF "\n" + "b hook_sys_execve\n" + ); +} + +#endif /* sys_execve_oabi */ + + +static uintptr_t armeabi_faccessat __read_mostly = NULL; __attribute__((hot)) static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); - return armeabi_faccessat(dfd, filename, mode); + return sys_faccessat(dfd, filename, mode); } -static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) __read_mostly = NULL; +static uintptr_t armeabi_fstatat64 __read_mostly = NULL; __attribute__((hot)) static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); - return armeabi_fstatat64(dfd, filename, statbuf, flag); + return sys_fstatat64(dfd, filename, statbuf, flag); } -static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) __read_mostly = NULL; +static uintptr_t armeabi_fstat64 __read_mostly = NULL; __attribute__((cold)) static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) { // we handle it like rp - long ret = armeabi_fstat64(fd, statbuf); + long ret = sys_fstat64(fd, statbuf); ksu_handle_fstat64_ret(&fd, &statbuf); return ret; } -static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +static uintptr_t armeabi_read __read_mostly = NULL; __attribute__((cold)) static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); - return armeabi_read(fd, buf, count); + return sys_read(fd, buf, count); } #endif // SYSCALL HANDLERS @@ -305,22 +353,6 @@ static int ksu_syscall_table_restore() return 0; } -static void ksu_syscall_table_hook_init() -{ - - read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)sys_call_table); - read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); - read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); - read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); - - // will be unregged - read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); - read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); - - // start unreg kthread - kthread_run(ksu_syscall_table_restore, NULL, "unhook"); -} - static DEFINE_MUTEX(sucompat_toggle_mutex); static void syscall_table_sucompat_enable() @@ -341,4 +373,26 @@ static void syscall_table_sucompat_disable() mutex_unlock(&sucompat_toggle_mutex); } +static __init int ksu_syscall_table_hook_init() +{ + // enable on init! + syscall_table_sucompat_enable(); + + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)sys_call_table); + + // theres an issue on fstat64 on oabi, so lets not hook it + // this is not that much of a loss since 3.0 / 3.4 devices aren't really running A17 + // TODO: fix and handle this +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); +#endif + + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); + + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); + return 0; +} +device_initcall_sync(ksu_syscall_table_hook_init); + // EOF diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm64.c b/drivers/kernelsu/hook/syscall_table_hook_arm64.c index 34e316d01105..ced382be024c 100644 --- a/drivers/kernelsu/hook/syscall_table_hook_arm64.c +++ b/drivers/kernelsu/hook/syscall_table_hook_arm64.c @@ -43,9 +43,10 @@ __attribute__((hot)) static long hook_aarch64_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); + ksu_handle_execve(filename, argv, envp); return aarch64_execve(regs); } @@ -110,9 +111,10 @@ __attribute__((hot)) static long hook_armeabi_execve(const struct pt_regs *regs) { const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; void ***envp = (void ***)®s->regs[2]; - ksu_handle_execve_sucompat(NULL, filename, NULL, envp, NULL); + ksu_handle_execve(filename, argv, envp); return armeabi_execve(regs); } @@ -163,113 +165,115 @@ static long hook_armeabi_read(const struct pt_regs *regs) #else // END OF 4.19+ SYSCALL HANDLERS -static long (*aarch64_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; +/** + * for legacy syscall abi, we straight up call the syscall symbol + * this is easier and maybe a little bit faster + * + */ + +static uintptr_t aarch64_reboot __read_mostly = NULL; static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); - return aarch64_reboot(magic1, magic2, cmd, arg); + return sys_reboot(magic1, magic2, cmd, arg); } -static long (*aarch64_execve)(const char __user * filename, - const char __user *const __user * argv, - const char __user *const __user * envp) __read_mostly = NULL; +static uintptr_t aarch64_execve __read_mostly = NULL; __attribute__((hot)) static long hook_aarch64_execve(const char __user * filename, const char __user *const __user * argv, const char __user *const __user * envp) { - ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); - return aarch64_execve(filename, argv, envp); + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return sys_execve(filename, argv, envp); } -static long (*aarch64_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +static uintptr_t aarch64_faccessat __read_mostly = NULL; __attribute__((hot)) static long hook_aarch64_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); - return aarch64_faccessat(dfd, filename, mode); + return sys_faccessat(dfd, filename, mode); } -static long (*aarch64_newfstatat)(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) __read_mostly = NULL; +static uintptr_t aarch64_newfstatat __read_mostly = NULL; __attribute__((hot)) static long hook_aarch64_newfstatat(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); - return aarch64_newfstatat(dfd, filename, statbuf, flag); + return sys_newfstatat(dfd, filename, statbuf, flag); } -static long (*aarch64_newfstat)(unsigned int fd, struct stat __user * statbuf) __read_mostly = NULL; +static uintptr_t aarch64_newfstat __read_mostly = NULL; __attribute__((cold)) static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * statbuf) { // we handle it like rp - long ret = aarch64_newfstat(fd, statbuf); + long ret = sys_newfstat(fd, statbuf); ksu_handle_newfstat_ret(&fd, &statbuf); return ret; } -static long (*aarch64_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +static uintptr_t aarch64_read __read_mostly = NULL; __attribute__((cold)) static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); - return aarch64_read(fd, buf, count); + return sys_read(fd, buf, count); } #ifdef CONFIG_COMPAT extern const void *compat_sys_call_table[]; -static long (*armeabi_reboot)(int magic1, int magic2, unsigned int cmd, void __user *arg) __read_mostly = NULL; +static uintptr_t armeabi_reboot __read_mostly = NULL; static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) { ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); - return armeabi_reboot(magic1, magic2, cmd, arg); + return sys_reboot(magic1, magic2, cmd, arg); } -static long (*armeabi_execve)(const char __user * filename, - const compat_uptr_t __user * argv, - const compat_uptr_t __user * envp) __read_mostly = NULL; +static uintptr_t armeabi_execve __read_mostly = NULL; __attribute__((hot)) static long hook_armeabi_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - ksu_handle_execve_sucompat(NULL, &filename, NULL, (void ***)&envp, NULL); - return armeabi_execve(filename, argv, envp); + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return compat_sys_execve(filename, argv, envp); } -static long (*armeabi_faccessat)(int dfd, const char __user * filename, int mode) __read_mostly = NULL; +static uintptr_t armeabi_faccessat __read_mostly = NULL; __attribute__((hot)) static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) { ksu_handle_faccessat(&dfd, &filename, &mode, NULL); - return armeabi_faccessat(dfd, filename, mode); + return sys_faccessat(dfd, filename, mode); } -static long (*armeabi_fstatat64)(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) __read_mostly = NULL; +static uintptr_t armeabi_fstatat64 __read_mostly = NULL; __attribute__((hot)) static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) { ksu_handle_stat(&dfd, &filename, &flag); - return armeabi_fstatat64(dfd, filename, statbuf, flag); + return sys_fstatat64(dfd, filename, statbuf, flag); } -static long (*armeabi_fstat64)(unsigned long fd, struct stat64 __user * statbuf) __read_mostly = NULL; +static uintptr_t armeabi_fstat64 __read_mostly = NULL; __attribute__((cold)) static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) { // we handle it like rp - long ret = armeabi_fstat64(fd, statbuf); + long ret = sys_fstat64(fd, statbuf); ksu_handle_fstat64_ret(&fd, &statbuf); return ret; } -static long (*armeabi_read)(unsigned int fd, char __user *buf, size_t count) __read_mostly = NULL; +static uintptr_t armeabi_read __read_mostly = NULL; __attribute__((cold)) static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) { ksu_handle_sys_read_fd(fd); - return armeabi_read(fd, buf, count); + return sys_read(fd, buf, count); } #endif // CONFIG_COMPAT @@ -445,33 +449,6 @@ static int ksu_syscall_table_restore() return 0; } -static void ksu_syscall_table_hook_init() -{ - read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table); - read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); - read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); - read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); - - // will be unregged - read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); - read_and_replace_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); - -#if defined(CONFIG_COMPAT) - read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table); - read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); - read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); - read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); - - // will be unregged - read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); - read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); - -#endif // COMPAT - - // start unreg kthread - kthread_run(ksu_syscall_table_restore, NULL, "unhook"); -} - static DEFINE_MUTEX(sucompat_toggle_mutex); static void syscall_table_sucompat_enable() @@ -508,4 +485,29 @@ static void syscall_table_sucompat_disable() mutex_unlock(&sucompat_toggle_mutex); } +static __init int ksu_syscall_table_hook_init() +{ + // enable on init! + syscall_table_sucompat_enable(); + + read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table); + + // will be unregged + read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table); + + // will be unregged + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); +#endif // COMPAT + + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); + return 0; +} +late_initcall(ksu_syscall_table_hook_init); + // EOF diff --git a/drivers/kernelsu/include/ksu.h b/drivers/kernelsu/include/ksu.h index 17a524097535..2f5841290b1d 100644 --- a/drivers/kernelsu/include/ksu.h +++ b/drivers/kernelsu/include/ksu.h @@ -1,7 +1,7 @@ #ifndef __KSU_H_KSU #define __KSU_H_KSU -#define KERNEL_SU_VERSION 32481 +#define KERNEL_SU_VERSION 32485 #define EVENT_POST_FS_DATA 1 #define EVENT_BOOT_COMPLETED 2 diff --git a/drivers/kernelsu/include/uapi/feature.h b/drivers/kernelsu/include/uapi/feature.h index aafd7720148b..b1b92f2fdc48 100644 --- a/drivers/kernelsu/include/uapi/feature.h +++ b/drivers/kernelsu/include/uapi/feature.h @@ -6,10 +6,7 @@ enum ksu_feature_id { KSU_FEATURE_KERNEL_UMOUNT = 1, KSU_FEATURE_SULOG = 2, KSU_FEATURE_ADB_ROOT = 3, - -#ifdef CONFIG_KSU_EXTRAS // custom extensions - KSU_FEATURE_AVC_SPOOF = 10003, -#endif + KSU_FEATURE_SELINUX_HIDE = 4, KSU_FEATURE_MAX }; diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c index d7d979a0ee0b..79b98fd73e21 100644 --- a/drivers/kernelsu/ksu.c +++ b/drivers/kernelsu/ksu.c @@ -38,6 +38,7 @@ #include "infra/event_queue.h" #include "feature/adb_root.h" #include "feature/kernel_umount.h" +#include "feature/selinux_hide.h" #include "feature/sucompat.h" #include "feature/sulog.h" #include "runtime/ksud.h" @@ -67,6 +68,7 @@ #include "feature/adb_root.c" #include "feature/kernel_umount.c" +#include "feature/selinux_hide.c" #include "feature/sucompat.c" #include "feature/sulog.c" #include "runtime/ksud.c" @@ -84,20 +86,16 @@ #ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE #ifdef CONFIG_ARM64 -#include "hook/syscall_table_hook_arm64.c" -#elif CONFIG_ARM -#include "hook/syscall_table_hook_arm.c" -#endif + #include "hook/syscall_table_hook_arm64.c" +#elif defined(CONFIG_ARM) + #include "hook/syscall_table_hook_arm.c" #endif +#endif /* CONFIG_KSU_TAMPER_SYSCALL_TABLE */ #if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) #include "hook/kp_ksud.c" #endif -#ifdef CONFIG_KSU_EXTRAS -#include "extras.c" -#endif - // __weak fn's #include "kernel_compat.c" @@ -138,8 +136,16 @@ int __init kernelsu_init(void) ksu_adb_root_init(); // so the feature is registered #endif +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE + ksu_selinux_hide_init(); +#endif + ksu_core_init(); +#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) + kp_ksud_init(); +#endif + ksu_allowlist_init(); ksu_throne_tracker_init(); @@ -148,18 +154,6 @@ int __init kernelsu_init(void) ksu_file_wrapper_init(); -#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE - ksu_syscall_table_hook_init(); -#endif - -#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) - kp_ksud_init(); -#endif - -#ifdef CONFIG_KSU_EXTRAS - ksu_avc_spoof_init(); // so the feature is registered -#endif - return 0; } diff --git a/drivers/kernelsu/manager/pkg_observer.c b/drivers/kernelsu/manager/pkg_observer.c index 3956b517e08f..3a913a6b5ed0 100644 --- a/drivers/kernelsu/manager/pkg_observer.c +++ b/drivers/kernelsu/manager/pkg_observer.c @@ -24,6 +24,26 @@ static noinline void ksu_grab_data_system_inode() path_put(&path); } +__attribute__((cold)) +static noinline void ksu_rename_observer_slow(struct dentry *old_dentry, struct dentry *new_dentry) +{ + system_dir_inode_ptr = NULL; // reset cached inode + + char path[128] = { 0 }; + char *buf = dentry_path_raw(new_dentry, path, sizeof(path) - 1); + if (IS_ERR(buf)) { + pr_err("dentry_path_raw failed.\n"); + return; + } + + if (!strstr(buf, "/system/packages.list")) + return; + + pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, new_dentry->d_iname, buf); + track_throne(false); + return; +} + static inline void ksu_rename_observer(struct dentry *old_dentry, struct dentry *new_dentry) { // skip kernel threads @@ -71,19 +91,6 @@ static inline void ksu_rename_observer(struct dentry *old_dentry, struct dentry return; slow_path: - system_dir_inode_ptr = NULL; // reset cached inode - - char path[128] = { 0 }; - char *buf = dentry_path_raw(new_dentry, path, sizeof(path) - 1); - if (IS_ERR(buf)) { - pr_err("dentry_path_raw failed.\n"); - return; - } - - if (!strstr(buf, "/system/packages.list")) - return; - - pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, new_dentry->d_iname, buf); - track_throne(false); + ksu_rename_observer_slow(old_dentry, new_dentry); return; } diff --git a/drivers/kernelsu/manager/throne_tracker.c b/drivers/kernelsu/manager/throne_tracker.c index e23d1cf852e2..f61bdf3a36b1 100644 --- a/drivers/kernelsu/manager/throne_tracker.c +++ b/drivers/kernelsu/manager/throne_tracker.c @@ -143,7 +143,7 @@ FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, #define ksu_get_magic(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic) #endif -void search_manager(const char *path, int depth, struct list_head *uid_data) +static noinline void search_manager(const char *path, int depth, struct list_head *uid_data) { int i, stop = 0; struct list_head data_path_list; diff --git a/drivers/kernelsu/runtime/ksud.c b/drivers/kernelsu/runtime/ksud.c index c912dad30a42..44b3c25d2618 100644 --- a/drivers/kernelsu/runtime/ksud.c +++ b/drivers/kernelsu/runtime/ksud.c @@ -79,12 +79,6 @@ int nuke_ext4_sysfs(const char *mnt) return 0; } -#ifdef CONFIG_KSU_EXTRAS -extern void ksu_avc_spoof_late_init(); -#else -void ksu_avc_spoof_late_init() {} -#endif - void on_module_mounted(void) { pr_info("on_module_mounted!\n"); @@ -98,7 +92,6 @@ void on_boot_completed(void) ksu_boot_completed = true; pr_info("on_boot_completed!\n"); track_throne(true); - ksu_avc_spoof_late_init(); // slow_avc_init kp } static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); @@ -327,8 +320,6 @@ static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf preempt_enable(); got_flipped = true; } - int old_nice = task_nice(current); - set_user_nice(current, -20); if (ksu_copy_from_user_retry(&size, st_size_ptr, len)) { pr_info("%s: read statbuf 0x%lx failed \n", syscall_name, (unsigned long)st_size_ptr); @@ -344,7 +335,6 @@ static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", syscall_name, (unsigned long)st_size_ptr); out: - set_user_nice(current, old_nice); if (got_flipped) preempt_disable(); diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c index 30e19b7236f7..c51990b6b060 100644 --- a/drivers/kernelsu/selinux/rules.c +++ b/drivers/kernelsu/selinux/rules.c @@ -39,22 +39,7 @@ static inline rwlock_t *ksu_get_policy_rwlock() { return &selinux_state.ss->poli #elif defined(KSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK) static inline rwlock_t *ksu_get_policy_rwlock() { extern rwlock_t policy_rwlock; return &policy_rwlock; } #elif defined(CONFIG_KALLSYMS) -static noinline rwlock_t *ksu_get_policy_rwlock() -{ - static bool already_ran = false; - - static rwlock_t *policy_rwlock_ksym = NULL; - - if (likely(already_ran)) - return policy_rwlock_ksym; - - policy_rwlock_ksym = (rwlock_t *)kallsyms_lookup_name("policy_rwlock"); - if (policy_rwlock_ksym) - pr_info("apply_kernelsu_rules: policy_rwlock: 0x%lx via ksym\n", (uintptr_t)policy_rwlock_ksym); - - already_ran = true; - return policy_rwlock_ksym; -} +static noinline rwlock_t *ksu_get_policy_rwlock() { return (rwlock_t *)kallsyms_lookup_name("policy_rwlock"); } #else static inline rwlock_t *ksu_get_policy_rwlock() { return NULL; } #endif @@ -156,8 +141,8 @@ void apply_kernelsu_rules() struct selinux_policy *pol, *old_pol = selinux_state.policy; mutex_lock(&selinux_state.policy_mutex); pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex))); - if (!pol) { - pr_err("failed to dup selinux_policy\n"); + if (IS_ERR(pol)) { + pr_err("failed to dup selinux_policy: %ld\n", PTR_ERR(pol)); goto out_unlock; } db = &pol->policydb; @@ -511,10 +496,10 @@ int handle_sepolicy(void __user *user_data, u64 data_len) mutex_lock(&selinux_state.policy_mutex); old_pol = selinux_state.policy; - pol = ksu_dup_sepolicy(rcu_dereference_protected( - old_pol, lockdep_is_held(&selinux_state.policy_mutex))); - if (!pol) { - ret = -ENOMEM; + pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex))); + if (IS_ERR(pol)) { + ret = PTR_ERR(pol); + pr_err("ksu_dup_sepolicy err: %d\n", ret); goto out_unlock; } db = &pol->policydb; @@ -557,6 +542,10 @@ int handle_sepolicy(void __user *user_data, u64 data_len) pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); } else { success_cmd_count++; + int argc = sepol_expected_argc(header.cmd); + int i; + for (i = 0; i < argc; i++) + ksu_add_shit_to_list(args[i]); } cmd_index++; } @@ -634,6 +623,11 @@ static int handle_sepolicy_fn(void *data) else { pr_info("sepol: cmd #%u success, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); success_cmd_count++; + int argc = sepol_expected_argc(header.cmd); + int i; + for (i = 0; i < argc; i++) + ksu_add_shit_to_list(args[i]); + } cmd_index++; diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c index 9593d8c7fa83..a97c7430efcf 100644 --- a/drivers/kernelsu/selinux/sepolicy.c +++ b/drivers/kernelsu/selinux/sepolicy.c @@ -1196,452 +1196,83 @@ bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, #include "ss/policydb.h" #include "ss/services.h" - -// https://github.com/torvalds/linux/commit/581646c3fb98494009671f6d347ea125bc0e663a -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0) -#define CONST_IF_6_10 const -#else -#define CONST_IF_6_10 -#endif - -// ======== begin copy ======== - -static int copy_hashtab_node(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) -{ - new_node->datum = old_node->datum; - new_node->key = old_node->key; - return 0; -} - -static int destroy_hashtab_node(void *key, void *datum, void *data) -{ - // just copied pointer, no need to free - return 0; -} - -static int shallow_copy_hashtab(struct hashtab *new_tab, struct hashtab *old_tab) -{ - return hashtab_duplicate(new_tab, old_tab, copy_hashtab_node, destroy_hashtab_node, NULL); -} - -// ======== class_datum ======== - -static int -copy_class_datum_partially_callback(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) -{ - struct policydb *db = data; - struct class_datum *cls = old_node->datum, *new_cls; - struct constraint_node *oldn, *n, *nprev = NULL; - struct constraint_expr *olde, *e, *eprev; - new_node->key = old_node->key; - new_cls = kmemdup(cls, sizeof(struct class_datum), GFP_KERNEL); - if (!new_cls) - return -ENOMEM; - new_node->datum = new_cls; - new_cls->constraints = NULL; - for (oldn = cls->constraints; oldn; oldn = oldn->next) { - n = kmemdup(oldn, sizeof(struct constraint_node), GFP_KERNEL); - if (!n) - goto out_nomem; - if (nprev) { - nprev->next = n; - } else { - new_cls->constraints = n; - } - eprev = NULL; - n->expr = NULL; - for (olde = oldn->expr; olde; olde = olde->next) { - e = kmemdup(olde, sizeof(struct constraint_expr), GFP_KERNEL); - if (!e) { - goto out_nomem; - } - if (eprev) { - eprev->next = e; - } else { - n->expr = e; - } - if (olde->expr_type == CEXPR_NAMES) { - if (ebitmap_cpy(&e->names, &olde->names) < 0) { - goto out_nomem; - } - } - eprev = e; - } - nprev = n; - } - - db->class_val_to_struct[new_cls->value - 1] = new_cls; - - return 0; -out_nomem: - return -ENOMEM; -} - -static int destroy_class_datum_partially_callback(void *key, void *datum, void *data) -{ - struct class_datum *cls = datum; - struct constraint_node *n, *nprev; - struct constraint_expr *e, *eprev; - if (cls) { - for (n = cls->constraints; n;) { - for (e = n->expr; e;) { - if (e->expr_type == CEXPR_NAMES) { - ebitmap_destroy(&e->names); - } - eprev = e; - e = e->next; - kfree(eprev); - } - nprev = n; - n = n->next; - kfree(nprev); - } - } - kfree(cls); - - return 0; -} - -static void free_class_datum_partially(struct policydb *db) +void ksu_destroy_sepolicy(struct selinux_policy *pol) { - if (db->class_val_to_struct) { - kfree(db->class_val_to_struct); - } - - if (db->p_classes.table.htable) { - hashtab_map(&db->p_classes.table, destroy_class_datum_partially_callback, NULL); - hashtab_destroy(&db->p_classes.table); - } + policydb_destroy(&pol->policydb); + kfree(pol); } -static int copy_class_datum_partially(struct policydb *new_db, struct policydb *old_db) +struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol) { int ret; - u32 n = new_db->symtab[SYM_CLASSES].nprim; - struct class_datum **new_class_val_to_struct; - - new_db->class_val_to_struct = NULL; - memset(&new_db->p_classes.table, 0, sizeof(new_db->p_classes.table)); - - new_class_val_to_struct = - kcalloc(n, sizeof(struct class_datum *), GFP_KERNEL); - if (!new_class_val_to_struct) { + size_t len; + struct selinux_policy *new_pol; + void *data; + struct policy_file fp; + + len = old_pol->policydb.len; + data = vmalloc(len); + if (!data) { + pr_err("alloc policy len %ld\n", len); ret = -ENOMEM; - goto exit; + goto out_free_data; } - new_db->class_val_to_struct = new_class_val_to_struct; - ret = hashtab_duplicate(&new_db->p_classes.table, &old_db->p_classes.table, - copy_class_datum_partially_callback, - destroy_class_datum_partially_callback, new_db); + fp.data = data; + fp.len = len; + ret = policydb_write(&old_pol->policydb, &fp); if (ret) { - goto exit; - } - - return 0; - -exit: - free_class_datum_partially(new_db); - return ret; -} - -// ======== avtab ======== - -static int copy_avtab(struct avtab *new_avtab, struct avtab *old_avtab) -{ - int ret, i; - struct avtab_node *n, *p; - ret = avtab_alloc_dup(new_avtab, old_avtab); - if (ret < 0) - return ret; - // avtab_alloc_dup didn't zero it - new_avtab->nel = 0; - - for (i = 0; i < old_avtab->nslot; i++) { - n = old_avtab->htable[i]; - while (n) { - p = avtab_insert_nonunique(new_avtab, &n->key, &n->datum); - if (!p) { - ret = -ENOMEM; - goto out_free; - } - n = n->next; + pr_err("sepolicy: policydb_write: %d\n", ret); + goto out_free_data; + } + + // https://android-review.googlesource.com/c/kernel/common/+/3009995/11/security/selinux/ss/policydb.c + // fixup config + // 4*2+8+4 + static const size_t kConfigOff = 20; + if (len >= kConfigOff + sizeof(u32)) { + u32 *config_ptr = (u32 *)((unsigned long)data + kConfigOff); + pr_info("old config: %u\n", *config_ptr); + if (old_pol->policydb.android_netlink_route) { + pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE\n"); + *config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE; } - } - - return 0; - -out_free: - avtab_destroy(new_avtab); - return ret; -} - -// ======== role_datum ======== - -static int -copy_role_datum_partially_callback(struct hashtab_node *new_node, CONST_IF_6_10 struct hashtab_node *old_node, void *data) -{ - int ret = 0; - struct policydb *db = data; - struct role_datum *role = old_node->datum, *new_role; - new_role = kmemdup(role, sizeof(struct role_datum), GFP_KERNEL); - if (!new_role) { - ret = -ENOMEM; - goto out; - } - new_node->datum = new_role; - new_node->key = old_node->key; - - ret = ebitmap_cpy(&new_role->types, &role->types); - if (ret) { - goto out; - } - db->role_val_to_struct[role->value - 1] = new_role; - -out: - return ret; -} - -static int destroy_role_datum_partially_callback(void *key, void *datum, void *data) -{ - struct role_datum *role = datum; - if (role) { - ebitmap_destroy(&role->types); - kfree(role); - } - return 0; -} - -static void free_role_datum_partially(struct policydb *db) -{ - if (db->role_val_to_struct) { - kfree(db->role_val_to_struct); - } - if (db->p_roles.table.htable) { - hashtab_map(&db->p_roles.table, destroy_role_datum_partially_callback, NULL); - hashtab_destroy(&db->p_roles.table); - } -} - -static int copy_role_datum_partially(struct policydb *new_db, struct policydb *old_db) -{ - int ret; - struct role_datum **new_role_val_to_struct; - u32 n = old_db->p_roles.nprim; - - new_db->role_val_to_struct = NULL; - memset(&new_db->p_roles.table, 0, sizeof(new_db->p_roles.table)); - - new_role_val_to_struct = - kcalloc(n, sizeof(*new_db->role_val_to_struct), GFP_KERNEL); - if (!new_role_val_to_struct) { - ret = -ENOMEM; - goto out_free; - } - new_db->role_val_to_struct = new_role_val_to_struct; - - ret = hashtab_duplicate(&new_db->p_roles.table, &old_db->p_roles.table, - copy_role_datum_partially_callback, - destroy_role_datum_partially_callback, new_db); - if (ret) - goto out_free; - return 0; - -out_free: - free_role_datum_partially(new_db); - - return ret; -} - -// ======== type_datum ======== - -static void free_type_datum_partially(struct policydb *db) -{ - u32 sz = db->p_types.nprim, i; - if (db->type_attr_map_array) { - for (i = 0; i < sz; i++) { - ebitmap_destroy(&db->type_attr_map_array[i]); + if (old_pol->policydb.android_netlink_getneigh) { + pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH\n"); + *config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH; } - - kvfree(db->type_attr_map_array); - } - - if (db->type_val_to_struct) { - kvfree(db->type_val_to_struct); + pr_info("new config: %u\n", *config_ptr); } - if (db->sym_val_to_name[SYM_TYPES]) { - kvfree(db->sym_val_to_name[SYM_TYPES]); - } - - hashtab_destroy(&db->p_types.table); -} - -static int copy_type_datum_partially(struct policydb *new_db, struct policydb *old_db) -{ - int ret = -ENOMEM; - u32 sz = new_db->p_types.nprim, i; - struct ebitmap *new_type_attr_map_array; - struct type_datum **new_type_val_to_struct; - char **new_sym_val_to_name_types; - - new_db->type_attr_map_array = NULL; - new_db->type_val_to_struct = NULL; - new_db->sym_val_to_name[SYM_TYPES] = NULL; - memset(&new_db->p_types.table, 0, sizeof(new_db->p_types.table)); - - // ======== type_attr_map_array ======== - - new_type_attr_map_array = kvcalloc(sz, sizeof(struct ebitmap), GFP_KERNEL); - - if (!new_type_attr_map_array) { - goto out; - } - - new_db->type_attr_map_array = new_type_attr_map_array; - for (i = 0; i < sz; i++) { - ret = ebitmap_cpy(&new_db->type_attr_map_array[i], - &old_db->type_attr_map_array[i]); - if (ret < 0) - goto out; - } - - // ======== type_val_to_struct ======== - ret = -ENOMEM; - - new_type_val_to_struct = - kvcalloc(sz, sizeof(*new_db->type_val_to_struct), GFP_KERNEL); - if (!new_type_val_to_struct) { - goto out; - } - new_db->type_val_to_struct = new_type_val_to_struct; - memcpy(new_db->type_val_to_struct, old_db->type_val_to_struct, - sz * sizeof(*new_db->type_val_to_struct)); - - // ======== sym_val_to_name[SYM_TYPES] ======== - - new_sym_val_to_name_types = - kvcalloc(sz, sizeof(*new_db->sym_val_to_name[SYM_TYPES]), GFP_KERNEL); - if (!new_sym_val_to_name_types) - goto out; - new_db->sym_val_to_name[SYM_TYPES] = new_sym_val_to_name_types; - memcpy(new_db->sym_val_to_name[SYM_TYPES], - old_db->sym_val_to_name[SYM_TYPES], - sz * sizeof(*new_db->sym_val_to_name[SYM_TYPES])); - - // ======== p_types ======== - - ret = shallow_copy_hashtab(&new_db->p_types.table, &old_db->p_types.table); - if (ret < 0) - goto out; - - return 0; -out: - free_type_datum_partially(new_db); - return ret; -} - -// ======== permissive_map ======== - -static void free_permissive_map(struct policydb *db) -{ - ebitmap_destroy(&db->permissive_map); -} - -static int copy_permissive_map(struct policydb *new_db, struct policydb *old_db) -{ - // On failure, the old ebitmap is cleaned. - return ebitmap_cpy(&new_db->permissive_map, &old_db->permissive_map); -} - -// ======== filename_trans ======== - -static void free_filename_trans(struct policydb *db) -{ - hashtab_destroy(&db->filename_trans); -} - -static int copy_filename_trans(struct policydb *new_db, struct policydb *old_db) -{ - // On failure, the old hashtab is cleaned. - return shallow_copy_hashtab(&new_db->filename_trans, &old_db->filename_trans); -} - -// ======== sepolicy ======== - -void ksu_destroy_sepolicy(struct selinux_policy *pol) -{ - if (!pol) - return; - - struct policydb *db = &pol->policydb; - - free_class_datum_partially(db); - - avtab_destroy(&db->te_avtab); - - free_role_datum_partially(db); - - free_type_datum_partially(db); - - free_permissive_map(db); - - free_filename_trans(db); - - kfree(pol); -} - -struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol) -{ - int ret; - struct selinux_policy *new_pol = - kmemdup(old_pol, sizeof(*old_pol), GFP_KERNEL); + new_pol = kmemdup(old_pol, sizeof(*old_pol), GFP_KERNEL); if (!new_pol) { - return NULL; - } - struct policydb *new_db = &new_pol->policydb, *old_db = &old_pol->policydb; - - ret = copy_class_datum_partially(new_db, old_db); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_class_datum_partially\n"); - goto out; - } - - ret = copy_avtab(&new_db->te_avtab, &old_db->te_avtab); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_avtab\n"); - goto out; - } - - ret = copy_role_datum_partially(new_db, old_db); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_role_datum_partially\n"); - goto out; - } - - ret = copy_type_datum_partially(new_db, old_db); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_type_datum_partially\n"); - goto out; + ret = -ENOMEM; + pr_err("sepolicy: dup old pol\n"); + goto out_free_data; } + memset(&new_pol->policydb, 0, sizeof(new_pol->policydb)); - ret = copy_permissive_map(new_db, old_db); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_permissive_map\n"); - goto out; - } + // rewind fp + fp.data = data; + fp.len = len; - ret = copy_filename_trans(new_db, old_db); - if (ret < 0) { - pr_err("ksu_dup_sepolicy: copy_filename_trans\n"); - goto out; + ret = policydb_read(&new_pol->policydb, &fp); + if (ret) { + pr_err("sepolicy: policydb_read: %d\n", ret); + goto out_free_policydb; } + new_pol->policydb.len = old_pol->policydb.len; + kvfree(data); return new_pol; -out: +out_free_policydb: kfree(new_pol); - return NULL; + +out_free_data: + kvfree(data); + + return ERR_PTR(ret); } #endif From 82b068a22a76597c21f86cb5aefb0a71c040c67d Mon Sep 17 00:00:00 2001 From: awkoo <184658409+awkoo@users.noreply.github.com> Date: Tue, 12 May 2026 04:31:52 +0000 Subject: [PATCH 59/59] =?UTF-8?q?KernelSU:=20=E4=BD=BF=E7=94=A8=E6=89=8B?= =?UTF-8?q?=E5=8A=A8hook?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: awkoo <184658409+awkoo@users.noreply.github.com> --- .../configs/vendor/xiaomi/mi845_defconfig | 1 - fs/exec.c | 12 +++++++++ fs/open.c | 10 ++++++++ fs/stat.c | 25 +++++++++++++++++++ kernel/reboot.c | 7 ++++++ security/selinux/hooks.c | 8 ++++++ 6 files changed, 62 insertions(+), 1 deletion(-) diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 92a628b225af..1ff7a9286950 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -636,4 +636,3 @@ CONFIG_SND_SOC_WCD_SPI=y CONFIG_SOUNDWIRE=y CONFIG_WCD_SPI_AC=y CONFIG_KSU=y -CONFIG_KSU_TAMPER_SYSCALL_TABLE=y diff --git a/fs/exec.c b/fs/exec.c index 5addf3b00561..351ce34f1226 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1837,12 +1837,21 @@ static int do_execveat_common(int fd, struct filename *filename, return retval; } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr, + void *argv, void *envp, int *flags); +#endif + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; +#ifdef CONFIG_KSU + ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0); +#endif return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } @@ -1870,6 +1879,9 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; +#ifdef CONFIG_KSU // 32-bit ksud and 32-on-64 support + ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0); +#endif return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } diff --git a/fs/open.c b/fs/open.c index f2b82c462fbb..7dc516777071 100644 --- a/fs/open.c +++ b/fs/open.c @@ -355,6 +355,12 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return error; } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, + int *mode, int *flags); +#endif + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -370,6 +376,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) int res; unsigned int lookup_flags = LOOKUP_FOLLOW; +#ifdef CONFIG_KSU + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); +#endif + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; diff --git a/fs/stat.c b/fs/stat.c index 068fdbcc9e26..6c795dd237bc 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -287,6 +287,12 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename, return cp_new_stat(&stat, statbuf); } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_stat(int *dfd, const char __user **filename_user, + int *flags); +#endif + #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct stat __user *, statbuf, int, flag) @@ -294,6 +300,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct kstat stat; int error; +#ifdef CONFIG_KSU + ksu_handle_stat(&dfd, &filename, &flag); +#endif error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; @@ -301,6 +310,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, } #endif +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) +extern void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr); +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +extern void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr); // for 32-bit +#endif +#endif + SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) { struct kstat stat; @@ -309,6 +325,9 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) if (!error) error = cp_new_stat(&stat, statbuf); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) + ksu_handle_newfstat_ret(&fd, &statbuf); +#endif return error; } @@ -427,6 +446,9 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) if (!error) error = cp_new_stat64(&stat, statbuf); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) // for 32-bit + ksu_handle_fstat64_ret(&fd, &statbuf); +#endif return error; } @@ -436,6 +458,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, struct kstat stat; int error; +#ifdef CONFIG_KSU // 32-bit su + ksu_handle_stat(&dfd, &filename, &flag); +#endif error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; diff --git a/kernel/reboot.c b/kernel/reboot.c index 2946ed1d99d4..a5ff5d0ef572 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -269,6 +269,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off); static DEFINE_MUTEX(reboot_mutex); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) +extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); +#endif + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -284,6 +288,9 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, char buffer[256]; int ret = 0; +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); +#endif /* We only trust the superuser with rebooting the system. */ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4abba0e1674d..ac0c60389581 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5907,6 +5907,10 @@ static int selinux_getprocattr(struct task_struct *p, return -EINVAL; } +#ifdef CONFIG_KSU +extern int ksu_hide_setprocattr(const char *name, void *value, size_t size); +#endif + static int selinux_setprocattr(struct task_struct *p, char *name, void *value, size_t size) { @@ -5916,6 +5920,10 @@ static int selinux_setprocattr(struct task_struct *p, int error; char *str = value; +#ifdef CONFIG_KSU + ksu_hide_setprocattr(name, value, size); +#endif + if (current != p) { /* SELinux only allows a process to change its own security attributes. */