diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 948e5d193a8c..e35f5377f681 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,9 +27,9 @@ jobs:
           - ursa
     steps:
       - name: 安装软件包
+        if: env.PACKAGES != ''
         env:
           PACKAGES:
-            ccache
             binutils-aarch64-linux-gnu
             binutils-arm-linux-gnueabi
         run: |
@@ -39,7 +39,7 @@ jobs:
       - name: 安装make4.4.1-2
         run: |
           curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb
-          sudo apt-get install -y ./make.deb
+          sudo apt-get install -y -q ./make.deb
           rm ./make.deb
 
       - name: 同步仓库
@@ -49,17 +49,25 @@ jobs:
 
       - name: 缓存Clang
         id: cache-clang
-        uses: actions/cache@main
+        uses: actions/cache/restore@main
         with:
           path: clang
           key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }}
 
       - name: 下载Clang
+        id: download_clang
         if: steps.cache-clang.outputs.cache-hit != 'true'
-        run:
-          mkdir -p clang &&
-          curl -LSs "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" |
-          tar xz -C clang
+        run: |
+          mkdir -p clang
+          wget -c -t 10 "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" -O clang.tgz
+          tar -zxvf clang.tgz -C clang/
+
+      - name: 保存Clang
+        if: always() && steps.cache-clang.outputs.cache-hit != 'true' && steps.download_clang.outcome == 'success'
+        uses: actions/cache/save@main
+        with:
+          path: clang
+          key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }}
 
       - name: 缓存ccache
         uses: hendrikmuhs/ccache-action@main
@@ -73,17 +81,18 @@ jobs:
         env:
           MAKE_ARGS:
             -j$(nproc --all)
+            O=out
+            LLVM=1
+            LLVM_IAS=1
             CC="ccache clang"
             LD=ld.lld
             ARCH=arm64
-            LLVM=1
-            LLVM_IAS=1
-            O=out
             CROSS_COMPILE=aarch64-linux-gnu-
             CROSS_COMPILE_ARM32=arm-linux-gnueabi-
           CONFIG_FILES:
             vendor/xiaomi/mi845_defconfig
             vendor/xiaomi/${{ matrix.device }}.config
+            lxc.config
         run: |
           export PATH=$GITHUB_WORKSPACE/clang/bin:$PATH
           export KBUILD_BUILD_USER=${{ github.repository_owner }}
@@ -112,31 +121,15 @@ jobs:
           EOF
           zip -qr9 Anykernel3-${{ matrix.device }}.zip * -x .git .github README.md *placeholder
 
-      - name: 打包(boot)
-        run: |
-          git clone https://android.googlesource.com/platform/system/tools/mkbootimg --depth=1 mkbootimg
-          cp kernel/out/arch/arm64/boot/Image.gz-dtb mkbootimg/
-          cd mkbootimg
-          boot_url=$(curl -LSs https://download.lineageos.org/api/v2/devices/${{ matrix.device }}/builds | jq -r '.[0].files[1].url')
-          curl -LSs $boot_url -o boot.img
-          mkbootimg_args=$(./unpack_bootimg.py --out out --boot_img boot.img --format mkbootimg)
-          mv Image.gz-dtb out/kernel
-          eval "./mkbootimg.py $mkbootimg_args -o boot-lineage-${{ matrix.device }}.img"
-
       - name: 上传文件
         uses: actions/upload-artifact@main
         with:
           name: kernel-${{ matrix.device }}-ak3
           path: ak3/Anykernel3-${{ matrix.device }}.zip
 
-      - name: 上传文件
-        uses: actions/upload-artifact@main
-        with:
-          name: kernel-${{ matrix.device }}-boot
-          path: mkbootimg/boot-lineage-${{ matrix.device }}.img
-
   release:
     name: 发布
+    if: github.event_name == 'push'
     permissions: { contents: write }
     runs-on: ubuntu-latest
     needs: build
@@ -151,54 +144,61 @@ jobs:
       - name: 获取当前时间
         id: time
         run: |
-          echo "time=$(TZ='Asia/Shanghai' date -u +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT
-          echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT
-
-      - name: 下载ci管理器
-        continue-on-error: true
-        uses: dawidd6/action-download-artifact@master
-        with:
-          repo: rsuntk/KernelSU
-          workflow_conclusion: success
-          name: manager
-          workflow: build-manager.yml
-          path: manager
-          check_artifacts: true
-          search_artifacts: true
+          NOW=$(date +%s)
+          TIME_STR=$(TZ='Asia/Shanghai' date -d "@$NOW" +'%Y%m%d%H%M')
+          echo "timestamp=$NOW" >> $GITHUB_OUTPUT
+          echo "time=$TIME_STR" >> $GITHUB_OUTPUT
 
       - name: 发布
         uses: softprops/action-gh-release@master
+        id: release
         with:
           tag_name: rel-${{ steps.time.outputs.timestamp }}
           name: Kernel build ${{ steps.time.outputs.time }}
           prerelease: ${{ startsWith(github.ref_name, 'dev/') }}
           files: |
             kernel/*
-            manager/*
 
       - name: 发送Telegram通知
         continue-on-error: true
-        env:
-          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
-          COMMIT_URL: ${{ github.event.head_commit.url }}
-          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-          RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }}
         run: |
-          msg="*CI ${{ steps.time.outputs.time }}*
-          > Branch/分支: \`${{ github.ref_name }}\`
-          \`\`\`
-          $COMMIT_MESSAGE
-          \`\`\`
-          [Download/下载]($RELEASE_URL)
-          [Commit/提交]($COMMIT_URL)
-          [Run/工作流]($RUN_URL)
+          IDS=(${{ join(github.event.commits.*.id, ' ') }})
+          MAX=6
+          if [ "${#IDS[@]}" -gt "$MAX" ]; then
+            COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")"
+          else
+            COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")"
+          fi
+          MSG="\
+          <b>CI ${{ steps.time.outputs.time }}</b>
+          <pre>\
+          项目: ${{ github.repository }}
+          分支: ${{ github.ref_name }}\
+          </pre>
+          <b>提交ID:</b>
+          <pre>$COMMIT_IDS_TEXT</pre>\
           "
-          curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \
-                      -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \
-                      -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \
-                      -F 'parse_mode="markdownv2"' \
-                      -F "text=\"$msg\"" | tee Markdown.txt
-          ! ${{ startsWith(github.ref_name, 'stable/') }} || \
-          curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \
-                      -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \
-                      -F message_id=$(jq '.result.message_id' Markdown.txt)
+          PREVIEW_OPTIONS="{ \
+            \"url\": \"${{ steps.release.outputs.url }}\", \
+            \"prefer_small_media\": true, \
+            \"show_above_text\": true \
+          }"
+          BUTTONS="{\"inline_keyboard\": [ [ \
+            { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \
+            { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \
+          ] ] }"
+          curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \
+                    -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \
+                    -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \
+                    -d "parse_mode=HTML" \
+                    --data-urlencode "text=$MSG" \
+                    -d "link_preview_options=$PREVIEW_OPTIONS" \
+                    -d "reply_markup=$BUTTONS" \
+                    -o response.txt && \
+          (! ${{ startsWith(github.ref_name, 'stable/') }} || \
+          curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \
+                    -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \
+                    -d "message_id=$(jq '.result.message_id' response.txt)")
+          if [ "${{ runner.debug }}" = "1" ]; then
+              cat response.txt
+          fi
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index f8614b3d49f9..a542b9f2a30d 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
 00-INDEX
 	- This file
-bfq-iosched.txt
-	- BFQ IO scheduler and its tunables
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
deleted file mode 100644
index 0539e87962ed..000000000000
--- a/Documentation/block/bfq-iosched.txt
+++ /dev/null
@@ -1,545 +0,0 @@
-BFQ (Budget Fair Queueing)
-==========================
-
-BFQ is a proportional-share I/O scheduler, with some extra
-low-latency capabilities. In addition to cgroups support (blkio or io
-controllers), BFQ's main features are:
-- BFQ guarantees a high system and application responsiveness, and a
-  low latency for time-sensitive applications, such as audio or video
-  players;
-- BFQ distributes bandwidth, and not just time, among processes or
-  groups (switching back to time distribution when needed to keep
-  throughput high).
-
-In its default configuration, BFQ privileges latency over
-throughput. So, when needed for achieving a lower latency, BFQ builds
-schedules that may lead to a lower throughput. If your main or only
-goal, for a given device, is to achieve the maximum-possible
-throughput at all times, then do switch off all low-latency heuristics
-for that device, by setting low_latency to 0. Full details in Section 3.
-
-On average CPUs, the current version of BFQ can handle devices
-performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
-reference, 30-50 KIOPS correspond to very high bandwidths with
-sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
-to 120-200 MB/s with 4KB random I/O.
-
-The table of contents follow. Impatients can just jump to Section 3.
-
-CONTENTS
-
-1. When may BFQ be useful?
- 1-1 Personal systems
- 1-2 Server systems
-2. How does BFQ work?
-3. What are BFQ's tunable?
-4. BFQ group scheduling
- 4-1 Service guarantees provided
- 4-2 Interface
-
-1. When may BFQ be useful?
-==========================
-
-BFQ provides the following benefits on personal and server systems.
-
-1-1 Personal systems
---------------------
-
-Low latency for interactive applications
-
-Regardless of the actual background workload, BFQ guarantees that, for
-interactive tasks, the storage device is virtually as responsive as if
-it was idle. For example, even if one or more of the following
-background workloads are being executed:
-- one or more large files are being read, written or copied,
-- a tree of source files is being compiled,
-- one or more virtual machines are performing I/O,
-- a software update is in progress,
-- indexing daemons are scanning filesystems and updating their
-  databases,
-starting an application or loading a file from within an application
-takes about the same time as if the storage device was idle. As a
-comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
-applications experience high latencies, or even become unresponsive
-until the background workload terminates (also on SSDs).
-
-Low latency for soft real-time applications
-
-Also soft real-time applications, such as audio and video
-players/streamers, enjoy a low latency and a low drop rate, regardless
-of the background I/O workload. As a consequence, these applications
-do not suffer from almost any glitch due to the background workload.
-
-Higher speed for code-development tasks
-
-If some additional workload happens to be executed in parallel, then
-BFQ executes the I/O-related components of typical code-development
-tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
-NOOP or DEADLINE.
-
-High throughput
-
-On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
-up to 150% higher throughput than DEADLINE and NOOP, with all the
-sequential workloads considered in our tests. With random workloads,
-and with all the workloads on flash-based devices, BFQ achieves,
-instead, about the same throughput as the other schedulers.
-
-Strong fairness, bandwidth and delay guarantees
-
-BFQ distributes the device throughput, and not just the device time,
-among I/O-bound applications in proportion their weights, with any
-workload and regardless of the device parameters. From these bandwidth
-guarantees, it is possible to compute tight per-I/O-request delay
-guarantees by a simple formula. If not configured for strict service
-guarantees, BFQ switches to time-based resource sharing (only) for
-applications that would otherwise cause a throughput loss.
-
-1-2 Server systems
-------------------
-
-Most benefits for server systems follow from the same service
-properties as above. In particular, regardless of whether additional,
-possibly heavy workloads are being served, BFQ guarantees:
-
-. audio and video-streaming with zero or very low jitter and drop
-  rate;
-
-. fast retrieval of WEB pages and embedded objects;
-
-. real-time recording of data in live-dumping applications (e.g.,
-  packet logging);
-
-. responsiveness in local and remote access to a server.
-
-
-2. How does BFQ work?
-=====================
-
-BFQ is a proportional-share I/O scheduler, whose general structure,
-plus a lot of code, are borrowed from CFQ.
-
-- Each process doing I/O on a device is associated with a weight and a
-  (bfq_)queue.
-
-- BFQ grants exclusive access to the device, for a while, to one queue
-  (process) at a time, and implements this service model by
-  associating every queue with a budget, measured in number of
-  sectors.
-
-  - After a queue is granted access to the device, the budget of the
-    queue is decremented, on each request dispatch, by the size of the
-    request.
-
-  - The in-service queue is expired, i.e., its service is suspended,
-    only if one of the following events occurs: 1) the queue finishes
-    its budget, 2) the queue empties, 3) a "budget timeout" fires.
-
-    - The budget timeout prevents processes doing random I/O from
-      holding the device for too long and dramatically reducing
-      throughput.
-
-    - Actually, as in CFQ, a queue associated with a process issuing
-      sync requests may not be expired immediately when it empties. In
-      contrast, BFQ may idle the device for a short time interval,
-      giving the process the chance to go on being served if it issues
-      a new request in time. Device idling typically boosts the
-      throughput on rotational devices, if processes do synchronous
-      and sequential I/O. In addition, under BFQ, device idling is
-      also instrumental in guaranteeing the desired throughput
-      fraction to processes issuing sync requests (see the description
-      of the slice_idle tunable in this document, or [1, 2], for more
-      details).
-
-      - With respect to idling for service guarantees, if several
-	processes are competing for the device at the same time, but
-	all processes (and groups, after the following commit) have
-	the same weight, then BFQ guarantees the expected throughput
-	distribution without ever idling the device. Throughput is
-	thus as high as possible in this common scenario.
-
-  - If low-latency mode is enabled (default configuration), BFQ
-    executes some special heuristics to detect interactive and soft
-    real-time applications (e.g., video or audio players/streamers),
-    and to reduce their latency. The most important action taken to
-    achieve this goal is to give to the queues associated with these
-    applications more than their fair share of the device
-    throughput. For brevity, we call just "weight-raising" the whole
-    sets of actions taken by BFQ to privilege these queues. In
-    particular, BFQ provides a milder form of weight-raising for
-    interactive applications, and a stronger form for soft real-time
-    applications.
-
-  - BFQ automatically deactivates idling for queues born in a burst of
-    queue creations. In fact, these queues are usually associated with
-    the processes of applications and services that benefit mostly
-    from a high throughput. Examples are systemd during boot, or git
-    grep.
-
-  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
-    performing random I/O that becomes mostly sequential if
-    merged. Differently from CFQ, BFQ achieves this goal with a more
-    reactive mechanism, called Early Queue Merge (EQM). EQM is so
-    responsive in detecting interleaved I/O (cooperating processes),
-    that it enables BFQ to achieve a high throughput, by queue
-    merging, even for queues for which CFQ needs a different
-    mechanism, preemption, to get a high throughput. As such EQM is a
-    unified mechanism to achieve a high throughput with interleaved
-    I/O.
-
-  - Queues are scheduled according to a variant of WF2Q+, named
-    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
-    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
-    also ready for hierarchical scheduling. However, for a cleaner
-    logical breakdown, the code that enables and completes
-    hierarchical support is provided in the next commit, which focuses
-    exactly on this feature.
-
-  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
-    perfectly fair, and smooth service. In particular, B-WF2Q+
-    guarantees that each queue receives a fraction of the device
-    throughput proportional to its weight, even if the throughput
-    fluctuates, and regardless of: the device parameters, the current
-    workload and the budgets assigned to the queue.
-
-  - The last, budget-independence, property (although probably
-    counterintuitive in the first place) is definitely beneficial, for
-    the following reasons:
-
-    - First, with any proportional-share scheduler, the maximum
-      deviation with respect to an ideal service is proportional to
-      the maximum budget (slice) assigned to queues. As a consequence,
-      BFQ can keep this deviation tight not only because of the
-      accurate service of B-WF2Q+, but also because BFQ *does not*
-      need to assign a larger budget to a queue to let the queue
-      receive a higher fraction of the device throughput.
-
-    - Second, BFQ is free to choose, for every process (queue), the
-      budget that best fits the needs of the process, or best
-      leverages the I/O pattern of the process. In particular, BFQ
-      updates queue budgets with a simple feedback-loop algorithm that
-      allows a high throughput to be achieved, while still providing
-      tight latency guarantees to time-sensitive applications. When
-      the in-service queue expires, this algorithm computes the next
-      budget of the queue so as to:
-
-      - Let large budgets be eventually assigned to the queues
-	associated with I/O-bound applications performing sequential
-	I/O: in fact, the longer these applications are served once
-	got access to the device, the higher the throughput is.
-
-      - Let small budgets be eventually assigned to the queues
-	associated with time-sensitive applications (which typically
-	perform sporadic and short I/O), because, the smaller the
-	budget assigned to a queue waiting for service is, the sooner
-	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
-
-- If several processes are competing for the device at the same time,
-  but all processes and groups have the same weight, then BFQ
-  guarantees the expected throughput distribution without ever idling
-  the device. It uses preemption instead. Throughput is then much
-  higher in this common scenario.
-
-- ioprio classes are served in strict priority order, i.e.,
-  lower-priority queues are not served as long as there are
-  higher-priority queues.  Among queues in the same class, the
-  bandwidth is distributed in proportion to the weight of each
-  queue. A very thin extra bandwidth is however guaranteed to
-  the Idle class, to prevent it from starving.
-
-
-3. What are BFQ's tunable?
-==========================
-
-The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
-fifo_expire_sync below are the same as in CFQ. Their description is
-just copied from that for CFQ. Some considerations in the description
-of slice_idle are copied from CFQ too.
-
-per-process ioprio and weight
------------------------------
-
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
-weights can be assigned to processes only indirectly, through I/O
-priorities, and according to the relation:
-weight = (IOPRIO_BE_NR - ioprio) * 10.
-
-Beware that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-slice_idle
-----------
-
-This parameter specifies how long BFQ should idle for next I/O
-request, when certain sync BFQ queues become empty. By default
-slice_idle is a non-zero value. Idling has a double purpose: boosting
-throughput and making sure that the desired throughput distribution is
-respected (see the description of how BFQ works, and, if needed, the
-papers referred there).
-
-As for throughput, idling can be very helpful on highly seeky media
-like single spindle SATA/SAS disks where we can cut down on overall
-number of seeks and see improved throughput.
-
-Setting slice_idle to 0 will remove all the idling on queues and one
-should see an overall improved throughput on faster storage devices
-like multiple SATA/SAS disks in hardware RAID configuration.
-
-So depending on storage and workload, it might be useful to set
-slice_idle=0.  In general for SATA/SAS disks and software RAID of
-SATA/SAS disks keeping slice_idle enabled should be useful. For any
-configurations where there are multiple spindles behind single LUN
-(Host based hardware RAID controller or for storage arrays), setting
-slice_idle=0 might end up in better throughput and acceptable
-latencies.
-
-Idling is however necessary to have service guarantees enforced in
-case of differentiated weights or differentiated I/O-request lengths.
-To see why, suppose that a given BFQ queue A must get several I/O
-requests served for each request served for another queue B. Idling
-ensures that, if A makes a new I/O request slightly after becoming
-empty, then no request of B is dispatched in the middle, and thus A
-does not lose the possibility to get more than one request dispatched
-before the next request of B is dispatched. Note that idling
-guarantees the desired differentiated treatment of queues only in
-terms of I/O-request dispatches. To guarantee that the actual service
-order then corresponds to the dispatch order, the strict_guarantees
-tunable must be set too.
-
-There is an important flipside for idling: apart from the above cases
-where it is beneficial also for throughput, idling can severely impact
-throughput. One important case is random workload. Because of this
-issue, BFQ tends to avoid idling as much as possible, when it is not
-beneficial also for throughput. As a consequence of this behavior, and
-of further issues described for the strict_guarantees tunable,
-short-term service guarantees may be occasionally violated. And, in
-some cases, these guarantees may be more important than guaranteeing
-maximum throughput. For example, in video playing/streaming, a very
-low drop rate may be more important than maximum throughput. In these
-cases, consider setting the strict_guarantees parameter.
-
-strict_guarantees
------------------
-
-If this parameter is set (default: unset), then BFQ
-
-- always performs idling when the in-service queue becomes empty;
-
-- forces the device to serve one I/O request at a time, by dispatching a
-  new request only if there is no outstanding request.
-
-In the presence of differentiated weights or I/O-request sizes, both
-the above conditions are needed to guarantee that every BFQ queue
-receives its allotted share of the bandwidth. The first condition is
-needed for the reasons explained in the description of the slice_idle
-tunable.  The second condition is needed because all modern storage
-devices reorder internally-queued requests, which may trivially break
-the service guarantees enforced by the I/O scheduler.
-
-Setting strict_guarantees may evidently affect throughput.
-
-back_seek_max
--------------
-
-This specifies, given in Kbytes, the maximum "distance" for backward seeking.
-The distance is the amount of space from the current head location to the
-sectors that are backward in terms of distance.
-
-This parameter allows the scheduler to anticipate requests in the "backward"
-direction and consider them as being the "next" if they are within this
-distance from the current head location.
-
-back_seek_penalty
------------------
-
-This parameter is used to compute the cost of backward seeking. If the
-backward distance of request is just 1/back_seek_penalty from a "front"
-request, then the seeking cost of two requests is considered equivalent.
-
-So scheduler will not bias toward one or the other request (otherwise scheduler
-will bias toward front request). Default value of back_seek_penalty is 2.
-
-fifo_expire_async
------------------
-
-This parameter is used to set the timeout of asynchronous requests. Default
-value of this is 248ms.
-
-fifo_expire_sync
-----------------
-
-This parameter is used to set the timeout of synchronous requests. Default
-value of this is 124ms. In case to favor synchronous requests over asynchronous
-one, this value should be decreased relative to fifo_expire_async.
-
-low_latency
------------
-
-This parameter is used to enable/disable BFQ's low latency mode. By
-default, low latency mode is enabled. If enabled, interactive and soft
-real-time applications are privileged and experience a lower latency,
-as explained in more detail in the description of how BFQ works.
-
-DISABLE this mode if you need full control on bandwidth
-distribution. In fact, if it is enabled, then BFQ automatically
-increases the bandwidth share of privileged applications, as the main
-means to guarantee a lower latency to them.
-
-In addition, as already highlighted at the beginning of this document,
-DISABLE this mode if your only goal is to achieve a high throughput.
-In fact, privileging the I/O of some application over the rest may
-entail a lower throughput. To achieve the highest-possible throughput
-on a non-rotational device, setting slice_idle to 0 may be needed too
-(at the cost of giving up any strong guarantee on fairness and low
-latency).
-
-timeout_sync
-------------
-
-Maximum amount of device time that can be given to a task (queue) once
-it has been selected for service. On devices with costly seeks,
-increasing this time usually increases maximum throughput. On the
-opposite end, increasing this time coarsens the granularity of the
-short-term bandwidth and latency guarantees, especially if the
-following parameter is set to zero.
-
-max_budget
-----------
-
-Maximum amount of service, measured in sectors, that can be provided
-to a BFQ queue once it is set in service (of course within the limits
-of the above timeout). According to what said in the description of
-the algorithm, larger values increase the throughput in proportion to
-the percentage of sequential I/O requests issued. The price of larger
-values is that they coarsen the granularity of short-term bandwidth
-and latency guarantees.
-
-The default value is 0, which enables auto-tuning: BFQ sets max_budget
-to the maximum number of sectors that can be served during
-timeout_sync, according to the estimated peak rate.
-
-weights
--------
-
-Read-only parameter, used to show the weights of the currently active
-BFQ queues.
-
-
-wr_ tunables
-------------
-
-BFQ exports a few parameters to control/tune the behavior of
-low-latency heuristics.
-
-wr_coeff
-
-Factor by which the weight of a weight-raised queue is multiplied. If
-the queue is deemed soft real-time, then the weight is further
-multiplied by an additional, constant factor.
-
-wr_max_time
-
-Maximum duration of a weight-raising period for an interactive task
-(ms). If set to zero (default value), then this value is computed
-automatically, as a function of the peak rate of the device. In any
-case, when the value of this parameter is read, it always reports the
-current duration, regardless of whether it has been set manually or
-computed automatically.
-
-wr_max_softrt_rate
-
-Maximum service rate below which a queue is deemed to be associated
-with a soft real-time application, and is then weight-raised
-accordingly (sectors/sec).
-
-wr_min_idle_time
-
-Minimum idle period after which interactive weight-raising may be
-reactivated for a queue (in ms).
-
-wr_rt_max_time
-
-Maximum weight-raising duration for soft real-time queues (in ms). The
-start time from which this duration is considered is automatically
-moved forward if the queue is detected to be still soft real-time
-before the current soft real-time weight-raising period finishes.
-
-wr_min_inter_arr_async
-
-Minimum period between I/O request arrivals after which weight-raising
-may be reactivated for an already busy async queue (in ms).
-
-
-4. Group scheduling with BFQ
-============================
-
-BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
-blkio and io. In particular, BFQ supports weight-based proportional
-share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
-
-4-1 Service guarantees provided
--------------------------------
-
-With BFQ, proportional share means true proportional share of the
-device bandwidth, according to group weights. For example, a group
-with weight 200 gets twice the bandwidth, and not just twice the time,
-of a group with weight 100.
-
-BFQ supports hierarchies (group trees) of any depth. Bandwidth is
-distributed among groups and processes in the expected way: for each
-group, the children of the group share the whole bandwidth of the
-group in proportion to their weights. In particular, this implies
-that, for each leaf group, every process of the group receives the
-same share of the whole group bandwidth, unless the ioprio of the
-process is modified.
-
-The resource-sharing guarantee for a group may partially or totally
-switch from bandwidth to time, if providing bandwidth guarantees to
-the group lowers the throughput too much. This switch occurs on a
-per-process basis: if a process of a leaf group causes throughput loss
-if served in such a way to receive its share of the bandwidth, then
-BFQ switches back to just time-based proportional share for that
-process.
-
-4-2 Interface
--------------
-
-To get proportional sharing of bandwidth with BFQ for a given device,
-BFQ must of course be the active scheduler for that device.
-
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
-
-Parameters to set
------------------
-
-For each group, there is only the following parameter to set.
-
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..10000 (default 100). The
-linear mapping between ioprio and weights, described at the beginning
-of the tunable section, is still valid, but all weights higher than
-IOPRIO_BE_NR*10 are mapped to ioprio 0.
-
-Recall that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-
-[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
-    Scheduler", Proceedings of the First Workshop on Mobile System
-    Technologies (MST-2015), May 2015.
-    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
-
-[2] P. Valente and M. Andreolini, "Improving Application
-    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
-    the 5th Annual International Systems and Storage Conference
-    (SYSTOR '12), June 2012.
-    Slightly extended version:
-    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
-							results.pdf
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
deleted file mode 100644
index af618171e0eb..000000000000
--- a/Documentation/cgroup-v1/rdma.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-				RDMA Controller
-				----------------
-
-Contents
---------
-
-1. Overview
-  1-1. What is RDMA controller?
-  1-2. Why RDMA controller needed?
-  1-3. How is RDMA controller implemented?
-2. Usage Examples
-
-1. Overview
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can leads to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-
-2. Usage Examples
------------------
-
-(a) Configure resource limit:
-echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit:
-cat /sys/fs/cgroup/rdma/2/rdma.max
-#Output:
-mlx4_0 hca_handle=2 hca_object=2000
-ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage:
-cat /sys/fs/cgroup/rdma/2/rdma.current
-#Output:
-mlx4_0 hca_handle=1 hca_object=20
-ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit:
-echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e4b6bf4de837..73950fdea31a 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -16,9 +16,7 @@ CONTENTS
   1-2. What is cgroup?
 2. Basic Operations
   2-1. Mounting
-  2-2. Organizing Processes and Threads
-    2-2-1. Processes
-    2-2-2. Threads
+  2-2. Organizing Processes
   2-3. [Un]populated Notification
   2-4. Controlling Controllers
     2-4-1. Enabling and Disabling
@@ -49,12 +47,6 @@ CONTENTS
   5-3. IO
     5-3-1. IO Interface Files
     5-3-2. Writeback
-  5-4. PID
-    5-4-1. PID Interface Files
-  5-5. Misc
-    5-5-1. perf_event
-  5-6. RDMA
-    5-6-1. RDMA Interface Files
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -151,20 +143,8 @@ during boot, before manual intervention is possible. To make testing
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 disabling controllers in v1 and make them always available in v2.
 
-cgroup v2 currently supports the following mount options.
 
-  nsdelegate
-
-	Consider cgroup namespaces as delegation boundaries.  This
-	option is system wide and can only be set on mount or modified
-	through remount from the init namespace.  The mount option is
-	ignored on non-init namespace mounts.  Please refer to the
-	Delegation section for details.
-
-
-2-2. Organizing Processes and Threads
-
-2-2-1. Processes
+2-2. Organizing Processes
 
 Initially, only the root cgroup exists to which all processes belong.
 A child cgroup can be created by creating a sub-directory.
@@ -215,104 +195,6 @@ is removed subsequently, " (deleted)" is appended to the path.
   0::/test-cgroup/test-cgroup-nested (deleted)
 
 
-2-2-2. Threads
-
-cgroup v2 supports thread granularity for a subset of controllers to
-support use cases requiring hierarchical resource distribution across
-the threads of a group of processes.  By default, all threads of a
-process belong to the same cgroup, which also serves as the resource
-domain to host resource consumptions which are not specific to a
-process or thread.  The thread mode allows threads to be spread across
-a subtree while still maintaining the common resource domain for them.
-
-Controllers which support thread mode are called threaded controllers.
-The ones which don't are called domain controllers.
-
-Marking a cgroup threaded makes it join the resource domain of its
-parent as a threaded cgroup.  The parent may be another threaded
-cgroup whose resource domain is further up in the hierarchy.  The root
-of a threaded subtree, that is, the nearest ancestor which is not
-threaded, is called threaded domain or thread root interchangeably and
-serves as the resource domain for the entire subtree.
-
-Inside a threaded subtree, threads of a process can be put in
-different cgroups and are not subject to the no internal process
-constraint - threaded controllers can be enabled on non-leaf cgroups
-whether they have threads in them or not.
-
-As the threaded domain cgroup hosts all the domain resource
-consumptions of the subtree, it is considered to have internal
-resource consumptions whether there are processes in it or not and
-can't have populated child cgroups which aren't threaded.  Because the
-root cgroup is not subject to no internal process constraint, it can
-serve both as a threaded domain and a parent to domain cgroups.
-
-The current operation mode or type of the cgroup is shown in the
-"cgroup.type" file which indicates whether the cgroup is a normal
-domain, a domain which is serving as the domain of a threaded subtree,
-or a threaded cgroup.
-
-On creation, a cgroup is always a domain cgroup and can be made
-threaded by writing "threaded" to the "cgroup.type" file.  The
-operation is single direction::
-
-  # echo threaded > cgroup.type
-
-Once threaded, the cgroup can't be made a domain again.  To enable the
-thread mode, the following conditions must be met.
-
-- As the cgroup will join the parent's resource domain.  The parent
-  must either be a valid (threaded) domain or a threaded cgroup.
-
-- When the parent is an unthreaded domain, it must not have any domain
-  controllers enabled or populated domain children.  The root is
-  exempt from this requirement.
-
-Topology-wise, a cgroup can be in an invalid state.  Please consider
-the following toplogy::
-
-  A (threaded domain) - B (threaded) - C (domain, just created)
-
-C is created as a domain but isn't connected to a parent which can
-host child domains.  C can't be used until it is turned into a
-threaded cgroup.  "cgroup.type" file will report "domain (invalid)" in
-these cases.  Operations which fail due to invalid topology use
-EOPNOTSUPP as the errno.
-
-A domain cgroup is turned into a threaded domain when one of its child
-cgroup becomes threaded or threaded controllers are enabled in the
-"cgroup.subtree_control" file while there are processes in the cgroup.
-A threaded domain reverts to a normal domain when the conditions
-clear.
-
-When read, "cgroup.threads" contains the list of the thread IDs of all
-threads in the cgroup.  Except that the operations are per-thread
-instead of per-process, "cgroup.threads" has the same format and
-behaves the same way as "cgroup.procs".  While "cgroup.threads" can be
-written to in any cgroup, as it can only move threads inside the same
-threaded domain, its operations are confined inside each threaded
-subtree.
-
-The threaded domain cgroup serves as the resource domain for the whole
-subtree, and, while the threads can be scattered across the subtree,
-all the processes are considered to be in the threaded domain cgroup.
-"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
-processes in the subtree and is not readable in the subtree proper.
-However, "cgroup.procs" can be written to from anywhere in the subtree
-to migrate all threads of the matching process to the cgroup.
-
-Only threaded controllers can be enabled in a threaded subtree.  When
-a threaded controller is enabled inside a threaded subtree, it only
-accounts for and controls resource consumptions associated with the
-threads in the cgroup and its descendants.  All consumptions which
-aren't tied to a specific thread belong to the threaded domain cgroup.
-
-Because a threaded subtree is exempt from no internal process
-constraint, a threaded controller must be able to handle competition
-between threads in a non-leaf cgroup and its child cgroups.  Each
-threaded controller defines how such competitions are handled.
-
-
 2-3. [Un]populated Notification
 
 Each non-root cgroup has a "cgroup.events" file which contains
@@ -391,15 +273,15 @@ disabled if one or more children have it enabled.
 
 2-4-3. No Internal Process Constraint
 
-Non-root cgroups can distribute domain resources to their children
-only when they don't have any processes of their own.  In other words,
-only domain cgroups which don't contain any processes can have domain
-controllers enabled in their "cgroup.subtree_control" files.
+Non-root cgroups can only distribute resources to their children when
+they don't have any processes of their own.  In other words, only
+cgroups which don't contain any processes can have controllers enabled
+in their "cgroup.subtree_control" files.
 
-This guarantees that, when a domain controller is looking at the part
-of the hierarchy which has it enabled, processes are always only on
-the leaves.  This rules out situations where child cgroups compete
-against internal processes of the parent.
+This guarantees that, when a controller is looking at the part of the
+hierarchy which has it enabled, processes are always only on the
+leaves.  This rules out situations where child cgroups compete against
+internal processes of the parent.
 
 The root cgroup is exempt from this restriction.  Root contains
 processes and anonymous resource consumption which can't be associated
@@ -420,27 +302,18 @@ file.
 
 2-5-1. Model of Delegation
 
-A cgroup can be delegated in two ways.  First, to a less privileged
-user by granting write access of the directory and its "cgroup.procs",
-"cgroup.threads" and "cgroup.subtree_control" files to the user.
-Second, if the "nsdelegate" mount option is set, automatically to a
-cgroup namespace on namespace creation.
-
-Because the resource control interface files in a given directory
-control the distribution of the parent's resources, the delegatee
-shouldn't be allowed to write to them.  For the first method, this is
-achieved by not granting access to these files.  For the second, the
-kernel rejects writes to all files other than "cgroup.procs" and
-"cgroup.subtree_control" on a namespace root from inside the
-namespace.
-
-The end results are equivalent for both delegation types.  Once
-delegated, the user can build sub-hierarchy under the directory,
-organize processes inside it as it sees fit and further distribute the
-resources it received from the parent.  The limits and other settings
-of all resource controllers are hierarchical and regardless of what
-happens in the delegated sub-hierarchy, nothing can escape the
-resource restrictions imposed by the parent.
+A cgroup can be delegated to a less privileged user by granting write
+access of the directory and its "cgroup.procs" file to the user.  Note
+that resource control interface files in a given directory control the
+distribution of the parent's resources and thus must not be delegated
+along with the directory.
+
+Once delegated, the user can build sub-hierarchy under the directory,
+organize processes as it sees fit and further distribute the resources
+it received from the parent.  The limits and other settings of all
+resource controllers are hierarchical and regardless of what happens
+in the delegated sub-hierarchy, nothing can escape the resource
+restrictions imposed by the parent.
 
 Currently, cgroup doesn't impose any restrictions on the number of
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -450,19 +323,19 @@ this may be limited explicitly in the future.
 2-5-2. Delegation Containment
 
 A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.
+can't be moved into or out of the sub-hierarchy by the delegatee.  For
+a process with a non-root euid to migrate a target process into a
+cgroup by writing its PID to the "cgroup.procs" file, the following
+conditions must be met.
 
-For delegations to a less privileged user, this is achieved by
-requiring the following conditions for a process with a non-root euid
-to migrate a target process into a cgroup by writing its PID to the
-"cgroup.procs" file.
+- The writer's euid must match either uid or suid of the target process.
 
 - The writer must have write access to the "cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file of the
   common ancestor of the source and destination cgroups.
 
-The above two constraints ensure that while a delegatee may migrate
+The above three constraints ensure that while a delegatee may migrate
 processes around freely in the delegated sub-hierarchy it can't pull
 in from or push out to outside the sub-hierarchy.
 
@@ -477,15 +350,10 @@ all processes under C0 and C1 belong to U0.
 
 Let's also say U0 wants to write the PID of a process which is
 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
-file; however, the common ancestor of the source cgroup C10 and the
-destination cgroup C00 is above the points of delegation and U0 would
-not have write access to its "cgroup.procs" files and thus the write
-will be denied with -EACCES.
-
-For delegations to namespaces, containment is achieved by requiring
-that both the source and destination cgroups are reachable from the
-namespace of the process which is attempting the migration.  If either
-is not reachable, the migration is rejected with -ENOENT.
+file and uid match on the process; however, the common ancestor of the
+source cgroup C10 and the destination cgroup C00 is above the points
+of delegation and U0 would not have write access to its "cgroup.procs"
+files and thus the write will be denied with -EACCES.
 
 
 2-6. Guidelines
@@ -718,29 +586,6 @@ may be specified in any order and not all pairs have to be specified.
 
 All cgroup core files are prefixed with "cgroup."
 
-  cgroup.type
-
-	A read-write single value file which exists on non-root
-	cgroups.
-
-	When read, it indicates the current type of the cgroup, which
-	can be one of the following values.
-
-	- "domain" : A normal valid domain cgroup.
-
-	- "domain threaded" : A threaded domain cgroup which is
-          serving as the root of a threaded subtree.
-
-	- "domain invalid" : A cgroup which is in an invalid state.
-	  It can't be populated or have controllers enabled.  It may
-	  be allowed to become a threaded cgroup.
-
-	- "threaded" : A threaded cgroup which is a member of a
-          threaded subtree.
-
-	A cgroup can be turned into a threaded cgroup by writing
-	"threaded" to this file.
-
   cgroup.procs
 
 	A read-write new-line separated values file which exists on
@@ -756,36 +601,10 @@ All cgroup core files are prefixed with "cgroup."
 	the PID to the cgroup.  The writer should match all of the
 	following conditions.
 
-	- It must have write access to the "cgroup.procs" file.
+	- Its euid is either root or must match either uid or suid of
+          the target process.
 
-	- It must have write access to the "cgroup.procs" file of the
-	  common ancestor of the source and destination cgroups.
-
-	When delegating a sub-hierarchy, write access to this file
-	should be granted along with the containing directory.
-
-	In a threaded cgroup, reading this file fails with EOPNOTSUPP
-	as all the processes belong to the thread root.  Writing is
-	supported and moves every thread of the process to the cgroup.
-
-  cgroup.threads
-	A read-write new-line separated values file which exists on
-	all cgroups.
-
-	When read, it lists the TIDs of all threads which belong to
-	the cgroup one-per-line.  The TIDs are not ordered and the
-	same TID may show up more than once if the thread got moved to
-	another cgroup and then back or the TID got recycled while
-	reading.
-
-	A TID can be written to migrate the thread associated with the
-	TID to the cgroup.  The writer should match all of the
-	following conditions.
-
-	- It must have write access to the "cgroup.threads" file.
-
-	- The cgroup that the thread is currently in must be in the
-          same resource domain as the destination cgroup.
+	- It must have write access to the "cgroup.procs" file.
 
 	- It must have write access to the "cgroup.procs" file of the
 	  common ancestor of the source and destination cgroups.
@@ -829,38 +648,6 @@ All cgroup core files are prefixed with "cgroup."
 		1 if the cgroup or its descendants contains any live
 		processes; otherwise, 0.
 
-  cgroup.max.descendants
-	A read-write single value files.  The default is "max".
-
-	Maximum allowed number of descent cgroups.
-	If the actual number of descendants is equal or larger,
-	an attempt to create a new cgroup in the hierarchy will fail.
-
-  cgroup.max.depth
-	A read-write single value files.  The default is "max".
-
-	Maximum allowed descent depth below the current cgroup.
-	If the actual descent depth is equal or larger,
-	an attempt to create a new child cgroup will fail.
-
-  cgroup.stat
-	A read-only flat-keyed file with the following entries:
-
-	  nr_descendants
-		Total number of visible descendant cgroups.
-
-	  nr_dying_descendants
-		Total number of dying descendant cgroups. A cgroup becomes
-		dying after being deleted by a user. The cgroup will remain
-		in dying state for some time undefined time (which can depend
-		on system load) before being completely destroyed.
-
-		A process can't enter a dying cgroup under any circumstances,
-		a dying cgroup can't revive.
-
-		A dying cgroup can consume system resources not exceeding
-		limits, which were active at the moment of cgroup deletion.
-
 
 5. Controllers
 
@@ -1350,92 +1137,6 @@ writeback as follows.
 	vm.dirty[_background]_ratio.
 
 
-5-4. PID
-
-The process number controller is used to allow a cgroup to stop any
-new tasks from being fork()'d or clone()'d after a specified limit is
-reached.
-
-The number of tasks in a cgroup can be exhausted in ways which other
-controllers cannot prevent, thus warranting its own controller.  For
-example, a fork bomb is likely to exhaust the number of tasks before
-hitting memory restrictions.
-
-Note that PIDs used in this controller refer to TIDs, process IDs as
-used by the kernel.
-
-
-5-4-1. PID Interface Files
-
-  pids.max
-
-	A read-write single value file which exists on non-root
-	cgroups.  The default is "max".
-
-	Hard limit of number of processes.
-
-  pids.current
-
-	A read-only single value file which exists on all cgroups.
-
-	The number of processes currently in the cgroup and its
-	descendants.
-
-Organisational operations are not blocked by cgroup policies, so it is
-possible to have pids.current > pids.max.  This can be done by either
-setting the limit to be smaller than pids.current, or attaching enough
-processes to the cgroup such that pids.current is larger than
-pids.max.  However, it is not possible to violate a cgroup PID policy
-through fork() or clone(). These will return -EAGAIN if the creation
-of a new process would cause a cgroup policy to be violated.
-
-
-5-5. Misc
-
-5-5-1. perf_event
-
-perf_event controller, if not mounted on a legacy hierarchy, is
-automatically enabled on the v2 hierarchy so that perf events can
-always be filtered by cgroup v2 path.  The controller can still be
-moved to a legacy hierarchy after v2 hierarchy is populated.
-
-
-5-6. RDMA
-
-The "rdma" controller regulates the distribution and accounting of
-of RDMA resources.
-
-5-6-1. RDMA Interface Files
-
-  rdma.max
-	A readwrite nested-keyed file that exists for all the cgroups
-	except root that describes current configured resource limit
-	for a RDMA/IB device.
-
-	Lines are keyed by device name and are not ordered.
-	Each line contains space separated resource name and its configured
-	limit that can be distributed.
-
-	The following nested keys are defined.
-
-	  hca_handle	Maximum number of HCA Handles
-	  hca_object 	Maximum number of HCA Objects
-
-	An example for mlx4 and ocrdma device follows.
-
-	  mlx4_0 hca_handle=2 hca_object=2000
-	  ocrdma1 hca_handle=3 hca_object=max
-
-  rdma.current
-	A read-only file that describes current resource usage.
-	It exists for all the cgroup except root.
-
-	An example for mlx4 and ocrdma device follows.
-
-	  mlx4_0 hca_handle=1 hca_object=20
-	  ocrdma1 hca_handle=1 hca_object=23
-
-
 6. Namespace
 
 6-1. Basics
@@ -1623,7 +1324,7 @@ D. Deprecated v1 Core Features
 
 - Multiple hierarchies including named ones are not supported.
 
-- All v1 mount options are not supported.
+- All mount options and remounting are not supported.
 
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9afba613a5c3..a66de7db0118 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -692,14 +692,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Specifying "pressure" disables per-cgroup pressure
 			stall information accounting feature
 
-	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
-			Format: { { controller | "all" | "named" }
-			          [,{ controller | "all" | "named" }...] }
+	cgroup_no_v1=	[KNL] Disable one, multiple, all cgroup controllers in v1
+			Format: { controller[,controller...] | "all" }
 			Like cgroup_disable, but only applies to cgroup v1;
 			the blacklisted controllers remain available in cgroup2.
-			"all" blacklists all controllers and "named" disables
-			named mounts. Specifying both "all" and "named" disables
-			all v1 hierarchies.
 
 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
diff --git a/Makefile b/Makefile
index 64d8b31c7b5d..a2946c935d83 100644
--- a/Makefile
+++ b/Makefile
@@ -87,10 +87,16 @@ endif
 
 # If the user is running make -s (silent mode), suppress echoing of
 # commands
+# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS.
 
-ifneq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),)
-  quiet=silent_
-  tools_silent=s
+ifeq ($(filter 3.%,$(MAKE_VERSION)),)
+silence:=$(findstring s,$(firstword -$(MAKEFLAGS)))
+else
+silence:=$(findstring s,$(filter-out --%,$(MAKEFLAGS)))
+endif
+
+ifeq ($(silence),s)
+quiet=silent_
 endif
 
 export quiet Q KBUILD_VERBOSE
diff --git a/README.md b/README.md
deleted file mode 100644
index 4a7cd21638cd..000000000000
--- a/README.md
+++ /dev/null
@@ -1 +0,0 @@
-已停更，随缘更新
\ No newline at end of file
diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config
new file mode 100644
index 000000000000..7c2967ce7218
--- /dev/null
+++ b/arch/arm64/configs/lxc.config
@@ -0,0 +1,44 @@
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+
+CONFIG_NAMESPACES=y
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_PID_NS=y
+CONFIG_USER_NS=y
+CONFIG_NET_NS=y
+
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_MEMCG=y
+CONFIG_CPUSETS=y
+
+CONFIG_VETH=y
+CONFIG_MACVLAN=y
+CONFIG_VLAN_8021Q=y
+CONFIG_BRIDGE=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NF_NAT_IPV4=y
+CONFIG_NF_NAT_IPV6=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP6_NF_TARGET_MASQUERADE=y
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_FUSE_FS=y
+
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_FHANDLE=y
+CONFIG_EVENTFD=y
+CONFIG_EPOLL=y
+CONFIG_UNIX_DIAG=y
+CONFIG_INET_DIAG=y
+CONFIG_PACKET_DIAG=y
+CONFIG_NETLINK_DIAG=y
+
+CONFIG_BINFMT_MISC=y
+
+CONFIG_ANDROID_PARANOID_NETWORK=n
\ No newline at end of file
diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
index 3768440716ca..1ff7a9286950 100644
--- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
+++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
@@ -71,7 +71,7 @@ CONFIG_PCI_MSM=y
 CONFIG_SCHED_MC=y
 CONFIG_NR_CPUS=8
 CONFIG_PREEMPT=y
-CONFIG_HZ_100=y
+CONFIG_HZ_300=y
 CONFIG_ANON_MIN_KBYTES=196608
 CONFIG_CLEAN_LOW_KBYTES=393216
 CONFIG_CLEAN_MIN_KBYTES=196608
@@ -218,6 +218,7 @@ CONFIG_IP6_NF_IPTABLES_128=y
 CONFIG_IP6_NF_MATCH_RPFILTER=y
 CONFIG_IP6_NF_TARGET_HL=y
 CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_NAT=y
 CONFIG_IP6_NF_TARGET_REJECT=y
 CONFIG_IP6_NF_MANGLE=y
 CONFIG_IP6_NF_RAW=y
@@ -634,5 +635,4 @@ CONFIG_SND_SOC_WCD_MBHC_ADC=y
 CONFIG_SND_SOC_WCD_SPI=y
 CONFIG_SOUNDWIRE=y
 CONFIG_WCD_SPI_AC=y
-CONFIG_REKERNEL=y
-CONFIG_REKERNEL_NETWORK=y
+CONFIG_KSU=y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 75ee7ba34ebb..421bef9c4c48 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -39,28 +39,9 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
-config IOSCHED_BFQ
-	tristate "BFQ I/O scheduler"
-	default y
-	---help---
-	The BFQ I/O scheduler distributes bandwidth among all
-	processes according to their weights, regardless of the
-	device parameters and with any workload. It also guarantees
-	a low latency to interactive and soft real-time applications.
-	Details in Documentation/block/bfq-iosched.txt
-
-config BFQ_GROUP_IOSCHED
-	bool "BFQ hierarchical scheduling support"
-	depends on IOSCHED_BFQ && BLK_CGROUP
-	default n
-	---help---
-
-	Enable hierarchical scheduling in BFQ, using the blkio
-	(cgroups-v1) or io (cgroups-v2) controller.
-
 choice
 	prompt "Default I/O scheduler"
-	default DEFAULT_BFQ
+	default DEFAULT_CFQ
 	help
 	  Select the I/O scheduler which will be used by default for all
 	  block devices.
@@ -74,16 +55,6 @@ choice
 	config DEFAULT_NOOP
 		bool "No-op"
 
-	config DEFAULT_BFQ
-		bool "BFQ" if IOSCHED_BFQ=y
-		help
-		  Selects BFQ as the default I/O scheduler which will be
-		  used by default for all block devices.
-		  The BFQ I/O scheduler aims at distributing the bandwidth
-		  as desired, independently of the disk parameters and with
-		  any workload. It also tries to guarantee low latency to
-		  interactive and soft real-time applications.
-
 endchoice
 
 config DEFAULT_IOSCHED
@@ -91,7 +62,6 @@ config DEFAULT_IOSCHED
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP
-	default "bfq" if DEFAULT_BFQ
 
 endmenu
 
diff --git a/block/Makefile b/block/Makefile
index 736e91a2ca1c..36acdd7545be 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
-obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
deleted file mode 100644
index 52484f10bb6f..000000000000
--- a/block/bfq-cgroup.c
+++ /dev/null
@@ -1,1191 +0,0 @@
-/*
- * BFQ: CGROUPS support.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
- * file.
- */
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-/* bfqg stats flags */
-enum bfqg_stats_flags {
-	BFQG_stats_waiting = 0,
-	BFQG_stats_idling,
-	BFQG_stats_empty,
-};
-
-#define BFQG_FLAG_FNS(name)						\
-static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags |= (1 << BFQG_stats_##name);			\
-}									\
-static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags &= ~(1 << BFQG_stats_##name);			\
-}									\
-static int bfqg_stats_##name(struct bfqg_stats *stats)		\
-{									\
-	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
-}									\
-
-BFQG_FLAG_FNS(waiting)
-BFQG_FLAG_FNS(idling)
-BFQG_FLAG_FNS(empty)
-#undef BFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_waiting(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
-		blkg_stat_add(&stats->group_wait_time,
-			      now - stats->start_group_wait_time);
-	bfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
-						 struct bfq_group *curr_bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_waiting(stats))
-		return;
-	if (bfqg == curr_bfqg)
-		return;
-	stats->start_group_wait_time = sched_clock();
-	bfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_empty(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
-		blkg_stat_add(&stats->empty_time,
-			      now - stats->start_empty_time);
-	bfqg_stats_clear_empty(stats);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
-{
-	blkg_stat_add(&bfqg->stats.dequeue, 1);
-}
-
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (blkg_rwstat_total(&stats->queued))
-		return;
-
-	/*
-	 * group is already marked empty. This can happen if bfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if (bfqg_stats_empty(stats))
-		return;
-
-	stats->start_empty_time = sched_clock();
-	bfqg_stats_mark_empty(stats);
-}
-
-static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
-
-		if (time_after64(now, stats->start_idle_time))
-			blkg_stat_add(&stats->idle_time,
-				      now - stats->start_idle_time);
-		bfqg_stats_clear_idling(stats);
-	}
-}
-
-static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	stats->start_idle_time = sched_clock();
-	bfqg_stats_mark_idling(stats);
-}
-
-static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_total(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	bfqg_stats_update_group_wait_time(stats);
-}
-
-static struct blkcg_policy blkcg_policy_bfq;
-
-/*
- * blk-cgroup policy-related handlers
- * The following functions help in converting between blk-cgroup
- * internal structures and BFQ-specific structures.
- */
-
-static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
-}
-
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
-{
-	return pd_to_blkg(&bfqg->pd);
-}
-
-static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
-{
-	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
-
-	return pd_to_bfqg(pd);
-}
-
-/*
- * bfq_group handlers
- * The following functions help in navigating the bfq_group hierarchy
- * by allowing to find the parent of a bfq_group or the bfq_group
- * associated to a bfq_queue.
- */
-
-static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
-{
-	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
-
-	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	return group_entity ? container_of(group_entity, struct bfq_group,
-					   entity) :
-			      bfqq->bfqd->root_group;
-}
-
-/*
- * The following two functions handle get and put of a bfq_group by
- * wrapping the related blk-cgroup hooks.
- */
-
-static void bfqg_get(struct bfq_group *bfqg)
-{
-	return blkg_get(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_put(struct bfq_group *bfqg)
-{
-	return blkg_put(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
-				     struct bfq_queue *bfqq,
-				     int op, int op_flags)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
-	bfqg_stats_end_empty_time(&bfqg->stats);
-	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
-		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
-
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
-					int op_flags)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
-}
-
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg,  int op,
-					int op_flags)
-{
-	blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
-}
-
-static void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
-
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, op_flags,
-				now - io_start_time);
-	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, op, op_flags,
-				io_start_time - start_time);
-}
-
-/* @stats = 0 */
-static void bfqg_stats_reset(struct bfqg_stats *stats)
-{
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->merged);
-	blkg_rwstat_reset(&stats->service_time);
-	blkg_rwstat_reset(&stats->wait_time);
-	blkg_stat_reset(&stats->time);
-	blkg_stat_reset(&stats->avg_queue_size_sum);
-	blkg_stat_reset(&stats->avg_queue_size_samples);
-	blkg_stat_reset(&stats->dequeue);
-	blkg_stat_reset(&stats->group_wait_time);
-	blkg_stat_reset(&stats->idle_time);
-	blkg_stat_reset(&stats->empty_time);
-}
-
-/* @to += @from */
-static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
-{
-	if (!to || !from)
-		return;
-
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->merged, &from->merged);
-	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
-	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
-	blkg_stat_add_aux(&from->time, &from->time);
-	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_add_aux(&to->avg_queue_size_samples,
-			  &from->avg_queue_size_samples);
-	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
-	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
-	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
-}
-
-/*
- * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
- * recursive stats can still account for the amount used by this bfqg after
- * it's gone.
- */
-static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
-{
-	struct bfq_group *parent;
-
-	if (!bfqg) /* root_group */
-		return;
-
-	parent = bfqg_parent(bfqg);
-
-	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
-
-	if (unlikely(!parent))
-		return;
-
-	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
-	bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
-		bfqg_get(bfqg);
-	}
-	entity->parent = bfqg->my_entity; /* NULL for root group */
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
-	blkg_rwstat_exit(&stats->merged);
-	blkg_rwstat_exit(&stats->service_time);
-	blkg_rwstat_exit(&stats->wait_time);
-	blkg_rwstat_exit(&stats->queued);
-	blkg_stat_exit(&stats->time);
-	blkg_stat_exit(&stats->avg_queue_size_sum);
-	blkg_stat_exit(&stats->avg_queue_size_samples);
-	blkg_stat_exit(&stats->dequeue);
-	blkg_stat_exit(&stats->group_wait_time);
-	blkg_stat_exit(&stats->idle_time);
-	blkg_stat_exit(&stats->empty_time);
-}
-
-static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
-{
-	if (blkg_rwstat_init(&stats->merged, gfp) ||
-	    blkg_rwstat_init(&stats->service_time, gfp) ||
-	    blkg_rwstat_init(&stats->wait_time, gfp) ||
-	    blkg_rwstat_init(&stats->queued, gfp) ||
-	    blkg_stat_init(&stats->time, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
-	    blkg_stat_init(&stats->dequeue, gfp) ||
-	    blkg_stat_init(&stats->group_wait_time, gfp) ||
-	    blkg_stat_init(&stats->idle_time, gfp) ||
-	    blkg_stat_init(&stats->empty_time, gfp)) {
-		bfqg_stats_exit(stats);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
-{
-	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
-}
-
-static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
-{
-	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
-}
-
-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
-{
-	struct bfq_group_data *bgd;
-
-	bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
-	if (!bgd)
-		return NULL;
-	return &bgd->pd;
-}
-
-static void bfq_cpd_init(struct blkcg_policy_data *cpd)
-{
-	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
-
-	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
-		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
-}
-
-static void bfq_cpd_free(struct blkcg_policy_data *cpd)
-{
-	kfree(cpd_to_bfqgd(cpd));
-}
-
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
-{
-	struct bfq_group *bfqg;
-
-	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
-	if (!bfqg)
-		return NULL;
-
-	if (bfqg_stats_init(&bfqg->stats, gfp)) {
-		kfree(bfqg);
-		return NULL;
-	}
-
-	return &bfqg->pd;
-}
-
-static void bfq_pd_init(struct blkg_policy_data *pd)
-{
-	struct blkcg_gq *blkg;
-	struct bfq_group *bfqg;
-	struct bfq_data *bfqd;
-	struct bfq_entity *entity;
-	struct bfq_group_data *d;
-
-	blkg = pd_to_blkg(pd);
-	BUG_ON(!blkg);
-	bfqg = blkg_to_bfqg(blkg);
-	bfqd = blkg->q->elevator->elevator_data;
-	entity = &bfqg->entity;
-	d = blkcg_to_bfqgd(blkg->blkcg);
-
-	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
-	entity->my_sched_data = &bfqg->sched_data;
-	bfqg->my_entity = entity; /*
-				   * the root_group's will be set to NULL
-				   * in bfq_init_queue()
-				   */
-	bfqg->bfqd = bfqd;
-	bfqg->active_entities = 0;
-	bfqg->rq_pos_tree = RB_ROOT;
-}
-
-static void bfq_pd_free(struct blkg_policy_data *pd)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-
-	bfqg_stats_exit(&bfqg->stats);
-	return kfree(bfqg);
-}
-
-static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-
-	bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
-{
-	struct bfq_entity *entity;
-
-	BUG_ON(!parent);
-	BUG_ON(!bfqg);
-	BUG_ON(bfqg == parent);
-
-	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
-}
-
-static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
-					 struct blkcg *blkcg)
-{
-	struct blkcg_gq *blkg;
-
-	blkg = blkg_lookup(blkcg, bfqd->queue);
-	if (likely(blkg))
-		return blkg_to_bfqg(blkg);
-	return NULL;
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
-					    struct blkcg *blkcg)
-{
-	struct bfq_group *bfqg, *parent;
-	struct bfq_entity *entity;
-
-	assert_spin_locked(bfqd->queue->queue_lock);
-
-	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
-
-	if (unlikely(!bfqg))
-		return NULL;
-
-	/*
-	 * Update chain of bfq_groups as we might be handling a leaf group
-	 * which, along with some of its relatives, has not been hooked yet
-	 * to the private hierarchy of BFQ.
-	 */
-	entity = &bfqg->entity;
-	for_each_entity(entity) {
-		bfqg = container_of(entity, struct bfq_group, entity);
-		BUG_ON(!bfqg);
-		if (bfqg != bfqd->root_group) {
-			parent = bfqg_parent(bfqg);
-			if (!parent)
-				parent = bfqd->root_group;
-			BUG_ON(!parent);
-			bfq_group_set_parent(bfqg, parent);
-		}
-	}
-
-	return bfqg;
-}
-
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
-				  struct bfq_queue *bfqq);
-
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason);
-
-/**
- * bfq_bfqq_move - migrate @bfqq to @bfqg.
- * @bfqd: queue descriptor.
- * @bfqq: the queue to move.
- * @bfqg: the group to move to.
- *
- * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
- * it on the new one.  Avoid putting the entity on the old group idle tree.
- *
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
- */
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_group *bfqg)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
-	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
-	BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
-	       && entity->on_st &&
-	       bfqq != bfqd->in_service_queue);
-	BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
-
-	/* If bfqq is empty, then bfq_bfqq_expire also invokes
-	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
-	 * from data structures related to current group. Otherwise we
-	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
-	 * we do below.
-	 */
-	if (bfqq == bfqd->in_service_queue)
-		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
-				false, BFQ_BFQQ_PREEMPTED);
-
-	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
-	    && &bfq_entity_service_tree(entity)->idle !=
-	       entity->tree);
-
-	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
-
-	if (bfq_bfqq_busy(bfqq))
-		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
-	else if (entity->on_st) {
-		BUG_ON(&bfq_entity_service_tree(entity)->idle !=
-		       entity->tree);
-		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	}
-	bfqg_put(bfqq_group(bfqq));
-
-	/*
-	 * Here we use a reference to bfqg.  We don't need a refcounter
-	 * as the cgroup reference will not be dropped, so that its
-	 * destroy() callback will not be invoked.
-	 */
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
-	bfqg_get(bfqg);
-
-	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
-	if (bfq_bfqq_busy(bfqq)) {
-		bfq_pos_tree_add_move(bfqd, bfqq);
-		bfq_activate_bfqq(bfqd, bfqq);
-	}
-
-	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
-		bfq_schedule_dispatch(bfqd);
-	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
-	       && &bfq_entity_service_tree(entity)->idle !=
-	       entity->tree);
-}
-
-/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
- * @bfqd: the queue descriptor.
- * @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
- *
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
- *
- * NOTE: an alternative approach might have been to store the current
- * cgroup in bfqq and getting a reference to it, reducing the lookup
- * time here, at the price of slightly more complex code.
- */
-static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
-						struct bfq_io_cq *bic,
-						struct blkcg *blkcg)
-{
-	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
-	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
-	struct bfq_group *bfqg;
-	struct bfq_entity *entity;
-
-	lockdep_assert_held(bfqd->queue->queue_lock);
-
-	bfqg = bfq_find_set_group(bfqd, blkcg);
-
-	if (unlikely(!bfqg))
-		bfqg = bfqd->root_group;
-
-	if (async_bfqq) {
-		entity = &async_bfqq->entity;
-
-		if (entity->sched_data != &bfqg->sched_data) {
-			bic_set_bfqq(bic, NULL, 0);
-			bfq_log_bfqq(bfqd, async_bfqq,
-				     "bic_change_group: %p %d",
-				     async_bfqq,
-				     async_bfqq->ref);
-			bfq_put_queue(async_bfqq);
-		}
-	}
-
-	if (sync_bfqq) {
-		entity = &sync_bfqq->entity;
-		if (entity->sched_data != &bfqg->sched_data)
-			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
-	}
-
-	return bfqg;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
-{
-	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct bfq_group *bfqg = NULL;
-	uint64_t serial_nr;
-
-	rcu_read_lock();
-	serial_nr = bio_blkcg(bio)->css.serial_nr;
-
-	/*
-	 * Check whether blkcg has changed.  The condition may trigger
-	 * spuriously on a newly created cic but there's no harm.
-	 */
-	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
-		goto out;
-
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
-	bic->blkcg_serial_nr = serial_nr;
-out:
-	rcu_read_unlock();
-}
-
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	BUG_ON(!bfqq);
-	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_entities - move to the root group all active
- *                                entities.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
- */
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
-					 struct bfq_group *bfqg,
-					 struct bfq_service_tree *st)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity = NULL;
-
-	if (!RB_EMPTY_ROOT(&st->active))
-		entity = bfq_entity_of(rb_first(active));
-
-	for (; entity ; entity = bfq_entity_of(rb_first(active)))
-		bfq_reparent_leaf_entity(bfqd, entity);
-
-	if (bfqg->sched_data.in_service_entity)
-		bfq_reparent_leaf_entity(bfqd,
-			bfqg->sched_data.in_service_entity);
-}
-
-/**
- * bfq_pd_offline - deactivate the entity associated with @pd,
- *		    and reparent its children entities.
- * @pd: descriptor of the policy going offline.
- *
- * blkio already grabs the queue_lock for us, so no need to use
- * RCU-based magic
- */
-static void bfq_pd_offline(struct blkg_policy_data *pd)
-{
-	struct bfq_service_tree *st;
-	struct bfq_group *bfqg;
-	struct bfq_data *bfqd;
-	struct bfq_entity *entity;
-	int i;
-
-	BUG_ON(!pd);
-	bfqg = pd_to_bfqg(pd);
-	BUG_ON(!bfqg);
-	bfqd = bfqg->bfqd;
-	BUG_ON(bfqd && !bfqd->root_group);
-
-	entity = bfqg->my_entity;
-
-	if (!entity) /* root group */
-		return;
-
-	/*
-	 * Empty all service_trees belonging to this group before
-	 * deactivating the group itself.
-	 */
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
-		st = bfqg->sched_data.service_tree + i;
-		/*
-		 * The idle tree may still contain bfq_queues belonging
-		 * to exited task because they never migrated to a different
-		 * cgroup from the one being destroyed now.  No one else
-		 * can access them so it's safe to act without any lock.
-		 */
-		bfq_flush_idle_tree(st);
-
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * in service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfq_reparent_active_entities(bfqd, bfqg, st);
-		BUG_ON(!RB_EMPTY_ROOT(&st->active));
-		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
-	}
-	BUG_ON(bfqg->sched_data.next_in_service);
-	BUG_ON(bfqg->sched_data.in_service_entity);
-
-	__bfq_deactivate_entity(entity, false);
-	bfq_put_async_queues(bfqd, bfqg);
-
-	/*
-	 * @blkg is going offline and will be ignored by
-	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
-	 * that they don't get lost.  If IOs complete after this point, the
-	 * stats for them will be lost.  Oh well...
-	 */
-	bfqg_stats_xfer_dead(bfqg);
-}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
-	struct blkcg_gq *blkg;
-
-	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
-		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-		BUG_ON(!bfqg);
-
-		bfq_end_wr_async_queues(bfqd, bfqg);
-	}
-	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
-}
-
-static int bfq_io_show_weight(struct seq_file *sf, void *v)
-{
-	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	unsigned int val = 0;
-
-	if (bfqgd)
-		val = bfqgd->weight;
-
-	seq_printf(sf, "%u\n", val);
-
-	return 0;
-}
-
-static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
-				    struct cftype *cftype,
-				    u64 val)
-{
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	struct blkcg_gq *blkg;
-	int ret = -ERANGE;
-
-	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
-		return ret;
-
-	ret = 0;
-	spin_lock_irq(&blkcg->lock);
-	bfqgd->weight = (unsigned short)val;
-	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
-		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-
-		if (!bfqg)
-			continue;
-		/*
-		 * Setting the prio_changed flag of the entity
-		 * to 1 with new_weight == weight would re-set
-		 * the value of the weight to its ioprio mapping.
-		 * Set the flag only if necessary.
-		 */
-		if ((unsigned short)val != bfqg->entity.new_weight) {
-			bfqg->entity.new_weight = (unsigned short)val;
-			/*
-			 * Make sure that the above new value has been
-			 * stored in bfqg->entity.new_weight before
-			 * setting the prio_changed flag. In fact,
-			 * this flag may be read asynchronously (in
-			 * critical sections protected by a different
-			 * lock than that held here), and finding this
-			 * flag set may cause the execution of the code
-			 * for updating parameters whose value may
-			 * depend also on bfqg->entity.new_weight (in
-			 * __bfq_entity_update_weight_prio).
-			 * This barrier makes sure that the new value
-			 * of bfqg->entity.new_weight is correctly
-			 * seen in that code.
-			 */
-			smp_wmb();
-			bfqg->entity.prio_changed = 1;
-		}
-	}
-	spin_unlock_irq(&blkcg->lock);
-
-	return ret;
-}
-
-static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
-				 char *buf, size_t nbytes,
-				 loff_t off)
-{
-	u64 weight;
-	/* First unsigned long found in the file is used */
-	int ret = kstrtoull(strim(buf), 0, &weight);
-
-	if (ret)
-		return ret;
-
-	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
-}
-
-static int bfqg_print_stat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
-	return 0;
-}
-
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
-	return 0;
-}
-
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
-					  &blkcg_policy_bfq, off);
-	return __blkg_prfill_u64(sf, pd, sum);
-}
-
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
-					struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
-							   &blkcg_policy_bfq,
-							   off);
-	return __blkg_prfill_rwstat(sf, pd, &sum);
-}
-
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, false);
-	return 0;
-}
-
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, true);
-	return 0;
-}
-
-static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
-			       int off)
-{
-	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
-	return 0;
-}
-
-static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
-					 struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
-					offsetof(struct blkcg_gq, stat_bytes));
-	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
-			  false);
-	return 0;
-}
-
-
-static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
-	u64 v = 0;
-
-	if (samples) {
-		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
-		v = div64_u64(v, samples);
-	}
-	__blkg_prfill_u64(sf, pd, v);
-	return 0;
-}
-
-/* print avg_queue_size */
-static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
-			  0, false);
-	return 0;
-}
-
-static struct bfq_group *
-bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
-{
-	int ret;
-
-	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
-	if (ret)
-		return NULL;
-
-	return blkg_to_bfqg(bfqd->queue->root_blkg);
-}
-
-static struct cftype bfq_blkcg_legacy_files[] = {
-	{
-		.name = "bfq.weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = bfq_io_show_weight,
-		.write_u64 = bfq_io_set_weight_legacy,
-	},
-
-	/* statistics, covers only the tasks in the bfqg */
-	{
-		.name = "bfq.time",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.sectors",
-		.seq_show = bfqg_print_stat_sectors,
-	},
-	{
-		.name = "bfq.io_service_bytes",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_bytes,
-	},
-	{
-		.name = "bfq.io_serviced",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_ios,
-	},
-	{
-		.name = "bfq.io_service_time",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_wait_time",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_merged",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_queued",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat,
-	},
-
-	/* the same statictics which cover the bfqg and its descendants */
-	{
-		.name = "bfq.time_recursive",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat_recursive,
-	},
-	{
-		.name = "bfq.sectors_recursive",
-		.seq_show = bfqg_print_stat_sectors_recursive,
-	},
-	{
-		.name = "bfq.io_service_bytes_recursive",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_bytes_recursive,
-	},
-	{
-		.name = "bfq.io_serviced_recursive",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_ios_recursive,
-	},
-	{
-		.name = "bfq.io_service_time_recursive",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_wait_time_recursive",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_merged_recursive",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_queued_recursive",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.avg_queue_size",
-		.seq_show = bfqg_print_avg_queue_size,
-	},
-	{
-		.name = "bfq.group_wait_time",
-		.private = offsetof(struct bfq_group, stats.group_wait_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.idle_time",
-		.private = offsetof(struct bfq_group, stats.idle_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.empty_time",
-		.private = offsetof(struct bfq_group, stats.empty_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.dequeue",
-		.private = offsetof(struct bfq_group, stats.dequeue),
-		.seq_show = bfqg_print_stat,
-	},
-	{ }	/* terminate */
-};
-
-static struct cftype bfq_blkg_files[] = {
-	{
-		.name = "bfq.weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = bfq_io_show_weight,
-		.write = bfq_io_set_weight,
-	},
-	{} /* terminate */
-};
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
-			struct bfq_queue *bfqq, int op, int op_flags) { }
-static inline void
-bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
-static inline void
-bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
-static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags) { }
-static inline void
-bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
-				     struct bfq_group *curr_bfqg) { }
-static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
-static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_group *bfqg) {}
-
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
-	}
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
-	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
-					    struct blkcg *blkcg)
-{
-	return bfqd->root_group;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
-static struct bfq_group *
-bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
-{
-	struct bfq_group *bfqg;
-	int i;
-
-	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (!bfqg)
-		return NULL;
-
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-
-	return bfqg;
-}
-#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
deleted file mode 100644
index fb7bb8f08b75..000000000000
--- a/block/bfq-ioc.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * BFQ: I/O context handling.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
- */
-
-/**
- * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
- * @icq: the iocontext queue.
- */
-static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
-{
-	/* bic->icq is the first member, %NULL will convert to %NULL */
-	return container_of(icq, struct bfq_io_cq, icq);
-}
-
-/**
- * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
- *
- * Queue lock must be held.
- */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
-					struct io_context *ioc)
-{
-	if (ioc)
-		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
-	return NULL;
-}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
deleted file mode 100644
index 6e6025dacfc6..000000000000
--- a/block/bfq-iosched.c
+++ /dev/null
@@ -1,5403 +0,0 @@
-/*
- * Budget Fair Queueing (BFQ) I/O scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
- * file.
- *
- * BFQ is a proportional-share I/O scheduler, with some extra
- * low-latency capabilities. BFQ also supports full hierarchical
- * scheduling through cgroups. Next paragraphs provide an introduction
- * on BFQ inner workings. Details on BFQ benefits and usage can be
- * found in Documentation/block/bfq-iosched.txt.
- *
- * BFQ is a proportional-share storage-I/O scheduling algorithm based
- * on the slice-by-slice service scheme of CFQ. But BFQ assigns
- * budgets, measured in number of sectors, to processes instead of
- * time slices. The device is not granted to the in-service process
- * for a given time slice, but until it has exhausted its assigned
- * budget. This change from the time to the service domain enables BFQ
- * to distribute the device throughput among processes as desired,
- * without any distortion due to throughput fluctuations, or to device
- * internal queueing. BFQ uses an ad hoc internal scheduler, called
- * B-WF2Q+, to schedule processes according to their budgets. More
- * precisely, BFQ schedules queues associated with processes. Thanks to
- * the accurate policy of B-WF2Q+, BFQ can afford to assign high
- * budgets to I/O-bound processes issuing sequential requests (to
- * boost the throughput), and yet guarantee a low latency to
- * interactive and soft real-time applications.
- *
- * NOTE: if the main or only goal, with a given device, is to achieve
- * the maximum-possible throughput at all times, then do switch off
- * all low-latency heuristics for that device, by setting low_latency
- * to 0.
- *
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
- *
- * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
- * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
- * complexity derives from the one introduced with EEVDF in [3].
- *
- * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
- *   Scheduler", Proceedings of the First Workshop on Mobile System
- *   Technologies (MST-2015), May 2015.
- *   http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
- *
- * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
- *
- * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
- *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
- *     Oct 1997.
- *
- * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
- *
- * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
- *     First: A Flexible and Accurate Mechanism for Proportional Share
- *     Resource Allocation,'' technical report.
- *
- * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <linux/cgroup.h>
-#include <linux/elevator.h>
-#include <linux/jiffies.h>
-#include <linux/rbtree.h>
-#include <linux/ioprio.h>
-#include "bfq.h"
-#include "blk.h"
-
-/* Expiration time of sync (0) and async (1) requests, in ns. */
-static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
-
-/* Maximum backwards seek, in KiB. */
-static const int bfq_back_max = (16 * 1024);
-
-/* Penalty of a backwards seek, in number of sectors. */
-static const int bfq_back_penalty = 2;
-
-/* Idling period duration, in ns. */
-static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
-
-/* Minimum number of assigned budgets for which stats are safe to compute. */
-static const int bfq_stats_min_budgets = 194;
-
-/* Default maximum budget values, in sectors and number of requests. */
-static const int bfq_default_max_budget = (16 * 1024);
-
-/*
- * Async to sync throughput distribution is controlled as follows:
- * when an async request is served, the entity is charged the number
- * of sectors of the request, multiplied by the factor below
- */
-static const int bfq_async_charge_factor = 10;
-
-/* Default timeout values, in jiffies, approximating CFQ defaults. */
-static const int bfq_timeout = (HZ / 8);
-
-static struct kmem_cache *bfq_pool;
-
-/* Below this threshold (in ns), we consider thinktime immediate. */
-#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
-
-/* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD	4
-#define BFQ_HW_QUEUE_SAMPLES	32
-
-#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
-#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
-#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
-
-/* Min number of samples required to perform peak-rate update */
-#define BFQ_RATE_MIN_SAMPLES	32
-/* Min observation time interval required to perform a peak-rate update (ns) */
-#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
-/* Target observation time interval for a peak-rate update (ns) */
-#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
-
-/* Shift used for peak rate fixed precision calculations. */
-#define BFQ_RATE_SHIFT		16
-
-/*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
- *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
- *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
- *
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
- */
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
-/*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
- */
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
-
-#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
-				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
-
-#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
-#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
-
-static void bfq_schedule_dispatch(struct bfq_data *bfqd);
-
-#include "bfq-ioc.c"
-#include "bfq-sched.c"
-#include "bfq-cgroup.c"
-
-#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
-
-#define bfq_sample_valid(samples)	((samples) > 80)
-
-/*
- * We regard a request as SYNC, if either it's a read or has the SYNC bit
- * set (in which case it could also be a direct WRITE).
- */
-static int bfq_bio_sync(struct bio *bio)
-{
-	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
-}
-
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		kblockd_schedule_work(&bfqd->unplug_work);
-	}
-}
-
-/*
- * Lifted from AS - choose which of rq1 and rq2 that is best served now.
- * We choose the request that is closesr to the head right now.  Distance
- * behind the head is penalized and only allowed to a certain extent.
- */
-static struct request *bfq_choose_req(struct bfq_data *bfqd,
-				      struct request *rq1,
-				      struct request *rq2,
-				      sector_t last)
-{
-	sector_t s1, s2, d1 = 0, d2 = 0;
-	unsigned long back_max;
-#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
-#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
-	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
-
-	if (!rq1 || rq1 == rq2)
-		return rq2;
-	if (!rq2)
-		return rq1;
-
-	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-		return rq1;
-	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-		return rq2;
-	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
-		return rq1;
-	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
-		return rq2;
-
-	s1 = blk_rq_pos(rq1);
-	s2 = blk_rq_pos(rq2);
-
-	/*
-	 * By definition, 1KiB is 2 sectors.
-	 */
-	back_max = bfqd->bfq_back_max * 2;
-
-	/*
-	 * Strict one way elevator _except_ in the case where we allow
-	 * short backward seeks which are biased as twice the cost of a
-	 * similar forward seek.
-	 */
-	if (s1 >= last)
-		d1 = s1 - last;
-	else if (s1 + back_max >= last)
-		d1 = (last - s1) * bfqd->bfq_back_penalty;
-	else
-		wrap |= BFQ_RQ1_WRAP;
-
-	if (s2 >= last)
-		d2 = s2 - last;
-	else if (s2 + back_max >= last)
-		d2 = (last - s2) * bfqd->bfq_back_penalty;
-	else
-		wrap |= BFQ_RQ2_WRAP;
-
-	/* Found required data */
-
-	/*
-	 * By doing switch() on the bit mask "wrap" we avoid having to
-	 * check two variables for all permutations: --> faster!
-	 */
-	switch (wrap) {
-	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
-		if (d1 < d2)
-			return rq1;
-		else if (d2 < d1)
-			return rq2;
-
-		if (s1 >= s2)
-			return rq1;
-		else
-			return rq2;
-
-	case BFQ_RQ2_WRAP:
-		return rq1;
-	case BFQ_RQ1_WRAP:
-		return rq2;
-	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
-	default:
-		/*
-		 * Since both rqs are wrapped,
-		 * start with the one that's further behind head
-		 * (--> only *one* back seek required),
-		 * since back seek takes more time than forward.
-		 */
-		if (s1 <= s2)
-			return rq1;
-		else
-			return rq2;
-	}
-}
-
-static struct bfq_queue *
-bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
-		     sector_t sector, struct rb_node **ret_parent,
-		     struct rb_node ***rb_link)
-{
-	struct rb_node **p, *parent;
-	struct bfq_queue *bfqq = NULL;
-
-	parent = NULL;
-	p = &root->rb_node;
-	while (*p) {
-		struct rb_node **n;
-
-		parent = *p;
-		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
-
-		/*
-		 * Sort strictly based on sector. Smallest to the left,
-		 * largest to the right.
-		 */
-		if (sector > blk_rq_pos(bfqq->next_rq))
-			n = &(*p)->rb_right;
-		else if (sector < blk_rq_pos(bfqq->next_rq))
-			n = &(*p)->rb_left;
-		else
-			break;
-		p = n;
-		bfqq = NULL;
-	}
-
-	*ret_parent = parent;
-	if (rb_link)
-		*rb_link = p;
-
-	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
-		(unsigned long long) sector,
-		bfqq ? bfqq->pid : 0);
-
-	return bfqq;
-}
-
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct rb_node **p, *parent;
-	struct bfq_queue *__bfqq;
-
-	if (bfqq->pos_root) {
-		rb_erase(&bfqq->pos_node, bfqq->pos_root);
-		bfqq->pos_root = NULL;
-	}
-
-	if (bfq_class_idle(bfqq))
-		return;
-	if (!bfqq->next_rq)
-		return;
-
-	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
-	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
-			blk_rq_pos(bfqq->next_rq), &parent, &p);
-	if (!__bfqq) {
-		rb_link_node(&bfqq->pos_node, parent, p);
-		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
-	} else
-		bfqq->pos_root = NULL;
-}
-
-/*
- * Tell whether there are active queues or groups with differentiated weights.
- */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
-{
-	/*
-	 * For weights to differ, at least one of the trees must contain
-	 * at least two nodes.
-	 */
-	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
-		(bfqd->queue_weights_tree.rb_node->rb_left ||
-		 bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	       ) ||
-	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
-		(bfqd->group_weights_tree.rb_node->rb_left ||
-		 bfqd->group_weights_tree.rb_node->rb_right)
-#endif
-	       );
-}
-
-/*
- * The following function returns true if every queue must receive the
- * same share of the throughput (this condition is used when deciding
- * whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
- *
- * Such a scenario occurs when:
- * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
- * 3) all active groups at the same level in the groups tree have the same
- *    number of children.
- *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
- * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
- */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
-{
-	return !bfq_differentiated_weights(bfqd);
-}
-
-/*
- * If the weight-counter tree passed as input contains no counter for
- * the weight of the input entity, then add that counter; otherwise just
- * increment the existing counter.
- *
- * Note that weight-counter trees contain few nodes in mostly symmetric
- * scenarios. For example, if all queues have the same weight, then the
- * weight-counter tree for the queues may contain at most one node.
- * This holds even if low_latency is on, because weight-raised queues
- * are not inserted in the tree.
- * In most scenarios, the rate at which nodes are created/destroyed
- * should be low too.
- */
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
-				 struct bfq_entity *entity,
-				 struct rb_root *root)
-{
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-
-	/*
-	 * Do not insert if the entity is already associated with a
-	 * counter, which happens if:
-	 *   1) the entity is associated with a queue,
-	 *   2) a request arrival has caused the queue to become both
-	 *      non-weight-raised, and hence change its weight, and
-	 *      backlogged; in this respect, each of the two events
-	 *      causes an invocation of this function,
-	 *   3) this is the invocation of this function caused by the
-	 *      second event. This second invocation is actually useless,
-	 *      and we handle this fact by exiting immediately. More
-	 *      efficient or clearer solutions might possibly be adopted.
-	 */
-	if (entity->weight_counter)
-		return;
-
-	while (*new) {
-		struct bfq_weight_counter *__counter = container_of(*new,
-						struct bfq_weight_counter,
-						weights_node);
-		parent = *new;
-
-		if (entity->weight == __counter->weight) {
-			entity->weight_counter = __counter;
-			goto inc_counter;
-		}
-		if (entity->weight < __counter->weight)
-			new = &((*new)->rb_left);
-		else
-			new = &((*new)->rb_right);
-	}
-
-	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
-					 GFP_ATOMIC);
-
-	/*
-	 * In the unlucky event of an allocation failure, we just
-	 * exit. This will cause the weight of entity to not be
-	 * considered in bfq_differentiated_weights, which, in its
-	 * turn, causes the scenario to be deemed wrongly symmetric in
-	 * case entity's weight would have been the only weight making
-	 * the scenario asymmetric. On the bright side, no unbalance
-	 * will however occur when entity becomes inactive again (the
-	 * invocation of this function is triggered by an activation
-	 * of entity). In fact, bfq_weights_tree_remove does nothing
-	 * if !entity->weight_counter.
-	 */
-	if (unlikely(!entity->weight_counter))
-		return;
-
-	entity->weight_counter->weight = entity->weight;
-	rb_link_node(&entity->weight_counter->weights_node, parent, new);
-	rb_insert_color(&entity->weight_counter->weights_node, root);
-
-inc_counter:
-	entity->weight_counter->num_active++;
-}
-
-/*
- * Decrement the weight counter associated with the entity, and, if the
- * counter reaches 0, remove the counter from the tree.
- * See the comments to the function bfq_weights_tree_add() for considerations
- * about overhead.
- */
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
-				    struct bfq_entity *entity,
-				    struct rb_root *root)
-{
-	if (!entity->weight_counter)
-		return;
-
-	BUG_ON(RB_EMPTY_ROOT(root));
-	BUG_ON(entity->weight_counter->weight != entity->weight);
-
-	BUG_ON(!entity->weight_counter->num_active);
-	entity->weight_counter->num_active--;
-	if (entity->weight_counter->num_active > 0)
-		goto reset_entity_pointer;
-
-	rb_erase(&entity->weight_counter->weights_node, root);
-	kfree(entity->weight_counter);
-
-reset_entity_pointer:
-	entity->weight_counter = NULL;
-}
-
-/*
- * Return expired entry, or NULL to just start from scratch in rbtree.
- */
-static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
-				      struct request *last)
-{
-	struct request *rq;
-
-	if (bfq_bfqq_fifo_expire(bfqq))
-		return NULL;
-
-	bfq_mark_bfqq_fifo_expire(bfqq);
-
-	rq = rq_entry_fifo(bfqq->fifo.next);
-
-	if (rq == last || ktime_get_ns() < rq->fifo_time)
-		return NULL;
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
-	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
-	return rq;
-}
-
-static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
-					struct bfq_queue *bfqq,
-					struct request *last)
-{
-	struct rb_node *rbnext = rb_next(&last->rb_node);
-	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct request *next, *prev = NULL;
-
-	BUG_ON(list_empty(&bfqq->fifo));
-
-	/* Follow expired path, else get first next available. */
-	next = bfq_check_fifo(bfqq, last);
-	if (next) {
-		BUG_ON(next == last);
-		return next;
-	}
-
-	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
-	if (rbprev)
-		prev = rb_entry_rq(rbprev);
-
-	if (rbnext)
-		next = rb_entry_rq(rbnext);
-	else {
-		rbnext = rb_first(&bfqq->sort_list);
-		if (rbnext && rbnext != &last->rb_node)
-			next = rb_entry_rq(rbnext);
-	}
-
-	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
-}
-
-/* see the definition of bfq_async_charge_factor for details */
-static unsigned long bfq_serv_to_charge(struct request *rq,
-					struct bfq_queue *bfqq)
-{
-	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
-		return blk_rq_sectors(rq);
-
-	/*
-	 * If there are no weight-raised queues, then amplify service
-	 * by just the async charge factor; otherwise amplify service
-	 * by twice the async charge factor, to further reduce latency
-	 * for weight-raised queues.
-	 */
-	if (bfqq->bfqd->wr_busy_queues == 0)
-		return blk_rq_sectors(rq) * bfq_async_charge_factor;
-
-	return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
-}
-
-/**
- * bfq_updated_next_req - update the queue after a new next_rq selection.
- * @bfqd: the device data the queue belongs to.
- * @bfqq: the queue to update.
- *
- * If the first request of a queue changes we make sure that the queue
- * has enough budget to serve at least its first request (if the
- * request has grown).  We do this because if the queue has not enough
- * budget for its first request, it has to go through two dispatch
- * rounds to actually get it dispatched.
- */
-static void bfq_updated_next_req(struct bfq_data *bfqd,
-				 struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	struct request *next_rq = bfqq->next_rq;
-	unsigned long new_budget;
-
-	if (!next_rq)
-		return;
-
-	if (bfqq == bfqd->in_service_queue)
-		/*
-		 * In order not to break guarantees, budgets cannot be
-		 * changed after an entity has been selected.
-		 */
-		return;
-
-	BUG_ON(entity->tree != &st->active);
-	BUG_ON(entity == entity->sched_data->in_service_entity);
-
-	new_budget = max_t(unsigned long, bfqq->max_budget,
-			   bfq_serv_to_charge(next_rq, bfqq));
-	if (entity->budget != new_budget) {
-		entity->budget = new_budget;
-		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
-					 new_budget);
-		bfq_requeue_bfqq(bfqd, bfqq);
-	}
-}
-
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
-{
-	u64 dur;
-
-	if (bfqd->bfq_wr_max_time > 0)
-		return bfqd->bfq_wr_max_time;
-
-	dur = bfqd->RT_prod;
-	do_div(dur, bfqd->peak_rate);
-
-	/*
-	 * Limit duration between 3 and 13 seconds. Tests show that
-	 * higher values than 13 seconds often yield the opposite of
-	 * the desired result, i.e., worsen responsiveness by letting
-	 * non-interactive and non-soft-real-time applications
-	 * preserve weight raising for a too long time interval.
-	 *
-	 * On the other end, lower values than 3 seconds make it
-	 * difficult for most interactive tasks to complete their jobs
-	 * before weight-raising finishes.
-	 */
-	if (dur > msecs_to_jiffies(13000))
-		dur = msecs_to_jiffies(13000);
-	else if (dur < msecs_to_jiffies(3000))
-		dur = msecs_to_jiffies(3000);
-
-	return dur;
-}
-
-static void
-bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
-		      struct bfq_io_cq *bic, bool bfq_already_existing)
-{
-	unsigned int old_wr_coeff;
-	bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
-
-	if (bic->saved_has_short_ttime)
-		bfq_mark_bfqq_has_short_ttime(bfqq);
-	else
-		bfq_clear_bfqq_has_short_ttime(bfqq);
-
-	if (bic->saved_IO_bound)
-		bfq_mark_bfqq_IO_bound(bfqq);
-	else
-		bfq_clear_bfqq_IO_bound(bfqq);
-
-	if (unlikely(busy))
-		old_wr_coeff = bfqq->wr_coeff;
-
-	bfqq->wr_coeff = bic->saved_wr_coeff;
-	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
-	BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
-	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
-	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
-	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-
-	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
-				   time_is_before_jiffies(bfqq->last_wr_start_finish +
-							  bfqq->wr_cur_max_time))) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			     "resume state: switching off wr (%lu + %lu < %lu)",
-			     bfqq->last_wr_start_finish, bfqq->wr_cur_max_time,
-			     jiffies);
-
-		bfqq->wr_coeff = 1;
-	}
-
-	/* make sure weight will be updated, however we got here */
-	bfqq->entity.prio_changed = 1;
-
-	if (likely(!busy))
-		return;
-
-	if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) {
-		bfqd->wr_busy_queues++;
-		BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
-	} else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) {
-		bfqd->wr_busy_queues--;
-		BUG_ON(bfqd->wr_busy_queues < 0);
-	}
-}
-
-static int bfqq_process_refs(struct bfq_queue *bfqq)
-{
-	int process_refs, io_refs;
-
-	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
-
-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
-	process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
-	BUG_ON(process_refs < 0);
-	return process_refs;
-}
-
-/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
-static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_queue *item;
-	struct hlist_node *n;
-
-	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
-		hlist_del_init(&item->burst_list_node);
-	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
-	bfqd->burst_size = 1;
-	bfqd->burst_parent_entity = bfqq->entity.parent;
-}
-
-/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
-static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	/* Increment burst size to take into account also bfqq */
-	bfqd->burst_size++;
-
-	bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
-
-	BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
-
-	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
-		struct bfq_queue *pos, *bfqq_item;
-		struct hlist_node *n;
-
-		/*
-		 * Enough queues have been activated shortly after each
-		 * other to consider this burst as large.
-		 */
-		bfqd->large_burst = true;
-		bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
-
-		/*
-		 * We can now mark all queues in the burst list as
-		 * belonging to a large burst.
-		 */
-		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
-				     burst_list_node) {
-			bfq_mark_bfqq_in_large_burst(bfqq_item);
-			bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
-		}
-		bfq_mark_bfqq_in_large_burst(bfqq);
-		bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
-
-		/*
-		 * From now on, and until the current burst finishes, any
-		 * new queue being activated shortly after the last queue
-		 * was inserted in the burst can be immediately marked as
-		 * belonging to a large burst. So the burst list is not
-		 * needed any more. Remove it.
-		 */
-		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
-					  burst_list_node)
-			hlist_del_init(&pos->burst_list_node);
-	} else /*
-		* Burst not yet large: add bfqq to the burst list. Do
-		* not increment the ref counter for bfqq, because bfqq
-		* is removed from the burst list before freeing bfqq
-		* in put_queue.
-		*/
-		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
-}
-
-/*
- * If many queues belonging to the same group happen to be created
- * shortly after each other, then the processes associated with these
- * queues have typically a common goal. In particular, bursts of queue
- * creations are usually caused by services or applications that spawn
- * many parallel threads/processes. Examples are systemd during boot,
- * or git grep. To help these processes get their job done as soon as
- * possible, it is usually better to not grant either weight-raising
- * or device idling to their queues.
- *
- * In this comment we describe, firstly, the reasons why this fact
- * holds, and, secondly, the next function, which implements the main
- * steps needed to properly mark these queues so that they can then be
- * treated in a different way.
- *
- * The above services or applications benefit mostly from a high
- * throughput: the quicker the requests of the activated queues are
- * cumulatively served, the sooner the target job of these queues gets
- * completed. As a consequence, weight-raising any of these queues,
- * which also implies idling the device for it, is almost always
- * counterproductive. In most cases it just lowers throughput.
- *
- * On the other hand, a burst of queue creations may be caused also by
- * the start of an application that does not consist of a lot of
- * parallel I/O-bound threads. In fact, with a complex application,
- * several short processes may need to be executed to start-up the
- * application. In this respect, to start an application as quickly as
- * possible, the best thing to do is in any case to privilege the I/O
- * related to the application with respect to all other
- * I/O. Therefore, the best strategy to start as quickly as possible
- * an application that causes a burst of queue creations is to
- * weight-raise all the queues created during the burst. This is the
- * exact opposite of the best strategy for the other type of bursts.
- *
- * In the end, to take the best action for each of the two cases, the
- * two types of bursts need to be distinguished. Fortunately, this
- * seems relatively easy, by looking at the sizes of the bursts. In
- * particular, we found a threshold such that only bursts with a
- * larger size than that threshold are apparently caused by
- * services or commands such as systemd or git grep. For brevity,
- * hereafter we call just 'large' these bursts. BFQ *does not*
- * weight-raise queues whose creation occurs in a large burst. In
- * addition, for each of these queues BFQ performs or does not perform
- * idling depending on which choice boosts the throughput more. The
- * exact choice depends on the device and request pattern at
- * hand.
- *
- * Unfortunately, false positives may occur while an interactive task
- * is starting (e.g., an application is being started). The
- * consequence is that the queues associated with the task do not
- * enjoy weight raising as expected. Fortunately these false positives
- * are very rare. They typically occur if some service happens to
- * start doing I/O exactly when the interactive task starts.
- *
- * Turning back to the next function, it implements all the steps
- * needed to detect the occurrence of a large burst and to properly
- * mark all the queues belonging to it (so that they can then be
- * treated in a different way). This goal is achieved by maintaining a
- * "burst list" that holds, temporarily, the queues that belong to the
- * burst in progress. The list is then used to mark these queues as
- * belonging to a large burst if the burst does become large. The main
- * steps are the following.
- *
- * . when the very first queue is created, the queue is inserted into the
- *   list (as it could be the first queue in a possible burst)
- *
- * . if the current burst has not yet become large, and a queue Q that does
- *   not yet belong to the burst is activated shortly after the last time
- *   at which a new queue entered the burst list, then the function appends
- *   Q to the burst list
- *
- * . if, as a consequence of the previous step, the burst size reaches
- *   the large-burst threshold, then
- *
- *     . all the queues in the burst list are marked as belonging to a
- *       large burst
- *
- *     . the burst list is deleted; in fact, the burst list already served
- *       its purpose (keeping temporarily track of the queues in a burst,
- *       so as to be able to mark them as belonging to a large burst in the
- *       previous sub-step), and now is not needed any more
- *
- *     . the device enters a large-burst mode
- *
- * . if a queue Q that does not belong to the burst is created while
- *   the device is in large-burst mode and shortly after the last time
- *   at which a queue either entered the burst list or was marked as
- *   belonging to the current large burst, then Q is immediately marked
- *   as belonging to a large burst.
- *
- * . if a queue Q that does not belong to the burst is created a while
- *   later, i.e., not shortly after, than the last time at which a queue
- *   either entered the burst list or was marked as belonging to the
- *   current large burst, then the current burst is deemed as finished and:
- *
- *        . the large-burst mode is reset if set
- *
- *        . the burst list is emptied
- *
- *        . Q is inserted in the burst list, as Q may be the first queue
- *          in a possible new burst (then the burst list contains just Q
- *          after this step).
- */
-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	/*
-	 * If bfqq is already in the burst list or is part of a large
-	 * burst, or finally has just been split, then there is
-	 * nothing else to do.
-	 */
-	if (!hlist_unhashed(&bfqq->burst_list_node) ||
-	    bfq_bfqq_in_large_burst(bfqq) ||
-	    time_is_after_eq_jiffies(bfqq->split_time +
-				     msecs_to_jiffies(10)))
-		return;
-
-	/*
-	 * If bfqq's creation happens late enough, or bfqq belongs to
-	 * a different group than the burst group, then the current
-	 * burst is finished, and related data structures must be
-	 * reset.
-	 *
-	 * In this respect, consider the special case where bfqq is
-	 * the very first queue created after BFQ is selected for this
-	 * device. In this case, last_ins_in_burst and
-	 * burst_parent_entity are not yet significant when we get
-	 * here. But it is easy to verify that, whether or not the
-	 * following condition is true, bfqq will end up being
-	 * inserted into the burst list. In particular the list will
-	 * happen to contain only bfqq. And this is exactly what has
-	 * to happen, as bfqq may be the first queue of the first
-	 * burst.
-	 */
-	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
-	    bfqd->bfq_burst_interval) ||
-	    bfqq->entity.parent != bfqd->burst_parent_entity) {
-		bfqd->large_burst = false;
-		bfq_reset_burst_list(bfqd, bfqq);
-		bfq_log_bfqq(bfqd, bfqq,
-			"handle_burst: late activation or different group");
-		goto end;
-	}
-
-	/*
-	 * If we get here, then bfqq is being activated shortly after the
-	 * last queue. So, if the current burst is also large, we can mark
-	 * bfqq as belonging to this large burst immediately.
-	 */
-	if (bfqd->large_burst) {
-		bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
-		bfq_mark_bfqq_in_large_burst(bfqq);
-		goto end;
-	}
-
-	/*
-	 * If we get here, then a large-burst state has not yet been
-	 * reached, but bfqq is being activated shortly after the last
-	 * queue. Then we add bfqq to the burst.
-	 */
-	bfq_add_to_burst(bfqd, bfqq);
-end:
-	/*
-	 * At this point, bfqq either has been added to the current
-	 * burst or has caused the current burst to terminate and a
-	 * possible new burst to start. In particular, in the second
-	 * case, bfqq has become the first queue in the possible new
-	 * burst.  In both cases last_ins_in_burst needs to be moved
-	 * forward.
-	 */
-	bfqd->last_ins_in_burst = jiffies;
-
-}
-
-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	return entity->budget - entity->service;
-}
-
-/*
- * If enough samples have been computed, return the current max budget
- * stored in bfqd, which is dynamically updated according to the
- * estimated disk peak rate; otherwise return the default max budget
- */
-static int bfq_max_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
-		return bfq_default_max_budget;
-	else
-		return bfqd->bfq_max_budget;
-}
-
-/*
- * Return min budget, which is a fraction of the current or default
- * max budget (trying with 1/32)
- */
-static int bfq_min_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
-		return bfq_default_max_budget / 32;
-	else
-		return bfqd->bfq_max_budget / 32;
-}
-
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason);
-
-/*
- * The next function, invoked after the input queue bfqq switches from
- * idle to busy, updates the budget of bfqq. The function also tells
- * whether the in-service queue should be expired, by returning
- * true. The purpose of expiring the in-service queue is to give bfqq
- * the chance to possibly preempt the in-service queue, and the reason
- * for preempting the in-service queue is to achieve one of the two
- * goals below.
- *
- * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
- * expired because it has remained idle. In particular, bfqq may have
- * expired for one of the following two reasons:
- *
- * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
- *   did not make it to issue a new request before its last request
- *   was served;
- *
- * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
- *   a new request before the expiration of the idling-time.
- *
- * Even if bfqq has expired for one of the above reasons, the process
- * associated with the queue may be however issuing requests greedily,
- * and thus be sensitive to the bandwidth it receives (bfqq may have
- * remained idle for other reasons: CPU high load, bfqq not enjoying
- * idling, I/O throttling somewhere in the path from the process to
- * the I/O scheduler, ...). But if, after every expiration for one of
- * the above two reasons, bfqq has to wait for the service of at least
- * one full budget of another queue before being served again, then
- * bfqq is likely to get a much lower bandwidth or resource time than
- * its reserved ones. To address this issue, two countermeasures need
- * to be taken.
- *
- * First, the budget and the timestamps of bfqq need to be updated in
- * a special way on bfqq reactivation: they need to be updated as if
- * bfqq did not remain idle and did not expire. In fact, if they are
- * computed as if bfqq expired and remained idle until reactivation,
- * then the process associated with bfqq is treated as if, instead of
- * being greedy, it stopped issuing requests when bfqq remained idle,
- * and restarts issuing requests only on this reactivation. In other
- * words, the scheduler does not help the process recover the "service
- * hole" between bfqq expiration and reactivation. As a consequence,
- * the process receives a lower bandwidth than its reserved one. In
- * contrast, to recover this hole, the budget must be updated as if
- * bfqq was not expired at all before this reactivation, i.e., it must
- * be set to the value of the remaining budget when bfqq was
- * expired. Along the same line, timestamps need to be assigned the
- * value they had the last time bfqq was selected for service, i.e.,
- * before last expiration. Thus timestamps need to be back-shifted
- * with respect to their normal computation (see [1] for more details
- * on this tricky aspect).
- *
- * Secondly, to allow the process to recover the hole, the in-service
- * queue must be expired too, to give bfqq the chance to preempt it
- * immediately. In fact, if bfqq has to wait for a full budget of the
- * in-service queue to be completed, then it may become impossible to
- * let the process recover the hole, even if the back-shifted
- * timestamps of bfqq are lower than those of the in-service queue. If
- * this happens for most or all of the holes, then the process may not
- * receive its reserved bandwidth. In this respect, it is worth noting
- * that, being the service of outstanding requests unpreemptible, a
- * little fraction of the holes may however be unrecoverable, thereby
- * causing a little loss of bandwidth.
- *
- * The last important point is detecting whether bfqq does need this
- * bandwidth recovery. In this respect, the next function deems the
- * process associated with bfqq greedy, and thus allows it to recover
- * the hole, if: 1) the process is waiting for the arrival of a new
- * request (which implies that bfqq expired for one of the above two
- * reasons), and 2) such a request has arrived soon. The first
- * condition is controlled through the flag non_blocking_wait_rq,
- * while the second through the flag arrived_in_time. If both
- * conditions hold, then the function computes the budget in the
- * above-described special way, and signals that the in-service queue
- * should be expired. Timestamp back-shifting is done later in
- * __bfq_activate_entity.
- *
- * 2. Reduce latency. Even if timestamps are not backshifted to let
- * the process associated with bfqq recover a service hole, bfqq may
- * however happen to have, after being (re)activated, a lower finish
- * timestamp than the in-service queue.  That is, the next budget of
- * bfqq may have to be completed before the one of the in-service
- * queue. If this is the case, then preempting the in-service queue
- * allows this goal to be achieved, apart from the unpreemptible,
- * outstanding requests mentioned above.
- *
- * Unfortunately, regardless of which of the above two goals one wants
- * to achieve, service trees need first to be updated to know whether
- * the in-service queue must be preempted. To have service trees
- * correctly updated, the in-service queue must be expired and
- * rescheduled, and bfqq must be scheduled too. This is one of the
- * most costly operations (in future versions, the scheduling
- * mechanism may be re-designed in such a way to make it possible to
- * know whether preemption is needed without needing to update service
- * trees). In addition, queue preemptions almost always cause random
- * I/O, and thus loss of throughput. Because of these facts, the next
- * function adopts the following simple scheme to avoid both costly
- * operations and too frequent preemptions: it requests the expiration
- * of the in-service queue (unconditionally) only for queues that need
- * to recover a hole, or that either are weight-raised or deserve to
- * be weight-raised.
- */
-static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
-						struct bfq_queue *bfqq,
-						bool arrived_in_time,
-						bool wr_or_deserves_wr)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
-		/*
-		 * We do not clear the flag non_blocking_wait_rq here, as
-		 * the latter is used in bfq_activate_bfqq to signal
-		 * that timestamps need to be back-shifted (and is
-		 * cleared right after).
-		 */
-
-		/*
-		 * In next assignment we rely on that either
-		 * entity->service or entity->budget are not updated
-		 * on expiration if bfqq is empty (see
-		 * __bfq_bfqq_recalc_budget). Thus both quantities
-		 * remain unchanged after such an expiration, and the
-		 * following statement therefore assigns to
-		 * entity->budget the remaining budget on such an
-		 * expiration. For clarity, entity->service is not
-		 * updated on expiration in any case, and, in normal
-		 * operation, is reset only when bfqq is selected for
-		 * service (see bfq_get_next_queue).
-		 */
-		BUG_ON(bfqq->max_budget < 0);
-		entity->budget = min_t(unsigned long,
-				       bfq_bfqq_budget_left(bfqq),
-				       bfqq->max_budget);
-
-		BUG_ON(entity->budget < 0);
-		return true;
-	}
-
-	BUG_ON(bfqq->max_budget < 0);
-	entity->budget = max_t(unsigned long, bfqq->max_budget,
-			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
-	BUG_ON(entity->budget < 0);
-
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
-	return wr_or_deserves_wr;
-}
-
-static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
-					     struct bfq_queue *bfqq,
-					     unsigned int old_wr_coeff,
-					     bool wr_or_deserves_wr,
-					     bool interactive,
-					     bool in_burst,
-					     bool soft_rt)
-{
-	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
-		/* start a weight-raising period */
-		if (interactive) {
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-		} else {
-			bfqq->wr_start_at_switch_to_srt = jiffies;
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
-				BFQ_SOFTRT_WEIGHT_FACTOR;
-			bfqq->wr_cur_max_time =
-				bfqd->bfq_wr_rt_max_time;
-		}
-		/*
-		 * If needed, further reduce budget to make sure it is
-		 * close to bfqq's backlog, so as to reduce the
-		 * scheduling-error component due to a too large
-		 * budget. Do not care about throughput consequences,
-		 * but only about latency. Finally, do not assign a
-		 * too small budget either, to avoid increasing
-		 * latency by causing too frequent expirations.
-		 */
-		bfqq->entity.budget = min_t(unsigned long,
-					    bfqq->entity.budget,
-					    2 * bfq_min_budget(bfqd));
-
-		bfq_log_bfqq(bfqd, bfqq,
-			     "wrais starting at %lu, rais_max_time %u",
-			     jiffies,
-			     jiffies_to_msecs(bfqq->wr_cur_max_time));
-	} else if (old_wr_coeff > 1) {
-		if (interactive) { /* update wr coeff and duration */
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-		} else if (in_burst) {
-			bfqq->wr_coeff = 1;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "wrais ending at %lu, rais_max_time %u",
-				     jiffies,
-				     jiffies_to_msecs(bfqq->
-						      wr_cur_max_time));
-		} else if (soft_rt) {
-			/*
-			 * The application is now or still meeting the
-			 * requirements for being deemed soft rt.  We
-			 * can then correctly and safely (re)charge
-			 * the weight-raising duration for the
-			 * application with the weight-raising
-			 * duration for soft rt applications.
-			 *
-			 * In particular, doing this recharge now, i.e.,
-			 * before the weight-raising period for the
-			 * application finishes, reduces the probability
-			 * of the following negative scenario:
-			 * 1) the weight of a soft rt application is
-			 *    raised at startup (as for any newly
-			 *    created application),
-			 * 2) since the application is not interactive,
-			 *    at a certain time weight-raising is
-			 *    stopped for the application,
-			 * 3) at that time the application happens to
-			 *    still have pending requests, and hence
-			 *    is destined to not have a chance to be
-			 *    deemed soft rt before these requests are
-			 *    completed (see the comments to the
-			 *    function bfq_bfqq_softrt_next_start()
-			 *    for details on soft rt detection),
-			 * 4) these pending requests experience a high
-			 *    latency because the application is not
-			 *    weight-raised while they are pending.
-			 */
-			if (bfqq->wr_cur_max_time !=
-				bfqd->bfq_wr_rt_max_time) {
-				bfqq->wr_start_at_switch_to_srt =
-					bfqq->last_wr_start_finish;
-                BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-
-				bfqq->wr_cur_max_time =
-					bfqd->bfq_wr_rt_max_time;
-				bfqq->wr_coeff = bfqd->bfq_wr_coeff *
-					BFQ_SOFTRT_WEIGHT_FACTOR;
-				bfq_log_bfqq(bfqd, bfqq,
-					     "switching to soft_rt wr");
-			} else
-				bfq_log_bfqq(bfqd, bfqq,
-					"moving forward soft_rt wr duration");
-			bfqq->last_wr_start_finish = jiffies;
-		}
-	}
-}
-
-static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
-					struct bfq_queue *bfqq)
-{
-	return bfqq->dispatched == 0 &&
-		time_is_before_jiffies(
-			bfqq->budget_timeout +
-			bfqd->bfq_wr_min_idle_time);
-}
-
-static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
-					     struct bfq_queue *bfqq,
-					     int old_wr_coeff,
-					     struct request *rq,
-					     bool *interactive)
-{
-	bool soft_rt, in_burst,	wr_or_deserves_wr,
-		bfqq_wants_to_preempt,
-		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
-		/*
-		 * See the comments on
-		 * bfq_bfqq_update_budg_for_activation for
-		 * details on the usage of the next variable.
-		 */
-		arrived_in_time =  ktime_get_ns() <=
-			RQ_BIC(rq)->ttime.last_end_request +
-			bfqd->bfq_slice_idle * 3;
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "bfq_add_request non-busy: "
-		     "jiffies %lu, in_time %d, idle_long %d busyw %d "
-		     "wr_coeff %u",
-		     jiffies, arrived_in_time,
-		     idle_for_long_time,
-		     bfq_bfqq_non_blocking_wait_rq(bfqq),
-		     old_wr_coeff);
-
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
-	BUG_ON(bfqq == bfqd->in_service_queue);
-	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
-				 req_op(rq), rq->cmd_flags);
-
-	/*
-	 * bfqq deserves to be weight-raised if:
-	 * - it is sync,
-	 * - it does not belong to a large burst,
-	 * - it has been idle for enough time or is soft real-time,
-	 * - is linked to a bfq_io_cq (it is not shared in any sense)
-	 */
-	in_burst = bfq_bfqq_in_large_burst(bfqq);
-	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
-		!in_burst &&
-		time_is_before_jiffies(bfqq->soft_rt_next_start);
-	*interactive =
-		!in_burst &&
-		idle_for_long_time;
-	wr_or_deserves_wr = bfqd->low_latency &&
-		(bfqq->wr_coeff > 1 ||
-		 (bfq_bfqq_sync(bfqq) &&
-		  bfqq->bic && (*interactive || soft_rt)));
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "bfq_add_request: "
-		     "in_burst %d, "
-		     "soft_rt %d (next %lu), inter %d, bic %p",
-		     bfq_bfqq_in_large_burst(bfqq), soft_rt,
-		     bfqq->soft_rt_next_start,
-		     *interactive,
-		     bfqq->bic);
-
-	/*
-	 * Using the last flag, update budget and check whether bfqq
-	 * may want to preempt the in-service queue.
-	 */
-	bfqq_wants_to_preempt =
-		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
-						    arrived_in_time,
-						    wr_or_deserves_wr);
-
-	/*
-	 * If bfqq happened to be activated in a burst, but has been
-	 * idle for much more than an interactive queue, then we
-	 * assume that, in the overall I/O initiated in the burst, the
-	 * I/O associated with bfqq is finished. So bfqq does not need
-	 * to be treated as a queue belonging to a burst
-	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
-	 * if set, and remove bfqq from the burst list if it's
-	 * there. We do not decrement burst_size, because the fact
-	 * that bfqq does not need to belong to the burst list any
-	 * more does not invalidate the fact that bfqq was created in
-	 * a burst.
-	 */
-	if (likely(!bfq_bfqq_just_created(bfqq)) &&
-	    idle_for_long_time &&
-	    time_is_before_jiffies(
-		    bfqq->budget_timeout +
-		    msecs_to_jiffies(10000))) {
-		hlist_del_init(&bfqq->burst_list_node);
-		bfq_clear_bfqq_in_large_burst(bfqq);
-	}
-
-	bfq_clear_bfqq_just_created(bfqq);
-
-	if (!bfq_bfqq_IO_bound(bfqq)) {
-		if (arrived_in_time) {
-			bfqq->requests_within_timer++;
-			if (bfqq->requests_within_timer >=
-			    bfqd->bfq_requests_within_timer)
-				bfq_mark_bfqq_IO_bound(bfqq);
-		} else
-			bfqq->requests_within_timer = 0;
-		bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
-			     bfqq->requests_within_timer);
-	}
-
-	if (bfqd->low_latency) {
-		if (unlikely(time_is_after_jiffies(bfqq->split_time)))
-			/* wraparound */
-			bfqq->split_time =
-				jiffies - bfqd->bfq_wr_min_idle_time - 1;
-
-		if (time_is_before_jiffies(bfqq->split_time +
-					   bfqd->bfq_wr_min_idle_time)) {
-			bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
-							 old_wr_coeff,
-							 wr_or_deserves_wr,
-							 *interactive,
-							 in_burst,
-							 soft_rt);
-
-			if (old_wr_coeff != bfqq->wr_coeff)
-				bfqq->entity.prio_changed = 1;
-		}
-	}
-
-	bfqq->last_idle_bklogged = jiffies;
-	bfqq->service_from_backlogged = 0;
-	bfq_clear_bfqq_softrt_update(bfqq);
-
-	bfq_add_bfqq_busy(bfqd, bfqq);
-
-	/*
-	 * Expire in-service queue only if preemption may be needed
-	 * for guarantees. In this respect, the function
-	 * next_queue_may_preempt just checks a simple, necessary
-	 * condition, and not a sufficient condition based on
-	 * timestamps. In fact, for the latter condition to be
-	 * evaluated, timestamps would need first to be updated, and
-	 * this operation is quite costly (see the comments on the
-	 * function bfq_bfqq_update_budg_for_activation).
-	 */
-	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
-	    bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
-	    next_queue_may_preempt(bfqd)) {
-		struct bfq_queue *in_serv =
-			bfqd->in_service_queue;
-		BUG_ON(in_serv == bfqq);
-
-		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
-				false, BFQ_BFQQ_PREEMPTED);
-	}
-}
-
-static void bfq_add_request(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-	struct request *next_rq, *prev;
-	unsigned int old_wr_coeff = bfqq->wr_coeff;
-	bool interactive = false;
-
-	bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
-		     blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
-
-	if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
-		bfq_log_bfqq(bfqd, bfqq,
-			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
-			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
-			jiffies_to_msecs(bfqq->wr_cur_max_time),
-			bfqq->wr_coeff,
-			bfqq->entity.weight, bfqq->entity.orig_weight);
-
-	bfqq->queued[rq_is_sync(rq)]++;
-	bfqd->queued++;
-
-	elv_rb_add(&bfqq->sort_list, rq);
-
-	/*
-	 * Check if this request is a better next-to-serve candidate.
-	 */
-	prev = bfqq->next_rq;
-	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
-	BUG_ON(!next_rq);
-	bfqq->next_rq = next_rq;
-
-	/*
-	 * Adjust priority tree position, if next_rq changes.
-	 */
-	if (prev != bfqq->next_rq)
-		bfq_pos_tree_add_move(bfqd, bfqq);
-
-	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
-		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
-						 rq, &interactive);
-	else {
-		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
-		    time_is_before_jiffies(
-				bfqq->last_wr_start_finish +
-				bfqd->bfq_wr_min_inter_arr_async)) {
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-
-			bfqd->wr_busy_queues++;
-			BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
-			bfqq->entity.prio_changed = 1;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "non-idle wrais starting, "
-				     "wr_max_time %u wr_busy %d",
-				     jiffies_to_msecs(bfqq->wr_cur_max_time),
-				     bfqd->wr_busy_queues);
-		}
-		if (prev != bfqq->next_rq)
-			bfq_updated_next_req(bfqd, bfqq);
-	}
-
-	/*
-	 * Assign jiffies to last_wr_start_finish in the following
-	 * cases:
-	 *
-	 * . if bfqq is not going to be weight-raised, because, for
-	 *   non weight-raised queues, last_wr_start_finish stores the
-	 *   arrival time of the last request; as of now, this piece
-	 *   of information is used only for deciding whether to
-	 *   weight-raise async queues
-	 *
-	 * . if bfqq is not weight-raised, because, if bfqq is now
-	 *   switching to weight-raised, then last_wr_start_finish
-	 *   stores the time when weight-raising starts
-	 *
-	 * . if bfqq is interactive, because, regardless of whether
-	 *   bfqq is currently weight-raised, the weight-raising
-	 *   period must start or restart (this case is considered
-	 *   separately because it is not detected by the above
-	 *   conditions, if bfqq is already weight-raised)
-	 *
-	 * last_wr_start_finish has to be updated also if bfqq is soft
-	 * real-time, because the weight-raising period is constantly
-	 * restarted on idle-to-busy transitions for these queues, but
-	 * this is already done in bfq_bfqq_handle_idle_busy_switch if
-	 * needed.
-	 */
-	if (bfqd->low_latency &&
-		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
-		bfqq->last_wr_start_finish = jiffies;
-}
-
-static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
-					  struct bio *bio)
-{
-	struct task_struct *tsk = current;
-	struct bfq_io_cq *bic;
-	struct bfq_queue *bfqq;
-
-	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (!bic)
-		return NULL;
-
-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
-	if (bfqq)
-		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
-
-	return NULL;
-}
-
-static sector_t get_sdist(sector_t last_pos, struct request *rq)
-{
-	sector_t sdist = 0;
-
-	if (last_pos) {
-		if (last_pos < blk_rq_pos(rq))
-			sdist = blk_rq_pos(rq) - last_pos;
-		else
-			sdist = last_pos - blk_rq_pos(rq);
-	}
-
-	return sdist;
-}
-
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	bfqd->rq_in_driver++;
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-
-	BUG_ON(bfqd->rq_in_driver == 0);
-	bfqd->rq_in_driver--;
-}
-
-static void bfq_remove_request(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-	const int sync = rq_is_sync(rq);
-
-	BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
-	       bfqq == bfqd->in_service_queue);
-
-	if (bfqq->next_rq == rq) {
-		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
-		bfq_updated_next_req(bfqd, bfqq);
-	}
-
-	if (rq->queuelist.prev != &rq->queuelist)
-		list_del_init(&rq->queuelist);
-	BUG_ON(bfqq->queued[sync] == 0);
-	bfqq->queued[sync]--;
-	bfqd->queued--;
-	elv_rb_del(&bfqq->sort_list, rq);
-
-	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		bfqq->next_rq = NULL;
-
-		BUG_ON(bfqq->entity.budget < 0);
-
-		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
-			BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */
-			bfq_del_bfqq_busy(bfqd, bfqq, false);
-			/*
-			 * bfqq emptied. In normal operation, when
-			 * bfqq is empty, bfqq->entity.service and
-			 * bfqq->entity.budget must contain,
-			 * respectively, the service received and the
-			 * budget used last time bfqq emptied. These
-			 * facts do not hold in this case, as at least
-			 * this last removal occurred while bfqq is
-			 * not in service. To avoid inconsistencies,
-			 * reset both bfqq->entity.service and
-			 * bfqq->entity.budget, if bfqq has still a
-			 * process that may issue I/O requests to it.
-			 */
-			bfqq->entity.budget = bfqq->entity.service = 0;
-		}
-
-		/*
-		 * Remove queue from request-position tree as it is empty.
-		 */
-		if (bfqq->pos_root) {
-			rb_erase(&bfqq->pos_node, bfqq->pos_root);
-			bfqq->pos_root = NULL;
-		}
-	}
-
-	if (rq->cmd_flags & REQ_META) {
-		BUG_ON(bfqq->meta_pending == 0);
-		bfqq->meta_pending--;
-	}
-	bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
-				    rq->cmd_flags);
-}
-
-static int bfq_merge(struct request_queue *q, struct request **req,
-		     struct bio *bio)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct request *__rq;
-
-	__rq = bfq_find_rq_fmerge(bfqd, bio);
-	if (__rq && elv_bio_merge_ok(__rq, bio)) {
-		*req = __rq;
-		return ELEVATOR_FRONT_MERGE;
-	}
-
-	return ELEVATOR_NO_MERGE;
-}
-
-static void bfq_merged_request(struct request_queue *q, struct request *req,
-			       int type)
-{
-	if (type == ELEVATOR_FRONT_MERGE &&
-	    rb_prev(&req->rb_node) &&
-	    blk_rq_pos(req) <
-	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
-				    struct request, rb_node))) {
-		struct bfq_queue *bfqq = RQ_BFQQ(req);
-		struct bfq_data *bfqd = bfqq->bfqd;
-		struct request *prev, *next_rq;
-
-		/* Reposition request in its sort_list */
-		elv_rb_del(&bfqq->sort_list, req);
-		elv_rb_add(&bfqq->sort_list, req);
-		/* Choose next request to be served for bfqq */
-		prev = bfqq->next_rq;
-		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
-					 bfqd->last_position);
-		BUG_ON(!next_rq);
-		bfqq->next_rq = next_rq;
-		/*
-		 * If next_rq changes, update both the queue's budget to
-		 * fit the new request and the queue's position in its
-		 * rq_pos_tree.
-		 */
-		if (prev != bfqq->next_rq) {
-			bfq_updated_next_req(bfqd, bfqq);
-			bfq_pos_tree_add_move(bfqd, bfqq);
-		}
-	}
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_bio_merged(struct request_queue *q, struct request *req,
-			   struct bio *bio)
-{
-	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
-				    bio->bi_opf);
-}
-#endif
-
-static void bfq_merged_requests(struct request_queue *q, struct request *rq,
-				struct request *next)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
-	/*
-	 * If next and rq belong to the same bfq_queue and next is older
-	 * than rq, then reposition rq in the fifo (by substituting next
-	 * with rq). Otherwise, if next and rq belong to different
-	 * bfq_queues, never reposition rq: in fact, we would have to
-	 * reposition it with respect to next's position in its own fifo,
-	 * which would most certainly be too expensive with respect to
-	 * the benefits.
-	 */
-	if (bfqq == next_bfqq &&
-	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
-	    next->fifo_time < rq->fifo_time) {
-		list_del_init(&rq->queuelist);
-		list_replace_init(&next->queuelist, &rq->queuelist);
-		rq->fifo_time = next->fifo_time;
-	}
-
-	if (bfqq->next_rq == next)
-		bfqq->next_rq = rq;
-
-	bfq_remove_request(next);
-	bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
-				    next->cmd_flags);
-}
-
-/* Must be called with bfqq != NULL */
-static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
-{
-	BUG_ON(!bfqq);
-
-	if (bfq_bfqq_busy(bfqq)) {
-		bfqq->bfqd->wr_busy_queues--;
-		BUG_ON(bfqq->bfqd->wr_busy_queues < 0);
-	}
-	bfqq->wr_coeff = 1;
-	bfqq->wr_cur_max_time = 0;
-	bfqq->last_wr_start_finish = jiffies;
-	/*
-	 * Trigger a weight change on the next invocation of
-	 * __bfq_entity_update_weight_prio.
-	 */
-	bfqq->entity.prio_changed = 1;
-	bfq_log_bfqq(bfqq->bfqd, bfqq,
-		     "end_wr: wrais ending at %lu, rais_max_time %u",
-		     bfqq->last_wr_start_finish,
-		     jiffies_to_msecs(bfqq->wr_cur_max_time));
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
-		     bfqq->bfqd->wr_busy_queues);
-}
-
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
-				    struct bfq_group *bfqg)
-{
-	int i, j;
-
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < IOPRIO_BE_NR; j++)
-			if (bfqg->async_bfqq[i][j])
-				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
-	if (bfqg->async_idle_bfqq)
-		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
-}
-
-static void bfq_end_wr(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq;
-
-	spin_lock_irq(bfqd->queue->queue_lock);
-
-	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
-		bfq_bfqq_end_wr(bfqq);
-	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
-		bfq_bfqq_end_wr(bfqq);
-	bfq_end_wr_async(bfqd);
-
-	spin_unlock_irq(bfqd->queue->queue_lock);
-}
-
-static sector_t bfq_io_struct_pos(void *io_struct, bool request)
-{
-	if (request)
-		return blk_rq_pos(io_struct);
-	else
-		return ((struct bio *)io_struct)->bi_iter.bi_sector;
-}
-
-static int bfq_rq_close_to_sector(void *io_struct, bool request,
-				  sector_t sector)
-{
-	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
-	       BFQQ_CLOSE_THR;
-}
-
-static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
-					 struct bfq_queue *bfqq,
-					 sector_t sector)
-{
-	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
-	struct rb_node *parent, *node;
-	struct bfq_queue *__bfqq;
-
-	if (RB_EMPTY_ROOT(root))
-		return NULL;
-
-	/*
-	 * First, if we find a request starting at the end of the last
-	 * request, choose it.
-	 */
-	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
-	if (__bfqq)
-		return __bfqq;
-
-	/*
-	 * If the exact sector wasn't found, the parent of the NULL leaf
-	 * will contain the closest sector (rq_pos_tree sorted by
-	 * next_request position).
-	 */
-	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
-	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
-		return __bfqq;
-
-	if (blk_rq_pos(__bfqq->next_rq) < sector)
-		node = rb_next(&__bfqq->pos_node);
-	else
-		node = rb_prev(&__bfqq->pos_node);
-	if (!node)
-		return NULL;
-
-	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
-	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
-		return __bfqq;
-
-	return NULL;
-}
-
-static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
-						   struct bfq_queue *cur_bfqq,
-						   sector_t sector)
-{
-	struct bfq_queue *bfqq;
-
-	/*
-	 * We shall notice if some of the queues are cooperating,
-	 * e.g., working closely on the same area of the device. In
-	 * that case, we can group them together and: 1) don't waste
-	 * time idling, and 2) serve the union of their requests in
-	 * the best possible order for throughput.
-	 */
-	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
-	if (!bfqq || bfqq == cur_bfqq)
-		return NULL;
-
-	return bfqq;
-}
-
-static struct bfq_queue *
-bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
-{
-	int process_refs, new_process_refs;
-	struct bfq_queue *__bfqq;
-
-	/*
-	 * If there are no process references on the new_bfqq, then it is
-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
-	 * may have dropped their last reference (not just their last process
-	 * reference).
-	 */
-	if (!bfqq_process_refs(new_bfqq))
-		return NULL;
-
-	/* Avoid a circular list and skip interim queue merges. */
-	while ((__bfqq = new_bfqq->new_bfqq)) {
-		if (__bfqq == bfqq)
-			return NULL;
-		new_bfqq = __bfqq;
-	}
-
-	process_refs = bfqq_process_refs(bfqq);
-	new_process_refs = bfqq_process_refs(new_bfqq);
-	/*
-	 * If the process for the bfqq has gone away, there is no
-	 * sense in merging the queues.
-	 */
-	if (process_refs == 0 || new_process_refs == 0)
-		return NULL;
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
-		new_bfqq->pid);
-
-	/*
-	 * Merging is just a redirection: the requests of the process
-	 * owning one of the two queues are redirected to the other queue.
-	 * The latter queue, in its turn, is set as shared if this is the
-	 * first time that the requests of some process are redirected to
-	 * it.
-	 *
-	 * We redirect bfqq to new_bfqq and not the opposite, because we
-	 * are in the context of the process owning bfqq, hence we have
-	 * the io_cq of this process. So we can immediately configure this
-	 * io_cq to redirect the requests of the process to new_bfqq.
-	 *
-	 * NOTE, even if new_bfqq coincides with the in-service queue, the
-	 * io_cq of new_bfqq is not available, because, if the in-service
-	 * queue is shared, bfqd->in_service_bic may not point to the
-	 * io_cq of the in-service queue.
-	 * Redirecting the requests of the process owning bfqq to the
-	 * currently in-service queue is in any case the best option, as
-	 * we feed the in-service queue with new requests close to the
-	 * last request served and, by doing so, hopefully increase the
-	 * throughput.
-	 */
-	bfqq->new_bfqq = new_bfqq;
-	new_bfqq->ref += process_refs;
-	return new_bfqq;
-}
-
-static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
-					struct bfq_queue *new_bfqq)
-{
-	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
-	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
-		return false;
-
-	/*
-	 * If either of the queues has already been detected as seeky,
-	 * then merging it with the other queue is unlikely to lead to
-	 * sequential I/O.
-	 */
-	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
-		return false;
-
-	/*
-	 * Interleaved I/O is known to be done by (some) applications
-	 * only for reads, so it does not make sense to merge async
-	 * queues.
-	 */
-	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
-		return false;
-
-	return true;
-}
-
-/*
- * If this function returns true, then bfqq cannot be merged. The idea
- * is that true cooperation happens very early after processes start
- * to do I/O. Usually, late cooperations are just accidental false
- * positives. In case bfqq is weight-raised, such false positives
- * would evidently degrade latency guarantees for bfqq.
- */
-static bool wr_from_too_long(struct bfq_queue *bfqq)
-{
-	return bfqq->wr_coeff > 1 &&
-		time_is_before_jiffies(bfqq->last_wr_start_finish +
-				       msecs_to_jiffies(100));
-}
-
-/*
- * Attempt to schedule a merge of bfqq with the currently in-service
- * queue or with a close queue among the scheduled queues.  Return
- * NULL if no merge was scheduled, a pointer to the shared bfq_queue
- * structure otherwise.
- *
- * The OOM queue is not allowed to participate to cooperation: in fact, since
- * the requests temporarily redirected to the OOM queue could be redirected
- * again to dedicated queues at any time, the state needed to correctly
- * handle merging with the OOM queue would be quite complex and expensive
- * to maintain. Besides, in such a critical condition as an out of memory,
- * the benefits of queue merging may be little relevant, or even negligible.
- *
- * Weight-raised queues can be merged only if their weight-raising
- * period has just started. In fact cooperating processes are usually
- * started together. Thus, with this filter we avoid false positives
- * that would jeopardize low-latency guarantees.
- *
- * WARNING: queue merging may impair fairness among non-weight raised
- * queues, for at least two reasons: 1) the original weight of a
- * merged queue may change during the merged state, 2) even being the
- * weight the same, a merged queue may be bloated with many more
- * requests than the ones produced by its originally-associated
- * process.
- */
-static struct bfq_queue *
-bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-		     void *io_struct, bool request)
-{
-	struct bfq_queue *in_service_bfqq, *new_bfqq;
-
-	if (bfqq->new_bfqq)
-		return bfqq->new_bfqq;
-
-	if (io_struct && wr_from_too_long(bfqq) &&
-	    likely(bfqq != &bfqd->oom_bfqq))
-		bfq_log_bfqq(bfqd, bfqq,
-			     "would have looked for coop, but bfq%d wr",
-			bfqq->pid);
-
-	if (!io_struct ||
-	    wr_from_too_long(bfqq) ||
-	    unlikely(bfqq == &bfqd->oom_bfqq))
-		return NULL;
-
-	/* If there is only one backlogged queue, don't search. */
-	if (bfqd->busy_queues == 1)
-		return NULL;
-
-	in_service_bfqq = bfqd->in_service_queue;
-
-	if (in_service_bfqq && in_service_bfqq != bfqq &&
-	    bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
-	    && likely(in_service_bfqq == &bfqd->oom_bfqq))
-		bfq_log_bfqq(bfqd, bfqq,
-		"would have tried merge with in-service-queue, but wr");
-
-	if (!in_service_bfqq || in_service_bfqq == bfqq ||
-	    !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
-	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
-		goto check_scheduled;
-
-	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
-	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
-	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
-		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
-		if (new_bfqq)
-			return new_bfqq;
-	}
-	/*
-	 * Check whether there is a cooperator among currently scheduled
-	 * queues. The only thing we need is that the bio/request is not
-	 * NULL, as we need it to establish whether a cooperator exists.
-	 */
-check_scheduled:
-	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
-			bfq_io_struct_pos(io_struct, request));
-
-	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
-
-	if (new_bfqq && wr_from_too_long(new_bfqq) &&
-	    likely(new_bfqq != &bfqd->oom_bfqq) &&
-	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
-		bfq_log_bfqq(bfqd, bfqq,
-			     "would have merged with bfq%d, but wr",
-			     new_bfqq->pid);
-
-	if (new_bfqq && !wr_from_too_long(new_bfqq) &&
-	    likely(new_bfqq != &bfqd->oom_bfqq) &&
-	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
-		return bfq_setup_merge(bfqq, new_bfqq);
-
-	return NULL;
-}
-
-static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
-{
-	struct bfq_io_cq *bic = bfqq->bic;
-
-	/*
-	 * If !bfqq->bic, the queue is already shared or its requests
-	 * have already been redirected to a shared queue; both idle window
-	 * and weight raising state have already been saved. Do nothing.
-	 */
-	if (!bic)
-		return;
-
-	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
-	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
-	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
-	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
-	bic->saved_wr_coeff = bfqq->wr_coeff;
-	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
-	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
-	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
-	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-}
-
-static void bfq_get_bic_reference(struct bfq_queue *bfqq)
-{
-	/*
-	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
-	 * is about to begin using a shared bfq_queue.
-	 */
-	if (bfqq->bic)
-		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
-}
-
-static void
-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
-		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
-{
-	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
-		     (unsigned long) new_bfqq->pid);
-	/* Save weight raising and idle window of the merged queues */
-	bfq_bfqq_save_state(bfqq);
-	bfq_bfqq_save_state(new_bfqq);
-	if (bfq_bfqq_IO_bound(bfqq))
-		bfq_mark_bfqq_IO_bound(new_bfqq);
-	bfq_clear_bfqq_IO_bound(bfqq);
-
-	/*
-	 * If bfqq is weight-raised, then let new_bfqq inherit
-	 * weight-raising. To reduce false positives, neglect the case
-	 * where bfqq has just been created, but has not yet made it
-	 * to be weight-raised (which may happen because EQM may merge
-	 * bfqq even before bfq_add_request is executed for the first
-	 * time for bfqq). Handling this case would however be very
-	 * easy, thanks to the flag just_created.
-	 */
-	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
-		new_bfqq->wr_coeff = bfqq->wr_coeff;
-		new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
-		new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
-		new_bfqq->wr_start_at_switch_to_srt =
-			bfqq->wr_start_at_switch_to_srt;
-		if (bfq_bfqq_busy(new_bfqq)) {
-			bfqd->wr_busy_queues++;
-			BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
-		}
-
-		new_bfqq->entity.prio_changed = 1;
-		bfq_log_bfqq(bfqd, new_bfqq,
-			     "wr start after merge with %d, rais_max_time %u",
-			     bfqq->pid,
-			     jiffies_to_msecs(bfqq->wr_cur_max_time));
-	}
-
-	if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
-		bfqq->wr_coeff = 1;
-		bfqq->entity.prio_changed = 1;
-		if (bfq_bfqq_busy(bfqq)) {
-			bfqd->wr_busy_queues--;
-			BUG_ON(bfqd->wr_busy_queues < 0);
-		}
-
-	}
-
-	bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
-		     bfqd->wr_busy_queues);
-
-	/*
-	 * Grab a reference to the bic, to prevent it from being destroyed
-	 * before being possibly touched by a bfq_split_bfqq().
-	 */
-	bfq_get_bic_reference(bfqq);
-	bfq_get_bic_reference(new_bfqq);
-	/*
-	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
-	 */
-	bic_set_bfqq(bic, new_bfqq, 1);
-	bfq_mark_bfqq_coop(new_bfqq);
-	/*
-	 * new_bfqq now belongs to at least two bics (it is a shared queue):
-	 * set new_bfqq->bic to NULL. bfqq either:
-	 * - does not belong to any bic any more, and hence bfqq->bic must
-	 *   be set to NULL, or
-	 * - is a queue whose owning bics have already been redirected to a
-	 *   different queue, hence the queue is destined to not belong to
-	 *   any bic soon and bfqq->bic is already NULL (therefore the next
-	 *   assignment causes no harm).
-	 */
-	new_bfqq->bic = NULL;
-	bfqq->bic = NULL;
-	/* release process reference to bfqq */
-	bfq_put_queue(bfqq);
-}
-
-static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
-			       struct bio *bio)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_io_cq *bic;
-	struct bfq_queue *bfqq, *new_bfqq;
-
-	/*
-	 * Disallow merge of a sync bio into an async request.
-	 */
-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
-		return false;
-
-	/*
-	 * Lookup the bfqq that this bio will be queued with. Allow
-	 * merge only if rq is queued there.
-	 * Queue lock is held here.
-	 */
-	bic = bfq_bic_lookup(bfqd, current->io_context);
-	if (!bic)
-		return false;
-
-	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
-	/*
-	 * We take advantage of this function to perform an early merge
-	 * of the queues of possible cooperating processes.
-	 */
-	if (bfqq) {
-		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
-		if (new_bfqq) {
-			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
-			/*
-			 * If we get here, the bio will be queued in the
-			 * shared queue, i.e., new_bfqq, so use new_bfqq
-			 * to decide whether bio and rq can be merged.
-			 */
-			bfqq = new_bfqq;
-		}
-	}
-
-	return bfqq == RQ_BFQQ(rq);
-}
-
-static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
-			      struct request *next)
-{
-	return RQ_BFQQ(rq) == RQ_BFQQ(next);
-}
-
-/*
- * Set the maximum time for the in-service queue to consume its
- * budget. This prevents seeky processes from lowering the throughput.
- * In practice, a time-slice service scheme is used with seeky
- * processes.
- */
-static void bfq_set_budget_timeout(struct bfq_data *bfqd,
-				   struct bfq_queue *bfqq)
-{
-	unsigned int timeout_coeff;
-
-	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
-		timeout_coeff = 1;
-	else
-		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
-
-	bfqd->last_budget_start = ktime_get();
-
-	bfqq->budget_timeout = jiffies +
-		bfqd->bfq_timeout * timeout_coeff;
-
-	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
-		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
-}
-
-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
-				       struct bfq_queue *bfqq)
-{
-	if (bfqq) {
-		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
-		bfq_mark_bfqq_must_alloc(bfqq);
-		bfq_clear_bfqq_fifo_expire(bfqq);
-
-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
-
-		BUG_ON(bfqq == bfqd->in_service_queue);
-		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-
-		if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
-		    bfqq->wr_coeff > 1 &&
-		    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
-		    time_is_before_jiffies(bfqq->budget_timeout)) {
-			/*
-			 * For soft real-time queues, move the start
-			 * of the weight-raising period forward by the
-			 * time the queue has not received any
-			 * service. Otherwise, a relatively long
-			 * service delay is likely to cause the
-			 * weight-raising period of the queue to end,
-			 * because of the short duration of the
-			 * weight-raising period of a soft real-time
-			 * queue.  It is worth noting that this move
-			 * is not so dangerous for the other queues,
-			 * because soft real-time queues are not
-			 * greedy.
-			 *
-			 * To not add a further variable, we use the
-			 * overloaded field budget_timeout to
-			 * determine for how long the queue has not
-			 * received service, i.e., how much time has
-			 * elapsed since the queue expired. However,
-			 * this is a little imprecise, because
-			 * budget_timeout is set to jiffies if bfqq
-			 * not only expires, but also remains with no
-			 * request.
-			 */
-			if (time_after(bfqq->budget_timeout,
-				       bfqq->last_wr_start_finish))
-				bfqq->last_wr_start_finish +=
-					jiffies - bfqq->budget_timeout;
-			else
-				bfqq->last_wr_start_finish = jiffies;
-
-			if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
-			       pr_crit(
-			       "BFQ WARNING:last %lu budget %lu jiffies %lu",
-			       bfqq->last_wr_start_finish,
-			       bfqq->budget_timeout,
-			       jiffies);
-			       pr_crit("diff %lu", jiffies -
-				       max_t(unsigned long,
-					     bfqq->last_wr_start_finish,
-					     bfqq->budget_timeout));
-			       bfqq->last_wr_start_finish = jiffies;
-			}
-		}
-
-		bfq_set_budget_timeout(bfqd, bfqq);
-		bfq_log_bfqq(bfqd, bfqq,
-			     "set_in_service_queue, cur-budget = %d",
-			     bfqq->entity.budget);
-	} else
-		bfq_log(bfqd, "set_in_service_queue: NULL");
-
-	bfqd->in_service_queue = bfqq;
-}
-
-/*
- * Get and set a new queue for service.
- */
-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
-
-	__bfq_set_in_service_queue(bfqd, bfqq);
-	return bfqq;
-}
-
-static void bfq_arm_slice_timer(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq = bfqd->in_service_queue;
-	struct bfq_io_cq *bic;
-	u32 sl;
-
-	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
-	/* Processes have exited, don't wait. */
-	bic = bfqd->in_service_bic;
-	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
-		return;
-
-	bfq_mark_bfqq_wait_request(bfqq);
-
-	/*
-	 * We don't want to idle for seeks, but we do want to allow
-	 * fair distribution of slice time for a process doing back-to-back
-	 * seeks. So allow a little bit of time for him to submit a new rq.
-	 *
-	 * To prevent processes with (partly) seeky workloads from
-	 * being too ill-treated, grant them a small fraction of the
-	 * assigned budget before reducing the waiting time to
-	 * BFQ_MIN_TT. This happened to help reduce latency.
-	 */
-	sl = bfqd->bfq_slice_idle;
-	/*
-	 * Unless the queue is being weight-raised or the scenario is
-	 * asymmetric, grant only minimum idle time if the queue
-	 * is seeky. A long idling is preserved for a weight-raised
-	 * queue, or, more in general, in an asymemtric scenario,
-	 * because a long idling is needed for guaranteeing to a queue
-	 * its reserved share of the throughput (in particular, it is
-	 * needed if the queue has a higher weight than some other
-	 * queue).
-	 */
-	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
-	    bfq_symmetric_scenario(bfqd))
-		sl = min_t(u32, sl, BFQ_MIN_TT);
-
-	bfqd->last_idling_start = ktime_get();
-	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
-		      HRTIMER_MODE_REL);
-	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
-	bfq_log(bfqd, "arm idle: %ld/%ld ms",
-		sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
-}
-
-/*
- * In autotuning mode, max_budget is dynamically recomputed as the
- * amount of sectors transferred in timeout at the estimated peak
- * rate. This enables BFQ to utilize a full timeslice with a full
- * budget, even if the in-service queue is served at peak rate. And
- * this maximises throughput with sequential workloads.
- */
-static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
-{
-	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
-		jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
-}
-
-/*
- * Update parameters related to throughput and responsiveness, as a
- * function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
- */
-static void update_thr_responsiveness_params(struct bfq_data *bfqd)
-{
-	int dev_type = blk_queue_nonrot(bfqd->queue);
-
-	if (bfqd->bfq_user_max_budget == 0) {
-		bfqd->bfq_max_budget =
-			bfq_calc_max_budget(bfqd);
-		BUG_ON(bfqd->bfq_max_budget < 0);
-		bfq_log(bfqd, "new max_budget = %d",
-			bfqd->bfq_max_budget);
-	}
-
-	if (bfqd->device_speed == BFQ_BFQD_FAST &&
-	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_SLOW;
-		bfqd->RT_prod = R_slow[dev_type] *
-			T_slow[dev_type];
-	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
-		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_FAST;
-		bfqd->RT_prod = R_fast[dev_type] *
-			T_fast[dev_type];
-	}
-
-	bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
-		dev_type == 0 ? "ROT" : "NONROT",
-		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
-		bfqd->device_speed == BFQ_BFQD_FAST ?
-		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
-		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
-		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
-		BFQ_RATE_SHIFT);
-}
-
-static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
-{
-	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
-		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
-		bfqd->peak_rate_samples = 1;
-		bfqd->sequential_samples = 0;
-		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
-			blk_rq_sectors(rq);
-	} else /* no new rq dispatched, just reset the number of samples */
-		bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
-
-	bfq_log(bfqd,
-		"reset_rate_computation at end, sample %u/%u tot_sects %llu",
-		bfqd->peak_rate_samples, bfqd->sequential_samples,
-		bfqd->tot_sectors_dispatched);
-}
-
-static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
-{
-	u32 rate, weight, divisor;
-
-	/*
-	 * For the convergence property to hold (see comments on
-	 * bfq_update_peak_rate()) and for the assessment to be
-	 * reliable, a minimum number of samples must be present, and
-	 * a minimum amount of time must have elapsed. If not so, do
-	 * not compute new rate. Just reset parameters, to get ready
-	 * for a new evaluation attempt.
-	 */
-	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
-	    bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
-		bfq_log(bfqd,
-	"update_rate_reset: only resetting, delta_first %lluus samples %d",
-			bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
-		goto reset_computation;
-	}
-
-	/*
-	 * If a new request completion has occurred after last
-	 * dispatch, then, to approximate the rate at which requests
-	 * have been served by the device, it is more precise to
-	 * extend the observation interval to the last completion.
-	 */
-	bfqd->delta_from_first =
-		max_t(u64, bfqd->delta_from_first,
-		      bfqd->last_completion - bfqd->first_dispatch);
-
-	BUG_ON(bfqd->delta_from_first == 0);
-	/*
-	 * Rate computed in sects/usec, and not sects/nsec, for
-	 * precision issues.
-	 */
-	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
-			div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
-
-	bfq_log(bfqd,
-"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
-		bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
-		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
-		rate > 20<<BFQ_RATE_SHIFT);
-
-	/*
-	 * Peak rate not updated if:
-	 * - the percentage of sequential dispatches is below 3/4 of the
-	 *   total, and rate is below the current estimated peak rate
-	 * - rate is unreasonably high (> 20M sectors/sec)
-	 */
-	if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
-	     rate <= bfqd->peak_rate) ||
-		rate > 20<<BFQ_RATE_SHIFT) {
-		bfq_log(bfqd,
-		"update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
-		bfqd->peak_rate_samples, bfqd->sequential_samples,
-		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
-		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
-		goto reset_computation;
-	} else {
-		bfq_log(bfqd,
-		"update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
-		bfqd->peak_rate_samples, bfqd->sequential_samples,
-		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
-		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
-	}
-
-	/*
-	 * We have to update the peak rate, at last! To this purpose,
-	 * we use a low-pass filter. We compute the smoothing constant
-	 * of the filter as a function of the 'weight' of the new
-	 * measured rate.
-	 *
-	 * As can be seen in next formulas, we define this weight as a
-	 * quantity proportional to how sequential the workload is,
-	 * and to how long the observation time interval is.
-	 *
-	 * The weight runs from 0 to 8. The maximum value of the
-	 * weight, 8, yields the minimum value for the smoothing
-	 * constant. At this minimum value for the smoothing constant,
-	 * the measured rate contributes for half of the next value of
-	 * the estimated peak rate.
-	 *
-	 * So, the first step is to compute the weight as a function
-	 * of how sequential the workload is. Note that the weight
-	 * cannot reach 9, because bfqd->sequential_samples cannot
-	 * become equal to bfqd->peak_rate_samples, which, in its
-	 * turn, holds true because bfqd->sequential_samples is not
-	 * incremented for the first sample.
-	 */
-	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
-
-	/*
-	 * Second step: further refine the weight as a function of the
-	 * duration of the observation interval.
-	 */
-	weight = min_t(u32, 8,
-		       div_u64(weight * bfqd->delta_from_first,
-			       BFQ_RATE_REF_INTERVAL));
-
-	/*
-	 * Divisor ranging from 10, for minimum weight, to 2, for
-	 * maximum weight.
-	 */
-	divisor = 10 - weight;
-	BUG_ON(divisor == 0);
-
-	/*
-	 * Finally, update peak rate:
-	 *
-	 * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
-	 */
-	bfqd->peak_rate *= divisor-1;
-	bfqd->peak_rate /= divisor;
-	rate /= divisor; /* smoothing constant alpha = 1/divisor */
-
-	bfq_log(bfqd,
-		"update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
-		divisor,
-		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
-		(u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
-
-	BUG_ON(bfqd->peak_rate == 0);
-	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
-
-	bfqd->peak_rate += rate;
-	update_thr_responsiveness_params(bfqd);
-	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
-
-reset_computation:
-	bfq_reset_rate_computation(bfqd, rq);
-}
-
-/*
- * Update the read/write peak rate (the main quantity used for
- * auto-tuning, see update_thr_responsiveness_params()).
- *
- * It is not trivial to estimate the peak rate (correctly): because of
- * the presence of sw and hw queues between the scheduler and the
- * device components that finally serve I/O requests, it is hard to
- * say exactly when a given dispatched request is served inside the
- * device, and for how long. As a consequence, it is hard to know
- * precisely at what rate a given set of requests is actually served
- * by the device.
- *
- * On the opposite end, the dispatch time of any request is trivially
- * available, and, from this piece of information, the "dispatch rate"
- * of requests can be immediately computed. So, the idea in the next
- * function is to use what is known, namely request dispatch times
- * (plus, when useful, request completion times), to estimate what is
- * unknown, namely in-device request service rate.
- *
- * The main issue is that, because of the above facts, the rate at
- * which a certain set of requests is dispatched over a certain time
- * interval can vary greatly with respect to the rate at which the
- * same requests are then served. But, since the size of any
- * intermediate queue is limited, and the service scheme is lossless
- * (no request is silently dropped), the following obvious convergence
- * property holds: the number of requests dispatched MUST become
- * closer and closer to the number of requests completed as the
- * observation interval grows. This is the key property used in
- * the next function to estimate the peak service rate as a function
- * of the observed dispatch rate. The function assumes to be invoked
- * on every request dispatch.
- */
-static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
-{
-	u64 now_ns = ktime_get_ns();
-
-	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
-		bfq_log(bfqd,
-		"update_peak_rate: goto reset, samples %d",
-				bfqd->peak_rate_samples) ;
-		bfq_reset_rate_computation(bfqd, rq);
-		goto update_last_values; /* will add one sample */
-	}
-
-	/*
-	 * Device idle for very long: the observation interval lasting
-	 * up to this dispatch cannot be a valid observation interval
-	 * for computing a new peak rate (similarly to the late-
-	 * completion event in bfq_completed_request()). Go to
-	 * update_rate_and_reset to have the following three steps
-	 * taken:
-	 * - close the observation interval at the last (previous)
-	 *   request dispatch or completion
-	 * - compute rate, if possible, for that observation interval
-	 * - start a new observation interval with this dispatch
-	 */
-	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
-	    bfqd->rq_in_driver == 0) {
-		bfq_log(bfqd,
-"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
-			(now_ns - bfqd->last_dispatch)>>10,
-			bfqd->peak_rate_samples) ;
-		goto update_rate_and_reset;
-	}
-
-	/* Update sampling information */
-	bfqd->peak_rate_samples++;
-
-	if ((bfqd->rq_in_driver > 0 ||
-		now_ns - bfqd->last_completion < BFQ_MIN_TT)
-	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
-		bfqd->sequential_samples++;
-
-	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
-
-	/* Reset max observed rq size every 32 dispatches */
-	if (likely(bfqd->peak_rate_samples % 32))
-		bfqd->last_rq_max_size =
-			max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
-	else
-		bfqd->last_rq_max_size = blk_rq_sectors(rq);
-
-	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
-
-	bfq_log(bfqd,
-	"update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
-		bfqd->peak_rate_samples, bfqd->sequential_samples,
-		bfqd->tot_sectors_dispatched,
-		bfqd->delta_from_first>>10);
-
-	/* Target observation interval not yet reached, go on sampling */
-	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
-		goto update_last_values;
-
-update_rate_and_reset:
-	bfq_update_rate_reset(bfqd, rq);
-update_last_values:
-	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
-	bfqd->last_dispatch = now_ns;
-
-	bfq_log(bfqd,
-	"update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
-		(now_ns - bfqd->first_dispatch)>>10,
-		(unsigned long long) bfqd->last_position,
-		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
-	bfq_log(bfqd,
-	"update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
-}
-
-/*
- * Move request from internal lists to the dispatch list of the request queue
- */
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	/*
-	 * For consistency, the next instruction should have been executed
-	 * after removing the request from the queue and dispatching it.
-	 * We execute instead this instruction before bfq_remove_request()
-	 * (and hence introduce a temporary inconsistency), for efficiency.
-	 * In fact, in a forced_dispatch, this prevents two counters related
-	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq
-	 * is not in service, and then to be incremented again after
-	 * incrementing bfqq->dispatched.
-	 */
-	bfqq->dispatched++;
-	bfq_update_peak_rate(q->elevator->elevator_data, rq);
-
-	bfq_remove_request(rq);
-	elv_dispatch_sort(q, rq);
-}
-
-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	BUG_ON(bfqq != bfqd->in_service_queue);
-
-	/*
-	 * If this bfqq is shared between multiple processes, check
-	 * to make sure that those processes are still issuing I/Os
-	 * within the mean seek distance. If not, it may be time to
-	 * break the queues apart again.
-	 */
-	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
-		bfq_mark_bfqq_split_coop(bfqq);
-
-	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		if (bfqq->dispatched == 0)
-			/*
-			 * Overloading budget_timeout field to store
-			 * the time at which the queue remains with no
-			 * backlog and no outstanding request; used by
-			 * the weight-raising mechanism.
-			 */
-			bfqq->budget_timeout = jiffies;
-
-		bfq_del_bfqq_busy(bfqd, bfqq, true);
-	} else {
-		bfq_requeue_bfqq(bfqd, bfqq);
-		/*
-		 * Resort priority tree of potential close cooperators.
-		 */
-		bfq_pos_tree_add_move(bfqd, bfqq);
-	}
-
-	/*
-	 * All in-service entities must have been properly deactivated
-	 * or requeued before executing the next function, which
-	 * resets all in-service entites as no more in service.
-	 */
-	__bfq_bfqd_reset_in_service(bfqd);
-}
-
-/**
- * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
- * @bfqd: device data.
- * @bfqq: queue to update.
- * @reason: reason for expiration.
- *
- * Handle the feedback on @bfqq budget at queue expiration.
- * See the body for detailed comments.
- */
-static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
-				     struct bfq_queue *bfqq,
-				     enum bfqq_expiration reason)
-{
-	struct request *next_rq;
-	int budget, min_budget;
-
-	BUG_ON(bfqq != bfqd->in_service_queue);
-
-	min_budget = bfq_min_budget(bfqd);
-
-	if (bfqq->wr_coeff == 1)
-		budget = bfqq->max_budget;
-	else /*
-	      * Use a constant, low budget for weight-raised queues,
-	      * to help achieve a low latency. Keep it slightly higher
-	      * than the minimum possible budget, to cause a little
-	      * bit fewer expirations.
-	      */
-		budget = 2 * min_budget;
-
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
-		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
-		budget, bfq_min_budget(bfqd));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
-		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
-
-	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
-		switch (reason) {
-		/*
-		 * Caveat: in all the following cases we trade latency
-		 * for throughput.
-		 */
-		case BFQ_BFQQ_TOO_IDLE:
-			/*
-			 * This is the only case where we may reduce
-			 * the budget: if there is no request of the
-			 * process still waiting for completion, then
-			 * we assume (tentatively) that the timer has
-			 * expired because the batch of requests of
-			 * the process could have been served with a
-			 * smaller budget.  Hence, betting that
-			 * process will behave in the same way when it
-			 * becomes backlogged again, we reduce its
-			 * next budget.  As long as we guess right,
-			 * this budget cut reduces the latency
-			 * experienced by the process.
-			 *
-			 * However, if there are still outstanding
-			 * requests, then the process may have not yet
-			 * issued its next request just because it is
-			 * still waiting for the completion of some of
-			 * the still outstanding ones.  So in this
-			 * subcase we do not reduce its budget, on the
-			 * contrary we increase it to possibly boost
-			 * the throughput, as discussed in the
-			 * comments to the BUDGET_TIMEOUT case.
-			 */
-			if (bfqq->dispatched > 0) /* still outstanding reqs */
-				budget = min(budget * 2, bfqd->bfq_max_budget);
-			else {
-				if (budget > 5 * min_budget)
-					budget -= 4 * min_budget;
-				else
-					budget = min_budget;
-			}
-			break;
-		case BFQ_BFQQ_BUDGET_TIMEOUT:
-			/*
-			 * We double the budget here because it gives
-			 * the chance to boost the throughput if this
-			 * is not a seeky process (and has bumped into
-			 * this timeout because of, e.g., ZBR).
-			 */
-			budget = min(budget * 2, bfqd->bfq_max_budget);
-			break;
-		case BFQ_BFQQ_BUDGET_EXHAUSTED:
-			/*
-			 * The process still has backlog, and did not
-			 * let either the budget timeout or the disk
-			 * idling timeout expire. Hence it is not
-			 * seeky, has a short thinktime and may be
-			 * happy with a higher budget too. So
-			 * definitely increase the budget of this good
-			 * candidate to boost the disk throughput.
-			 */
-			budget = min(budget * 4, bfqd->bfq_max_budget);
-			break;
-		case BFQ_BFQQ_NO_MORE_REQUESTS:
-			/*
-			 * For queues that expire for this reason, it
-			 * is particularly important to keep the
-			 * budget close to the actual service they
-			 * need. Doing so reduces the timestamp
-			 * misalignment problem described in the
-			 * comments in the body of
-			 * __bfq_activate_entity. In fact, suppose
-			 * that a queue systematically expires for
-			 * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
-			 * new request in time to enjoy timestamp
-			 * back-shifting. The larger the budget of the
-			 * queue is with respect to the service the
-			 * queue actually requests in each service
-			 * slot, the more times the queue can be
-			 * reactivated with the same virtual finish
-			 * time. It follows that, even if this finish
-			 * time is pushed to the system virtual time
-			 * to reduce the consequent timestamp
-			 * misalignment, the queue unjustly enjoys for
-			 * many re-activations a lower finish time
-			 * than all newly activated queues.
-			 *
-			 * The service needed by bfqq is measured
-			 * quite precisely by bfqq->entity.service.
-			 * Since bfqq does not enjoy device idling,
-			 * bfqq->entity.service is equal to the number
-			 * of sectors that the process associated with
-			 * bfqq requested to read/write before waiting
-			 * for request completions, or blocking for
-			 * other reasons.
-			 */
-			budget = max_t(int, bfqq->entity.service, min_budget);
-			break;
-		default:
-			return;
-		}
-	} else if (!bfq_bfqq_sync(bfqq))
-		/*
-		 * Async queues get always the maximum possible
-		 * budget, as for them we do not care about latency
-		 * (in addition, their ability to dispatch is limited
-		 * by the charging factor).
-		 */
-		budget = bfqd->bfq_max_budget;
-
-	bfqq->max_budget = budget;
-
-	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
-	    !bfqd->bfq_user_max_budget)
-		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
-
-	/*
-	 * If there is still backlog, then assign a new budget, making
-	 * sure that it is large enough for the next request.  Since
-	 * the finish time of bfqq must be kept in sync with the
-	 * budget, be sure to call __bfq_bfqq_expire() *after* this
-	 * update.
-	 *
-	 * If there is no backlog, then no need to update the budget;
-	 * it will be updated on the arrival of a new request.
-	 */
-	next_rq = bfqq->next_rq;
-	if (next_rq) {
-		BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
-		       reason == BFQ_BFQQ_NO_MORE_REQUESTS);
-		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
-					    bfq_serv_to_charge(next_rq, bfqq));
-		BUG_ON(!bfq_bfqq_busy(bfqq));
-		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-	}
-
-	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
-			next_rq ? blk_rq_sectors(next_rq) : 0,
-			bfqq->entity.budget);
-}
-
-/*
- * Return true if the process associated with bfqq is "slow". The slow
- * flag is used, in addition to the budget timeout, to reduce the
- * amount of service provided to seeky processes, and thus reduce
- * their chances to lower the throughput. More details in the comments
- * on the function bfq_bfqq_expire().
- *
- * An important observation is in order: as discussed in the comments
- * on the function bfq_update_peak_rate(), with devices with internal
- * queues, it is hard if ever possible to know when and for how long
- * an I/O request is processed by the device (apart from the trivial
- * I/O pattern where a new request is dispatched only after the
- * previous one has been completed). This makes it hard to evaluate
- * the real rate at which the I/O requests of each bfq_queue are
- * served.  In fact, for an I/O scheduler like BFQ, serving a
- * bfq_queue means just dispatching its requests during its service
- * slot (i.e., until the budget of the queue is exhausted, or the
- * queue remains idle, or, finally, a timeout fires). But, during the
- * service slot of a bfq_queue, around 100 ms at most, the device may
- * be even still processing requests of bfq_queues served in previous
- * service slots. On the opposite end, the requests of the in-service
- * bfq_queue may be completed after the service slot of the queue
- * finishes.
- *
- * Anyway, unless more sophisticated solutions are used
- * (where possible), the sum of the sizes of the requests dispatched
- * during the service slot of a bfq_queue is probably the only
- * approximation available for the service received by the bfq_queue
- * during its service slot. And this sum is the quantity used in this
- * function to evaluate the I/O speed of a process.
- */
-static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				 bool compensate, enum bfqq_expiration reason,
-				 unsigned long *delta_ms)
-{
-	ktime_t delta_ktime;
-	u32 delta_usecs;
-	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
-
-	if (!bfq_bfqq_sync(bfqq))
-		return false;
-
-	if (compensate)
-		delta_ktime = bfqd->last_idling_start;
-	else
-		delta_ktime = ktime_get();
-	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
-	delta_usecs = ktime_to_us(delta_ktime);
-
-	/* don't use too short time intervals */
-	if (delta_usecs < 1000) {
-		if (blk_queue_nonrot(bfqd->queue))
-			 /*
-			  * give same worst-case guarantees as idling
-			  * for seeky
-			  */
-			*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
-		else /* charge at least one seek */
-			*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
-
-		bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs);
-
-		return slow;
-	}
-
-	*delta_ms = delta_usecs / USEC_PER_MSEC;
-
-	/*
-	 * Use only long (> 20ms) intervals to filter out excessive
-	 * spikes in service rate estimation.
-	 */
-	if (delta_usecs > 20000) {
-		/*
-		 * Caveat for rotational devices: processes doing I/O
-		 * in the slower disk zones tend to be slow(er) even
-		 * if not seeky. In this respect, the estimated peak
-		 * rate is likely to be an average over the disk
-		 * surface. Accordingly, to not be too harsh with
-		 * unlucky processes, a process is deemed slow only if
-		 * its rate has been lower than half of the estimated
-		 * peak rate.
-		 */
-		slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
-		bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
-			bfqq->entity.service, bfqd->bfq_max_budget);
-	}
-
-	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
-
-	return slow;
-}
-
-/*
- * To be deemed as soft real-time, an application must meet two
- * requirements. First, the application must not require an average
- * bandwidth higher than the approximate bandwidth required to playback or
- * record a compressed high-definition video.
- * The next function is invoked on the completion of the last request of a
- * batch, to compute the next-start time instant, soft_rt_next_start, such
- * that, if the next request of the application does not arrive before
- * soft_rt_next_start, then the above requirement on the bandwidth is met.
- *
- * The second requirement is that the request pattern of the application is
- * isochronous, i.e., that, after issuing a request or a batch of requests,
- * the application stops issuing new requests until all its pending requests
- * have been completed. After that, the application may issue a new batch,
- * and so on.
- * For this reason the next function is invoked to compute
- * soft_rt_next_start only for applications that meet this requirement,
- * whereas soft_rt_next_start is set to infinity for applications that do
- * not.
- *
- * Unfortunately, even a greedy application may happen to behave in an
- * isochronous way if the CPU load is high. In fact, the application may
- * stop issuing requests while the CPUs are busy serving other processes,
- * then restart, then stop again for a while, and so on. In addition, if
- * the disk achieves a low enough throughput with the request pattern
- * issued by the application (e.g., because the request pattern is random
- * and/or the device is slow), then the application may meet the above
- * bandwidth requirement too. To prevent such a greedy application to be
- * deemed as soft real-time, a further rule is used in the computation of
- * soft_rt_next_start: soft_rt_next_start must be higher than the current
- * time plus the maximum time for which the arrival of a request is waited
- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
- * This filters out greedy applications, as the latter issue instead their
- * next request as soon as possible after the last one has been completed
- * (in contrast, when a batch of requests is completed, a soft real-time
- * application spends some time processing data).
- *
- * Unfortunately, the last filter may easily generate false positives if
- * only bfqd->bfq_slice_idle is used as a reference time interval and one
- * or both the following cases occur:
- * 1) HZ is so low that the duration of a jiffy is comparable to or higher
- *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
- *    HZ=100.
- * 2) jiffies, instead of increasing at a constant rate, may stop increasing
- *    for a while, then suddenly 'jump' by several units to recover the lost
- *    increments. This seems to happen, e.g., inside virtual machines.
- * To address this issue, we do not use as a reference time interval just
- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
- * particular we add the minimum number of jiffies for which the filter
- * seems to be quite precise also in embedded systems and KVM/QEMU virtual
- * machines.
- */
-static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
-						struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqd, bfqq,
-"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
-		     bfqq->service_from_backlogged,
-		     bfqd->bfq_wr_max_softrt_rate,
-		     jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
-				      bfqd->bfq_wr_max_softrt_rate));
-
-	return max(bfqq->last_idle_bklogged +
-		   HZ * bfqq->service_from_backlogged /
-		   bfqd->bfq_wr_max_softrt_rate,
-		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
-}
-
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
-	return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
-	return jiffies - MAX_JIFFY_OFFSET;
-}
-
-/**
- * bfq_bfqq_expire - expire a queue.
- * @bfqd: device owning the queue.
- * @bfqq: the queue to expire.
- * @compensate: if true, compensate for the time spent idling.
- * @reason: the reason causing the expiration.
- *
- * If the process associated with bfqq does slow I/O (e.g., because it
- * issues random requests), we charge bfqq with the time it has been
- * in service instead of the service it has received (see
- * bfq_bfqq_charge_time for details on how this goal is achieved). As
- * a consequence, bfqq will typically get higher timestamps upon
- * reactivation, and hence it will be rescheduled as if it had
- * received more service than what it has actually received. In the
- * end, bfqq receives less service in proportion to how slowly its
- * associated process consumes its budgets (and hence how seriously it
- * tends to lower the throughput). In addition, this time-charging
- * strategy guarantees time fairness among slow processes. In
- * contrast, if the process associated with bfqq is not slow, we
- * charge bfqq exactly with the service it has received.
- *
- * Charging time to the first type of queues and the exact service to
- * the other has the effect of using the WF2Q+ policy to schedule the
- * former on a timeslice basis, without violating service domain
- * guarantees among the latter.
- */
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason)
-{
-	bool slow;
-	unsigned long delta = 0;
-	struct bfq_entity *entity = &bfqq->entity;
-	int ref;
-
-	BUG_ON(bfqq != bfqd->in_service_queue);
-
-	/*
-	 * Check whether the process is slow (see bfq_bfqq_is_slow).
-	 */
-	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
-
-	/*
-	 * Increase service_from_backlogged before next statement,
-	 * because the possible next invocation of
-	 * bfq_bfqq_charge_time would likely inflate
-	 * entity->service. In contrast, service_from_backlogged must
-	 * contain real service, to enable the soft real-time
-	 * heuristic to correctly compute the bandwidth consumed by
-	 * bfqq.
-	 */
-	bfqq->service_from_backlogged += entity->service;
-
-	/*
-	 * As above explained, charge slow (typically seeky) and
-	 * timed-out queues with the time and not the service
-	 * received, to favor sequential workloads.
-	 *
-	 * Processes doing I/O in the slower disk zones will tend to
-	 * be slow(er) even if not seeky. Therefore, since the
-	 * estimated peak rate is actually an average over the disk
-	 * surface, these processes may timeout just for bad luck. To
-	 * avoid punishing them, do not charge time to processes that
-	 * succeeded in consuming at least 2/3 of their budget. This
-	 * allows BFQ to preserve enough elasticity to still perform
-	 * bandwidth, and not time, distribution with little unlucky
-	 * or quasi-sequential processes.
-	 */
-	if (bfqq->wr_coeff == 1 &&
-	    (slow ||
-	     (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
-	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
-		bfq_bfqq_charge_time(bfqd, bfqq, delta);
-
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
-	if (reason == BFQ_BFQQ_TOO_IDLE &&
-	    entity->service <= 2 * entity->budget / 10)
-		bfq_clear_bfqq_IO_bound(bfqq);
-
-	if (bfqd->low_latency && bfqq->wr_coeff == 1)
-		bfqq->last_wr_start_finish = jiffies;
-
-	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
-	    RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		/*
-		 * If we get here, and there are no outstanding
-		 * requests, then the request pattern is isochronous
-		 * (see the comments on the function
-		 * bfq_bfqq_softrt_next_start()). Thus we can compute
-		 * soft_rt_next_start. If, instead, the queue still
-		 * has outstanding requests, then we have to wait for
-		 * the completion of all the outstanding requests to
-		 * discover whether the request pattern is actually
-		 * isochronous.
-		 */
-		BUG_ON(bfqd->busy_queues < 1);
-		if (bfqq->dispatched == 0) {
-			bfqq->soft_rt_next_start =
-				bfq_bfqq_softrt_next_start(bfqd, bfqq);
-			bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
-				     bfqq->soft_rt_next_start);
-		} else {
-			/*
-			 * The application is still waiting for the
-			 * completion of one or more requests:
-			 * prevent it from possibly being incorrectly
-			 * deemed as soft real-time by setting its
-			 * soft_rt_next_start to infinity. In fact,
-			 * without this assignment, the application
-			 * would be incorrectly deemed as soft
-			 * real-time if:
-			 * 1) it issued a new request before the
-			 *    completion of all its in-flight
-			 *    requests, and
-			 * 2) at that time, its soft_rt_next_start
-			 *    happened to be in the past.
-			 */
-			bfqq->soft_rt_next_start =
-				bfq_greatest_from_now();
-			/*
-			 * Schedule an update of soft_rt_next_start to when
-			 * the task may be discovered to be isochronous.
-			 */
-			bfq_mark_bfqq_softrt_update(bfqq);
-		}
-	}
-
-	bfq_log_bfqq(bfqd, bfqq,
-		"expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)",
-		     reason, slow, bfqq->dispatched,
-		     bfq_bfqq_has_short_ttime(bfqq), entity->weight);
-
-	/*
-	 * Increase, decrease or leave budget unchanged according to
-	 * reason.
-	 */
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
-	BUG_ON(bfqq->next_rq == NULL &&
-	       bfqq->entity.budget < bfqq->entity.service);
-	ref = bfqq->ref;
-	__bfq_bfqq_expire(bfqd, bfqq);
-
-	BUG_ON(ref > 1 &&
-	       !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
-		!bfq_class_idle(bfqq));
-
-	/* mark bfqq as waiting a request only if a bic still points to it */
-	if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
-	    reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
-	    reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
-		bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
-}
-
-/*
- * Budget timeout is not implemented through a dedicated timer, but
- * just checked on request arrivals and completions, as well as on
- * idle timer expirations.
- */
-static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
-{
-	return time_is_before_eq_jiffies(bfqq->budget_timeout);
-}
-
-/*
- * If we expire a queue that is actively waiting (i.e., with the
- * device idled) for the arrival of a new request, then we may incur
- * the timestamp misalignment problem described in the body of the
- * function __bfq_activate_entity. Hence we return true only if this
- * condition does not hold, or if the queue is slow enough to deserve
- * only to be kicked off for preserving a high throughput.
- */
-static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqq->bfqd, bfqq,
-		"may_budget_timeout: wait_request %d left %d timeout %d",
-		bfq_bfqq_wait_request(bfqq),
-			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
-		bfq_bfqq_budget_timeout(bfqq));
-
-	return (!bfq_bfqq_wait_request(bfqq) ||
-		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
-		&&
-		bfq_bfqq_budget_timeout(bfqq);
-}
-
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for that queue. As a consequence, since
- * device idling plays a critical role for both throughput boosting
- * and service guarantees, the return value of this function plays a
- * critical role as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * while introducing the variables.
- */
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
-{
-	struct bfq_data *bfqd = bfqq->bfqd;
-	bool rot_without_queueing =
-		!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
-		bfqq_sequential_and_IO_bound,
-		idling_boosts_thr, idling_boosts_thr_without_issues,
-		idling_needed_for_service_guarantees,
-		asymmetric_scenario;
-
-	if (bfqd->strict_guarantees)
-		return true;
-
-	/*
-	 * Idling is performed only if slice_idle > 0. In addition, we
-	 * do not idle if
-	 * (a) bfqq is async
-	 * (b) bfqq is in the idle io prio class: in this case we do
-	 * not idle because we want to minimize the bandwidth that
-	 * queues in this class can steal to higher-priority queues
-	 */
-	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
-	   bfq_class_idle(bfqq))
-		return false;
-
-	bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
-		bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
-	/*
-	 * The next variable takes into account the cases where idling
-	 * boosts the throughput.
-	 *
-	 * The value of the variable is computed considering, first, that
-	 * idling is virtually always beneficial for the throughput if:
-	 * (a) the device is not NCQ-capable and rotational, or
-	 * (b) regardless of the presence of NCQ, the device is rotational and
-	 *     the request pattern for bfqq is I/O-bound and sequential, or
-	 * (c) regardless of whether it is rotational, the device is
-	 *     not NCQ-capable and the request pattern for bfqq is
-	 *     I/O-bound and sequential.
-	 *
-	 * Secondly, and in contrast to the above item (b), idling an
-	 * NCQ-capable flash-based device would not boost the
-	 * throughput even with sequential I/O; rather it would lower
-	 * the throughput in proportion to how fast the device
-	 * is. Accordingly, the next variable is true if any of the
-	 * above conditions (a), (b) or (c) is true, and, in
-	 * particular, happens to be false if bfqd is an NCQ-capable
-	 * flash-based device.
-	 */
-	idling_boosts_thr = rot_without_queueing ||
-		((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) &&
-		 bfqq_sequential_and_IO_bound);
-
-	/*
-	 * The value of the next variable,
-	 * idling_boosts_thr_without_issues, is equal to that of
-	 * idling_boosts_thr, unless a special case holds. In this
-	 * special case, described below, idling may cause problems to
-	 * weight-raised queues.
-	 *
-	 * When the request pool is saturated (e.g., in the presence
-	 * of write hogs), if the processes associated with
-	 * non-weight-raised queues ask for requests at a lower rate,
-	 * then processes associated with weight-raised queues have a
-	 * higher probability to get a request from the pool
-	 * immediately (or at least soon) when they need one. Thus
-	 * they have a higher probability to actually get a fraction
-	 * of the device throughput proportional to their high
-	 * weight. This is especially true with NCQ-capable drives,
-	 * which enqueue several requests in advance, and further
-	 * reorder internally-queued requests.
-	 *
-	 * For this reason, we force to false the value of
-	 * idling_boosts_thr_without_issues if there are weight-raised
-	 * busy queues. In this case, and if bfqq is not weight-raised,
-	 * this guarantees that the device is not idled for bfqq (if,
-	 * instead, bfqq is weight-raised, then idling will be
-	 * guaranteed by another variable, see below). Combined with
-	 * the timestamping rules of BFQ (see [1] for details), this
-	 * behavior causes bfqq, and hence any sync non-weight-raised
-	 * queue, to get a lower number of requests served, and thus
-	 * to ask for a lower number of requests from the request
-	 * pool, before the busy weight-raised queues get served
-	 * again. This often mitigates starvation problems in the
-	 * presence of heavy write workloads and NCQ, thereby
-	 * guaranteeing a higher application and system responsiveness
-	 * in these hostile scenarios.
-	 */
-	idling_boosts_thr_without_issues = idling_boosts_thr &&
-		bfqd->wr_busy_queues == 0;
-
-	/*
-	 * There is then a case where idling must be performed not
-	 * for throughput concerns, but to preserve service
-	 * guarantees.
-	 *
-	 * To introduce this case, we can note that allowing the drive
-	 * to enqueue more than one request at a time, and hence
-	 * delegating de facto final scheduling decisions to the
-	 * drive's internal scheduler, entails loss of control on the
-	 * actual request service order. In particular, the critical
-	 * situation is when requests from different processes happen
-	 * to be present, at the same time, in the internal queue(s)
-	 * of the drive. In such a situation, the drive, by deciding
-	 * the service order of the internally-queued requests, does
-	 * determine also the actual throughput distribution among
-	 * these processes. But the drive typically has no notion or
-	 * concern about per-process throughput distribution, and
-	 * makes its decisions only on a per-request basis. Therefore,
-	 * the service distribution enforced by the drive's internal
-	 * scheduler is likely to coincide with the desired
-	 * device-throughput distribution only in a completely
-	 * symmetric scenario where:
-	 * (i)  each of these processes must get the same throughput as
-	 *      the others;
-	 * (ii) all these processes have the same I/O pattern
-	 *      (either sequential or random).
-	 * In fact, in such a scenario, the drive will tend to treat
-	 * the requests of each of these processes in about the same
-	 * way as the requests of the others, and thus to provide
-	 * each of these processes with about the same throughput
-	 * (which is exactly the desired throughput distribution). In
-	 * contrast, in any asymmetric scenario, device idling is
-	 * certainly needed to guarantee that bfqq receives its
-	 * assigned fraction of the device throughput (see [1] for
-	 * details).
-	 *
-	 * We address this issue by controlling, actually, only the
-	 * symmetry sub-condition (i), i.e., provided that
-	 * sub-condition (i) holds, idling is not performed,
-	 * regardless of whether sub-condition (ii) holds. In other
-	 * words, only if sub-condition (i) holds, then idling is
-	 * allowed, and the device tends to be prevented from queueing
-	 * many requests, possibly of several processes. The reason
-	 * for not controlling also sub-condition (ii) is that we
-	 * exploit preemption to preserve guarantees in case of
-	 * symmetric scenarios, even if (ii) does not hold, as
-	 * explained in the next two paragraphs.
-	 *
-	 * Even if a queue, say Q, is expired when it remains idle, Q
-	 * can still preempt the new in-service queue if the next
-	 * request of Q arrives soon (see the comments on
-	 * bfq_bfqq_update_budg_for_activation). If all queues and
-	 * groups have the same weight, this form of preemption,
-	 * combined with the hole-recovery heuristic described in the
-	 * comments on function bfq_bfqq_update_budg_for_activation,
-	 * are enough to preserve a correct bandwidth distribution in
-	 * the mid term, even without idling. In fact, even if not
-	 * idling allows the internal queues of the device to contain
-	 * many requests, and thus to reorder requests, we can rather
-	 * safely assume that the internal scheduler still preserves a
-	 * minimum of mid-term fairness. The motivation for using
-	 * preemption instead of idling is that, by not idling,
-	 * service guarantees are preserved without minimally
-	 * sacrificing throughput. In other words, both a high
-	 * throughput and its desired distribution are obtained.
-	 *
-	 * More precisely, this preemption-based, idleless approach
-	 * provides fairness in terms of IOPS, and not sectors per
-	 * second. This can be seen with a simple example. Suppose
-	 * that there are two queues with the same weight, but that
-	 * the first queue receives requests of 8 sectors, while the
-	 * second queue receives requests of 1024 sectors. In
-	 * addition, suppose that each of the two queues contains at
-	 * most one request at a time, which implies that each queue
-	 * always remains idle after it is served. Finally, after
-	 * remaining idle, each queue receives very quickly a new
-	 * request. It follows that the two queues are served
-	 * alternatively, preempting each other if needed. This
-	 * implies that, although both queues have the same weight,
-	 * the queue with large requests receives a service that is
-	 * 1024/8 times as high as the service received by the other
-	 * queue.
-	 *
-	 * On the other hand, device idling is performed, and thus
-	 * pure sector-domain guarantees are provided, for the
-	 * following queues, which are likely to need stronger
-	 * throughput guarantees: weight-raised queues, and queues
-	 * with a higher weight than other queues. When such queues
-	 * are active, sub-condition (i) is false, which triggers
-	 * device idling.
-	 *
-	 * According to the above considerations, the next variable is
-	 * true (only) if sub-condition (i) holds. To compute the
-	 * value of this variable, we not only use the return value of
-	 * the function bfq_symmetric_scenario(), but also check
-	 * whether bfqq is being weight-raised, because
-	 * bfq_symmetric_scenario() does not take into account also
-	 * weight-raised queues (see comments on
-	 * bfq_weights_tree_add()).
-	 *
-	 * As a side note, it is worth considering that the above
-	 * device-idling countermeasures may however fail in the
-	 * following unlucky scenario: if idling is (correctly)
-	 * disabled in a time period during which all symmetry
-	 * sub-conditions hold, and hence the device is allowed to
-	 * enqueue many requests, but at some later point in time some
-	 * sub-condition stops to hold, then it may become impossible
-	 * to let requests be served in the desired order until all
-	 * the requests already queued in the device have been served.
-	 */
-	asymmetric_scenario = bfqq->wr_coeff > 1 ||
-		!bfq_symmetric_scenario(bfqd);
-
-	/*
-	 * Finally, there is a case where maximizing throughput is the
-	 * best choice even if it may cause unfairness toward
-	 * bfqq. Such a case is when bfqq became active in a burst of
-	 * queue activations. Queues that became active during a large
-	 * burst benefit only from throughput, as discussed in the
-	 * comments on bfq_handle_burst. Thus, if bfqq became active
-	 * in a burst and not idling the device maximizes throughput,
-	 * then the device must no be idled, because not idling the
-	 * device provides bfqq and all other queues in the burst with
-	 * maximum benefit. Combining this and the above case, we can
-	 * now establish when idling is actually needed to preserve
-	 * service guarantees.
-	 */
-	idling_needed_for_service_guarantees =
-		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
-
-	/*
-	 * We have now all the components we need to compute the
-	 * return value of the function, which is true only if idling
-	 * either boosts the throughput (without issues), or is
-	 * necessary to preserve service guarantees.
-	 */
-	bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
-		     bfq_bfqq_sync(bfqq), idling_boosts_thr);
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
-		     bfqd->wr_busy_queues,
-		     idling_boosts_thr_without_issues,
-		     bfq_bfqq_IO_bound(bfqq),
-		     idling_needed_for_service_guarantees);
-
-	return idling_boosts_thr_without_issues ||
-		idling_needed_for_service_guarantees;
-}
-
-/*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
- * returns true, then:
- * 1) the queue must remain in service and cannot be expired, and
- * 2) the device must be idled to wait for the possible arrival of a new
- *    request for the queue.
- * See the comments on the function bfq_bfqq_may_idle for the reasons
- * why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
- * returns true.
- */
-static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
-{
-	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
-}
-
-/*
- * Select a queue for service.  If we have a current queue in service,
- * check whether to continue servicing it, or retrieve and set a new one.
- */
-static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq;
-	struct request *next_rq;
-	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
-
-	bfqq = bfqd->in_service_queue;
-	if (!bfqq)
-		goto new_queue;
-
-	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
-
-	if (bfq_may_expire_for_budg_timeout(bfqq) &&
-	    !hrtimer_active(&bfqd->idle_slice_timer) &&
-	    !bfq_bfqq_must_idle(bfqq))
-		goto expire;
-
-check_queue:
-	/*
-	 * This loop is rarely executed more than once. Even when it
-	 * happens, it is much more convenient to re-execute this loop
-	 * than to return NULL and trigger a new dispatch to get a
-	 * request served.
-	 */
-	next_rq = bfqq->next_rq;
-	/*
-	 * If bfqq has requests queued and it has enough budget left to
-	 * serve them, keep the queue, otherwise expire it.
-	 */
-	if (next_rq) {
-		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-
-		if (bfq_serv_to_charge(next_rq, bfqq) >
-			bfq_bfqq_budget_left(bfqq)) {
-			/*
-			 * Expire the queue for budget exhaustion,
-			 * which makes sure that the next budget is
-			 * enough to serve the next request, even if
-			 * it comes from the fifo expired path.
-			 */
-			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
-			goto expire;
-		} else {
-			/*
-			 * The idle timer may be pending because we may
-			 * not disable disk idling even when a new request
-			 * arrives.
-			 */
-			if (bfq_bfqq_wait_request(bfqq)) {
-				BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
-				/*
-				 * If we get here: 1) at least a new request
-				 * has arrived but we have not disabled the
-				 * timer because the request was too small,
-				 * 2) then the block layer has unplugged
-				 * the device, causing the dispatch to be
-				 * invoked.
-				 *
-				 * Since the device is unplugged, now the
-				 * requests are probably large enough to
-				 * provide a reasonable throughput.
-				 * So we disable idling.
-				 */
-				bfq_clear_bfqq_wait_request(bfqq);
-				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-				bfqg_stats_update_idle_time(bfqq_group(bfqq));
-			}
-			goto keep_queue;
-		}
-	}
-
-	/*
-	 * No requests pending. However, if the in-service queue is idling
-	 * for a new request, or has requests waiting for a completion and
-	 * may idle after their completion, then keep it anyway.
-	 */
-	if (hrtimer_active(&bfqd->idle_slice_timer) ||
-	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
-		bfqq = NULL;
-		goto keep_queue;
-	}
-
-	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
-expire:
-	bfq_bfqq_expire(bfqd, bfqq, false, reason);
-new_queue:
-	bfqq = bfq_set_in_service_queue(bfqd);
-	if (bfqq) {
-		bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
-		goto check_queue;
-	}
-keep_queue:
-	if (bfqq)
-		bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
-	else
-		bfq_log(bfqd, "select_queue: no queue returned");
-
-	return bfqq;
-}
-
-static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
-		BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
-		       time_is_after_jiffies(bfqq->last_wr_start_finish));
-
-		bfq_log_bfqq(bfqd, bfqq,
-			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
-			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
-			jiffies_to_msecs(bfqq->wr_cur_max_time),
-			bfqq->wr_coeff,
-			bfqq->entity.weight, bfqq->entity.orig_weight);
-
-		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
-		       entity->orig_weight * bfqq->wr_coeff);
-		if (entity->prio_changed)
-			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
-
-		/*
-		 * If the queue was activated in a burst, or too much
-		 * time has elapsed from the beginning of this
-		 * weight-raising period, then end weight raising.
-		 */
-		if (bfq_bfqq_in_large_burst(bfqq))
-			bfq_bfqq_end_wr(bfqq);
-		else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
-					   bfqq->wr_cur_max_time)) {
-			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
-			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
-					bfq_wr_duration(bfqd)))
-				bfq_bfqq_end_wr(bfqq);
-			else {
-				/* switch back to interactive wr */
-				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-				bfqq->last_wr_start_finish =
-					bfqq->wr_start_at_switch_to_srt;
-				BUG_ON(time_is_after_jiffies(
-					       bfqq->last_wr_start_finish));
-				bfqq->entity.prio_changed = 1;
-				bfq_log_bfqq(bfqd, bfqq,
-					"back to interactive wr");
-			}
-		}
-	}
-	/*
-	 * To improve latency (for this or other queues), immediately
-	 * update weight both if it must be raised and if it must be
-	 * lowered. Since, entity may be on some active tree here, and
-	 * might have a pending change of its ioprio class, invoke
-	 * next function with the last parameter unset (see the
-	 * comments on the function).
-	 */
-	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
-		__bfq_entity_update_weight_prio(bfq_entity_service_tree(entity),
-						entity, false);
-}
-
-/*
- * Dispatch one request from bfqq, moving it to the request queue
- * dispatch list.
- */
-static int bfq_dispatch_request(struct bfq_data *bfqd,
-				struct bfq_queue *bfqq)
-{
-	int dispatched = 0;
-	struct request *rq = bfqq->next_rq;
-	unsigned long service_to_charge;
-
-	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-	BUG_ON(!rq);
-	service_to_charge = bfq_serv_to_charge(rq, bfqq);
-
-	BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq));
-
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
-	bfq_bfqq_served(bfqq, service_to_charge);
-
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
-	bfq_dispatch_insert(bfqd->queue, rq);
-
-	/*
-	 * If weight raising has to terminate for bfqq, then next
-	 * function causes an immediate update of bfqq's weight,
-	 * without waiting for next activation. As a consequence, on
-	 * expiration, bfqq will be timestamped as if has never been
-	 * weight-raised during this service slot, even if it has
-	 * received part or even most of the service as a
-	 * weight-raised queue. This inflates bfqq's timestamps, which
-	 * is beneficial, as bfqq is then more willing to leave the
-	 * device immediately to possible other weight-raised queues.
-	 */
-	bfq_update_wr_data(bfqd, bfqq);
-
-	bfq_log_bfqq(bfqd, bfqq,
-			"dispatched %u sec req (%llu), budg left %d",
-			blk_rq_sectors(rq),
-			(unsigned long long) blk_rq_pos(rq),
-			bfq_bfqq_budget_left(bfqq));
-
-	dispatched++;
-
-	if (!bfqd->in_service_bic) {
-		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
-		bfqd->in_service_bic = RQ_BIC(rq);
-	}
-
-	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
-		goto expire;
-
-	return dispatched;
-
-expire:
-	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
-	return dispatched;
-}
-
-static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
-{
-	int dispatched = 0;
-
-	while (bfqq->next_rq) {
-		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
-		dispatched++;
-	}
-
-	BUG_ON(!list_empty(&bfqq->fifo));
-	return dispatched;
-}
-
-/*
- * Drain our current requests.
- * Used for barriers and when switching io schedulers on-the-fly.
- */
-static int bfq_forced_dispatch(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq, *n;
-	struct bfq_service_tree *st;
-	int dispatched = 0;
-
-	bfqq = bfqd->in_service_queue;
-	if (bfqq)
-		__bfq_bfqq_expire(bfqd, bfqq);
-
-	/*
-	 * Loop through classes, and be careful to leave the scheduler
-	 * in a consistent state, as feedback mechanisms and vtime
-	 * updates cannot be disabled during the process.
-	 */
-	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
-		st = bfq_entity_service_tree(&bfqq->entity);
-
-		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
-
-		bfqq->max_budget = bfq_max_budget(bfqd);
-		bfq_forget_idle(st);
-	}
-
-	BUG_ON(bfqd->busy_queues != 0);
-
-	return dispatched;
-}
-
-static int bfq_dispatch_requests(struct request_queue *q, int force)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq;
-
-	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
-
-	if (bfqd->busy_queues == 0)
-		return 0;
-
-	if (unlikely(force))
-		return bfq_forced_dispatch(bfqd);
-
-	/*
-	 * Force device to serve one request at a time if
-	 * strict_guarantees is true. Forcing this service scheme is
-	 * currently the ONLY way to guarantee that the request
-	 * service order enforced by the scheduler is respected by a
-	 * queueing device. Otherwise the device is free even to make
-	 * some unlucky request wait for as long as the device
-	 * wishes.
-	 *
-	 * Of course, serving one request at at time may cause loss of
-	 * throughput.
-	 */
-	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
-		return 0;
-
-	bfqq = bfq_select_queue(bfqd);
-	if (!bfqq)
-		return 0;
-
-	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
-	BUG_ON(bfq_bfqq_wait_request(bfqq));
-
-	if (!bfq_dispatch_request(bfqd, bfqq))
-		return 0;
-
-	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
-			bfq_bfqq_sync(bfqq) ? "sync" : "async");
-
-	BUG_ON(bfqq->next_rq == NULL &&
-	       bfqq->entity.budget < bfqq->entity.service);
-	return 1;
-}
-
-/*
- * Task holds one reference to the queue, dropped when task exits.  Each rq
- * in-flight on this queue also holds a reference, dropped when rq is freed.
- *
- * Queue lock must be held here. Recall not to use bfqq after calling
- * this function on it.
- */
-static void bfq_put_queue(struct bfq_queue *bfqq)
-{
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
-
-	BUG_ON(bfqq->ref <= 0);
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
-	bfqq->ref--;
-	if (bfqq->ref)
-		return;
-
-	BUG_ON(rb_first(&bfqq->sort_list));
-	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
-	BUG_ON(bfqq->entity.tree);
-	BUG_ON(bfq_bfqq_busy(bfqq));
-
-	if (bfq_bfqq_sync(bfqq))
-		/*
-		 * The fact that this queue is being destroyed does not
-		 * invalidate the fact that this queue may have been
-		 * activated during the current burst. As a consequence,
-		 * although the queue does not exist anymore, and hence
-		 * needs to be removed from the burst list if there,
-		 * the burst size has not to be decremented.
-		 */
-		hlist_del_init(&bfqq->burst_list_node);
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
-
-	kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_put(bfqg);
-#endif
-}
-
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
-{
-	struct bfq_queue *__bfqq, *next;
-
-	/*
-	 * If this queue was scheduled to merge with another queue, be
-	 * sure to drop the reference taken on that queue (and others in
-	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
-	 */
-	__bfqq = bfqq->new_bfqq;
-	while (__bfqq) {
-		if (__bfqq == bfqq)
-			break;
-		next = __bfqq->new_bfqq;
-		bfq_put_queue(__bfqq);
-		__bfqq = next;
-	}
-}
-
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	if (bfqq == bfqd->in_service_queue) {
-		__bfq_bfqq_expire(bfqd, bfqq);
-		bfq_schedule_dispatch(bfqd);
-	}
-
-	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
-
-	bfq_put_cooperator(bfqq);
-
-	bfq_put_queue(bfqq); /* release process reference */
-}
-
-static void bfq_init_icq(struct io_cq *icq)
-{
-	icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
-}
-
-static void bfq_exit_icq(struct io_cq *icq)
-{
-	struct bfq_io_cq *bic = icq_to_bic(icq);
-	struct bfq_data *bfqd = bic_to_bfqd(bic);
-
-	if (bic_to_bfqq(bic, false)) {
-		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
-		bic_set_bfqq(bic, NULL, false);
-	}
-
-	if (bic_to_bfqq(bic, true)) {
-		/*
-		 * If the bic is using a shared queue, put the reference
-		 * taken on the io_context when the bic started using a
-		 * shared bfq_queue.
-		 */
-		if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
-			put_io_context(icq->ioc);
-		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
-		bic_set_bfqq(bic, NULL, true);
-	}
-}
-
-/*
- * Update the entity prio values; note that the new values will not
- * be used until the next (re)activation.
- */
-static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
-				     struct bfq_io_cq *bic)
-{
-	struct task_struct *tsk = current;
-	int ioprio_class;
-
-	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
-	switch (ioprio_class) {
-	default:
-		dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
-			"bfq: bad prio class %d\n", ioprio_class);
-	case IOPRIO_CLASS_NONE:
-		/*
-		 * No prio set, inherit CPU scheduling settings.
-		 */
-		bfqq->new_ioprio = task_nice_ioprio(tsk);
-		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
-		break;
-	case IOPRIO_CLASS_RT:
-		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
-		break;
-	case IOPRIO_CLASS_BE:
-		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
-		break;
-	case IOPRIO_CLASS_IDLE:
-		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
-		bfqq->new_ioprio = 7;
-		break;
-	}
-
-	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
-		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
-			bfqq->new_ioprio);
-		BUG();
-	}
-
-	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
-	bfqq->entity.prio_changed = 1;
-	bfq_log_bfqq(bfqq->bfqd, bfqq,
-		     "set_next_ioprio_data: bic_class %d prio %d class %d",
-		     ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
-}
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
-{
-	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct bfq_queue *bfqq;
-	unsigned long uninitialized_var(flags);
-	int ioprio = bic->icq.ioc->ioprio;
-
-	/*
-	 * This condition may trigger on a newly created bic, be sure to
-	 * drop the lock before returning.
-	 */
-	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
-		return;
-
-	bic->ioprio = ioprio;
-
-	bfqq = bic_to_bfqq(bic, false);
-	if (bfqq) {
-		/* release process reference on this queue */
-		bfq_put_queue(bfqq);
-		bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
-		bic_set_bfqq(bic, bfqq, false);
-		bfq_log_bfqq(bfqd, bfqq,
-			     "check_ioprio_change: bfqq %p %d",
-			     bfqq, bfqq->ref);
-	}
-
-	bfqq = bic_to_bfqq(bic, true);
-	if (bfqq)
-		bfq_set_next_ioprio_data(bfqq, bic);
-}
-
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
-{
-	RB_CLEAR_NODE(&bfqq->entity.rb_node);
-	INIT_LIST_HEAD(&bfqq->fifo);
-	INIT_HLIST_NODE(&bfqq->burst_list_node);
-	BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
-
-	bfqq->ref = 0;
-	bfqq->bfqd = bfqd;
-
-	if (bic)
-		bfq_set_next_ioprio_data(bfqq, bic);
-
-	if (is_sync) {
-		/*
-		 * No need to mark as has_short_ttime if in
-		 * idle_class, because no device idling is performed
-		 * for queues in idle class
-		 */
-		if (!bfq_class_idle(bfqq))
-			/* tentatively mark as has_short_ttime */
-			bfq_mark_bfqq_has_short_ttime(bfqq);
-		bfq_mark_bfqq_sync(bfqq);
-		bfq_mark_bfqq_just_created(bfqq);
-	} else
-		bfq_clear_bfqq_sync(bfqq);
-	bfq_mark_bfqq_IO_bound(bfqq);
-
-	/* Tentative initial value to trade off between thr and lat */
-	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
-	bfqq->pid = pid;
-
-	bfqq->wr_coeff = 1;
-	bfqq->last_wr_start_finish = jiffies;
-	bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
-	bfqq->budget_timeout = bfq_smallest_from_now();
-	bfqq->split_time = bfq_smallest_from_now();
-
-	/*
-	 * Set to the value for which bfqq will not be deemed as
-	 * soft rt when it becomes backlogged.
-	 */
-	bfqq->soft_rt_next_start = bfq_greatest_from_now();
-
-	/* first request is almost certainly seeky */
-	bfqq->seek_history = 1;
-}
-
-static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
-					       struct bfq_group *bfqg,
-					       int ioprio_class, int ioprio)
-{
-	switch (ioprio_class) {
-	case IOPRIO_CLASS_RT:
-		return &bfqg->async_bfqq[0][ioprio];
-	case IOPRIO_CLASS_NONE:
-		ioprio = IOPRIO_NORM;
-		/* fall through */
-	case IOPRIO_CLASS_BE:
-		return &bfqg->async_bfqq[1][ioprio];
-	case IOPRIO_CLASS_IDLE:
-		return &bfqg->async_idle_bfqq;
-	default:
-		BUG();
-	}
-}
-
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, bool is_sync,
-				       struct bfq_io_cq *bic)
-{
-	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
-	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
-	struct bfq_queue **async_bfqq = NULL;
-	struct bfq_queue *bfqq;
-	struct bfq_group *bfqg;
-
-	rcu_read_lock();
-
-	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
-	if (!bfqg) {
-		bfqq = &bfqd->oom_bfqq;
-		goto out;
-	}
-
-	if (!is_sync) {
-		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
-						  ioprio);
-		bfqq = *async_bfqq;
-		if (bfqq)
-			goto out;
-	}
-
-	bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
-				     bfqd->queue->node);
-
-	if (bfqq) {
-		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
-			      is_sync);
-		bfq_init_entity(&bfqq->entity, bfqg);
-		bfq_log_bfqq(bfqd, bfqq, "allocated");
-	} else {
-		bfqq = &bfqd->oom_bfqq;
-		bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
-		goto out;
-	}
-
-	/*
-	 * Pin the queue now that it's allocated, scheduler exit will
-	 * prune it.
-	 */
-	if (async_bfqq) {
-		bfqq->ref++; /*
-			      * Extra group reference, w.r.t. sync
-			      * queue. This extra reference is removed
-			      * only if bfqq->bfqg disappears, to
-			      * guarantee that this queue is not freed
-			      * until its group goes away.
-			      */
-		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
-			     bfqq, bfqq->ref);
-		*async_bfqq = bfqq;
-	}
-
-out:
-	bfqq->ref++; /* get a process reference to this queue */
-	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
-	rcu_read_unlock();
-	return bfqq;
-}
-
-static void bfq_update_io_thinktime(struct bfq_data *bfqd,
-				    struct bfq_io_cq *bic)
-{
-	struct bfq_ttime *ttime = &bic->ttime;
-	u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
-
-	elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
-
-	ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
-	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
-	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
-				     ttime->ttime_samples);
-}
-
-static void
-bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-		       struct request *rq)
-{
-	bfqq->seek_history <<= 1;
-	bfqq->seek_history |=
-		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
-		(!blk_queue_nonrot(bfqd->queue) ||
-		 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
-}
-
-static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
-				       struct bfq_queue *bfqq,
-				       struct bfq_io_cq *bic)
-{
-	bool has_short_ttime = true;
-
-	/*
-	 * No need to update has_short_ttime if bfqq is async or in
-	 * idle io prio class, or if bfq_slice_idle is zero, because
-	 * no device idling is performed for bfqq in this case.
-	 */
-	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) ||
-	    bfqd->bfq_slice_idle == 0)
-		return;
-
-	/* Idle window just restored, statistics are meaningless. */
-	if (time_is_after_eq_jiffies(bfqq->split_time +
-				     bfqd->bfq_wr_min_idle_time))
-		return;
-
-	/* Think time is infinite if no process is linked to
-	 * bfqq. Otherwise check average think time to
-	 * decide whether to mark as has_short_ttime
-	 */
-	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
-	    (bfq_sample_valid(bic->ttime.ttime_samples) &&
-	     bic->ttime.ttime_mean > bfqd->bfq_slice_idle))
-		has_short_ttime = false;
-
-	bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
-		has_short_ttime);
-
-	if (has_short_ttime)
-		bfq_mark_bfqq_has_short_ttime(bfqq);
-	else
-		bfq_clear_bfqq_has_short_ttime(bfqq);
-}
-
-/*
- * Called when a new fs request (rq) is added to bfqq.  Check if there's
- * something we should do about it.
- */
-static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			    struct request *rq)
-{
-	struct bfq_io_cq *bic = RQ_BIC(rq);
-
-	if (rq->cmd_flags & REQ_META)
-		bfqq->meta_pending++;
-
-	bfq_update_io_thinktime(bfqd, bic);
-	bfq_update_has_short_ttime(bfqd, bfqq, bic);
-	bfq_update_io_seektime(bfqd, bfqq, rq);
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "rq_enqueued: has_short_ttime=%d (seeky %d)",
-		     bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
-
-	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
-	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
-		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
-				 blk_rq_sectors(rq) < 32;
-		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
-
-		/*
-		 * There is just this request queued: if the request
-		 * is small and the queue is not to be expired, then
-		 * just exit.
-		 *
-		 * In this way, if the device is being idled to wait
-		 * for a new request from the in-service queue, we
-		 * avoid unplugging the device and committing the
-		 * device to serve just a small request. On the
-		 * contrary, we wait for the block layer to decide
-		 * when to unplug the device: hopefully, new requests
-		 * will be merged to this one quickly, then the device
-		 * will be unplugged and larger requests will be
-		 * dispatched.
-		 */
-		if (small_req && !budget_timeout)
-			return;
-
-		/*
-		 * A large enough request arrived, or the queue is to
-		 * be expired: in both cases disk idling is to be
-		 * stopped, so clear wait_request flag and reset
-		 * timer.
-		 */
-		bfq_clear_bfqq_wait_request(bfqq);
-		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-
-		/*
-		 * The queue is not empty, because a new request just
-		 * arrived. Hence we can safely expire the queue, in
-		 * case of budget timeout, without risking that the
-		 * timestamps of the queue are not updated correctly.
-		 * See [1] for more details.
-		 */
-		if (budget_timeout)
-			bfq_bfqq_expire(bfqd, bfqq, false,
-					BFQ_BFQQ_BUDGET_TIMEOUT);
-
-		/*
-		 * Let the request rip immediately, or let a new queue be
-		 * selected if bfqq has just been expired.
-		 */
-		__blk_run_queue(bfqd->queue);
-	}
-}
-
-static void bfq_insert_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
-
-	assert_spin_locked(bfqd->queue->queue_lock);
-
-	/*
-	 * An unplug may trigger a requeue of a request from the device
-	 * driver: make sure we are in process context while trying to
-	 * merge two bfq_queues.
-	 */
-	if (!in_interrupt()) {
-		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
-		if (new_bfqq) {
-			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
-				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
-			/*
-			 * Release the request's reference to the old bfqq
-			 * and make sure one is taken to the shared queue.
-			 */
-			new_bfqq->allocated[rq_data_dir(rq)]++;
-			bfqq->allocated[rq_data_dir(rq)]--;
-			new_bfqq->ref++;
-			bfq_clear_bfqq_just_created(bfqq);
-			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
-				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
-						bfqq, new_bfqq);
-			/*
-			 * rq is about to be enqueued into new_bfqq,
-			 * release rq reference on bfqq
-			 */
-			bfq_put_queue(bfqq);
-			rq->elv.priv[1] = new_bfqq;
-			bfqq = new_bfqq;
-		}
-	}
-
-	bfq_add_request(rq);
-
-	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
-	list_add_tail(&rq->queuelist, &bfqq->fifo);
-
-	bfq_rq_enqueued(bfqd, bfqq, rq);
-}
-
-static void bfq_update_hw_tag(struct bfq_data *bfqd)
-{
-	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
-				       bfqd->rq_in_driver);
-
-	if (bfqd->hw_tag == 1)
-		return;
-
-	/*
-	 * This sample is valid if the number of outstanding requests
-	 * is large enough to allow a queueing behavior.  Note that the
-	 * sum is not exact, as it's not taking into account deactivated
-	 * requests.
-	 */
-	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
-		return;
-
-	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
-		return;
-
-	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
-	bfqd->max_rq_in_driver = 0;
-	bfqd->hw_tag_samples = 0;
-}
-
-static void bfq_completed_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-	u64 now_ns;
-	u32 delta_us;
-
-	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
-		     blk_rq_sectors(rq));
-
-	assert_spin_locked(bfqd->queue->queue_lock);
-	bfq_update_hw_tag(bfqd);
-
-	BUG_ON(!bfqd->rq_in_driver);
-	BUG_ON(!bfqq->dispatched);
-	bfqd->rq_in_driver--;
-	bfqq->dispatched--;
-	bfqg_stats_update_completion(bfqq_group(bfqq),
-				     rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), req_op(rq),
-				     rq->cmd_flags);
-
-	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
-		BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-		/*
-		 * Set budget_timeout (which we overload to store the
-		 * time at which the queue remains with no backlog and
-		 * no outstanding request; used by the weight-raising
-		 * mechanism).
-		 */
-		bfqq->budget_timeout = jiffies;
-
-		bfq_weights_tree_remove(bfqd, &bfqq->entity,
-					&bfqd->queue_weights_tree);
-	}
-
-	now_ns = ktime_get_ns();
-
-	RQ_BIC(rq)->ttime.last_end_request = now_ns;
-
-	/*
-	 * Using us instead of ns, to get a reasonable precision in
-	 * computing rate in next check.
-	 */
-	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
-
-	bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
-		delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
-		(USEC_PER_SEC*
-		(u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
-			>>BFQ_RATE_SHIFT,
-		(USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
-
-	/*
-	 * If the request took rather long to complete, and, according
-	 * to the maximum request size recorded, this completion latency
-	 * implies that the request was certainly served at a very low
-	 * rate (less than 1M sectors/sec), then the whole observation
-	 * interval that lasts up to this time instant cannot be a
-	 * valid time interval for computing a new peak rate.  Invoke
-	 * bfq_update_rate_reset to have the following three steps
-	 * taken:
-	 * - close the observation interval at the last (previous)
-	 *   request dispatch or completion
-	 * - compute rate, if possible, for that observation interval
-	 * - reset to zero samples, which will trigger a proper
-	 *   re-initialization of the observation interval on next
-	 *   dispatch
-	 */
-	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
-	   (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
-			1UL<<(BFQ_RATE_SHIFT - 10))
-		bfq_update_rate_reset(bfqd, NULL);
-	bfqd->last_completion = now_ns;
-
-	/*
-	 * If we are waiting to discover whether the request pattern
-	 * of the task associated with the queue is actually
-	 * isochronous, and both requisites for this condition to hold
-	 * are now satisfied, then compute soft_rt_next_start (see the
-	 * comments on the function bfq_bfqq_softrt_next_start()). We
-	 * schedule this delayed check when bfqq expires, if it still
-	 * has in-flight requests.
-	 */
-	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
-	    RB_EMPTY_ROOT(&bfqq->sort_list))
-		bfqq->soft_rt_next_start =
-			bfq_bfqq_softrt_next_start(bfqd, bfqq);
-
-	/*
-	 * If this is the in-service queue, check if it needs to be expired,
-	 * or if we want to idle in case it has no pending requests.
-	 */
-	if (bfqd->in_service_queue == bfqq) {
-		if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
-			bfq_arm_slice_timer(bfqd);
-			goto out;
-		} else if (bfq_may_expire_for_budg_timeout(bfqq))
-			bfq_bfqq_expire(bfqd, bfqq, false,
-					BFQ_BFQQ_BUDGET_TIMEOUT);
-		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
-			 (bfqq->dispatched == 0 ||
-			  !bfq_bfqq_may_idle(bfqq)))
-			bfq_bfqq_expire(bfqd, bfqq, false,
-					BFQ_BFQQ_NO_MORE_REQUESTS);
-	}
-
-	if (!bfqd->rq_in_driver)
-		bfq_schedule_dispatch(bfqd);
-
-out:
-	return;
-}
-
-static int __bfq_may_queue(struct bfq_queue *bfqq)
-{
-	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
-		bfq_clear_bfqq_must_alloc(bfqq);
-		return ELV_MQUEUE_MUST;
-	}
-
-	return ELV_MQUEUE_MAY;
-}
-
-static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
-	struct bfq_io_cq *bic;
-	struct bfq_queue *bfqq;
-
-	/*
-	 * Don't force setup of a queue from here, as a call to may_queue
-	 * does not necessarily imply that a request actually will be
-	 * queued. So just lookup a possibly existing queue, or return
-	 * 'may queue' if that fails.
-	 */
-	bic = bfq_bic_lookup(bfqd, tsk->io_context);
-	if (!bic)
-		return ELV_MQUEUE_MAY;
-
-	bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
-	if (bfqq)
-		return __bfq_may_queue(bfqq);
-
-	return ELV_MQUEUE_MAY;
-}
-
-/*
- * Queue lock held here.
- */
-static void bfq_put_request(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	if (bfqq) {
-		const int rw = rq_data_dir(rq);
-
-		BUG_ON(!bfqq->allocated[rw]);
-		bfqq->allocated[rw]--;
-
-		rq->elv.priv[0] = NULL;
-		rq->elv.priv[1] = NULL;
-
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
-			     bfqq, bfqq->ref);
-		bfq_put_queue(bfqq);
-	}
-}
-
-/*
- * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
- * was the last process referring to that bfqq.
- */
-static struct bfq_queue *
-bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
-
-	put_io_context(bic->icq.ioc);
-
-	if (bfqq_process_refs(bfqq) == 1) {
-		bfqq->pid = current->pid;
-		bfq_clear_bfqq_coop(bfqq);
-		bfq_clear_bfqq_split_coop(bfqq);
-		return bfqq;
-	}
-
-	bic_set_bfqq(bic, NULL, 1);
-
-	bfq_put_cooperator(bfqq);
-
-	bfq_put_queue(bfqq);
-	return NULL;
-}
-
-/*
- * Allocate bfq data structures associated with this request.
- */
-static int bfq_set_request(struct request_queue *q, struct request *rq,
-			   struct bio *bio, gfp_t gfp_mask)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
-	const int rw = rq_data_dir(rq);
-	const int is_sync = rq_is_sync(rq);
-	struct bfq_queue *bfqq;
-	unsigned long flags;
-	bool bfqq_already_existing = false, split = false;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	if (!bic)
-		goto queue_fail;
-
-	bfq_check_ioprio_change(bic, bio);
-
-	bfq_bic_update_cgroup(bic, bio);
-
-new_queue:
-	bfqq = bic_to_bfqq(bic, is_sync);
-	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
-		if (bfqq)
-			bfq_put_queue(bfqq);
-		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
-		BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
-
-		bic_set_bfqq(bic, bfqq, is_sync);
-		if (split && is_sync) {
-			bfq_log_bfqq(bfqd, bfqq,
-				     "set_request: was_in_list %d "
-				     "was_in_large_burst %d "
-				     "large burst in progress %d",
-				     bic->was_in_burst_list,
-				     bic->saved_in_large_burst,
-				     bfqd->large_burst);
-
-			if ((bic->was_in_burst_list && bfqd->large_burst) ||
-			    bic->saved_in_large_burst) {
-				bfq_log_bfqq(bfqd, bfqq,
-					     "set_request: marking in "
-					     "large burst");
-				bfq_mark_bfqq_in_large_burst(bfqq);
-			} else {
-				bfq_log_bfqq(bfqd, bfqq,
-					     "set_request: clearing in "
-					     "large burst");
-				bfq_clear_bfqq_in_large_burst(bfqq);
-				if (bic->was_in_burst_list)
-					hlist_add_head(&bfqq->burst_list_node,
-						       &bfqd->burst_list);
-			}
-			bfqq->split_time = jiffies;
-		}
-	} else {
-		/* If the queue was seeky for too long, break it apart. */
-		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
-			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
-
-			/* Update bic before losing reference to bfqq */
-			if (bfq_bfqq_in_large_burst(bfqq))
-				bic->saved_in_large_burst = true;
-
-			bfqq = bfq_split_bfqq(bic, bfqq);
-			split = true;
-			if (!bfqq)
-				goto new_queue;
-			else
-				bfqq_already_existing = true;
-		}
-	}
-
-	bfqq->allocated[rw]++;
-	bfqq->ref++;
-	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
-
-	rq->elv.priv[0] = bic;
-	rq->elv.priv[1] = bfqq;
-
-	/*
-	 * If a bfq_queue has only one process reference, it is owned
-	 * by only one bfq_io_cq: we can set the bic field of the
-	 * bfq_queue to the address of that structure. Also, if the
-	 * queue has just been split, mark a flag so that the
-	 * information is available to the other scheduler hooks.
-	 */
-	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
-		bfqq->bic = bic;
-		if (split) {
-			/*
-			 * If the queue has just been split from a shared
-			 * queue, restore the idle window and the possible
-			 * weight raising period.
-			 */
-			bfq_bfqq_resume_state(bfqq, bfqd, bic,
-					      bfqq_already_existing);
-		}
-	}
-
-	if (unlikely(bfq_bfqq_just_created(bfqq)))
-		bfq_handle_burst(bfqd, bfqq);
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return 0;
-
-queue_fail:
-	bfq_schedule_dispatch(bfqd);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return 1;
-}
-
-static void bfq_kick_queue(struct work_struct *work)
-{
-	struct bfq_data *bfqd =
-		container_of(work, struct bfq_data, unplug_work);
-	struct request_queue *q = bfqd->queue;
-
-	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(q);
-	spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * Handler of the expiration of the timer running if the in-service queue
- * is idling inside its time slice.
- */
-static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
-{
-	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
-					     idle_slice_timer);
-	struct bfq_queue *bfqq;
-	unsigned long flags;
-	enum bfqq_expiration reason;
-
-	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
-
-	bfqq = bfqd->in_service_queue;
-	/*
-	 * Theoretical race here: the in-service queue can be NULL or
-	 * different from the queue that was idling if the timer handler
-	 * spins on the queue_lock and a new request arrives for the
-	 * current queue and there is a full dispatch cycle that changes
-	 * the in-service queue.  This can hardly happen, but in the worst
-	 * case we just expire a queue too early.
-	 */
-	if (bfqq) {
-		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
-		bfq_clear_bfqq_wait_request(bfqq);
-
-		if (bfq_bfqq_budget_timeout(bfqq))
-			/*
-			 * Also here the queue can be safely expired
-			 * for budget timeout without wasting
-			 * guarantees
-			 */
-			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
-		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
-			/*
-			 * The queue may not be empty upon timer expiration,
-			 * because we may not disable the timer when the
-			 * first request of the in-service queue arrives
-			 * during disk idling.
-			 */
-			reason = BFQ_BFQQ_TOO_IDLE;
-		else
-			goto schedule_dispatch;
-
-		bfq_bfqq_expire(bfqd, bfqq, true, reason);
-	}
-
-schedule_dispatch:
-	bfq_schedule_dispatch(bfqd);
-
-	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
-	return HRTIMER_NORESTART;
-}
-
-static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
-{
-	hrtimer_cancel(&bfqd->idle_slice_timer);
-	cancel_work_sync(&bfqd->unplug_work);
-}
-
-static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
-				 struct bfq_queue **bfqq_ptr)
-{
-	struct bfq_group *root_group = bfqd->root_group;
-	struct bfq_queue *bfqq = *bfqq_ptr;
-
-	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
-	if (bfqq) {
-		bfq_bfqq_move(bfqd, bfqq, root_group);
-		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
-			     bfqq, bfqq->ref);
-		bfq_put_queue(bfqq);
-		*bfqq_ptr = NULL;
-	}
-}
-
-/*
- * Release all the bfqg references to its async queues.  If we are
- * deallocating the group these queues may still contain requests, so
- * we reparent them to the root cgroup (i.e., the only one that will
- * exist for sure until all the requests on a device are gone).
- */
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
-{
-	int i, j;
-
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < IOPRIO_BE_NR; j++)
-			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
-
-	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
-}
-
-static void bfq_exit_queue(struct elevator_queue *e)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	struct request_queue *q = bfqd->queue;
-	struct bfq_queue *bfqq, *n;
-
-	bfq_shutdown_timer_wq(bfqd);
-
-	spin_lock_irq(q->queue_lock);
-
-	BUG_ON(bfqd->in_service_queue);
-	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
-
-	spin_unlock_irq(q->queue_lock);
-
-	bfq_shutdown_timer_wq(bfqd);
-
-	BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
-#else
-	bfq_put_async_queues(bfqd, bfqd->root_group);
-	kfree(bfqd->root_group);
-#endif
-
-	kfree(bfqd);
-}
-
-static void bfq_init_root_group(struct bfq_group *root_group,
-				struct bfq_data *bfqd)
-{
-	int i;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	root_group->entity.parent = NULL;
-	root_group->my_entity = NULL;
-	root_group->bfqd = bfqd;
-#endif
-	root_group->rq_pos_tree = RB_ROOT;
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-	root_group->sched_data.bfq_class_idle_last_service = jiffies;
-}
-
-static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
-{
-	struct bfq_data *bfqd;
-	struct elevator_queue *eq;
-
-	eq = elevator_alloc(q, e);
-	if (!eq)
-		return -ENOMEM;
-
-	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
-	if (!bfqd) {
-		kobject_put(&eq->kobj);
-		return -ENOMEM;
-	}
-	eq->elevator_data = bfqd;
-
-	/*
-	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
-	 */
-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-	bfqd->oom_bfqq.ref++;
-	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
-	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
-
-	/* oom_bfqq does not participate to bursts */
-	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
-	/*
-	 * Trigger weight initialization, according to ioprio, at the
-	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
-	 * class won't be changed any more.
-	 */
-	bfqd->oom_bfqq.entity.prio_changed = 1;
-
-	bfqd->queue = q;
-
-	spin_lock_irq(q->queue_lock);
-	q->elevator = eq;
-	spin_unlock_irq(q->queue_lock);
-
-	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
-	if (!bfqd->root_group)
-		goto out_free;
-	bfq_init_root_group(bfqd->root_group, bfqd);
-	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-
-	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL);
-	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
-
-	bfqd->queue_weights_tree = RB_ROOT;
-	bfqd->group_weights_tree = RB_ROOT;
-
-	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
-
-	INIT_LIST_HEAD(&bfqd->active_list);
-	INIT_LIST_HEAD(&bfqd->idle_list);
-	INIT_HLIST_HEAD(&bfqd->burst_list);
-
-	bfqd->hw_tag = -1;
-
-	bfqd->bfq_max_budget = bfq_default_max_budget;
-
-	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
-	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
-	bfqd->bfq_back_max = bfq_back_max;
-	bfqd->bfq_back_penalty = bfq_back_penalty;
-	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_timeout = bfq_timeout;
-
-	bfqd->bfq_requests_within_timer = 120;
-
-	bfqd->bfq_large_burst_thresh = 8;
-	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
-
-	bfqd->low_latency = true;
-
-	/*
-	 * Trade-off between responsiveness and fairness.
-	 */
-	bfqd->bfq_wr_coeff = 30;
-	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
-	bfqd->bfq_wr_max_time = 0;
-	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
-	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
-	bfqd->bfq_wr_max_softrt_rate = 7000; /*
-					      * Approximate rate required
-					      * to playback or record a
-					      * high-definition compressed
-					      * video.
-					      */
-	bfqd->wr_busy_queues = 0;
-
-	/*
-	 * Begin by assuming, optimistically, that the device is a
-	 * high-speed one, and that its peak rate is equal to 2/3 of
-	 * the highest reference rate.
-	 */
-	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
-			T_fast[blk_queue_nonrot(bfqd->queue)];
-	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
-	bfqd->device_speed = BFQ_BFQD_FAST;
-
-	return 0;
-
-out_free:
-	kfree(bfqd);
-	kobject_put(&eq->kobj);
-	return -ENOMEM;
-}
-
-static void bfq_slab_kill(void)
-{
-	kmem_cache_destroy(bfq_pool);
-}
-
-static int __init bfq_slab_setup(void)
-{
-	bfq_pool = KMEM_CACHE(bfq_queue, 0);
-	if (!bfq_pool)
-		return -ENOMEM;
-	return 0;
-}
-
-static ssize_t bfq_var_show(unsigned int var, char *page)
-{
-	return sprintf(page, "%u\n", var);
-}
-
-static ssize_t bfq_var_store(unsigned long *var, const char *page,
-			     size_t count)
-{
-	unsigned long new_val;
-	int ret = kstrtoul(page, 10, &new_val);
-
-	if (ret == 0)
-		*var = new_val;
-
-	return count;
-}
-
-static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-
-	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
-		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :
-		       jiffies_to_msecs(bfq_wr_duration(bfqd)));
-}
-
-static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
-{
-	struct bfq_queue *bfqq;
-	struct bfq_data *bfqd = e->elevator_data;
-	ssize_t num_char = 0;
-
-	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
-			    bfqd->queued);
-
-	spin_lock_irq(bfqd->queue->queue_lock);
-
-	num_char += sprintf(page + num_char, "Active:\n");
-	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
-		num_char += sprintf(page + num_char,
-				    "pid%d: weight %hu, nr_queued %d %d, ",
-				    bfqq->pid,
-				    bfqq->entity.weight,
-				    bfqq->queued[0],
-				    bfqq->queued[1]);
-		num_char += sprintf(page + num_char,
-				    "dur %d/%u\n",
-				    jiffies_to_msecs(
-					    jiffies -
-					    bfqq->last_wr_start_finish),
-				    jiffies_to_msecs(bfqq->wr_cur_max_time));
-	}
-
-	num_char += sprintf(page + num_char, "Idle:\n");
-	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
-		num_char += sprintf(page + num_char,
-				    "pid%d: weight %hu, dur %d/%u\n",
-				    bfqq->pid,
-				    bfqq->entity.weight,
-				    jiffies_to_msecs(jiffies -
-						     bfqq->last_wr_start_finish),
-				    jiffies_to_msecs(bfqq->wr_cur_max_time));
-	}
-
-	spin_unlock_irq(bfqd->queue->queue_lock);
-
-	return num_char;
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	u64 __data = __VAR;						\
-	if (__CONV == 1)						\
-		__data = jiffies_to_msecs(__data);			\
-	else if (__CONV == 2)						\
-		__data = div_u64(__data, NSEC_PER_MSEC);		\
-	return bfq_var_show(__data, (page));				\
-}
-SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
-SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
-SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
-SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
-SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
-SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
-SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
-SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
-SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
-SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
-SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
-SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
-	1);
-SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
-#undef SHOW_FUNCTION
-
-#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	u64 __data = __VAR;						\
-	__data = div_u64(__data, NSEC_PER_USEC);			\
-	return bfq_var_show(__data, (page));				\
-}
-USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
-#undef USEC_SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t								\
-__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	unsigned long uninitialized_var(__data);			\
-	int ret = bfq_var_store(&__data, (page), count);		\
-	if (__data < (MIN))						\
-		__data = (MIN);						\
-	else if (__data > (MAX))					\
-		__data = (MAX);						\
-	if (__CONV == 1)						\
-		*(__PTR) = msecs_to_jiffies(__data);			\
-	else if (__CONV == 2)						\
-		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
-	else								\
-		*(__PTR) = __data;					\
-	return ret;							\
-}
-STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
-		INT_MAX, 2);
-STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
-		INT_MAX, 2);
-STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
-STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
-		INT_MAX, 0);
-STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
-STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
-STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
-		1);
-STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
-		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
-		INT_MAX, 0);
-#undef STORE_FUNCTION
-
-#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	unsigned long uninitialized_var(__data);			\
-	int ret = bfq_var_store(&__data, (page), count);		\
-	if (__data < (MIN))						\
-		__data = (MIN);						\
-	else if (__data > (MAX))					\
-		__data = (MAX);						\
-	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
-	return ret;							\
-}
-USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
-		    UINT_MAX);
-#undef USEC_STORE_FUNCTION
-
-/* do nothing for the moment */
-static ssize_t bfq_weights_store(struct elevator_queue *e,
-				    const char *page, size_t count)
-{
-	return count;
-}
-
-static ssize_t bfq_max_budget_store(struct elevator_queue *e,
-				    const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long uninitialized_var(__data);
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data == 0)
-		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
-	else {
-		if (__data > INT_MAX)
-			__data = INT_MAX;
-		bfqd->bfq_max_budget = __data;
-	}
-
-	bfqd->bfq_user_max_budget = __data;
-
-	return ret;
-}
-
-/*
- * Leaving this name to preserve name compatibility with cfq
- * parameters, but this timeout is used for both sync and async.
- */
-static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
-				      const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long uninitialized_var(__data);
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data < 1)
-		__data = 1;
-	else if (__data > INT_MAX)
-		__data = INT_MAX;
-
-	bfqd->bfq_timeout = msecs_to_jiffies(__data);
-	if (bfqd->bfq_user_max_budget == 0)
-		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
-
-	return ret;
-}
-
-static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
-				     const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long uninitialized_var(__data);
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data > 1)
-		__data = 1;
-	if (!bfqd->strict_guarantees && __data == 1
-	    && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
-		bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
-
-	bfqd->strict_guarantees = __data;
-
-	return ret;
-}
-
-static ssize_t bfq_low_latency_store(struct elevator_queue *e,
-				     const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long uninitialized_var(__data);
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data > 1)
-		__data = 1;
-	if (__data == 0 && bfqd->low_latency != 0)
-		bfq_end_wr(bfqd);
-	bfqd->low_latency = __data;
-
-	return ret;
-}
-
-#define BFQ_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
-
-static struct elv_fs_entry bfq_attrs[] = {
-	BFQ_ATTR(fifo_expire_sync),
-	BFQ_ATTR(fifo_expire_async),
-	BFQ_ATTR(back_seek_max),
-	BFQ_ATTR(back_seek_penalty),
-	BFQ_ATTR(slice_idle),
-	BFQ_ATTR(slice_idle_us),
-	BFQ_ATTR(max_budget),
-	BFQ_ATTR(timeout_sync),
-	BFQ_ATTR(strict_guarantees),
-	BFQ_ATTR(low_latency),
-	BFQ_ATTR(wr_coeff),
-	BFQ_ATTR(wr_max_time),
-	BFQ_ATTR(wr_rt_max_time),
-	BFQ_ATTR(wr_min_idle_time),
-	BFQ_ATTR(wr_min_inter_arr_async),
-	BFQ_ATTR(wr_max_softrt_rate),
-	BFQ_ATTR(weights),
-	__ATTR_NULL
-};
-
-static struct elevator_type iosched_bfq = {
-	.ops = {
-		.elevator_merge_fn =		bfq_merge,
-		.elevator_merged_fn =		bfq_merged_request,
-		.elevator_merge_req_fn =	bfq_merged_requests,
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		.elevator_bio_merged_fn =	bfq_bio_merged,
-#endif
-		.elevator_allow_bio_merge_fn =	bfq_allow_bio_merge,
-		.elevator_allow_rq_merge_fn =	bfq_allow_rq_merge,
-		.elevator_dispatch_fn =		bfq_dispatch_requests,
-		.elevator_add_req_fn =		bfq_insert_request,
-		.elevator_activate_req_fn =	bfq_activate_request,
-		.elevator_deactivate_req_fn =	bfq_deactivate_request,
-		.elevator_completed_req_fn =	bfq_completed_request,
-		.elevator_former_req_fn =	elv_rb_former_request,
-		.elevator_latter_req_fn =	elv_rb_latter_request,
-		.elevator_init_icq_fn =		bfq_init_icq,
-		.elevator_exit_icq_fn =		bfq_exit_icq,
-		.elevator_set_req_fn =		bfq_set_request,
-		.elevator_put_req_fn =		bfq_put_request,
-		.elevator_may_queue_fn =	bfq_may_queue,
-		.elevator_init_fn =		bfq_init_queue,
-		.elevator_exit_fn =		bfq_exit_queue,
-	},
-	.icq_size =		sizeof(struct bfq_io_cq),
-	.icq_align =		__alignof__(struct bfq_io_cq),
-	.elevator_attrs =	bfq_attrs,
-	.elevator_name =	"bfq",
-	.elevator_owner =	THIS_MODULE,
-};
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct blkcg_policy blkcg_policy_bfq = {
-	.dfl_cftypes		= bfq_blkg_files,
-	.legacy_cftypes		= bfq_blkcg_legacy_files,
-
-	.cpd_alloc_fn		= bfq_cpd_alloc,
-	.cpd_init_fn		= bfq_cpd_init,
-	.cpd_bind_fn	        = bfq_cpd_init,
-	.cpd_free_fn		= bfq_cpd_free,
-
-	.pd_alloc_fn		= bfq_pd_alloc,
-	.pd_init_fn		= bfq_pd_init,
-	.pd_offline_fn		= bfq_pd_offline,
-	.pd_free_fn		= bfq_pd_free,
-	.pd_reset_stats_fn	= bfq_pd_reset_stats,
-};
-#endif
-
-static int __init bfq_init(void)
-{
-	int ret;
-	char msg[60] = "BFQ I/O-scheduler: v8r12";
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	ret = blkcg_policy_register(&blkcg_policy_bfq);
-	if (ret)
-		return ret;
-#endif
-
-	ret = -ENOMEM;
-	if (bfq_slab_setup())
-		goto err_pol_unreg;
-
-	/*
-	 * Times to load large popular applications for the typical
-	 * systems installed on the reference devices (see the
-	 * comments before the definitions of the next two
-	 * arrays). Actually, we use slightly slower values, as the
-	 * estimated peak rate tends to be smaller than the actual
-	 * peak rate.  The reason for this last fact is that estimates
-	 * are computed over much shorter time intervals than the long
-	 * intervals typically used for benchmarking. Why? First, to
-	 * adapt more quickly to variations. Second, because an I/O
-	 * scheduler cannot rely on a peak-rate-evaluation workload to
-	 * be run for a long time.
-	 */
-	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
-	T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
-	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
-	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
-	/*
-	 * Thresholds that determine the switch between speed classes
-	 * (see the comments before the definition of the array
-	 * device_speed_thresh). These thresholds are biased towards
-	 * transitions to the fast class. This is safer than the
-	 * opposite bias. In fact, a wrong transition to the slow
-	 * class results in short weight-raising periods, because the
-	 * speed of the device then tends to be higher that the
-	 * reference peak rate. On the opposite end, a wrong
-	 * transition to the fast class tends to increase
-	 * weight-raising periods, because of the opposite reason.
-	 */
-	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
-	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
-
-	ret = elv_register(&iosched_bfq);
-	if (ret)
-		goto err_pol_unreg;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	strcat(msg, " (with cgroups support)");
-#endif
-	pr_info("%s", msg);
-
-	return 0;
-
-err_pol_unreg:
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
-	return ret;
-}
-
-static void __exit bfq_exit(void)
-{
-	elv_unregister(&iosched_bfq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
-	bfq_slab_kill();
-}
-
-module_init(bfq_init);
-module_exit(bfq_exit);
-
-MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
-MODULE_LICENSE("GPL");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
deleted file mode 100644
index be985d9d5f17..000000000000
--- a/block/bfq-sched.c
+++ /dev/null
@@ -1,2025 +0,0 @@
-/*
- * BFQ: Hierarchical B-WF2Q+ scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
- */
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
-static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
-{
-	struct rb_node *node = tree->rb_node;
-
-	return rb_entry(node, struct bfq_entity, rb_node);
-}
-
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
-
-/**
- * bfq_update_next_in_service - update sd->next_in_service
- * @sd: sched_data for which to perform the update.
- * @new_entity: if not NULL, pointer to the entity whose activation,
- *		requeueing or repositionig triggered the invocation of
- *		this function.
- *
- * This function is called to update sd->next_in_service, which, in
- * its turn, may change as a consequence of the insertion or
- * extraction of an entity into/from one of the active trees of
- * sd. These insertions/extractions occur as a consequence of
- * activations/deactivations of entities, with some activations being
- * 'true' activations, and other activations being requeueings (i.e.,
- * implementing the second, requeueing phase of the mechanism used to
- * reposition an entity in its active tree; see comments on
- * __bfq_activate_entity and __bfq_requeue_entity for details). In
- * both the last two activation sub-cases, new_entity points to the
- * just activated or requeued entity.
- *
- * Returns true if sd->next_in_service changes in such a way that
- * entity->parent may become the next_in_service for its parent
- * entity.
- */
-static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
-				       struct bfq_entity *new_entity)
-{
-	struct bfq_entity *next_in_service = sd->next_in_service;
-	struct bfq_queue *bfqq;
-	bool parent_sched_may_change = false;
-
-	/*
-	 * If this update is triggered by the activation, requeueing
-	 * or repositiong of an entity that does not coincide with
-	 * sd->next_in_service, then a full lookup in the active tree
-	 * can be avoided. In fact, it is enough to check whether the
-	 * just-modified entity has a higher priority than
-	 * sd->next_in_service, or, even if it has the same priority
-	 * as sd->next_in_service, is eligible and has a lower virtual
-	 * finish time than sd->next_in_service. If this compound
-	 * condition holds, then the new entity becomes the new
-	 * next_in_service. Otherwise no change is needed.
-	 */
-	if (new_entity && new_entity != sd->next_in_service) {
-		/*
-		 * Flag used to decide whether to replace
-		 * sd->next_in_service with new_entity. Tentatively
-		 * set to true, and left as true if
-		 * sd->next_in_service is NULL.
-		 */
-		bool replace_next = true;
-
-		/*
-		 * If there is already a next_in_service candidate
-		 * entity, then compare class priorities or timestamps
-		 * to decide whether to replace sd->service_tree with
-		 * new_entity.
-		 */
-		if (next_in_service) {
-			unsigned int new_entity_class_idx =
-				bfq_class_idx(new_entity);
-			struct bfq_service_tree *st =
-				sd->service_tree + new_entity_class_idx;
-
-			/*
-			 * For efficiency, evaluate the most likely
-			 * sub-condition first.
-			 */
-			replace_next =
-				(new_entity_class_idx ==
-				 bfq_class_idx(next_in_service)
-				 &&
-				 !bfq_gt(new_entity->start, st->vtime)
-				 &&
-				 bfq_gt(next_in_service->finish,
-					new_entity->finish))
-				||
-				new_entity_class_idx <
-				bfq_class_idx(next_in_service);
-		}
-
-		if (replace_next)
-			next_in_service = new_entity;
-	} else /* invoked because of a deactivation: lookup needed */
-		next_in_service = bfq_lookup_next_entity(sd);
-
-	if (next_in_service) {
-		parent_sched_may_change = !sd->next_in_service ||
-			bfq_update_parent_budget(next_in_service);
-	}
-
-	sd->next_in_service = next_in_service;
-
-	if (!next_in_service)
-		return parent_sched_may_change;
-
-	bfqq = bfq_entity_to_bfqq(next_in_service);
-	if (bfqq)
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			     "update_next_in_service: chosen this queue");
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else {
-		struct bfq_group *bfqg =
-			container_of(next_in_service,
-				     struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			     "update_next_in_service: chosen this entity");
-	}
-#endif
-	return parent_sched_may_change;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-/* both next loops stop at one of the child entities of the root group */
-#define for_each_entity(entity)				\
-	for (; entity ; entity = entity->parent)
-
-/*
- * For each iteration, compute parent in advance, so as to be safe if
- * entity is deallocated during the iteration. Such a deallocation may
- * happen as a consequence of a bfq_put_queue that frees the bfq_queue
- * containing entity.
- */
-#define for_each_entity_safe(entity, parent)				\
-	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
-
-/*
- * Returns true if this budget changes may let next_in_service->parent
- * become the next_in_service entity for its parent entity.
- */
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
-	struct bfq_entity *bfqg_entity;
-	struct bfq_group *bfqg;
-	struct bfq_sched_data *group_sd;
-	bool ret = false;
-
-	BUG_ON(!next_in_service);
-
-	group_sd = next_in_service->sched_data;
-
-	bfqg = container_of(group_sd, struct bfq_group, sched_data);
-	/*
-	 * bfq_group's my_entity field is not NULL only if the group
-	 * is not the root group. We must not touch the root entity
-	 * as it must never become an in-service entity.
-	 */
-	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity) {
-		if (bfqg_entity->budget > next_in_service->budget)
-			ret = true;
-		bfqg_entity->budget = next_in_service->budget;
-	}
-
-	return ret;
-}
-
-/*
- * This function tells whether entity stops being a candidate for next
- * service, according to the restrictive definition of the field
- * next_in_service. In particular, this function is invoked for an
- * entity that is about to be set in service.
- *
- * If entity is a queue, then the entity is no longer a candidate for
- * next service according to the that definition, because entity is
- * about to become the in-service queue. This function then returns
- * true if entity is a queue.
- *
- * In contrast, entity could still be a candidate for next service if
- * it is not a queue, and has more than one active child. In fact,
- * even if one of its children is about to be set in service, other
- * active children may still be the next to serve, for the parent
- * entity, even according to the above definition. As a consequence, a
- * non-queue entity is not a candidate for next-service only if it has
- * only one active child. And only if this condition holds, then this
- * function returns true for a non-queue entity.
- */
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
-	struct bfq_group *bfqg;
-
-	if (bfq_entity_to_bfqq(entity))
-		return true;
-
-	bfqg = container_of(entity, struct bfq_group, entity);
-
-	BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group);
-	BUG_ON(bfqg->active_entities == 0);
-	/*
-	 * The field active_entities does not always contain the
-	 * actual number of active children entities: it happens to
-	 * not account for the in-service entity in case the latter is
-	 * removed from its active tree (which may get done after
-	 * invoking the function bfq_no_longer_next_in_service in
-	 * bfq_get_next_queue). Fortunately, here, i.e., while
-	 * bfq_no_longer_next_in_service is not yet completed in
-	 * bfq_get_next_queue, bfq_active_extract has not yet been
-	 * invoked, and thus active_entities still coincides with the
-	 * actual number of active entities.
-	 */
-	if (bfqg->active_entities == 1)
-		return true;
-
-	return false;
-}
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-#define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
-
-#define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
-	return false;
-}
-
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
-	return true;
-}
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-/*
- * Shift for timestamp calculations.  This actually limits the maximum
- * service allowed in one timestamp delta (small shift values increase it),
- * the maximum total weight that can be used for the queues in the system
- * (big shift values increase it), and the period of virtual time
- * wraparounds.
- */
-#define WFQ_SERVICE_SHIFT	22
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = NULL;
-
-	BUG_ON(!entity);
-
-	if (!entity->my_sched_data)
-		bfqq = container_of(entity, struct bfq_queue, entity);
-
-	return bfqq;
-}
-
-
-/**
- * bfq_delta - map service into the virtual time domain.
- * @service: amount of service.
- * @weight: scale factor (weight of an entity or weight sum).
- */
-static u64 bfq_delta(unsigned long service, unsigned long weight)
-{
-	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
-
-	do_div(d, weight);
-	return d;
-}
-
-/**
- * bfq_calc_finish - assign the finish time to an entity.
- * @entity: the entity to act upon.
- * @service: the service to be charged to the entity.
- */
-static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned long long start, finish, delta;
-
-	BUG_ON(entity->weight == 0);
-
-	entity->finish = entity->start +
-		bfq_delta(service, entity->weight);
-
-	start = ((entity->start>>10)*1000)>>12;
-	finish = ((entity->finish>>10)*1000)>>12;
-	delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
-
-	if (bfqq) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: serv %lu, w %d",
-			service, entity->weight);
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: start %llu, finish %llu, delta %llu",
-			start, finish, delta);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	} else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			"calc_finish group: serv %lu, w %d",
-			     service, entity->weight);
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			"calc_finish group: start %llu, finish %llu, delta %llu",
-			start, finish, delta);
-#endif
-	}
-}
-
-/**
- * bfq_entity_of - get an entity from a node.
- * @node: the node field of the entity.
- *
- * Convert a node pointer to the relative entity.  This is used only
- * to simplify the logic of some functions and not as the generic
- * conversion mechanism because, e.g., in the tree walking functions,
- * the check for a %NULL value would be redundant.
- */
-static struct bfq_entity *bfq_entity_of(struct rb_node *node)
-{
-	struct bfq_entity *entity = NULL;
-
-	if (node)
-		entity = rb_entry(node, struct bfq_entity, rb_node);
-
-	return entity;
-}
-
-/**
- * bfq_extract - remove an entity from a tree.
- * @root: the tree root.
- * @entity: the entity to remove.
- */
-static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
-{
-	BUG_ON(entity->tree != root);
-
-	entity->tree = NULL;
-	rb_erase(&entity->rb_node, root);
-}
-
-/**
- * bfq_idle_extract - extract an entity from the idle tree.
- * @st: the service tree of the owning @entity.
- * @entity: the entity being removed.
- */
-static void bfq_idle_extract(struct bfq_service_tree *st,
-			     struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *next;
-
-	BUG_ON(entity->tree != &st->idle);
-
-	if (entity == st->first_idle) {
-		next = rb_next(&entity->rb_node);
-		st->first_idle = bfq_entity_of(next);
-	}
-
-	if (entity == st->last_idle) {
-		next = rb_prev(&entity->rb_node);
-		st->last_idle = bfq_entity_of(next);
-	}
-
-	bfq_extract(&st->idle, entity);
-
-	if (bfqq)
-		list_del(&bfqq->bfqq_list);
-}
-
-/**
- * bfq_insert - generic tree insertion.
- * @root: tree root.
- * @entity: entity to insert.
- *
- * This is used for the idle and the active tree, since they are both
- * ordered by finish time.
- */
-static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
-{
-	struct bfq_entity *entry;
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-
-	BUG_ON(entity->tree);
-
-	while (*node) {
-		parent = *node;
-		entry = rb_entry(parent, struct bfq_entity, rb_node);
-
-		if (bfq_gt(entry->finish, entity->finish))
-			node = &parent->rb_left;
-		else
-			node = &parent->rb_right;
-	}
-
-	rb_link_node(&entity->rb_node, parent, node);
-	rb_insert_color(&entity->rb_node, root);
-
-	entity->tree = root;
-}
-
-/**
- * bfq_update_min - update the min_start field of a entity.
- * @entity: the entity to update.
- * @node: one of its children.
- *
- * This function is called when @entity may store an invalid value for
- * min_start due to updates to the active tree.  The function  assumes
- * that the subtree rooted at @node (which may be its left or its right
- * child) has a valid min_start value.
- */
-static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
-{
-	struct bfq_entity *child;
-
-	if (node) {
-		child = rb_entry(node, struct bfq_entity, rb_node);
-		if (bfq_gt(entity->min_start, child->min_start))
-			entity->min_start = child->min_start;
-	}
-}
-
-/**
- * bfq_update_active_node - recalculate min_start.
- * @node: the node to update.
- *
- * @node may have changed position or one of its children may have moved,
- * this function updates its min_start value.  The left and right subtrees
- * are assumed to hold a correct min_start value.
- */
-static void bfq_update_active_node(struct rb_node *node)
-{
-	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->min_start = entity->start;
-	bfq_update_min(entity, node->rb_right);
-	bfq_update_min(entity, node->rb_left);
-
-	if (bfqq) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			     "update_active_node: new min_start %llu",
-			     ((entity->min_start>>10)*1000)>>12);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	} else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			     "update_active_node: new min_start %llu",
-			     ((entity->min_start>>10)*1000)>>12);
-#endif
-	}
-}
-
-/**
- * bfq_update_active_tree - update min_start for the whole active tree.
- * @node: the starting node.
- *
- * @node must be the deepest modified node after an update.  This function
- * updates its min_start using the values held by its children, assuming
- * that they did not change, and then updates all the nodes that may have
- * changed in the path to the root.  The only nodes that may have changed
- * are the ones in the path or their siblings.
- */
-static void bfq_update_active_tree(struct rb_node *node)
-{
-	struct rb_node *parent;
-
-up:
-	bfq_update_active_node(node);
-
-	parent = rb_parent(node);
-	if (!parent)
-		return;
-
-	if (node == parent->rb_left && parent->rb_right)
-		bfq_update_active_node(parent->rb_right);
-	else if (parent->rb_left)
-		bfq_update_active_node(parent->rb_left);
-
-	node = parent;
-	goto up;
-}
-
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
-				 struct bfq_entity *entity,
-				 struct rb_root *root);
-
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
-				    struct bfq_entity *entity,
-				    struct rb_root *root);
-
-
-/**
- * bfq_active_insert - insert an entity in the active tree of its
- *                     group/device.
- * @st: the service tree of the entity.
- * @entity: the entity being inserted.
- *
- * The active tree is ordered by finish time, but an extra key is kept
- * per each node, containing the minimum value for the start times of
- * its children (and the node itself), so it's possible to search for
- * the eligible node with the lowest finish time in logarithmic time.
- */
-static void bfq_active_insert(struct bfq_service_tree *st,
-			      struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_sched_data *sd = NULL;
-	struct bfq_group *bfqg = NULL;
-	struct bfq_data *bfqd = NULL;
-#endif
-
-	bfq_insert(&st->active, entity);
-
-	if (node->rb_left)
-		node = node->rb_left;
-	else if (node->rb_right)
-		node = node->rb_right;
-
-	bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	sd = entity->sched_data;
-	bfqg = container_of(sd, struct bfq_group, sched_data);
-	BUG_ON(!bfqg);
-	bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
-	if (bfqq)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else { /* bfq_group */
-		BUG_ON(!bfqd);
-		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
-	}
-	if (bfqg != bfqd->root_group) {
-		BUG_ON(!bfqg);
-		BUG_ON(!bfqd);
-		bfqg->active_entities++;
-	}
-#endif
-}
-
-/**
- * bfq_ioprio_to_weight - calc a weight from an ioprio.
- * @ioprio: the ioprio value to convert.
- */
-static unsigned short bfq_ioprio_to_weight(int ioprio)
-{
-	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
-}
-
-/**
- * bfq_weight_to_ioprio - calc an ioprio from a weight.
- * @weight: the weight value to convert.
- *
- * To preserve as much as possible the old only-ioprio user interface,
- * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
- */
-static unsigned short bfq_weight_to_ioprio(int weight)
-{
-	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
-	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
-		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
-}
-
-static void bfq_get_entity(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	if (bfqq) {
-		bfqq->ref++;
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
-			     bfqq, bfqq->ref);
-	}
-}
-
-/**
- * bfq_find_deepest - find the deepest node that an extraction can modify.
- * @node: the node being removed.
- *
- * Do the first step of an extraction in an rb tree, looking for the
- * node that will replace @node, and returning the deepest node that
- * the following modifications to the tree can touch.  If @node is the
- * last node in the tree return %NULL.
- */
-static struct rb_node *bfq_find_deepest(struct rb_node *node)
-{
-	struct rb_node *deepest;
-
-	if (!node->rb_right && !node->rb_left)
-		deepest = rb_parent(node);
-	else if (!node->rb_right)
-		deepest = node->rb_left;
-	else if (!node->rb_left)
-		deepest = node->rb_right;
-	else {
-		deepest = rb_next(node);
-		if (deepest->rb_right)
-			deepest = deepest->rb_right;
-		else if (rb_parent(deepest) != node)
-			deepest = rb_parent(deepest);
-	}
-
-	return deepest;
-}
-
-/**
- * bfq_active_extract - remove an entity from the active tree.
- * @st: the service_tree containing the tree.
- * @entity: the entity being removed.
- */
-static void bfq_active_extract(struct bfq_service_tree *st,
-			       struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_sched_data *sd = NULL;
-	struct bfq_group *bfqg = NULL;
-	struct bfq_data *bfqd = NULL;
-#endif
-
-	node = bfq_find_deepest(&entity->rb_node);
-	bfq_extract(&st->active, entity);
-
-	if (node)
-		bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	sd = entity->sched_data;
-	bfqg = container_of(sd, struct bfq_group, sched_data);
-	BUG_ON(!bfqg);
-	bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
-	if (bfqq)
-		list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else { /* bfq_group */
-		BUG_ON(!bfqd);
-		bfq_weights_tree_remove(bfqd, entity,
-					&bfqd->group_weights_tree);
-	}
-	if (bfqg != bfqd->root_group) {
-		BUG_ON(!bfqg);
-		BUG_ON(!bfqd);
-		BUG_ON(!bfqg->active_entities);
-		bfqg->active_entities--;
-	}
-#endif
-}
-
-/**
- * bfq_idle_insert - insert an entity into the idle tree.
- * @st: the service tree containing the tree.
- * @entity: the entity to insert.
- */
-static void bfq_idle_insert(struct bfq_service_tree *st,
-			    struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
-		st->first_idle = entity;
-	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
-		st->last_idle = entity;
-
-	bfq_insert(&st->idle, entity);
-
-	if (bfqq)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
-}
-
-/**
- * bfq_forget_entity - do not consider entity any longer for scheduling
- * @st: the service tree.
- * @entity: the entity being removed.
- * @is_in_service: true if entity is currently the in-service entity.
- *
- * Forget everything about @entity. In addition, if entity represents
- * a queue, and the latter is not in service, then release the service
- * reference to the queue (the one taken through bfq_get_entity). In
- * fact, in this case, there is really no more service reference to
- * the queue, as the latter is also outside any service tree. If,
- * instead, the queue is in service, then __bfq_bfqd_reset_in_service
- * will take care of putting the reference when the queue finally
- * stops being served.
- */
-static void bfq_forget_entity(struct bfq_service_tree *st,
-			      struct bfq_entity *entity,
-			      bool is_in_service)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	BUG_ON(!entity->on_st);
-
-	entity->on_st = false;
-	st->wsum -= entity->weight;
-	if (bfqq && !is_in_service) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d",
-			     bfqq, bfqq->ref);
-		bfq_put_queue(bfqq);
-	}
-}
-
-/**
- * bfq_put_idle_entity - release the idle tree ref of an entity.
- * @st: service tree for the entity.
- * @entity: the entity being released.
- */
-static void bfq_put_idle_entity(struct bfq_service_tree *st,
-				struct bfq_entity *entity)
-{
-	bfq_idle_extract(st, entity);
-	bfq_forget_entity(st, entity,
-			  entity == entity->sched_data->in_service_entity);
-}
-
-/**
- * bfq_forget_idle - update the idle tree if necessary.
- * @st: the service tree to act upon.
- *
- * To preserve the global O(log N) complexity we only remove one entry here;
- * as the idle tree will not grow indefinitely this can be done safely.
- */
-static void bfq_forget_idle(struct bfq_service_tree *st)
-{
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
-	    !bfq_gt(last_idle->finish, st->vtime)) {
-		/*
-		 * Forget the whole idle tree, increasing the vtime past
-		 * the last finish time of idle entities.
-		 */
-		st->vtime = last_idle->finish;
-	}
-
-	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
-		bfq_put_idle_entity(st, first_idle);
-}
-
-/*
- * Update weight and priority of entity. If update_class_too is true,
- * then update the ioprio_class of entity too.
- *
- * The reason why the update of ioprio_class is controlled through the
- * last parameter is as follows. Changing the ioprio class of an
- * entity implies changing the destination service trees for that
- * entity. If such a change occurred when the entity is already on one
- * of the service trees for its previous class, then the state of the
- * entity would become more complex: none of the new possible service
- * trees for the entity, according to bfq_entity_service_tree(), would
- * match any of the possible service trees on which the entity
- * is. Complex operations involving these trees, such as entity
- * activations and deactivations, should take into account this
- * additional complexity.  To avoid this issue, this function is
- * invoked with update_class_too unset in the points in the code where
- * entity may happen to be on some tree.
- */
-static struct bfq_service_tree *
-__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-				struct bfq_entity *entity,
-				bool update_class_too)
-{
-	struct bfq_service_tree *new_st = old_st;
-
-	if (entity->prio_changed) {
-		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-		unsigned int prev_weight, new_weight;
-		struct bfq_data *bfqd = NULL;
-		struct rb_root *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		struct bfq_sched_data *sd;
-		struct bfq_group *bfqg;
-#endif
-
-		if (bfqq)
-			bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		else {
-			sd = entity->my_sched_data;
-			bfqg = container_of(sd, struct bfq_group, sched_data);
-			BUG_ON(!bfqg);
-			bfqd = (struct bfq_data *)bfqg->bfqd;
-			BUG_ON(!bfqd);
-		}
-#endif
-
-		BUG_ON(entity->tree && update_class_too);
-		BUG_ON(old_st->wsum < entity->weight);
-		old_st->wsum -= entity->weight;
-
-		if (entity->new_weight != entity->orig_weight) {
-			if (entity->new_weight < BFQ_MIN_WEIGHT ||
-			    entity->new_weight > BFQ_MAX_WEIGHT) {
-				pr_crit("update_weight_prio: new_weight %d\n",
-					entity->new_weight);
-				if (entity->new_weight < BFQ_MIN_WEIGHT)
-					entity->new_weight = BFQ_MIN_WEIGHT;
-				else
-					entity->new_weight = BFQ_MAX_WEIGHT;
-			}
-			entity->orig_weight = entity->new_weight;
-			if (bfqq)
-				bfqq->ioprio =
-				  bfq_weight_to_ioprio(entity->orig_weight);
-		}
-
-		if (bfqq && update_class_too)
-			bfqq->ioprio_class = bfqq->new_ioprio_class;
-
-		/*
-		 * Reset prio_changed only if the ioprio_class change
-		 * is not pending any longer.
-		 */
-		if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class)
-			entity->prio_changed = 0;
-
-		/*
-		 * NOTE: here we may be changing the weight too early,
-		 * this will cause unfairness.  The correct approach
-		 * would have required additional complexity to defer
-		 * weight changes to the proper time instants (i.e.,
-		 * when entity->finish <= old_st->vtime).
-		 */
-		new_st = bfq_entity_service_tree(entity);
-
-		prev_weight = entity->weight;
-		new_weight = entity->orig_weight *
-			     (bfqq ? bfqq->wr_coeff : 1);
-		/*
-		 * If the weight of the entity changes, remove the entity
-		 * from its old weight counter (if there is a counter
-		 * associated with the entity), and add it to the counter
-		 * associated with its new weight.
-		 */
-		if (prev_weight != new_weight) {
-			if (bfqq)
-				bfq_log_bfqq(bfqq->bfqd, bfqq,
-					     "weight changed %d %d(%d %d)",
-					     prev_weight, new_weight,
-					     entity->orig_weight,
-					     bfqq->wr_coeff);
-
-			root = bfqq ? &bfqd->queue_weights_tree :
-				      &bfqd->group_weights_tree;
-			bfq_weights_tree_remove(bfqd, entity, root);
-		}
-		entity->weight = new_weight;
-		/*
-		 * Add the entity to its weights tree only if it is
-		 * not associated with a weight-raised queue.
-		 */
-		if (prev_weight != new_weight &&
-		    (bfqq ? bfqq->wr_coeff == 1 : 1))
-			/* If we get here, root has been initialized. */
-			bfq_weights_tree_add(bfqd, entity, root);
-
-		new_st->wsum += entity->weight;
-
-		if (new_st != old_st) {
-			BUG_ON(!update_class_too);
-			entity->start = new_st->vtime;
-		}
-	}
-
-	return new_st;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-#endif
-
-/**
- * bfq_bfqq_served - update the scheduler status after selection for
- *                   service.
- * @bfqq: the queue being served.
- * @served: bytes to transfer.
- *
- * NOTE: this can be optimized, as the timestamps of upper level entities
- * are synchronized every time a new bfqq is selected for service.  By now,
- * we keep it to better check consistency.
- */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st;
-
-	for_each_entity(entity) {
-		st = bfq_entity_service_tree(entity);
-
-		entity->service += served;
-
-		BUG_ON(st->wsum == 0);
-
-		st->vtime += bfq_delta(served, st->wsum);
-		bfq_forget_idle(st);
-	}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
-#endif
-	st = bfq_entity_service_tree(&bfqq->entity);
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
-		     served,  ((st->vtime>>10)*1000)>>12, st);
-}
-
-/**
- * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
- *			  of the time interval during which bfqq has been in
- *			  service.
- * @bfqd: the device
- * @bfqq: the queue that needs a service update.
- * @time_ms: the amount of time during which the queue has received service
- *
- * If a queue does not consume its budget fast enough, then providing
- * the queue with service fairness may impair throughput, more or less
- * severely. For this reason, queues that consume their budget slowly
- * are provided with time fairness instead of service fairness. This
- * goal is achieved through the BFQ scheduling engine, even if such an
- * engine works in the service, and not in the time domain. The trick
- * is charging these queues with an inflated amount of service, equal
- * to the amount of service that they would have received during their
- * service slot if they had been fast, i.e., if their requests had
- * been dispatched at a rate equal to the estimated peak rate.
- *
- * It is worth noting that time fairness can cause important
- * distortions in terms of bandwidth distribution, on devices with
- * internal queueing. The reason is that I/O requests dispatched
- * during the service slot of a queue may be served after that service
- * slot is finished, and may have a total processing time loosely
- * correlated with the duration of the service slot. This is
- * especially true for short service slots.
- */
-static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				 unsigned long time_ms)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	int tot_serv_to_charge = entity->service;
-	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
-
-	if (time_ms > 0 && time_ms < timeout_ms)
-		tot_serv_to_charge =
-			(bfqd->bfq_max_budget * time_ms) / timeout_ms;
-
-	if (tot_serv_to_charge < entity->service)
-		tot_serv_to_charge = entity->service;
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq,
-		     "charge_time: %lu/%u ms, %d/%d/%d sectors",
-		     time_ms, timeout_ms, entity->service,
-		     tot_serv_to_charge, entity->budget);
-
-	/* Increase budget to avoid inconsistencies */
-	if (tot_serv_to_charge > entity->budget)
-		entity->budget = tot_serv_to_charge;
-
-	bfq_bfqq_served(bfqq,
-			max_t(int, 0, tot_serv_to_charge - entity->service));
-}
-
-static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
-					struct bfq_service_tree *st,
-					bool backshifted)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct bfq_sched_data *sd = entity->sched_data;
-
-	/*
-	 * When this function is invoked, entity is not in any service
-	 * tree, then it is safe to invoke next function with the last
-	 * parameter set (see the comments on the function).
-	 */
-	BUG_ON(entity->tree);
-	st = __bfq_entity_update_weight_prio(st, entity, true);
-	bfq_calc_finish(entity, entity->budget);
-
-	/*
-	 * If some queues enjoy backshifting for a while, then their
-	 * (virtual) finish timestamps may happen to become lower and
-	 * lower than the system virtual time.  In particular, if
-	 * these queues often happen to be idle for short time
-	 * periods, and during such time periods other queues with
-	 * higher timestamps happen to be busy, then the backshifted
-	 * timestamps of the former queues can become much lower than
-	 * the system virtual time. In fact, to serve the queues with
-	 * higher timestamps while the ones with lower timestamps are
-	 * idle, the system virtual time may be pushed-up to much
-	 * higher values than the finish timestamps of the idle
-	 * queues. As a consequence, the finish timestamps of all new
-	 * or newly activated queues may end up being much larger than
-	 * those of lucky queues with backshifted timestamps. The
-	 * latter queues may then monopolize the device for a lot of
-	 * time. This would simply break service guarantees.
-	 *
-	 * To reduce this problem, push up a little bit the
-	 * backshifted timestamps of the queue associated with this
-	 * entity (only a queue can happen to have the backshifted
-	 * flag set): just enough to let the finish timestamp of the
-	 * queue be equal to the current value of the system virtual
-	 * time. This may introduce a little unfairness among queues
-	 * with backshifted timestamps, but it does not break
-	 * worst-case fairness guarantees.
-	 *
-	 * As a special case, if bfqq is weight-raised, push up
-	 * timestamps much less, to keep very low the probability that
-	 * this push up causes the backshifted finish timestamps of
-	 * weight-raised queues to become higher than the backshifted
-	 * finish timestamps of non weight-raised queues.
-	 */
-	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
-		unsigned long delta = st->vtime - entity->finish;
-
-		if (bfqq)
-			delta /= bfqq->wr_coeff;
-
-		entity->start += delta;
-		entity->finish += delta;
-
-		if (bfqq) {
-			bfq_log_bfqq(bfqq->bfqd, bfqq,
-				     "__activate_entity: new queue finish %llu",
-				     ((entity->finish>>10)*1000)>>12);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		} else {
-			struct bfq_group *bfqg =
-				container_of(entity, struct bfq_group, entity);
-
-			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-				     "__activate_entity: new group finish %llu",
-				     ((entity->finish>>10)*1000)>>12);
-#endif
-		}
-	}
-
-	bfq_active_insert(st, entity);
-
-	if (bfqq) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"__activate_entity: queue %seligible in st %p",
-			     entity->start <= st->vtime ? "" : "non ", st);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	} else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			"__activate_entity: group %seligible in st %p",
-			     entity->start <= st->vtime ? "" : "non ", st);
-#endif
-	}
-	BUG_ON(RB_EMPTY_ROOT(&st->active));
-	BUG_ON(&st->active != &sd->service_tree->active &&
-	       &st->active != &(sd->service_tree+1)->active &&
-	       &st->active != &(sd->service_tree+2)->active);
-}
-
-/**
- * __bfq_activate_entity - handle activation of entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if entity was waiting for a request
- *
- * Called for a 'true' activation, i.e., if entity is not active and
- * one of its children receives a new request.
- *
- * Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possibly extracting it
- * from its idle tree.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
-				  bool non_blocking_wait_rq)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	bool backshifted = false;
-	unsigned long long min_vstart;
-
-	BUG_ON(!sd);
-	BUG_ON(!st);
-
-	/* See comments on bfq_fqq_update_budg_for_activation */
-	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
-		backshifted = true;
-		min_vstart = entity->finish;
-	} else
-		min_vstart = st->vtime;
-
-	if (entity->tree == &st->idle) {
-		/*
-		 * Must be on the idle tree, bfq_idle_extract() will
-		 * check for that.
-		 */
-		bfq_idle_extract(st, entity);
-		BUG_ON(entity->tree);
-		entity->start = bfq_gt(min_vstart, entity->finish) ?
-			min_vstart : entity->finish;
-	} else {
-		BUG_ON(entity->tree);
-		/*
-		 * The finish time of the entity may be invalid, and
-		 * it is in the past for sure, otherwise the queue
-		 * would have been on the idle tree.
-		 */
-		entity->start = min_vstart;
-		st->wsum += entity->weight;
-		/*
-		 * entity is about to be inserted into a service tree,
-		 * and then set in service: get a reference to make
-		 * sure entity does not disappear until it is no
-		 * longer in service or scheduled for service.
-		 */
-		bfq_get_entity(entity);
-
-		BUG_ON(entity->on_st && bfqq);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		if (entity->on_st && !bfqq) {
-			struct bfq_group *bfqg =
-				container_of(entity, struct bfq_group,
-					     entity);
-
-			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd,
-				     bfqg,
-				     "activate bug, class %d in_service %p",
-				     bfq_class_idx(entity), sd->in_service_entity);
-		}
-#endif
-		BUG_ON(entity->on_st && !bfqq);
-		entity->on_st = true;
-	}
-
-	bfq_update_fin_time_enqueue(entity, st, backshifted);
-}
-
-/**
- * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
- * @entity: the entity being requeued or repositioned.
- *
- * Requeueing is needed if this entity stops being served, which
- * happens if a leaf descendant entity has expired. On the other hand,
- * repositioning is needed if the next_inservice_entity for the child
- * entity has changed. See the comments inside the function for
- * details.
- *
- * Basically, this function: 1) removes entity from its active tree if
- * present there, 2) updates the timestamps of entity and 3) inserts
- * entity back into its active tree (in the new, right position for
- * the new values of the timestamps).
- */
-static void __bfq_requeue_entity(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	BUG_ON(!sd);
-	BUG_ON(!st);
-
-	BUG_ON(entity != sd->in_service_entity &&
-	       entity->tree != &st->active);
-
-	if (entity == sd->in_service_entity) {
-		/*
-		 * We are requeueing the current in-service entity,
-		 * which may have to be done for one of the following
-		 * reasons:
-		 * - entity represents the in-service queue, and the
-		 *   in-service queue is being requeued after an
-		 *   expiration;
-		 * - entity represents a group, and its budget has
-		 *   changed because one of its child entities has
-		 *   just been either activated or requeued for some
-		 *   reason; the timestamps of the entity need then to
-		 *   be updated, and the entity needs to be enqueued
-		 *   or repositioned accordingly.
-		 *
-		 * In particular, before requeueing, the start time of
-		 * the entity must be moved forward to account for the
-		 * service that the entity has received while in
-		 * service. This is done by the next instructions. The
-		 * finish time will then be updated according to this
-		 * new value of the start time, and to the budget of
-		 * the entity.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		BUG_ON(entity->tree && entity->tree == &st->idle);
-		BUG_ON(entity->tree && entity->tree != &st->active);
-		/*
-		 * In addition, if the entity had more than one child
-		 * when set in service, then it was not extracted from
-		 * the active tree. This implies that the position of
-		 * the entity in the active tree may need to be
-		 * changed now, because we have just updated the start
-		 * time of the entity, and we will update its finish
-		 * time in a moment (the requeueing is then, more
-		 * precisely, a repositioning in this case). To
-		 * implement this repositioning, we: 1) dequeue the
-		 * entity here, 2) update the finish time and requeue
-		 * the entity according to the new timestamps below.
-		 */
-		if (entity->tree)
-			bfq_active_extract(st, entity);
-	} else { /* The entity is already active, and not in service */
-		/*
-		 * In this case, this function gets called only if the
-		 * next_in_service entity below this entity has
-		 * changed, and this change has caused the budget of
-		 * this entity to change, which, finally implies that
-		 * the finish time of this entity must be
-		 * updated. Such an update may cause the scheduling,
-		 * i.e., the position in the active tree, of this
-		 * entity to change. We handle this change by: 1)
-		 * dequeueing the entity here, 2) updating the finish
-		 * time and requeueing the entity according to the new
-		 * timestamps below. This is the same approach as the
-		 * non-extracted-entity sub-case above.
-		 */
-		bfq_active_extract(st, entity);
-	}
-
-	bfq_update_fin_time_enqueue(entity, st, false);
-}
-
-static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
-					  struct bfq_sched_data *sd,
-					  bool non_blocking_wait_rq)
-{
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	if (sd->in_service_entity == entity || entity->tree == &st->active)
-		 /*
-		  * in service or already queued on the active tree,
-		  * requeue or reposition
-		  */
-		__bfq_requeue_entity(entity);
-	else
-		/*
-		 * Not in service and not queued on its active tree:
-		 * the activity is idle and this is a true activation.
-		 */
-		__bfq_activate_entity(entity, non_blocking_wait_rq);
-}
-
-
-/**
- * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue,
- *			 	 and activate, requeue or reposition all ancestors
- *			 	 for which such an update becomes necessary.
- * @entity: the entity to activate.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- * @requeue: true if this is a requeue, which implies that bfqq is
- *	     being expired; thus ALL its ancestors stop being served and must
- *	     therefore be requeued
- */
-static void bfq_activate_requeue_entity(struct bfq_entity *entity,
-					bool non_blocking_wait_rq,
-					bool requeue)
-{
-	struct bfq_sched_data *sd;
-
-	for_each_entity(entity) {
-		BUG_ON(!entity);
-		sd = entity->sched_data;
-		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
-		BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) &&
-		       RB_EMPTY_ROOT(&(sd->service_tree+1)->active) &&
-		       RB_EMPTY_ROOT(&(sd->service_tree+2)->active));
-
-		if (!bfq_update_next_in_service(sd, entity) && !requeue) {
-			BUG_ON(!sd->next_in_service);
-			break;
-		}
-		BUG_ON(!sd->next_in_service);
-	}
-}
-
-/**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: if false, the entity will not be put into the
- *			idle tree.
- *
- * Deactivates an entity, independently of its previous state.  Must
- * be invoked only if entity is on a service tree. Extracts the entity
- * from that tree, and if necessary and allowed, puts it into the idle
- * tree.
- */
-static bool __bfq_deactivate_entity(struct bfq_entity *entity,
-				    bool ins_into_idle_tree)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st;
-	bool is_in_service;
-
-	if (!entity->on_st) { /* entity never activated, or already inactive */
-		BUG_ON(sd && entity == sd->in_service_entity);
-		return false;
-	}
-
-	/*
-	 * If we get here, then entity is active, which implies that
-	 * bfq_group_set_parent has already been invoked for the group
-	 * represented by entity. Therefore, the field
-	 * entity->sched_data has been set, and we can safely use it.
-	 */
-	st = bfq_entity_service_tree(entity);
-	is_in_service = entity == sd->in_service_entity;
-
-	BUG_ON(is_in_service && entity->tree && entity->tree != &st->active);
-
-	if (is_in_service) {
-		bfq_calc_finish(entity, entity->service);
-		sd->in_service_entity = NULL;
-	}
-
-	if (entity->tree == &st->active)
-		bfq_active_extract(st, entity);
-	else if (!is_in_service && entity->tree == &st->idle)
-		bfq_idle_extract(st, entity);
-	else if (entity->tree)
-		BUG();
-
-	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
-		bfq_forget_entity(st, entity, is_in_service);
-	else
-		bfq_idle_insert(st, entity);
-
-	return true;
-}
-
-/**
- * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: true if the entity can be put into the idle tree
- */
-static void bfq_deactivate_entity(struct bfq_entity *entity,
-				  bool ins_into_idle_tree,
-				  bool expiration)
-{
-	struct bfq_sched_data *sd;
-	struct bfq_entity *parent = NULL;
-
-	for_each_entity_safe(entity, parent) {
-		sd = entity->sched_data;
-
-		BUG_ON(sd == NULL); /*
-				     * It would mean that this is the
-				     * root group.
-				     */
-
-		BUG_ON(expiration && entity != sd->in_service_entity);
-
-		BUG_ON(entity != sd->in_service_entity &&
-		       entity->tree ==
-		       &bfq_entity_service_tree(entity)->active &&
-		       !sd->next_in_service);
-
-		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
-			/*
-			 * entity is not in any tree any more, so
-			 * this deactivation is a no-op, and there is
-			 * nothing to change for upper-level entities
-			 * (in case of expiration, this can never
-			 * happen).
-			 */
-			BUG_ON(expiration); /*
-					     * entity cannot be already out of
-					     * any tree
-					     */
-			return;
-		}
-
-		if (sd->next_in_service == entity)
-			/*
-			 * entity was the next_in_service entity,
-			 * then, since entity has just been
-			 * deactivated, a new one must be found.
-			 */
-			bfq_update_next_in_service(sd, NULL);
-
-		if (sd->next_in_service || sd->in_service_entity) {
-			/*
-			 * The parent entity is still active, because
-			 * either next_in_service or in_service_entity
-			 * is not NULL. So, no further upwards
-			 * deactivation must be performed.  Yet,
-			 * next_in_service has changed.  Then the
-			 * schedule does need to be updated upwards.
-			 *
-			 * NOTE If in_service_entity is not NULL, then
-			 * next_in_service may happen to be NULL,
-			 * although the parent entity is evidently
-			 * active. This happens if 1) the entity
-			 * pointed by in_service_entity is the only
-			 * active entity in the parent entity, and 2)
-			 * according to the definition of
-			 * next_in_service, the in_service_entity
-			 * cannot be considered as
-			 * next_in_service. See the comments on the
-			 * definition of next_in_service for details.
-			 */
-			BUG_ON(sd->next_in_service == entity);
-			BUG_ON(sd->in_service_entity == entity);
-			break;
-		}
-
-		/*
-		 * If we get here, then the parent is no more
-		 * backlogged and we need to propagate the
-		 * deactivation upwards. Thus let the loop go on.
-		 */
-
-		/*
-		 * Also let parent be queued into the idle tree on
-		 * deactivation, to preserve service guarantees, and
-		 * assuming that who invoked this function does not
-		 * need parent entities too to be removed completely.
-		 */
-		ins_into_idle_tree = true;
-	}
-
-	/*
-	 * If the deactivation loop is fully executed, then there are
-	 * no more entities to touch and next loop is not executed at
-	 * all. Otherwise, requeue remaining entities if they are
-	 * about to stop receiving service, or reposition them if this
-	 * is not the case.
-	 */
-	entity = parent;
-	for_each_entity(entity) {
-		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-		/*
-		 * Invoke __bfq_requeue_entity on entity, even if
-		 * already active, to requeue/reposition it in the
-		 * active tree (because sd->next_in_service has
-		 * changed)
-		 */
-		__bfq_requeue_entity(entity);
-
-		sd = entity->sched_data;
-		BUG_ON(expiration && sd->in_service_entity != entity);
-
-		if (bfqq)
-			bfq_log_bfqq(bfqq->bfqd, bfqq,
-				     "invoking udpdate_next for this queue");
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		else {
-			struct bfq_group *bfqg =
-				container_of(entity,
-					     struct bfq_group, entity);
-
-			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-				     "invoking udpdate_next for this entity");
-		}
-#endif
-		if (!bfq_update_next_in_service(sd, entity) &&
-		    !expiration)
-			/*
-			 * next_in_service unchanged or not causing
-			 * any change in entity->parent->sd, and no
-			 * requeueing needed for expiration: stop
-			 * here.
-			 */
-			break;
-	}
-}
-
-/**
- * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
- *                       if needed, to have at least one entity eligible.
- * @st: the service tree to act upon.
- *
- * Assumes that st is not empty.
- */
-static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
-{
-	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
-
-	if (bfq_gt(root_entity->min_start, st->vtime)) {
-		struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity);
-
-		if (bfqq)
-			bfq_log_bfqq(bfqq->bfqd, bfqq,
-				     "calc_vtime_jump: new value %llu",
-				     root_entity->min_start);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		else {
-			struct bfq_group *bfqg =
-				container_of(root_entity, struct bfq_group,
-					     entity);
-
-			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-				     "calc_vtime_jump: new value %llu",
-				     root_entity->min_start);
-		}
-#endif
-		return root_entity->min_start;
-	}
-	return st->vtime;
-}
-
-static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
-{
-	if (new_value > st->vtime) {
-		st->vtime = new_value;
-		bfq_forget_idle(st);
-	}
-}
-
-/**
- * bfq_first_active_entity - find the eligible entity with
- *                           the smallest finish time
- * @st: the service tree to select from.
- * @vtime: the system virtual to use as a reference for eligibility
- *
- * This function searches the first schedulable entity, starting from the
- * root of the tree and going on the left every time on this side there is
- * a subtree with at least one eligible (start >= vtime) entity. The path on
- * the right is followed only if a) the left subtree contains no eligible
- * entities and b) no eligible entity has been found yet.
- */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
-						  u64 vtime)
-{
-	struct bfq_entity *entry, *first = NULL;
-	struct rb_node *node = st->active.rb_node;
-
-	while (node) {
-		entry = rb_entry(node, struct bfq_entity, rb_node);
-left:
-		if (!bfq_gt(entry->start, vtime))
-			first = entry;
-
-		BUG_ON(bfq_gt(entry->min_start, vtime));
-
-		if (node->rb_left) {
-			entry = rb_entry(node->rb_left,
-					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, vtime)) {
-				node = node->rb_left;
-				goto left;
-			}
-		}
-		if (first)
-			break;
-		node = node->rb_right;
-	}
-
-	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
-	return first;
-}
-
-/**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
- *
- * If there is no in-service entity for the sched_data st belongs to,
- * then return the entity that will be set in service if:
- * 1) the parent entity this st belongs to is set in service;
- * 2) no entity belonging to such parent entity undergoes a state change
- * that would influence the timestamps of the entity (e.g., becomes idle,
- * becomes backlogged, changes its budget, ...).
- *
- * In this first case, update the virtual time in @st too (see the
- * comments on this update inside the function).
- *
- * In constrast, if there is an in-service entity, then return the
- * entity that would be set in service if not only the above
- * conditions, but also the next one held true: the currently
- * in-service entity, on expiration,
- * 1) gets a finish time equal to the current one, or
- * 2) is not eligible any more, or
- * 3) is idle.
- */
-static struct bfq_entity *
-__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service
-#if 0
-			 , bool force
-#endif
-	)
-{
-	struct bfq_entity *entity
-#if 0
-		, *new_next_in_service = NULL
-#endif
-		;
-	u64 new_vtime;
-	struct bfq_queue *bfqq;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
-
-	/*
-	 * Get the value of the system virtual time for which at
-	 * least one entity is eligible.
-	 */
-	new_vtime = bfq_calc_vtime_jump(st);
-
-	/*
-	 * If there is no in-service entity for the sched_data this
-	 * active tree belongs to, then push the system virtual time
-	 * up to the value that guarantees that at least one entity is
-	 * eligible. If, instead, there is an in-service entity, then
-	 * do not make any such update, because there is already an
-	 * eligible entity, namely the in-service one (even if the
-	 * entity is not on st, because it was extracted when set in
-	 * service).
-	 */
-	if (!in_service)
-		bfq_update_vtime(st, new_vtime);
-
-	entity = bfq_first_active_entity(st, new_vtime);
-	BUG_ON(bfq_gt(entity->start, new_vtime));
-
-	/* Log some information */
-	bfqq = bfq_entity_to_bfqq(entity);
-	if (bfqq)
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			     "__lookup_next: start %llu vtime %llu st %p",
-			     ((entity->start>>10)*1000)>>12,
-			     ((new_vtime>>10)*1000)>>12, st);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			     "__lookup_next: start %llu vtime %llu st %p",
-			     ((entity->start>>10)*1000)>>12,
-			     ((new_vtime>>10)*1000)>>12, st);
-	}
-#endif
-
-	BUG_ON(!entity);
-
-	return entity;
-}
-
-/**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- *
- * This function is invoked when there has been a change in the trees
- * for sd, and we need know what is the new next entity after this
- * change.
- */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
-{
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
-	struct bfq_entity *entity = NULL;
-	struct bfq_queue *bfqq;
-	int class_idx = 0;
-
-	BUG_ON(!sd);
-	BUG_ON(!st);
-	/*
-	 * Choose from idle class, if needed to guarantee a minimum
-	 * bandwidth to this class (and if there is some active entity
-	 * in idle class). This should also mitigate
-	 * priority-inversion problems in case a low priority task is
-	 * holding file system resources.
-	 */
-	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
-				   BFQ_CL_IDLE_TIMEOUT)) {
-		if (!RB_EMPTY_ROOT(&idle_class_st->active))
-			class_idx = BFQ_IOPRIO_CLASSES - 1;
-		/* About to be served if backlogged, or not yet backlogged */
-		sd->bfq_class_idle_last_service = jiffies;
-	}
-
-	/*
-	 * Find the next entity to serve for the highest-priority
-	 * class, unless the idle class needs to be served.
-	 */
-	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
-		entity = __bfq_lookup_next_entity(st + class_idx,
-						  sd->in_service_entity);
-
-		if (entity)
-			break;
-	}
-
-	BUG_ON(!entity &&
-	       (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) ||
-		!RB_EMPTY_ROOT(&(st+2)->active)));
-
-	if (!entity)
-		return NULL;
-
-	/* Log some information */
-	bfqq = bfq_entity_to_bfqq(entity);
-	if (bfqq)
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d",
-			     st + class_idx, class_idx);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			     "chosen from st %p %d",
-			     st + class_idx, class_idx);
-	}
-#endif
-
-	return entity;
-}
-
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
-{
-	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
-
-	return sd->next_in_service != sd->in_service_entity;
-}
-
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
-{
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
-
-	BUG_ON(bfqd->in_service_queue);
-
-	if (bfqd->busy_queues == 0)
-		return NULL;
-
-	/*
-	 * Traverse the path from the root to the leaf entity to
-	 * serve. Set in service all the entities visited along the
-	 * way.
-	 */
-	sd = &bfqd->root_group->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		if (entity) {
-			struct bfq_group *bfqg =
-				container_of(entity, struct bfq_group, entity);
-
-			bfq_log_bfqg(bfqd, bfqg,
-				     "get_next_queue: lookup in this group");
-			if (!sd->next_in_service)
-				pr_crit("get_next_queue: lookup in this group");
-		} else {
-			bfq_log_bfqg(bfqd, bfqd->root_group,
-				     "get_next_queue: lookup in root group");
-			if (!sd->next_in_service)
-				pr_crit("get_next_queue: lookup in root group");
-		}
-#endif
-
-		BUG_ON(!sd->next_in_service);
-
-		/*
-		 * WARNING. We are about to set the in-service entity
-		 * to sd->next_in_service, i.e., to the (cached) value
-		 * returned by bfq_lookup_next_entity(sd) the last
-		 * time it was invoked, i.e., the last time when the
-		 * service order in sd changed as a consequence of the
-		 * activation or deactivation of an entity. In this
-		 * respect, if we execute bfq_lookup_next_entity(sd)
-		 * in this very moment, it may, although with low
-		 * probability, yield a different entity than that
-		 * pointed to by sd->next_in_service. This rare event
-		 * happens in case there was no CLASS_IDLE entity to
-		 * serve for sd when bfq_lookup_next_entity(sd) was
-		 * invoked for the last time, while there is now one
-		 * such entity.
-		 *
-		 * If the above event happens, then the scheduling of
-		 * such entity in CLASS_IDLE is postponed until the
-		 * service of the sd->next_in_service entity
-		 * finishes. In fact, when the latter is expired,
-		 * bfq_lookup_next_entity(sd) gets called again,
-		 * exactly to update sd->next_in_service.
-		 */
-
-		/* Make next_in_service entity become in_service_entity */
-		entity = sd->next_in_service;
-		sd->in_service_entity = entity;
-
-		/*
-		 * Reset the accumulator of the amount of service that
-		 * the entity is about to receive.
-		 */
-		entity->service = 0;
-
-		/*
-		 * If entity is no longer a candidate for next
-		 * service, then it must be extracted from its active
-		 * tree, so as to make sure that it won't be
-		 * considered when computing next_in_service. See the
-		 * comments on the function
-		 * bfq_no_longer_next_in_service() for details.
-		 */
-		if (bfq_no_longer_next_in_service(entity))
-			bfq_active_extract(bfq_entity_service_tree(entity),
-					   entity);
-
-		/*
-		 * Even if entity is not to be extracted according to
-		 * the above check, a descendant entity may get
-		 * extracted in one of the next iterations of this
-		 * loop. Such an event could cause a change in
-		 * next_in_service for the level of the descendant
-		 * entity, and thus possibly back to this level.
-		 *
-		 * However, we cannot perform the resulting needed
-		 * update of next_in_service for this level before the
-		 * end of the whole loop, because, to know which is
-		 * the correct next-to-serve candidate entity for each
-		 * level, we need first to find the leaf entity to set
-		 * in service. In fact, only after we know which is
-		 * the next-to-serve leaf entity, we can discover
-		 * whether the parent entity of the leaf entity
-		 * becomes the next-to-serve, and so on.
-		 */
-
-		/* Log some information */
-		bfqq = bfq_entity_to_bfqq(entity);
-		if (bfqq)
-			bfq_log_bfqq(bfqd, bfqq,
-			     "get_next_queue: this queue, finish %llu",
-				(((entity->finish>>10)*1000)>>10)>>2);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		else {
-			struct bfq_group *bfqg =
-				container_of(entity, struct bfq_group, entity);
-
-			bfq_log_bfqg(bfqd, bfqg,
-			     "get_next_queue: this entity, finish %llu",
-				(((entity->finish>>10)*1000)>>10)>>2);
-		}
-#endif
-
-	}
-
-	BUG_ON(!entity);
-	bfqq = bfq_entity_to_bfqq(entity);
-	BUG_ON(!bfqq);
-
-	/*
-	 * We can finally update all next-to-serve entities along the
-	 * path from the leaf entity just set in service to the root.
-	 */
-	for_each_entity(entity) {
-		struct bfq_sched_data *sd = entity->sched_data;
-
-		if(!bfq_update_next_in_service(sd, NULL))
-			break;
-	}
-
-	return bfqq;
-}
-
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
-{
-	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
-	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
-	struct bfq_entity *entity = in_serv_entity;
-
-	if (bfqd->in_service_bic) {
-		put_io_context(bfqd->in_service_bic->icq.ioc);
-		bfqd->in_service_bic = NULL;
-	}
-
-	bfq_clear_bfqq_wait_request(in_serv_bfqq);
-	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-	bfqd->in_service_queue = NULL;
-
-	/*
-	 * When this function is called, all in-service entities have
-	 * been properly deactivated or requeued, so we can safely
-	 * execute the final step: reset in_service_entity along the
-	 * path from entity to the root.
-	 */
-	for_each_entity(entity)
-		entity->sched_data->in_service_entity = NULL;
-
-	/*
-	 * in_serv_entity is no longer in service, so, if it is in no
-	 * service tree either, then release the service reference to
-	 * the queue it represents (taken with bfq_get_entity).
-	 */
-	if (!in_serv_entity->on_st)
-		bfq_put_queue(in_serv_bfqq);
-}
-
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				bool ins_into_idle_tree, bool expiration)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
-}
-
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	BUG_ON(bfqq == bfqd->in_service_queue);
-	BUG_ON(entity->tree != &st->active && entity->tree != &st->idle &&
-	       entity->on_st);
-
-	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
-				    false);
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
-}
-
-static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_activate_requeue_entity(entity, false,
-				    bfqq == bfqd->in_service_queue);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree. As a special case, it can be invoked during an
- * expiration.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      bool expiration)
-{
-	BUG_ON(!bfq_bfqq_busy(bfqq));
-	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
-
-	bfq_clear_bfqq_busy(bfqq);
-
-	BUG_ON(bfqd->busy_queues == 0);
-	bfqd->busy_queues--;
-
-	if (!bfqq->dispatched)
-		bfq_weights_tree_remove(bfqd, &bfqq->entity,
-					&bfqd->queue_weights_tree);
-
-	if (bfqq->wr_coeff > 1) {
-		bfqd->wr_busy_queues--;
-		BUG_ON(bfqd->wr_busy_queues < 0);
-	}
-
-	bfqg_stats_update_dequeue(bfqq_group(bfqq));
-
-	BUG_ON(bfqq->entity.budget < 0);
-
-	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
-}
-
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	BUG_ON(bfq_bfqq_busy(bfqq));
-	BUG_ON(bfqq == bfqd->in_service_queue);
-
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
-
-	bfq_activate_bfqq(bfqd, bfqq);
-
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
-
-	if (!bfqq->dispatched)
-		if (bfqq->wr_coeff == 1)
-			bfq_weights_tree_add(bfqd, &bfqq->entity,
-					     &bfqd->queue_weights_tree);
-
-	if (bfqq->wr_coeff > 1) {
-		bfqd->wr_busy_queues++;
-		BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
-	}
-
-}
diff --git a/block/bfq.h b/block/bfq.h
deleted file mode 100644
index e35bf89b09f3..000000000000
--- a/block/bfq.h
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- * BFQ v8r12 for 4.9.0: data structures and common functions prototypes.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
- */
-
-#ifndef _BFQ_H
-#define _BFQ_H
-
-#include <linux/blktrace_api.h>
-#include <linux/hrtimer.h>
-#include <linux/ioprio.h>
-#include <linux/rbtree.h>
-#include <linux/blk-cgroup.h>
-
-#define BFQ_IOPRIO_CLASSES	3
-#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
-
-#define BFQ_MIN_WEIGHT			1
-#define BFQ_MAX_WEIGHT			1000
-#define BFQ_WEIGHT_CONVERSION_COEFF	10
-
-#define BFQ_DEFAULT_QUEUE_IOPRIO	4
-
-#define BFQ_WEIGHT_LEGACY_DFL	100
-#define BFQ_DEFAULT_GRP_IOPRIO	0
-#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
-
-/*
- * Soft real-time applications are extremely more latency sensitive
- * than interactive ones. Over-raise the weight of the former to
- * privilege them against the latter.
- */
-#define BFQ_SOFTRT_WEIGHT_FACTOR	100
-
-struct bfq_entity;
-
-/**
- * struct bfq_service_tree - per ioprio_class service tree.
- *
- * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
- * ioprio_class has its own independent scheduler, and so its own
- * bfq_service_tree.  All the fields are protected by the queue lock
- * of the containing bfqd.
- */
-struct bfq_service_tree {
-	/* tree for active entities (i.e., those backlogged) */
-	struct rb_root active;
-	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
-	struct rb_root idle;
-
-	struct bfq_entity *first_idle;	/* idle entity with minimum F_i */
-	struct bfq_entity *last_idle;	/* idle entity with maximum F_i */
-
-	u64 vtime; /* scheduler virtual time */
-	/* scheduler weight sum; active and idle entities contribute to it */
-	unsigned long wsum;
-};
-
-/**
- * struct bfq_sched_data - multi-class scheduler.
- *
- * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as an
- * intermediate queue in a hierarchical setup.
- *
- * The supported ioprio_classes are the same as in CFQ, in descending
- * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
- * Requests from higher priority queues are served before all the
- * requests from lower priority queues; among requests of the same
- * queue requests are served according to B-WF2Q+.
- *
- * The schedule is implemented by the service trees, plus the field
- * @next_in_service, which points to the entity on the active trees
- * that will be served next, if 1) no changes in the schedule occurs
- * before the current in-service entity is expired, 2) the in-service
- * queue becomes idle when it expires, and 3) if the entity pointed by
- * in_service_entity is not a queue, then the in-service child entity
- * of the entity pointed by in_service_entity becomes idle on
- * expiration. This peculiar definition allows for the following
- * optimization, not yet exploited: while a given entity is still in
- * service, we already know which is the best candidate for next
- * service among the other active entitities in the same parent
- * entity. We can then quickly compare the timestamps of the
- * in-service entity with those of such best candidate.
- *
- * All the fields are protected by the queue lock of the containing
- * bfqd.
- */
-struct bfq_sched_data {
-	struct bfq_entity *in_service_entity;  /* entity in service */
-	/* head-of-the-line entity in the scheduler (see comments above) */
-	struct bfq_entity *next_in_service;
-	/* array of service trees, one per ioprio_class */
-	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
-	/* last time CLASS_IDLE was served */
-	unsigned long bfq_class_idle_last_service;
-
-};
-
-/**
- * struct bfq_weight_counter - counter of the number of all active entities
- *                             with a given weight.
- */
-struct bfq_weight_counter {
-	unsigned int weight; /* weight of the entities this counter refers to */
-	unsigned int num_active; /* nr of active entities with this weight */
-	/*
-	 * Weights tree member (see bfq_data's @queue_weights_tree and
-	 * @group_weights_tree)
-	 */
-	struct rb_node weights_node;
-};
-
-/**
- * struct bfq_entity - schedulable entity.
- *
- * A bfq_entity is used to represent either a bfq_queue (leaf node in the
- * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
- * entity belongs to the sched_data of the parent group in the cgroup
- * hierarchy.  Non-leaf entities have also their own sched_data, stored
- * in @my_sched_data.
- *
- * Each entity stores independently its priority values; this would
- * allow different weights on different devices, but this
- * functionality is not exported to userspace by now.  Priorities and
- * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @prio_changed flag.  As soon as
- * there is a transition in the entity state that allows the priority
- * update to take place the effective and the requested priority
- * values are synchronized.
- *
- * Unless cgroups are used, the weight value is calculated from the
- * ioprio to export the same interface as CFQ.  When dealing with
- * ``well-behaved'' queues (i.e., queues that do not spend too much
- * time to consume their budget and have true sequential behavior, and
- * when there are no external factors breaking anticipation) the
- * relative weights at each level of the cgroups hierarchy should be
- * guaranteed.  All the fields are protected by the queue lock of the
- * containing bfqd.
- */
-struct bfq_entity {
-	struct rb_node rb_node; /* service_tree member */
-	/* pointer to the weight counter associated with this entity */
-	struct bfq_weight_counter *weight_counter;
-
-	/*
-	 * Flag, true if the entity is on a tree (either the active or
-	 * the idle one of its service_tree) or is in service.
-	 */
-	bool on_st;
-
-	u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
-	u64 start;  /* B-WF2Q+ start timestamp (aka S_i) */
-
-	/* tree the entity is enqueued into; %NULL if not on a tree */
-	struct rb_root *tree;
-
-	/*
-	 * minimum start time of the (active) subtree rooted at this
-	 * entity; used for O(log N) lookups into active trees
-	 */
-	u64 min_start;
-
-	/* amount of service received during the last service slot */
-	int service;
-
-	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
-	int budget;
-
-	unsigned int weight;	 /* weight of the queue */
-	unsigned int new_weight; /* next weight if a change is in progress */
-
-	/* original weight, used to implement weight boosting */
-	unsigned int orig_weight;
-
-	/* parent entity, for hierarchical scheduling */
-	struct bfq_entity *parent;
-
-	/*
-	 * For non-leaf nodes in the hierarchy, the associated
-	 * scheduler queue, %NULL on leaf nodes.
-	 */
-	struct bfq_sched_data *my_sched_data;
-	/* the scheduler queue this entity belongs to */
-	struct bfq_sched_data *sched_data;
-
-	/* flag, set to request a weight, ioprio or ioprio_class change  */
-	int prio_changed;
-};
-
-struct bfq_group;
-
-/**
- * struct bfq_queue - leaf schedulable entity.
- *
- * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it  is  async or shared  between  cooperating
- * processes. @cgroup holds a reference to the cgroup, to be sure that it
- * does not disappear while a bfqq still references it (mostly to avoid
- * races between request issuing and task migration followed by cgroup
- * destruction).
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_queue {
-	/* reference counter */
-	int ref;
-	/* parent bfq_data */
-	struct bfq_data *bfqd;
-
-	/* current ioprio and ioprio class */
-	unsigned short ioprio, ioprio_class;
-	/* next ioprio and ioprio class if a change is in progress */
-	unsigned short new_ioprio, new_ioprio_class;
-
-	/*
-	 * Shared bfq_queue if queue is cooperating with one or more
-	 * other queues.
-	 */
-	struct bfq_queue *new_bfqq;
-	/* request-position tree member (see bfq_group's @rq_pos_tree) */
-	struct rb_node pos_node;
-	/* request-position tree root (see bfq_group's @rq_pos_tree) */
-	struct rb_root *pos_root;
-
-	/* sorted list of pending requests */
-	struct rb_root sort_list;
-	/* if fifo isn't expired, next request to serve */
-	struct request *next_rq;
-	/* number of sync and async requests queued */
-	int queued[2];
-	/* number of sync and async requests currently allocated */
-	int allocated[2];
-	/* number of pending metadata requests */
-	int meta_pending;
-	/* fifo list of requests in sort_list */
-	struct list_head fifo;
-
-	/* entity representing this queue in the scheduler */
-	struct bfq_entity entity;
-
-	/* maximum budget allowed from the feedback mechanism */
-	int max_budget;
-	/* budget expiration (in jiffies) */
-	unsigned long budget_timeout;
-
-	/* number of requests on the dispatch list or inside driver */
-	int dispatched;
-
-	unsigned int flags; /* status flags.*/
-
-	/* node for active/idle bfqq list inside parent bfqd */
-	struct list_head bfqq_list;
-
-	/* bit vector: a 1 for each seeky requests in history */
-	u32 seek_history;
-
-	/* node for the device's burst list */
-	struct hlist_node burst_list_node;
-
-	/* position of the last request enqueued */
-	sector_t last_request_pos;
-
-	/* Number of consecutive pairs of request completion and
-	 * arrival, such that the queue becomes idle after the
-	 * completion, but the next request arrives within an idle
-	 * time slice; used only if the queue's IO_bound flag has been
-	 * cleared.
-	 */
-	unsigned int requests_within_timer;
-
-	/* pid of the process owning the queue, used for logging purposes */
-	pid_t pid;
-
-	/*
-	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
-	 * if the queue is shared.
-	 */
-	struct bfq_io_cq *bic;
-
-	/* current maximum weight-raising time for this queue */
-	unsigned long wr_cur_max_time;
-	/*
-	 * Minimum time instant such that, only if a new request is
-	 * enqueued after this time instant in an idle @bfq_queue with
-	 * no outstanding requests, then the task associated with the
-	 * queue it is deemed as soft real-time (see the comments on
-	 * the function bfq_bfqq_softrt_next_start())
-	 */
-	unsigned long soft_rt_next_start;
-	/*
-	 * Start time of the current weight-raising period if
-	 * the @bfq-queue is being weight-raised, otherwise
-	 * finish time of the last weight-raising period.
-	 */
-	unsigned long last_wr_start_finish;
-	/* factor by which the weight of this queue is multiplied */
-	unsigned int wr_coeff;
-	/*
-	 * Time of the last transition of the @bfq_queue from idle to
-	 * backlogged.
-	 */
-	unsigned long last_idle_bklogged;
-	/*
-	 * Cumulative service received from the @bfq_queue since the
-	 * last transition from idle to backlogged.
-	 */
-	unsigned long service_from_backlogged;
-	/*
-	 * Value of wr start time when switching to soft rt
-	 */
-	unsigned long wr_start_at_switch_to_srt;
-
-	unsigned long split_time; /* time of last split */
-};
-
-/**
- * struct bfq_ttime - per process thinktime stats.
- */
-struct bfq_ttime {
-	u64 last_end_request; /* completion time of last request */
-
-	u64 ttime_total; /* total process thinktime */
-	unsigned long ttime_samples; /* number of thinktime samples */
-	u64 ttime_mean; /* average process thinktime */
-
-};
-
-/**
- * struct bfq_io_cq - per (request_queue, io_context) structure.
- */
-struct bfq_io_cq {
-	/* associated io_cq structure */
-	struct io_cq icq; /* must be the first member */
-	/* array of two process queues, the sync and the async */
-	struct bfq_queue *bfqq[2];
-	/* associated @bfq_ttime struct */
-	struct bfq_ttime ttime;
-	/* per (request_queue, blkcg) ioprio */
-	int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	uint64_t blkcg_serial_nr; /* the current blkcg serial */
-#endif
-
-	/*
-	 * Snapshot of the has_short_time flag before merging; taken
-	 * to remember its value while the queue is merged, so as to
-	 * be able to restore it in case of split.
-	 */
-	bool saved_has_short_ttime;
-	/*
-	 * Same purpose as the previous two fields for the I/O bound
-	 * classification of a queue.
-	 */
-	bool saved_IO_bound;
-
-	/*
-	 * Same purpose as the previous fields for the value of the
-	 * field keeping the queue's belonging to a large burst
-	 */
-	bool saved_in_large_burst;
-	/*
-	 * True if the queue belonged to a burst list before its merge
-	 * with another cooperating queue.
-	 */
-	bool was_in_burst_list;
-
-	/*
-	 * Similar to previous fields: save wr information.
-	 */
-	unsigned long saved_wr_coeff;
-	unsigned long saved_last_wr_start_finish;
-	unsigned long saved_wr_start_at_switch_to_srt;
-	unsigned int saved_wr_cur_max_time;
-};
-
-enum bfq_device_speed {
-	BFQ_BFQD_FAST,
-	BFQ_BFQD_SLOW,
-};
-
-/**
- * struct bfq_data - per-device data structure.
- *
- * All the fields are protected by the @queue lock.
- */
-struct bfq_data {
-	/* request queue for the device */
-	struct request_queue *queue;
-
-	/* root bfq_group for the device */
-	struct bfq_group *root_group;
-
-	/*
-	 * rbtree of weight counters of @bfq_queues, sorted by
-	 * weight. Used to keep track of whether all @bfq_queues have
-	 * the same weight. The tree contains one counter for each
-	 * distinct weight associated to some active and not
-	 * weight-raised @bfq_queue (see the comments to the functions
-	 * bfq_weights_tree_[add|remove] for further details).
-	 */
-	struct rb_root queue_weights_tree;
-	/*
-	 * rbtree of non-queue @bfq_entity weight counters, sorted by
-	 * weight. Used to keep track of whether all @bfq_groups have
-	 * the same weight. The tree contains one counter for each
-	 * distinct weight associated to some active @bfq_group (see
-	 * the comments to the functions bfq_weights_tree_[add|remove]
-	 * for further details).
-	 */
-	struct rb_root group_weights_tree;
-
-	/*
-	 * Number of bfq_queues containing requests (including the
-	 * queue in service, even if it is idling).
-	 */
-	int busy_queues;
-	/* number of weight-raised busy @bfq_queues */
-	int wr_busy_queues;
-	/* number of queued requests */
-	int queued;
-	/* number of requests dispatched and waiting for completion */
-	int rq_in_driver;
-
-	/*
-	 * Maximum number of requests in driver in the last
-	 * @hw_tag_samples completed requests.
-	 */
-	int max_rq_in_driver;
-	/* number of samples used to calculate hw_tag */
-	int hw_tag_samples;
-	/* flag set to one if the driver is showing a queueing behavior */
-	int hw_tag;
-
-	/* number of budgets assigned */
-	int budgets_assigned;
-
-	/*
-	 * Timer set when idling (waiting) for the next request from
-	 * the queue in service.
-	 */
-	struct hrtimer idle_slice_timer;
-	/* delayed work to restart dispatching on the request queue */
-	struct work_struct unplug_work;
-
-	/* bfq_queue in service */
-	struct bfq_queue *in_service_queue;
-	/* bfq_io_cq (bic) associated with the @in_service_queue */
-	struct bfq_io_cq *in_service_bic;
-
-	/* on-disk position of the last served request */
-	sector_t last_position;
-
-	/* time of last request completion (ns) */
-	u64 last_completion;
-
-	/* time of first rq dispatch in current observation interval (ns) */
-	u64 first_dispatch;
-	/* time of last rq dispatch in current observation interval (ns) */
-	u64 last_dispatch;
-
-	/* beginning of the last budget */
-	ktime_t last_budget_start;
-	/* beginning of the last idle slice */
-	ktime_t last_idling_start;
-
-	/* number of samples in current observation interval */
-	int peak_rate_samples;
-	/* num of samples of seq dispatches in current observation interval */
-	u32 sequential_samples;
-	/* total num of sectors transferred in current observation interval */
-	u64 tot_sectors_dispatched;
-	/* max rq size seen during current observation interval (sectors) */
-	u32 last_rq_max_size;
-	/* time elapsed from first dispatch in current observ. interval (us) */
-	u64 delta_from_first;
-	/* current estimate of device peak rate */
-	u32 peak_rate;
-
-	/* maximum budget allotted to a bfq_queue before rescheduling */
-	int bfq_max_budget;
-
-	/* list of all the bfq_queues active on the device */
-	struct list_head active_list;
-	/* list of all the bfq_queues idle on the device */
-	struct list_head idle_list;
-
-	/*
-	 * Timeout for async/sync requests; when it fires, requests
-	 * are served in fifo order.
-	 */
-	u64 bfq_fifo_expire[2];
-	/* weight of backward seeks wrt forward ones */
-	unsigned int bfq_back_penalty;
-	/* maximum allowed backward seek */
-	unsigned int bfq_back_max;
-	/* maximum idling time */
-	u32 bfq_slice_idle;
-
-	/* user-configured max budget value (0 for auto-tuning) */
-	int bfq_user_max_budget;
-	/*
-	 * Timeout for bfq_queues to consume their budget; used to
-	 * prevent seeky queues from imposing long latencies to
-	 * sequential or quasi-sequential ones (this also implies that
-	 * seeky queues cannot receive guarantees in the service
-	 * domain; after a timeout they are charged for the time they
-	 * have been in service, to preserve fairness among them, but
-	 * without service-domain guarantees).
-	 */
-	unsigned int bfq_timeout;
-
-	/*
-	 * Number of consecutive requests that must be issued within
-	 * the idle time slice to set again idling to a queue which
-	 * was marked as non-I/O-bound (see the definition of the
-	 * IO_bound flag for further details).
-	 */
-	unsigned int bfq_requests_within_timer;
-
-	/*
-	 * Force device idling whenever needed to provide accurate
-	 * service guarantees, without caring about throughput
-	 * issues. CAVEAT: this may even increase latencies, in case
-	 * of useless idling for processes that did stop doing I/O.
-	 */
-	bool strict_guarantees;
-
-	/*
-	 * Last time at which a queue entered the current burst of
-	 * queues being activated shortly after each other; for more
-	 * details about this and the following parameters related to
-	 * a burst of activations, see the comments on the function
-	 * bfq_handle_burst.
-	 */
-	unsigned long last_ins_in_burst;
-	/*
-	 * Reference time interval used to decide whether a queue has
-	 * been activated shortly after @last_ins_in_burst.
-	 */
-	unsigned long bfq_burst_interval;
-	/* number of queues in the current burst of queue activations */
-	int burst_size;
-
-	/* common parent entity for the queues in the burst */
-	struct bfq_entity *burst_parent_entity;
-	/* Maximum burst size above which the current queue-activation
-	 * burst is deemed as 'large'.
-	 */
-	unsigned long bfq_large_burst_thresh;
-	/* true if a large queue-activation burst is in progress */
-	bool large_burst;
-	/*
-	 * Head of the burst list (as for the above fields, more
-	 * details in the comments on the function bfq_handle_burst).
-	 */
-	struct hlist_head burst_list;
-
-	/* if set to true, low-latency heuristics are enabled */
-	bool low_latency;
-	/*
-	 * Maximum factor by which the weight of a weight-raised queue
-	 * is multiplied.
-	 */
-	unsigned int bfq_wr_coeff;
-	/* maximum duration of a weight-raising period (jiffies) */
-	unsigned int bfq_wr_max_time;
-
-	/* Maximum weight-raising duration for soft real-time processes */
-	unsigned int bfq_wr_rt_max_time;
-	/*
-	 * Minimum idle period after which weight-raising may be
-	 * reactivated for a queue (in jiffies).
-	 */
-	unsigned int bfq_wr_min_idle_time;
-	/*
-	 * Minimum period between request arrivals after which
-	 * weight-raising may be reactivated for an already busy async
-	 * queue (in jiffies).
-	 */
-	unsigned long bfq_wr_min_inter_arr_async;
-
-	/* Max service-rate for a soft real-time queue, in sectors/sec */
-	unsigned int bfq_wr_max_softrt_rate;
-	/*
-	 * Cached value of the product R*T, used for computing the
-	 * maximum duration of weight raising automatically.
-	 */
-	u64 RT_prod;
-	/* device-speed class for the low-latency heuristic */
-	enum bfq_device_speed device_speed;
-
-	/* fallback dummy bfqq for extreme OOM conditions */
-	struct bfq_queue oom_bfqq;
-};
-
-enum bfqq_state_flags {
-	BFQ_BFQQ_FLAG_just_created = 0,	/* queue just allocated */
-	BFQ_BFQQ_FLAG_busy,		/* has requests or is in service */
-	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
-	BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
-					     * waiting for a request
-					     * without idling the device
-					     */
-	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
-	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
-	BFQ_BFQQ_FLAG_has_short_ttime,	/* queue has a short think time */
-	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
-	BFQ_BFQQ_FLAG_IO_bound,		/*
-					 * bfqq has timed-out at least once
-					 * having consumed at most 2/10 of
-					 * its budget
-					 */
-	BFQ_BFQQ_FLAG_in_large_burst,	/*
-					 * bfqq activated in a large burst,
-					 * see comments to bfq_handle_burst.
-					 */
-	BFQ_BFQQ_FLAG_softrt_update,	/*
-					 * may need softrt-next-start
-					 * update
-					 */
-	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
-	BFQ_BFQQ_FLAG_split_coop	/* shared bfqq will be split */
-};
-
-#define BFQ_BFQQ_FNS(name)						\
-static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
-{									\
-	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
-}									\
-static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
-{									\
-	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
-}									\
-static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
-{									\
-	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
-}
-
-BFQ_BFQQ_FNS(just_created);
-BFQ_BFQQ_FNS(busy);
-BFQ_BFQQ_FNS(wait_request);
-BFQ_BFQQ_FNS(non_blocking_wait_rq);
-BFQ_BFQQ_FNS(must_alloc);
-BFQ_BFQQ_FNS(fifo_expire);
-BFQ_BFQQ_FNS(has_short_ttime);
-BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(IO_bound);
-BFQ_BFQQ_FNS(in_large_burst);
-BFQ_BFQQ_FNS(coop);
-BFQ_BFQQ_FNS(split_coop);
-BFQ_BFQQ_FNS(softrt_update);
-#undef BFQ_BFQQ_FNS
-
-/* Logging facilities. */
-#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	assert_spin_locked((bfqd)->queue->queue_lock);			\
-	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
-	pr_crit("bfq%d%c %s " fmt "\n", 			\
-		(bfqq)->pid,						\
-		bfq_bfqq_sync((bfqq)) ? 'S' : 'A',			\
-		__pbuf, ##args);					\
-} while (0)
-
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
-	pr_crit("%s " fmt "\n", __pbuf, ##args);	\
-} while (0)
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)		\
-	pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid,		\
-		bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\
-		##args)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log(bfqd, fmt, args...) \
-	pr_crit("bfq " fmt "\n", ##args)
-
-#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	assert_spin_locked((bfqd)->queue->queue_lock);			\
-	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
-			  (bfqq)->pid,			  \
-			  bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\
-			  __pbuf, ##args);				\
-} while (0)
-
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
-} while (0)
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-				##args)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log(bfqd, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
-#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
-
-/* Expiration reasons. */
-enum bfqq_expiration {
-	BFQ_BFQQ_TOO_IDLE = 0,		/*
-					 * queue has been idling for
-					 * too long
-					 */
-	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
-	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
-	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
-	BFQ_BFQQ_PREEMPTED		/* preemption in progress */
-};
-
-
-struct bfqg_stats {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	/* number of ios merged */
-	struct blkg_rwstat		merged;
-	/* total time spent on device in ns, may not be accurate w/ queueing */
-	struct blkg_rwstat		service_time;
-	/* total time spent waiting in scheduler queue in ns */
-	struct blkg_rwstat		wait_time;
-	/* number of IOs queued up */
-	struct blkg_rwstat		queued;
-	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
-	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
-	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
-	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
-	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkcg_gq */
-	struct blkg_stat		idle_time;
-	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
-	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
-	uint16_t			flags;
-#endif
-};
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-/*
- * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
- *
- * @ps: @blkcg_policy_storage that this structure inherits
- * @weight: weight of the bfq_group
- */
-struct bfq_group_data {
-	/* must be the first member */
-	struct blkcg_policy_data pd;
-
-	unsigned int weight;
-};
-
-/**
- * struct bfq_group - per (device, cgroup) data structure.
- * @entity: schedulable entity to insert into the parent group sched_data.
- * @sched_data: own sched_data, to contain child entities (they may be
- *              both bfq_queues and bfq_groups).
- * @bfqd: the bfq_data for the device this group acts upon.
- * @async_bfqq: array of async queues for all the tasks belonging to
- *              the group, one queue per ioprio value per ioprio_class,
- *              except for the idle class that has only one queue.
- * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
- * @my_entity: pointer to @entity, %NULL for the toplevel group; used
- *             to avoid too many special cases during group creation/
- *             migration.
- * @active_entities: number of active entities belonging to the group;
- *                   unused for the root group. Used to know whether there
- *                   are groups with more than one active @bfq_entity
- *                   (see the comments to the function
- *                   bfq_bfqq_may_idle()).
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- *               determining if two or more queues have interleaving
- *               requests (see bfq_find_close_cooperator()).
- *
- * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
- * there is a set of bfq_groups, each one collecting the lower-level
- * entities belonging to the group that are acting on the same device.
- *
- * Locking works as follows:
- *    o @bfqd is protected by the queue lock, RCU is used to access it
- *      from the readers.
- *    o All the other fields are protected by the @bfqd queue lock.
- */
-struct bfq_group {
-	/* must be the first member */
-	struct blkg_policy_data pd;
-
-	struct bfq_entity entity;
-	struct bfq_sched_data sched_data;
-
-	void *bfqd;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-
-	struct bfq_entity *my_entity;
-
-	int active_entities;
-
-	struct rb_root rq_pos_tree;
-
-	struct bfqg_stats stats;
-};
-
-#else
-struct bfq_group {
-	struct bfq_sched_data sched_data;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-
-	struct rb_root rq_pos_tree;
-};
-#endif
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
-static unsigned int bfq_class_idx(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	return bfqq ? bfqq->ioprio_class - 1 :
-		BFQ_DEFAULT_GRP_CLASS - 1;
-}
-
-static struct bfq_service_tree *
-bfq_entity_service_tree(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sched_data = entity->sched_data;
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfq_class_idx(entity);
-
-	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
-	BUG_ON(sched_data == NULL);
-
-	if (bfqq)
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			     "entity_service_tree %p %d",
-			     sched_data->service_tree + idx, idx);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else {
-		struct bfq_group *bfqg =
-			container_of(entity, struct bfq_group, entity);
-
-		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
-			     "entity_service_tree %p %d",
-			     sched_data->service_tree + idx, idx);
-	}
-#endif
-	return sched_data->service_tree + idx;
-}
-
-static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
-{
-	return bic->bfqq[is_sync];
-}
-
-static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
-			 bool is_sync)
-{
-	bic->bfqq[is_sync] = bfqq;
-}
-
-static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
-{
-	return bic->icq.q->elevator->elevator_data;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	if (!group_entity)
-		group_entity = &bfqq->bfqd->root_group->entity;
-
-	return container_of(group_entity, struct bfq_group, entity);
-}
-
-#else
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
-#endif
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
-static void bfq_put_queue(struct bfq_queue *bfqq);
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, bool is_sync,
-				       struct bfq_io_cq *bic);
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
-				    struct bfq_group *bfqg);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
-#endif
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-
-#endif /* _BFQ_H */
diff --git a/drivers/Kconfig b/drivers/Kconfig
index c89e0f383be6..981778f02e56 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -216,8 +216,6 @@ source "drivers/gps/Kconfig"
 
 source "drivers/halls/Kconfig"
 
-source "drivers/rekernel/Kconfig"
-
 source "drivers/kernelsu/Kconfig"
 
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index f691364e80c8..8d445b4401be 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -183,5 +183,4 @@ obj-$(CONFIG_SENSORS_SSC)		+= sensors/
 obj-$(CONFIG_TEE)		+= tee/
 obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/
 obj-$(CONFIG_HALLS)		+= halls/
-obj-$(CONFIG_REKERNEL) += rekernel/
 obj-$(CONFIG_KSU)		+= kernelsu/
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 491751ab0dbf..bb2a5b581622 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -1,9 +1,8 @@
-# SPDX-License-Identifier: GPL-2.0
 menu "Android"
 
 config ANDROID
 	bool "Android Drivers"
-	help
+	---help---
 	  Enable support for various drivers needed on the Android platform
 
 if ANDROID
@@ -12,7 +11,7 @@ config ANDROID_BINDER_IPC
 	bool "Android Binder IPC Driver"
 	depends on MMU
 	default n
-	help
+	---help---
 	  Binder is used in Android for both communication between processes,
 	  and remote method invocation.
 
@@ -20,23 +19,11 @@ config ANDROID_BINDER_IPC
 	  Android process, using Binder to identify, invoke and pass arguments
 	  between said processes.
 
-config ANDROID_BINDERFS
-	bool "Android Binderfs filesystem"
-	depends on ANDROID_BINDER_IPC
-	default n
-	help
-	  Binderfs is a pseudo-filesystem for the Android Binder IPC driver
-	  which can be mounted per-ipc namespace allowing to run multiple
-	  instances of Android.
-	  Each binderfs mount initially only contains a binder-control device.
-	  It can be used to dynamically allocate new binder IPC devices via
-	  ioctls.
-
 config ANDROID_BINDER_DEVICES
 	string "Android Binder devices"
 	depends on ANDROID_BINDER_IPC
 	default "binder,hwbinder,vndbinder"
-	help
+	---help---
 	  Default value for the binder.devices parameter.
 
 	  The binder.devices parameter is a comma-separated list of strings
@@ -44,71 +31,29 @@ config ANDROID_BINDER_DEVICES
 	  created. Each binder device has its own context manager, and is
 	  therefore logically separated from the other devices.
 
+config ANDROID_BINDER_IPC_32BIT
+	bool "Android Binder IPC 32BIT Driver"
+	depends on !64BIT && ANDROID_BINDER_IPC
+	default n
+	---help---
+	  The Binder API has been changed to support both 32 and 64bit
+	  applications in a mixed environment.
+
+	  Enable this to support an old 32-bit Android user-space (v4.4 and
+	  earlier).
+
+	  Note that enabling this will break newer Android user-space.
+
 config ANDROID_BINDER_IPC_SELFTEST
 	bool "Android Binder IPC Driver Selftest"
 	depends on ANDROID_BINDER_IPC
-	help
+	---help---
 	  This feature allows binder selftest to run.
 
 	  Binder selftest checks the allocation and free of binder buffers
 	  exhaustively with combinations of various buffer sizes and
 	  alignments.
 
-config ANDROID_DEBUG_SYMBOLS
-	bool "Android Debug Symbols"
-	help
-	  Enables export of debug symbols that are useful for offline debugging
-	  of a kernel. These symbols would be used in vendor modules to find
-	  addresses of the core kernel symbols for vendor extensions.
-
-	  This driver is statically compiled into kernel and maintains all the
-	  required symbol addresses for vendor modules and provides necessary
-	  interface vendor modules.
-
-config ANDROID_VENDOR_HOOKS
-	bool "Android Vendor Hooks"
-	depends on TRACEPOINTS
-	help
-	  Enable vendor hooks implemented as tracepoints
-
-	  Allow vendor modules to attach to tracepoint "hooks" defined via
-	  DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
-
-config ANDROID_KABI_RESERVE
-	bool "Android KABI reserve padding"
-	default y
-	help
-	  This option enables the padding that the Android GKI kernel adds
-	  to many different kernel structures to support an in-kernel stable ABI
-	  over the lifespan of support for the kernel.
-
-	  Only disable this option if you have a system that needs the Android
-	  kernel drivers, but is NOT an Android GKI kernel image. If disabled
-	  it has the possibility to make the kernel static and runtime image
-	  slightly smaller but will NOT be supported by the Google Android
-	  kernel team.
-
-	  If even slightly unsure, say Y.
-
-config ANDROID_VENDOR_OEM_DATA
-	bool "Android vendor and OEM data padding"
-	default y
-	help
-	  This option enables the padding that the Android GKI kernel adds
-	  to many different kernel structures to support an in-kernel stable ABI
-	  over the lifespan of support for the kernel as well as OEM additional
-	  fields that are needed by some of the Android kernel tracepoints. The
-	  macros enabled by this option are used to enable padding in vendor modules
-	  used for the above specified purposes.
-
-	  Only disable this option if you have a system that needs the Android
-	  kernel drivers, but is NOT an Android GKI kernel image and you do NOT
-	  use the Android kernel tracepoints. If disabled it has the possibility
-	  to make the kernel static and runtime image slightly smaller but will
-	  NOT be supported by the Google Android kernel team.
-
-	  If even slightly unsure, say Y.
-
 endif # if ANDROID
 
 endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index f1ac44102987..a01254c43ee3 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,8 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
 ccflags-y += -I$(src)			# needed for trace events
 
-obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
-obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS)	+= android_debug_symbols.o
-obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o
diff --git a/drivers/android/android_debug_symbols.c b/drivers/android/android_debug_symbols.c
deleted file mode 100644
index dd75ddac2085..000000000000
--- a/drivers/android/android_debug_symbols.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-/*
- * Copyright (c) 2021, The Linux Foundation. All rights reserved.
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/android_debug_symbols.h>
-#include <asm/stacktrace.h>
-#include <asm/sections.h>
-
-#include <linux/cma.h>
-#include "../../mm/slab.h"
-#include <linux/memblock.h>
-#include <linux/page_owner.h>
-#include <linux/swap.h>
-#include <linux/mm.h>
-#include <linux/security.h>
-
-struct ads_entry {
-	char *name;
-	void *addr;
-};
-
-bool ads_page_owner;
-bool ads_slub_debug;
-unsigned long ads_vmalloc_nr_pages;
-unsigned long ads_pcpu_nr_pages;
-
-#define _ADS_ENTRY(index, symbol)			\
-	[index] = { .name = #symbol, .addr = (void *)symbol }
-#define ADS_ENTRY(index, symbol) _ADS_ENTRY(index, symbol)
-
-#define _ADS_PER_CPU_ENTRY(index, symbol)			\
-	[index] = { .name = #symbol, .addr = (void *)&symbol }
-#define ADS_PER_CPU_ENTRY(index, symbol) _ADS_PER_CPU_ENTRY(index, symbol)
-
-/*
- * This module maintains static array of symbol and address information.
- * Add all required core kernel symbols and their addresses into ads_entries[] array,
- * so that vendor modules can query and to find address of non-exported symbol.
- */
-static const struct ads_entry ads_entries[ADS_END] = {
-	ADS_ENTRY(ADS_SDATA, _sdata),
-	ADS_ENTRY(ADS_BSS_END, __bss_stop),
-	ADS_ENTRY(ADS_PER_CPU_START, __per_cpu_start),
-	ADS_ENTRY(ADS_PER_CPU_END, __per_cpu_end),
-	ADS_ENTRY(ADS_START_RO_AFTER_INIT, __start_ro_after_init),
-	ADS_ENTRY(ADS_END_RO_AFTER_INIT, __end_ro_after_init),
-	ADS_ENTRY(ADS_LINUX_BANNER, linux_banner),
-#ifdef CONFIG_CMA
-	ADS_ENTRY(ADS_TOTAL_CMA, &totalcma_pages),
-#endif
-	ADS_ENTRY(ADS_SLAB_CACHES, &slab_caches),
-	ADS_ENTRY(ADS_SLAB_MUTEX, &slab_mutex),
-	ADS_ENTRY(ADS_MIN_LOW_PFN, &min_low_pfn),
-	ADS_ENTRY(ADS_MAX_PFN, &max_pfn),
-	ADS_ENTRY(ADS_VMALLOC_NR_PAGES, &ads_vmalloc_nr_pages),
-	ADS_ENTRY(ADS_PCPU_NR_PAGES, &ads_pcpu_nr_pages),
-#ifdef CONFIG_PAGE_OWNER
-	ADS_ENTRY(ADS_PAGE_OWNER_ENABLED, &ads_page_owner),
-#endif
-#ifdef CONFIG_SLUB_DEBUG
-	ADS_ENTRY(ADS_SLUB_DEBUG, &ads_slub_debug),
-#endif
-#ifdef CONFIG_SWAP
-	ADS_ENTRY(ADS_NR_SWAP_PAGES, &nr_swap_pages),
-#endif
-#ifdef CONFIG_MMU
-	ADS_ENTRY(ADS_MMAP_MIN_ADDR, &mmap_min_addr),
-#endif
-	ADS_ENTRY(ADS_STACK_GUARD_GAP, &stack_guard_gap),
-#ifdef CONFIG_SYSCTL
-	ADS_ENTRY(ADS_SYSCTL_LEGACY_VA_LAYOUT, &sysctl_legacy_va_layout),
-#endif
-	ADS_ENTRY(ADS_SHOW_MEM, show_mem),
-#ifdef CONFIG_ARM64
-	ADS_ENTRY(ADS_PUT_TASK_STACK, put_task_stack),
-#endif
-};
-
-/*
- * ads_per_cpu_entries array contains all the per_cpu variable address information.
- */
-static const struct ads_entry ads_per_cpu_entries[ADS_DEBUG_PER_CPU_END] = {
-#ifdef CONFIG_ARM64
-	ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, irq_stack_ptr),
-#endif
-#ifdef CONFIG_X86
-	ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, hardirq_stack_ptr),
-#endif
-};
-
-/*
- * android_debug_symbol - Provide address inforamtion of debug symbol.
- * @symbol: Index of debug symbol array.
- *
- * Return address of core kernel symbol on success and a negative errno will be
- * returned in error cases.
- *
- */
-void *android_debug_symbol(enum android_debug_symbol symbol)
-{
-	if (symbol >= ADS_END)
-		return ERR_PTR(-EINVAL);
-
-	return ads_entries[symbol].addr;
-}
-EXPORT_SYMBOL_NS_GPL(android_debug_symbol, MINIDUMP);
-
-/*
- * android_debug_per_cpu_symbol - Provide address inforamtion of per cpu debug symbol.
- * @symbol: Index of per cpu debug symbol array.
- *
- * Return address of core kernel symbol on success and a negative errno will be
- * returned in error cases.
- *
- */
-void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol)
-{
-	if (symbol >= ADS_DEBUG_PER_CPU_END)
-		return ERR_PTR(-EINVAL);
-
-	return ads_per_cpu_entries[symbol].addr;
-}
-EXPORT_SYMBOL_NS_GPL(android_debug_per_cpu_symbol, MINIDUMP);
-
-static int __init debug_symbol_init(void)
-{
-#ifdef CONFIG_PAGE_OWNER
-	ads_page_owner  = page_owner_ops.need();
-#endif
-#ifdef CONFIG_SLUB_DEBUG
-	ads_slub_debug = __slub_debug_enabled();
-#endif
-	ads_vmalloc_nr_pages = vmalloc_nr_pages();
-	ads_pcpu_nr_pages = pcpu_nr_pages();
-	return 0;
-}
-module_init(debug_symbol_init);
-
-static void __exit debug_symbol_exit(void)
-{ }
-module_exit(debug_symbol_exit);
-
-MODULE_DESCRIPTION("Debug Symbol Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 977cb783ea0b..20356105e4ba 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1,9 +1,18 @@
-// SPDX-License-Identifier: GPL-2.0-only
 /* binder.c
  *
  * Android IPC Subsystem
  *
  * Copyright (C) 2007-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
  */
 
 /*
@@ -42,6 +51,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/cacheflush.h>
 #include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/freezer.h>
@@ -54,28 +64,13 @@
 #include <linux/poll.h>
 #include <linux/debugfs.h>
 #include <linux/rbtree.h>
-#include <linux/signal.h>
-#include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
-#include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/pid_namespace.h>
 #include <linux/security.h>
 #include <linux/spinlock.h>
-#include <linux/ratelimit.h>
-#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/sizes.h>
-
-#include <uapi/linux/types.h>
-#ifdef CONFIG_REKERNEL
-#include <../rekernel/rekernel.h>
-#endif /* CONFIG_REKERNEL */
-#include <uapi/linux/android/binder.h>
-
-#include <asm/cacheflush.h>
-
-#include "binder_internal.h"
+#include "binder_alloc.h"
 #include "binder_trace.h"
 
 static HLIST_HEAD(binder_deferred_list);
@@ -92,11 +87,36 @@ static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
 static atomic_t binder_last_id;
 
-static int proc_show(struct seq_file *m, void *unused);
-DEFINE_SHOW_ATTRIBUTE(proc);
+#define BINDER_DEBUG_ENTRY(name) \
+static int binder_##name##_open(struct inode *inode, struct file *file) \
+{ \
+	return single_open(file, binder_##name##_show, inode->i_private); \
+} \
+\
+static const struct file_operations binder_##name##_fops = { \
+	.owner = THIS_MODULE, \
+	.open = binder_##name##_open, \
+	.read = seq_read, \
+	.llseek = seq_lseek, \
+	.release = single_release, \
+}
+
+static int binder_proc_show(struct seq_file *m, void *unused);
+BINDER_DEBUG_ENTRY(proc);
+
+/* This is only defined in include/asm-arm/sizes.h */
+#ifndef SZ_1K
+#define SZ_1K                               0x400
+#endif
+
+#ifndef SZ_4M
+#define SZ_4M                               0x400000
+#endif
 
 #define FORBIDDEN_MMAP_FLAGS                (VM_WRITE)
 
+#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64)
+
 enum {
 	BINDER_DEBUG_USER_ERROR             = 1U << 0,
 	BINDER_DEBUG_FAILED_TRANSACTION     = 1U << 1,
@@ -118,8 +138,8 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
 	BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
 module_param_named(debug_mask, binder_debug_mask, uint, 0644);
 
-char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
-module_param_named(devices, binder_devices_param, charp, 0444);
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
 
 static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
 static int binder_stop_on_user_error;
@@ -140,13 +160,13 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define binder_debug(mask, x...) \
 	do { \
 		if (binder_debug_mask & mask) \
-			pr_info_ratelimited(x); \
+			pr_info(x); \
 	} while (0)
 
 #define binder_user_error(x...) \
 	do { \
 		if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \
-			pr_info_ratelimited(x); \
+			pr_info(x); \
 		if (binder_stop_on_user_error) \
 			binder_stop_on_user_error = 2; \
 	} while (0)
@@ -162,6 +182,24 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define to_binder_fd_array_object(hdr) \
 	container_of(hdr, struct binder_fd_array_object, hdr)
 
+enum binder_stat_types {
+	BINDER_STAT_PROC,
+	BINDER_STAT_THREAD,
+	BINDER_STAT_NODE,
+	BINDER_STAT_REF,
+	BINDER_STAT_DEATH,
+	BINDER_STAT_TRANSACTION,
+	BINDER_STAT_TRANSACTION_COMPLETE,
+	BINDER_STAT_COUNT
+};
+
+struct binder_stats {
+	atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1];
+	atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
+	atomic_t obj_created[BINDER_STAT_COUNT];
+	atomic_t obj_deleted[BINDER_STAT_COUNT];
+};
+
 static struct binder_stats binder_stats;
 
 static inline void binder_stats_deleted(enum binder_stat_types type)
@@ -189,26 +227,16 @@ struct binder_transaction_log_entry {
 	int return_error_line;
 	uint32_t return_error;
 	uint32_t return_error_param;
-	char context_name[BINDERFS_MAX_NAME + 1];
+	const char *context_name;
 };
-
 struct binder_transaction_log {
 	atomic_t cur;
 	bool full;
 	struct binder_transaction_log_entry entry[32];
 };
-
 static struct binder_transaction_log binder_transaction_log;
 static struct binder_transaction_log binder_transaction_log_failed;
 
-static struct kmem_cache *binder_node_pool;
-static struct kmem_cache *binder_proc_pool;
-static struct kmem_cache *binder_ref_death_pool;
-static struct kmem_cache *binder_ref_pool;
-static struct kmem_cache *binder_thread_pool;
-static struct kmem_cache *binder_transaction_pool;
-static struct kmem_cache *binder_work_pool;
-
 static struct binder_transaction_log_entry *binder_transaction_log_add(
 	struct binder_transaction_log *log)
 {
@@ -229,9 +257,320 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 	return e;
 }
 
+struct binder_context {
+	struct binder_node *binder_context_mgr_node;
+	struct mutex context_mgr_node_lock;
+
+	kuid_t binder_context_mgr_uid;
+	const char *name;
+};
+
+struct binder_device {
+	struct hlist_node hlist;
+	struct miscdevice miscdev;
+	struct binder_context context;
+};
+
+/**
+ * struct binder_work - work enqueued on a worklist
+ * @entry:             node enqueued on list
+ * @type:              type of work to be performed
+ *
+ * There are separate work lists for proc, thread, and node (async).
+ */
+struct binder_work {
+	struct list_head entry;
+
+	enum binder_work_type {
+		BINDER_WORK_TRANSACTION = 1,
+		BINDER_WORK_TRANSACTION_COMPLETE,
+		BINDER_WORK_RETURN_ERROR,
+		BINDER_WORK_NODE,
+		BINDER_WORK_DEAD_BINDER,
+		BINDER_WORK_DEAD_BINDER_AND_CLEAR,
+		BINDER_WORK_CLEAR_DEATH_NOTIFICATION,
+	} type;
+};
+
+struct binder_error {
+	struct binder_work work;
+	uint32_t cmd;
+};
+
+/**
+ * struct binder_node - binder node bookkeeping
+ * @debug_id:             unique ID for debugging
+ *                        (invariant after initialized)
+ * @lock:                 lock for node fields
+ * @work:                 worklist element for node work
+ *                        (protected by @proc->inner_lock)
+ * @rb_node:              element for proc->nodes tree
+ *                        (protected by @proc->inner_lock)
+ * @dead_node:            element for binder_dead_nodes list
+ *                        (protected by binder_dead_nodes_lock)
+ * @proc:                 binder_proc that owns this node
+ *                        (invariant after initialized)
+ * @refs:                 list of references on this node
+ *                        (protected by @lock)
+ * @internal_strong_refs: used to take strong references when
+ *                        initiating a transaction
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @local_weak_refs:      weak user refs from local process
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @local_strong_refs:    strong user refs from local process
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @tmp_refs:             temporary kernel refs
+ *                        (protected by @proc->inner_lock while @proc
+ *                        is valid, and by binder_dead_nodes_lock
+ *                        if @proc is NULL. During inc/dec and node release
+ *                        it is also protected by @lock to provide safety
+ *                        as the node dies and @proc becomes NULL)
+ * @ptr:                  userspace pointer for node
+ *                        (invariant, no lock needed)
+ * @cookie:               userspace cookie for node
+ *                        (invariant, no lock needed)
+ * @has_strong_ref:       userspace notified of strong ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @pending_strong_ref:   userspace has acked notification of strong ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @has_weak_ref:         userspace notified of weak ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @pending_weak_ref:     userspace has acked notification of weak ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
+ * @has_async_transaction: async transaction to node in progress
+ *                        (protected by @lock)
+ * @sched_policy:         minimum scheduling policy for node
+ *                        (invariant after initialized)
+ * @accept_fds:           file descriptor operations supported for node
+ *                        (invariant after initialized)
+ * @min_priority:         minimum scheduling priority
+ *                        (invariant after initialized)
+ * @inherit_rt:           inherit RT scheduling policy from caller
+ * @txn_security_ctx:     require sender's security context
+ *                        (invariant after initialized)
+ * @async_todo:           list of async work items
+ *                        (protected by @proc->inner_lock)
+ *
+ * Bookkeeping structure for binder nodes.
+ */
+struct binder_node {
+	int debug_id;
+	spinlock_t lock;
+	struct binder_work work;
+	union {
+		struct rb_node rb_node;
+		struct hlist_node dead_node;
+	};
+	struct binder_proc *proc;
+	struct hlist_head refs;
+	int internal_strong_refs;
+	int local_weak_refs;
+	int local_strong_refs;
+	int tmp_refs;
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+	struct {
+		/*
+		 * bitfield elements protected by
+		 * proc inner_lock
+		 */
+		u8 has_strong_ref:1;
+		u8 pending_strong_ref:1;
+		u8 has_weak_ref:1;
+		u8 pending_weak_ref:1;
+	};
+	struct {
+		/*
+		 * invariant after initialization
+		 */
+		u8 sched_policy:2;
+		u8 inherit_rt:1;
+		u8 accept_fds:1;
+		u8 txn_security_ctx:1;
+		u8 min_priority;
+	};
+	bool has_async_transaction;
+	struct list_head async_todo;
+};
+
+struct binder_ref_death {
+	/**
+	 * @work: worklist element for death notifications
+	 *        (protected by inner_lock of the proc that
+	 *        this ref belongs to)
+	 */
+	struct binder_work work;
+	binder_uintptr_t cookie;
+};
+
+/**
+ * struct binder_ref_data - binder_ref counts and id
+ * @debug_id:        unique ID for the ref
+ * @desc:            unique userspace handle for ref
+ * @strong:          strong ref count (debugging only if not locked)
+ * @weak:            weak ref count (debugging only if not locked)
+ *
+ * Structure to hold ref count and ref id information. Since
+ * the actual ref can only be accessed with a lock, this structure
+ * is used to return information about the ref to callers of
+ * ref inc/dec functions.
+ */
+struct binder_ref_data {
+	int debug_id;
+	uint32_t desc;
+	int strong;
+	int weak;
+};
+
+/**
+ * struct binder_ref - struct to track references on nodes
+ * @data:        binder_ref_data containing id, handle, and current refcounts
+ * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
+ * @rb_node_node: node for lookup by @node in proc's rb_tree
+ * @node_entry:  list entry for node->refs list in target node
+ *               (protected by @node->lock)
+ * @proc:        binder_proc containing ref
+ * @node:        binder_node of target node. When cleaning up a
+ *               ref for deletion in binder_cleanup_ref, a non-NULL
+ *               @node indicates the node must be freed
+ * @death:       pointer to death notification (ref_death) if requested
+ *               (protected by @node->lock)
+ *
+ * Structure to track references from procA to target node (on procB). This
+ * structure is unsafe to access without holding @proc->outer_lock.
+ */
+struct binder_ref {
+	/* Lookups needed: */
+	/*   node + proc => ref (transaction) */
+	/*   desc + proc => ref (transaction, inc/dec ref) */
+	/*   node => refs + procs (proc exit) */
+	struct binder_ref_data data;
+	struct rb_node rb_node_desc;
+	struct rb_node rb_node_node;
+	struct hlist_node node_entry;
+	struct binder_proc *proc;
+	struct binder_node *node;
+	struct binder_ref_death *death;
+};
+
 enum binder_deferred_state {
-	BINDER_DEFERRED_FLUSH        = 0x01,
-	BINDER_DEFERRED_RELEASE      = 0x02,
+	BINDER_DEFERRED_PUT_FILES    = 0x01,
+	BINDER_DEFERRED_FLUSH        = 0x02,
+	BINDER_DEFERRED_RELEASE      = 0x04,
+};
+
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy            scheduler policy
+ * @prio                    [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+	unsigned int sched_policy;
+	int prio;
+};
+
+/**
+ * struct binder_proc - binder process bookkeeping
+ * @proc_node:            element for binder_procs list
+ * @threads:              rbtree of binder_threads in this proc
+ *                        (protected by @inner_lock)
+ * @nodes:                rbtree of binder nodes associated with
+ *                        this proc ordered by node->ptr
+ *                        (protected by @inner_lock)
+ * @refs_by_desc:         rbtree of refs ordered by ref->desc
+ *                        (protected by @outer_lock)
+ * @refs_by_node:         rbtree of refs ordered by ref->node
+ *                        (protected by @outer_lock)
+ * @waiting_threads:      threads currently waiting for proc work
+ *                        (protected by @inner_lock)
+ * @pid                   PID of group_leader of process
+ *                        (invariant after initialized)
+ * @tsk                   task_struct for group_leader of process
+ *                        (invariant after initialized)
+ * @files                 files_struct for process
+ *                        (protected by @files_lock)
+ * @files_lock            mutex to protect @files
+ * @cred                  struct cred associated with the `struct file`
+ *                        in binder_open()
+ *                        (invariant after initialized)
+ * @deferred_work_node:   element for binder_deferred_list
+ *                        (protected by binder_deferred_lock)
+ * @deferred_work:        bitmap of deferred work to perform
+ *                        (protected by binder_deferred_lock)
+ * @is_dead:              process is dead and awaiting free
+ *                        when outstanding transactions are cleaned up
+ *                        (protected by @inner_lock)
+ * @todo:                 list of work for this process
+ *                        (protected by @inner_lock)
+ * @stats:                per-process binder statistics
+ *                        (atomics, no lock needed)
+ * @delivered_death:      list of delivered death notification
+ *                        (protected by @inner_lock)
+ * @max_threads:          cap on number of binder threads
+ *                        (protected by @inner_lock)
+ * @requested_threads:    number of binder threads requested but not
+ *                        yet started. In current implementation, can
+ *                        only be 0 or 1.
+ *                        (protected by @inner_lock)
+ * @requested_threads_started: number binder threads started
+ *                        (protected by @inner_lock)
+ * @tmp_ref:              temporary reference to indicate proc is in use
+ *                        (atomic since @proc->inner_lock cannot
+ *                        always be acquired)
+ * @default_priority:     default scheduler priority
+ *                        (invariant after initialized)
+ * @debugfs_entry:        debugfs node
+ * @alloc:                binder allocator bookkeeping
+ * @context:              binder_context for this proc
+ *                        (invariant after initialized)
+ * @inner_lock:           can nest under outer_lock and/or node lock
+ * @outer_lock:           no nesting under innor or node lock
+ *                        Lock order: 1) outer, 2) node, 3) inner
+ *
+ * Bookkeeping structure for binder processes
+ */
+struct binder_proc {
+	struct hlist_node proc_node;
+	struct rb_root threads;
+	struct rb_root nodes;
+	struct rb_root refs_by_desc;
+	struct rb_root refs_by_node;
+	struct list_head waiting_threads;
+	int pid;
+	struct task_struct *tsk;
+	struct files_struct *files;
+	struct mutex files_lock;
+	const struct cred *cred;
+	struct hlist_node deferred_work_node;
+	int deferred_work;
+	bool is_dead;
+
+	struct list_head todo;
+	struct binder_stats stats;
+	struct list_head delivered_death;
+	int max_threads;
+	int requested_threads;
+	int requested_threads_started;
+	atomic_t tmp_ref;
+	struct binder_priority default_priority;
+	struct dentry *debugfs_entry;
+	struct binder_alloc alloc;
+	struct binder_context *context;
+	spinlock_t inner_lock;
+	spinlock_t outer_lock;
 };
 
 enum {
@@ -243,6 +582,110 @@ enum {
 	BINDER_LOOPER_STATE_POLL        = 0x20,
 };
 
+/**
+ * struct binder_thread - binder thread bookkeeping
+ * @proc:                 binder process for this thread
+ *                        (invariant after initialization)
+ * @rb_node:              element for proc->threads rbtree
+ *                        (protected by @proc->inner_lock)
+ * @waiting_thread_node:  element for @proc->waiting_threads list
+ *                        (protected by @proc->inner_lock)
+ * @pid:                  PID for this thread
+ *                        (invariant after initialization)
+ * @looper:               bitmap of looping state
+ *                        (only accessed by this thread)
+ * @looper_needs_return:  looping thread needs to exit driver
+ *                        (no lock needed)
+ * @transaction_stack:    stack of in-progress transactions for this thread
+ *                        (protected by @proc->inner_lock)
+ * @todo:                 list of work to do for this thread
+ *                        (protected by @proc->inner_lock)
+ * @process_todo:         whether work in @todo should be processed
+ *                        (protected by @proc->inner_lock)
+ * @return_error:         transaction errors reported by this thread
+ *                        (only accessed by this thread)
+ * @reply_error:          transaction errors reported by target thread
+ *                        (protected by @proc->inner_lock)
+ * @wait:                 wait queue for thread work
+ * @stats:                per-thread statistics
+ *                        (atomics, no lock needed)
+ * @tmp_ref:              temporary reference to indicate thread is in use
+ *                        (atomic since @proc->inner_lock cannot
+ *                        always be acquired)
+ * @is_dead:              thread is dead and awaiting free
+ *                        when outstanding transactions are cleaned up
+ *                        (protected by @proc->inner_lock)
+ * @task:                 struct task_struct for this thread
+ *
+ * Bookkeeping structure for binder threads.
+ */
+struct binder_thread {
+	struct binder_proc *proc;
+	struct rb_node rb_node;
+	struct list_head waiting_thread_node;
+	int pid;
+	int looper;              /* only modified by this thread */
+	bool looper_need_return; /* can be written by other thread */
+	struct binder_transaction *transaction_stack;
+	struct list_head todo;
+	bool process_todo;
+	struct binder_error return_error;
+	struct binder_error reply_error;
+	wait_queue_head_t wait;
+	struct binder_stats stats;
+	atomic_t tmp_ref;
+	bool is_dead;
+	struct task_struct *task;
+};
+
+struct binder_transaction {
+	int debug_id;
+	struct binder_work work;
+	struct binder_thread *from;
+	struct binder_transaction *from_parent;
+	struct binder_proc *to_proc;
+	struct binder_thread *to_thread;
+	struct binder_transaction *to_parent;
+	unsigned need_reply:1;
+	/* unsigned is_dead:1; */	/* not used at the moment */
+
+	struct binder_buffer *buffer;
+	unsigned int	code;
+	unsigned int	flags;
+	struct binder_priority	priority;
+	struct binder_priority	saved_priority;
+	bool    set_priority_called;
+	kuid_t	sender_euid;
+	binder_uintptr_t security_ctx;
+	/**
+	 * @lock:  protects @from, @to_proc, and @to_thread
+	 *
+	 * @from, @to_proc, and @to_thread can be set to NULL
+	 * during thread teardown
+	 */
+	spinlock_t lock;
+};
+
+/**
+ * struct binder_object - union of flat binder object types
+ * @hdr:   generic object header
+ * @fbo:   binder object (nodes and refs)
+ * @fdo:   file descriptor object
+ * @bbo:   binder buffer pointer
+ * @fdao:  file descriptor array
+ *
+ * Used for type-independent object copies
+ */
+struct binder_object {
+	union {
+		struct binder_object_header hdr;
+		struct flat_binder_object fbo;
+		struct binder_fd_object fdo;
+		struct binder_buffer_object bbo;
+		struct binder_fd_array_object fdao;
+	};
+};
+
 /**
  * binder_proc_lock() - Acquire outer lock for given binder_proc
  * @proc:         struct binder_proc to acquire
@@ -253,7 +696,6 @@ enum {
 #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__)
 static void
 _binder_proc_lock(struct binder_proc *proc, int line)
-	__acquires(&proc->outer_lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -269,7 +711,6 @@ _binder_proc_lock(struct binder_proc *proc, int line)
 #define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__)
 static void
 _binder_proc_unlock(struct binder_proc *proc, int line)
-	__releases(&proc->outer_lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -285,7 +726,6 @@ _binder_proc_unlock(struct binder_proc *proc, int line)
 #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__)
 static void
 _binder_inner_proc_lock(struct binder_proc *proc, int line)
-	__acquires(&proc->inner_lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -301,7 +741,6 @@ _binder_inner_proc_lock(struct binder_proc *proc, int line)
 #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__)
 static void
 _binder_inner_proc_unlock(struct binder_proc *proc, int line)
-	__releases(&proc->inner_lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -317,7 +756,6 @@ _binder_inner_proc_unlock(struct binder_proc *proc, int line)
 #define binder_node_lock(node) _binder_node_lock(node, __LINE__)
 static void
 _binder_node_lock(struct binder_node *node, int line)
-	__acquires(&node->lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -333,7 +771,6 @@ _binder_node_lock(struct binder_node *node, int line)
 #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__)
 static void
 _binder_node_unlock(struct binder_node *node, int line)
-	__releases(&node->lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
@@ -350,16 +787,12 @@ _binder_node_unlock(struct binder_node *node, int line)
 #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__)
 static void
 _binder_node_inner_lock(struct binder_node *node, int line)
-	__acquires(&node->lock) __acquires(&node->proc->inner_lock)
 {
 	binder_debug(BINDER_DEBUG_SPINLOCKS,
 		     "%s: line=%d\n", __func__, line);
 	spin_lock(&node->lock);
 	if (node->proc)
 		binder_inner_proc_lock(node->proc);
-	else
-		/* annotation for sparse */
-		__acquire(&node->proc->inner_lock);
 }
 
 /**
@@ -371,7 +804,6 @@ _binder_node_inner_lock(struct binder_node *node, int line)
 #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__)
 static void
 _binder_node_inner_unlock(struct binder_node *node, int line)
-	__releases(&node->lock) __releases(&node->proc->inner_lock)
 {
 	struct binder_proc *proc = node->proc;
 
@@ -379,9 +811,6 @@ _binder_node_inner_unlock(struct binder_node *node, int line)
 		     "%s: line=%d\n", __func__, line);
 	if (proc)
 		binder_inner_proc_unlock(proc);
-	else
-		/* annotation for sparse */
-		__release(&node->proc->inner_lock);
 	spin_unlock(&node->lock);
 }
 
@@ -442,7 +871,6 @@ static void
 binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread,
 					    struct binder_work *work)
 {
-	WARN_ON(!list_empty(&thread->waiting_thread_node));
 	binder_enqueue_work_ilocked(work, &thread->todo);
 }
 
@@ -460,7 +888,6 @@ static void
 binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
 				   struct binder_work *work)
 {
-	WARN_ON(!list_empty(&thread->waiting_thread_node));
 	binder_enqueue_work_ilocked(work, &thread->todo);
 	thread->process_todo = true;
 }
@@ -521,13 +948,69 @@ static void binder_free_thread(struct binder_thread *thread);
 static void binder_free_proc(struct binder_proc *proc);
 static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
+static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
+{
+	unsigned long rlim_cur;
+	unsigned long irqs;
+	int ret;
+
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		ret = -ESRCH;
+		goto err;
+	}
+	if (!lock_task_sighand(proc->tsk, &irqs)) {
+		ret = -EMFILE;
+		goto err;
+	}
+	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
+	unlock_task_sighand(proc->tsk, &irqs);
+
+	ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+	mutex_unlock(&proc->files_lock);
+	return ret;
+}
+
+/*
+ * copied from fd_install
+ */
+static void task_fd_install(
+	struct binder_proc *proc, unsigned int fd, struct file *file)
+{
+	mutex_lock(&proc->files_lock);
+	if (proc->files)
+		__fd_install(proc->files, fd, file);
+	mutex_unlock(&proc->files_lock);
+}
+
+/*
+ * copied from sys_close
+ */
+static long task_close_fd(struct binder_proc *proc, unsigned int fd)
+{
+	int retval;
+
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		retval = -ESRCH;
+		goto err;
+	}
+	retval = __close_fd(proc->files, fd);
+	/* can't restart close syscall because file table entry was cleared */
+	if (unlikely(retval == -ERESTARTSYS ||
+		     retval == -ERESTARTNOINTR ||
+		     retval == -ERESTARTNOHAND ||
+		     retval == -ERESTART_RESTARTBLOCK))
+		retval = -EINTR;
+err:
+	mutex_unlock(&proc->files_lock);
+	return retval;
+}
+
 static bool binder_has_work_ilocked(struct binder_thread *thread,
 				    bool do_proc_work)
 {
-	int ret = 0;
-
-	if (ret)
-		return true;
 	return thread->process_todo ||
 		thread->looper_need_return ||
 		(do_proc_work &&
@@ -672,7 +1155,7 @@ static int to_userspace_prio(int policy, int kernel_priority)
 	if (is_fair_policy(policy))
 		return PRIO_TO_NICE(kernel_priority);
 	else
-		return MAX_RT_PRIO - 1 - kernel_priority;
+		return MAX_USER_RT_PRIO - 1 - kernel_priority;
 }
 
 static int to_kernel_prio(int policy, int user_priority)
@@ -680,29 +1163,23 @@ static int to_kernel_prio(int policy, int user_priority)
 	if (is_fair_policy(policy))
 		return NICE_TO_PRIO(user_priority);
 	else
-		return MAX_RT_PRIO - 1 - user_priority;
+		return MAX_USER_RT_PRIO - 1 - user_priority;
 }
 
-static void binder_do_set_priority(struct binder_thread *thread,
-				   const struct binder_priority *desired,
+static void binder_do_set_priority(struct task_struct *task,
+				   struct binder_priority desired,
 				   bool verify)
 {
-	struct task_struct *task = thread->task;
 	int priority; /* user-space prio value */
 	bool has_cap_nice;
-	unsigned int policy = desired->sched_policy;
+	unsigned int policy = desired.sched_policy;
 
-	if (task->policy == policy && task->normal_prio == desired->prio) {
-		spin_lock(&thread->prio_lock);
-		if (thread->prio_state == BINDER_PRIO_PENDING)
-			thread->prio_state = BINDER_PRIO_SET;
-		spin_unlock(&thread->prio_lock);
+	if (task->policy == policy && task->normal_prio == desired.prio)
 		return;
-	}
 
 	has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
 
-	priority = to_userspace_prio(policy, desired->prio);
+	priority = to_userspace_prio(policy, desired.prio);
 
 	if (verify && is_rt_policy(policy) && !has_cap_nice) {
 		long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
@@ -727,30 +1204,16 @@ static void binder_do_set_priority(struct binder_thread *thread,
 		}
 	}
 
-	if (policy != desired->sched_policy ||
-	    to_kernel_prio(policy, priority) != desired->prio)
+	if (policy != desired.sched_policy ||
+	    to_kernel_prio(policy, priority) != desired.prio)
 		binder_debug(BINDER_DEBUG_PRIORITY_CAP,
 			     "%d: priority %d not allowed, using %d instead\n",
-			      task->pid, desired->prio,
+			      task->pid, desired.prio,
 			      to_kernel_prio(policy, priority));
 
 	trace_binder_set_priority(task->tgid, task->pid, task->normal_prio,
 				  to_kernel_prio(policy, priority),
-				  desired->prio);
-
-	spin_lock(&thread->prio_lock);
-	if (!verify && thread->prio_state == BINDER_PRIO_ABORT) {
-		/*
-		 * A new priority has been set by an incoming nested
-		 * transaction. Abort this priority restore and allow
-		 * the transaction to run at the new desired priority.
-		 */
-		spin_unlock(&thread->prio_lock);
-		binder_debug(BINDER_DEBUG_PRIORITY_CAP,
-			"%d: %s: aborting priority restore\n",
-			thread->pid, __func__);
-		return;
-	}
+				  desired.prio);
 
 	/* Set the actual priority */
 	if (task->policy != policy || is_rt_policy(policy)) {
@@ -764,46 +1227,37 @@ static void binder_do_set_priority(struct binder_thread *thread,
 	}
 	if (is_fair_policy(policy))
 		set_user_nice(task, priority);
-
-	thread->prio_state = BINDER_PRIO_SET;
-	spin_unlock(&thread->prio_lock);
 }
 
-static void binder_set_priority(struct binder_thread *thread,
-				const struct binder_priority *desired)
+static void binder_set_priority(struct task_struct *task,
+				struct binder_priority desired)
 {
-	binder_do_set_priority(thread, desired, /* verify = */ true);
+	binder_do_set_priority(task, desired, /* verify = */ true);
 }
 
-static void binder_restore_priority(struct binder_thread *thread,
-				    const struct binder_priority *desired)
+static void binder_restore_priority(struct task_struct *task,
+				    struct binder_priority desired)
 {
-	binder_do_set_priority(thread, desired, /* verify = */ false);
+	binder_do_set_priority(task, desired, /* verify = */ false);
 }
 
-static void binder_transaction_priority(struct binder_thread *thread,
+static void binder_transaction_priority(struct task_struct *task,
 					struct binder_transaction *t,
-					struct binder_node *node)
+					struct binder_priority node_prio,
+					bool inherit_rt)
 {
-	struct task_struct *task = thread->task;
-	struct binder_priority desired = t->priority;
-	const struct binder_priority node_prio = {
-		.sched_policy = node->sched_policy,
-		.prio = node->min_priority,
-	};
-	bool skip = false;
+	struct binder_priority desired_prio = t->priority;
 
 	if (t->set_priority_called)
 		return;
 
 	t->set_priority_called = true;
+	t->saved_priority.sched_policy = task->policy;
+	t->saved_priority.prio = task->normal_prio;
 
-	if (skip)
-		return;
-
-	if (!node->inherit_rt && is_rt_policy(desired.sched_policy)) {
-		desired.prio = NICE_TO_PRIO(0);
-		desired.sched_policy = SCHED_NORMAL;
+	if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+		desired_prio.prio = NICE_TO_PRIO(0);
+		desired_prio.sched_policy = SCHED_NORMAL;
 	}
 
 	if (node_prio.prio < t->priority.prio ||
@@ -816,29 +1270,10 @@ static void binder_transaction_priority(struct binder_thread *thread,
 		 * SCHED_FIFO, prefer SCHED_FIFO, since it can
 		 * run unbounded, unlike SCHED_RR.
 		 */
-		desired = node_prio;
-	}
-
-	spin_lock(&thread->prio_lock);
-	if (thread->prio_state == BINDER_PRIO_PENDING) {
-		/*
-		 * Task is in the process of changing priorities
-		 * saving its current values would be incorrect.
-		 * Instead, save the pending priority and signal
-		 * the task to abort the priority restore.
-		 */
-		t->saved_priority = thread->prio_next;
-		thread->prio_state = BINDER_PRIO_ABORT;
-		binder_debug(BINDER_DEBUG_PRIORITY_CAP,
-			"%d: saved pending priority %d\n",
-			current->pid, thread->prio_next.prio);
-	} else {
-		t->saved_priority.sched_policy = task->policy;
-		t->saved_priority.prio = task->normal_prio;
+		desired_prio = node_prio;
 	}
-	spin_unlock(&thread->prio_lock);
 
-	binder_set_priority(thread, &desired);
+	binder_set_priority(task, desired_prio);
 }
 
 static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
@@ -945,9 +1380,9 @@ static struct binder_node *binder_init_node_ilocked(
 static struct binder_node *binder_new_node(struct binder_proc *proc,
 					   struct flat_binder_object *fp)
 {
-	struct binder_node *node, *new_node;
+	struct binder_node *node;
+	struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL);
 
-	new_node = kmem_cache_zalloc(binder_node_pool, GFP_KERNEL);
 	if (!new_node)
 		return NULL;
 	binder_inner_proc_lock(proc);
@@ -957,14 +1392,14 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 		/*
 		 * The node was already added by another thread
 		 */
-		kmem_cache_free(binder_node_pool, new_node);
+		kfree(new_node);
 
 	return node;
 }
 
 static void binder_free_node(struct binder_node *node)
 {
-	kmem_cache_free(binder_node_pool, node);
+	kfree(node);
 	binder_stats_deleted(BINDER_STAT_NODE);
 }
 
@@ -982,7 +1417,8 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
 			if (target_list == NULL &&
 			    node->internal_strong_refs == 0 &&
 			    !(node->proc &&
-			      node == node->proc->context->binder_context_mgr_node &&
+			      node == node->proc->context->
+				      binder_context_mgr_node &&
 			      node->has_strong_ref)) {
 				pr_err("invalid inc strong node for %d\n",
 					node->debug_id);
@@ -992,12 +1428,19 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
 		} else
 			node->local_strong_refs++;
 		if (!node->has_strong_ref && target_list) {
-			struct binder_thread *thread = container_of(target_list,
-						    struct binder_thread, todo);
 			binder_dequeue_work_ilocked(&node->work);
-			BUG_ON(&thread->todo != target_list);
-			binder_enqueue_deferred_thread_work_ilocked(thread,
-								   &node->work);
+			/*
+			 * Note: this function is the only place where we queue
+			 * directly to a thread->todo without using the
+			 * corresponding binder_enqueue_thread_work() helper
+			 * functions; in this case it's ok to not set the
+			 * process_todo flag, since we know this node work will
+			 * always be followed by other work that starts queue
+			 * processing: in case of synchronous transactions, a
+			 * BR_REPLY or BR_ERROR; in case of oneway
+			 * transactions, a BR_TRANSACTION_COMPLETE.
+			 */
+			binder_enqueue_work_ilocked(&node->work, target_list);
 		}
 	} else {
 		if (!internal)
@@ -1151,14 +1594,10 @@ static void binder_dec_node_tmpref(struct binder_node *node)
 	binder_node_inner_lock(node);
 	if (!node->proc)
 		spin_lock(&binder_dead_nodes_lock);
-	else
-		__acquire(&binder_dead_nodes_lock);
 	node->tmp_refs--;
 	BUG_ON(node->tmp_refs < 0);
 	if (!node->proc)
 		spin_unlock(&binder_dead_nodes_lock);
-	else
-		__release(&binder_dead_nodes_lock);
 	/*
 	 * Call binder_dec_node() to check if all refcounts are 0
 	 * and cleanup is needed. Calling with strong=0 and internal=1
@@ -1448,9 +1887,8 @@ static void binder_free_ref(struct binder_ref *ref)
 {
 	if (ref->node)
 		binder_free_node(ref->node);
-	if (ref->death)
-		kmem_cache_free(binder_ref_death_pool, ref->death);
-	kmem_cache_free(binder_ref_pool, ref);
+	kfree(ref->death);
+	kfree(ref);
 }
 
 /**
@@ -1543,7 +1981,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
 	ref = binder_get_ref_for_node_olocked(proc, node, NULL);
 	if (!ref) {
 		binder_proc_unlock(proc);
-		new_ref = kmem_cache_zalloc(binder_ref_pool, GFP_KERNEL);
+		new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
 		if (!new_ref)
 			return -ENOMEM;
 		binder_proc_lock(proc);
@@ -1569,7 +2007,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
 		 * Another thread created the ref first so
 		 * free the one we allocated
 		 */
-		kmem_cache_free(binder_ref_pool, new_ref);
+		kfree(new_ref);
 	return ret;
 }
 
@@ -1628,9 +2066,9 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread)
 static void binder_proc_dec_tmpref(struct binder_proc *proc)
 {
 	binder_inner_proc_lock(proc);
-	proc->tmp_ref--;
+	atomic_dec(&proc->tmp_ref);
 	if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
-			!proc->tmp_ref) {
+			!atomic_read(&proc->tmp_ref)) {
 		binder_inner_proc_unlock(proc);
 		binder_free_proc(proc);
 		return;
@@ -1674,89 +2112,45 @@ static struct binder_thread *binder_get_txn_from(
  */
 static struct binder_thread *binder_get_txn_from_and_acq_inner(
 		struct binder_transaction *t)
-	__acquires(&t->from->proc->inner_lock)
 {
 	struct binder_thread *from;
 
 	from = binder_get_txn_from(t);
-	if (!from) {
-		__acquire(&from->proc->inner_lock);
+	if (!from)
 		return NULL;
-	}
 	binder_inner_proc_lock(from->proc);
 	if (t->from) {
 		BUG_ON(from != t->from);
 		return from;
 	}
 	binder_inner_proc_unlock(from->proc);
-	__acquire(&from->proc->inner_lock);
 	binder_thread_dec_tmpref(from);
 	return NULL;
 }
 
-/**
- * binder_free_txn_fixups() - free unprocessed fd fixups
- * @t:	binder transaction for t->from
- *
- * If the transaction is being torn down prior to being
- * processed by the target process, free all of the
- * fd fixups and fput the file structs. It is safe to
- * call this function after the fixups have been
- * processed -- in that case, the list will be empty.
- */
-static void binder_free_txn_fixups(struct binder_transaction *t)
-{
-	struct binder_txn_fd_fixup *fixup, *tmp;
-
-	list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) {
-		fput(fixup->file);
-		list_del(&fixup->fixup_entry);
-		kfree(fixup);
-	}
-}
-
-static void binder_txn_latency_free(struct binder_transaction *t)
-{
-	int from_proc, from_thread, to_proc, to_thread;
-
-	spin_lock(&t->lock);
-	from_proc = t->from ? t->from->proc->pid : 0;
-	from_thread = t->from ? t->from->pid : 0;
-	to_proc = t->to_proc ? t->to_proc->pid : 0;
-	to_thread = t->to_thread ? t->to_thread->pid : 0;
-	spin_unlock(&t->lock);
-
-	trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread);
-}
-
 static void binder_free_transaction(struct binder_transaction *t)
 {
-	struct binder_proc *target_proc = t->to_proc;
+	struct binder_proc *target_proc;
 
+	spin_lock(&t->lock);
+	target_proc = t->to_proc;
 	if (target_proc) {
+		atomic_inc(&target_proc->tmp_ref);
+		spin_unlock(&t->lock);
+
 		binder_inner_proc_lock(target_proc);
-		target_proc->outstanding_txns--;
-		if (target_proc->outstanding_txns < 0)
-			pr_warn("%s: Unexpected outstanding_txns %d\n",
-				__func__, target_proc->outstanding_txns);
-		if (!target_proc->outstanding_txns && target_proc->is_frozen)
-			wake_up_interruptible_all(&target_proc->freeze_wait);
 		if (t->buffer)
 			t->buffer->transaction = NULL;
 		binder_inner_proc_unlock(target_proc);
+		binder_proc_dec_tmpref(target_proc);
+	} else {
+		/*
+		 * If the transaction has no target_proc, then
+		 * t->buffer->transaction * has already been cleared.
+		 */
+		spin_unlock(&t->lock);
 	}
-	if (trace_binder_txn_latency_free_enabled())
-		binder_txn_latency_free(t);
-	/*
-	 * If the transaction has no target_proc, then
-	 * t->buffer->transaction has already been cleared.
-	 */
-	binder_free_txn_fixups(t);
-	/*
-	 * If the transaction has no target_proc, then
-	 * t->buffer->transaction has already been cleared.
-	 */
-	kmem_cache_free(binder_transaction_pool, t);
+	kfree(t);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION);
 }
 
@@ -1798,7 +2192,6 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 			binder_free_transaction(t);
 			return;
 		}
-		__release(&target_thread->proc->inner_lock);
 		next = t->from_parent;
 
 		binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
@@ -1841,21 +2234,15 @@ static void binder_cleanup_transaction(struct binder_transaction *t,
 /**
  * binder_get_object() - gets object and checks for valid metadata
  * @proc:	binder_proc owning the buffer
- * @u:		sender's user pointer to base of buffer
  * @buffer:	binder_buffer that we're parsing.
  * @offset:	offset in the @buffer at which to validate an object.
  * @object:	struct binder_object to read into
  *
- * Copy the binder object at the given offset into @object. If @u is
- * provided then the copy is from the sender's buffer. If not, then
- * it is copied from the target's @buffer.
- *
- * Return:	If there's a valid metadata object at @offset, the
+ * Return:	If there's a valid metadata object at @offset in @buffer, the
  *		size of that object. Otherwise, it returns zero. The object
  *		is read into the struct binder_object pointed to by @object.
  */
 static size_t binder_get_object(struct binder_proc *proc,
-				const void __user *u,
 				struct binder_buffer *buffer,
 				unsigned long offset,
 				struct binder_object *object)
@@ -1865,16 +2252,11 @@ static size_t binder_get_object(struct binder_proc *proc,
 	size_t object_size = 0;
 
 	read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset);
-	if (offset > buffer->data_size || read_size < sizeof(*hdr))
+	if (offset > buffer->data_size || read_size < sizeof(*hdr) ||
+	    !IS_ALIGNED(offset, sizeof(u32)))
 		return 0;
-	if (u) {
-		if (copy_from_user(object, u + offset, read_size))
-			return 0;
-	} else {
-		if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
-						  offset, read_size))
-			return 0;
-	}
+	binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
+				      offset, read_size);
 
 	/* Ok, now see if we read a complete object. */
 	hdr = &object->hdr;
@@ -1943,11 +2325,9 @@ static struct binder_buffer_object *binder_validate_ptr(
 		return NULL;
 
 	buffer_offset = start_offset + sizeof(binder_size_t) * index;
-	if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
-					  b, buffer_offset,
-					  sizeof(object_offset)))
-		return NULL;
-	object_size = binder_get_object(proc, NULL, b, object_offset, object);
+	binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
+				      b, buffer_offset, sizeof(object_offset));
+	object_size = binder_get_object(proc, b, object_offset, object);
 	if (!object_size || object->hdr.type != BINDER_TYPE_PTR)
 		return NULL;
 	if (object_offsetp)
@@ -2012,8 +2392,7 @@ static bool binder_validate_fixup(struct binder_proc *proc,
 		unsigned long buffer_offset;
 		struct binder_object last_object;
 		struct binder_buffer_object *last_bbo;
-		size_t object_size = binder_get_object(proc, NULL, b,
-						       last_obj_offset,
+		size_t object_size = binder_get_object(proc, b, last_obj_offset,
 						       &last_object);
 		if (object_size != sizeof(*last_bbo))
 			return false;
@@ -2027,78 +2406,15 @@ static bool binder_validate_fixup(struct binder_proc *proc,
 			return false;
 		last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t);
 		buffer_offset = objects_start_offset +
-			sizeof(binder_size_t) * last_bbo->parent;
-		if (binder_alloc_copy_from_buffer(&proc->alloc,
-						  &last_obj_offset,
-						  b, buffer_offset,
-						  sizeof(last_obj_offset)))
-			return false;
+			sizeof(binder_size_t) * last_bbo->parent,
+		binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset,
+					      b, buffer_offset,
+					      sizeof(last_obj_offset));
 	}
 	return (fixup_offset >= last_min_offset);
 }
 
-/**
- * struct binder_task_work_cb - for deferred close
- *
- * @twork:                callback_head for task work
- * @fd:                   fd to close
- *
- * Structure to pass task work to be handled after
- * returning from binder_ioctl() via task_work_add().
- */
-struct binder_task_work_cb {
-	struct callback_head twork;
-	struct file *file;
-};
-
-/**
- * binder_do_fd_close() - close list of file descriptors
- * @twork:	callback head for task work
- *
- * It is not safe to call ksys_close() during the binder_ioctl()
- * function if there is a chance that binder's own file descriptor
- * might be closed. This is to meet the requirements for using
- * fdget() (see comments for __fget_light()). Therefore use
- * task_work_add() to schedule the close operation once we have
- * returned from binder_ioctl(). This function is a callback
- * for that mechanism and does the actual ksys_close() on the
- * given file descriptor.
- */
-static void binder_do_fd_close(struct callback_head *twork)
-{
-	struct binder_task_work_cb *twcb = container_of(twork,
-			struct binder_task_work_cb, twork);
-
-	fput(twcb->file);
-	kfree(twcb);
-}
-
-/**
- * binder_deferred_fd_close() - schedule a close for the given file-descriptor
- * @fd:		file-descriptor to close
- *
- * See comments in binder_do_fd_close(). This function is used to schedule
- * a file-descriptor to be closed after returning from binder_ioctl().
- */
-static void binder_deferred_fd_close(int fd)
-{
-	struct binder_task_work_cb *twcb;
-
-	twcb = kzalloc(sizeof(*twcb), GFP_KERNEL);
-	if (!twcb)
-		return;
-	init_task_work(&twcb->twork, binder_do_fd_close);
-	close_fd_get_file(fd, &twcb->file);
-	if (twcb->file) {
-		filp_close(twcb->file, current->files);
-		task_work_add(current, &twcb->twork, true);
-	} else {
-		kfree(twcb);
-	}
-}
-
 static void binder_transaction_buffer_release(struct binder_proc *proc,
-					      struct binder_thread *thread,
 					      struct binder_buffer *buffer,
 					      binder_size_t failed_at,
 					      bool is_failure)
@@ -2116,20 +2432,20 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 		binder_dec_node(buffer->target_node, 1, 0);
 
 	off_start_offset = ALIGN(buffer->data_size, sizeof(void *));
-	off_end_offset = is_failure && failed_at ? failed_at :
+	off_end_offset = is_failure ? failed_at :
 				off_start_offset + buffer->offsets_size;
 	for (buffer_offset = off_start_offset; buffer_offset < off_end_offset;
 	     buffer_offset += sizeof(binder_size_t)) {
 		struct binder_object_header *hdr;
-		size_t object_size = 0;
+		size_t object_size;
 		struct binder_object object;
 		binder_size_t object_offset;
 
-		if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
-						   buffer, buffer_offset,
-						   sizeof(object_offset)))
-			object_size = binder_get_object(proc, NULL, buffer,
-							object_offset, &object);
+		binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
+					      buffer, buffer_offset,
+					      sizeof(object_offset));
+		object_size = binder_get_object(proc, buffer,
+						object_offset, &object);
 		if (object_size == 0) {
 			pr_err("transaction release %d bad object at offset %lld, size %zd\n",
 			       debug_id, (u64)object_offset, buffer->data_size);
@@ -2177,15 +2493,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 		} break;
 
 		case BINDER_TYPE_FD: {
-			/*
-			 * No need to close the file here since user-space
-			 * closes it for for successfully delivered
-			 * transactions. For transactions that weren't
-			 * delivered, the new fd was never allocated so
-			 * there is no need to close and the fput on the
-			 * file is done when the transaction is torn
-			 * down.
-			 */
+			struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
+			binder_debug(BINDER_DEBUG_TRANSACTION,
+				     "        fd %d\n", fp->fd);
+			if (failed_at)
+				task_close_fd(proc, fp->fd);
 		} break;
 		case BINDER_TYPE_PTR:
 			/*
@@ -2202,14 +2515,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			binder_size_t fd_buf_size;
 			binder_size_t num_valid;
 
-			if (is_failure) {
-				/*
-				 * The fd fixups have not been applied so no
-				 * fds need to be closed.
-				 */
-				continue;
-			}
-
 			num_valid = (buffer_offset - off_start_offset) /
 						sizeof(binder_size_t);
 			fda = to_binder_fd_array_object(hdr);
@@ -2219,7 +2524,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 						     NULL,
 						     num_valid);
 			if (!parent) {
-				pr_err("transaction release %d bad parent offset\n",
+				pr_err("transaction release %d bad parent offset",
 				       debug_id);
 				continue;
 			}
@@ -2249,24 +2554,15 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			for (fd_index = 0; fd_index < fda->num_fds;
 			     fd_index++) {
 				u32 fd;
-				int err;
 				binder_size_t offset = fda_offset +
 					fd_index * sizeof(fd);
 
-				err = binder_alloc_copy_from_buffer(
-						&proc->alloc, &fd, buffer,
-						offset, sizeof(fd));
-				WARN_ON(err);
-				if (!err) {
-					binder_deferred_fd_close(fd);
-					/*
-					 * Need to make sure the thread goes
-					 * back to userspace to complete the
-					 * deferred close
-					 */
-					if (thread)
-						thread->looper_need_return = true;
-				}
+				binder_alloc_copy_from_buffer(&proc->alloc,
+							      &fd,
+							      buffer,
+							      offset,
+							      sizeof(fd));
+				task_close_fd(proc, fd);
 			}
 		} break;
 		default:
@@ -2362,15 +2658,11 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 		fp->cookie = node->cookie;
 		if (node->proc)
 			binder_inner_proc_lock(node->proc);
-		else
-			__acquire(&node->proc->inner_lock);
 		binder_inc_node_nilocked(node,
 					 fp->hdr.type == BINDER_TYPE_BINDER,
 					 0, NULL);
 		if (node->proc)
 			binder_inner_proc_unlock(node->proc);
-		else
-			__release(&node->proc->inner_lock);
 		trace_binder_transaction_ref_to_node(t, node, &src_rdata);
 		binder_debug(BINDER_DEBUG_TRANSACTION,
 			     "        ref %d desc %d -> node %d u%016llx\n",
@@ -2403,16 +2695,16 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 	return ret;
 }
 
-static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
+static int binder_translate_fd(int fd,
 			       struct binder_transaction *t,
 			       struct binder_thread *thread,
 			       struct binder_transaction *in_reply_to)
 {
 	struct binder_proc *proc = thread->proc;
 	struct binder_proc *target_proc = t->to_proc;
-	struct binder_txn_fd_fixup *fixup;
+	int target_fd;
 	struct file *file;
-	int ret = 0;
+	int ret;
 	bool target_allows_fd;
 
 	if (in_reply_to)
@@ -2441,24 +2733,19 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
 		goto err_security;
 	}
 
-	/*
-	 * Add fixup record for this transaction. The allocation
-	 * of the fd in the target needs to be done from a
-	 * target thread.
-	 */
-	fixup = kzalloc(sizeof(*fixup), GFP_KERNEL);
-	if (!fixup) {
+	target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+	if (target_fd < 0) {
 		ret = -ENOMEM;
-		goto err_alloc;
+		goto err_get_unused_fd;
 	}
-	fixup->file = file;
-	fixup->offset = fd_offset;
-	trace_binder_transaction_fd_send(t, fd, fixup->offset);
-	list_add_tail(&fixup->fixup_entry, &t->fd_fixups);
+	task_fd_install(target_proc, target_fd, file);
+	trace_binder_transaction_fd(t, fd, target_fd);
+	binder_debug(BINDER_DEBUG_TRANSACTION, "        fd %d -> %d\n",
+		     fd, target_fd);
 
-	return ret;
+	return target_fd;
 
-err_alloc:
+err_get_unused_fd:
 err_security:
 	fput(file);
 err_fget:
@@ -2466,266 +2753,17 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
 	return ret;
 }
 
-/**
- * struct binder_ptr_fixup - data to be fixed-up in target buffer
- * @offset	offset in target buffer to fixup
- * @skip_size	bytes to skip in copy (fixup will be written later)
- * @fixup_data	data to write at fixup offset
- * @node	list node
- *
- * This is used for the pointer fixup list (pf) which is created and consumed
- * during binder_transaction() and is only accessed locally. No
- * locking is necessary.
- *
- * The list is ordered by @offset.
- */
-struct binder_ptr_fixup {
-	binder_size_t offset;
-	size_t skip_size;
-	binder_uintptr_t fixup_data;
-	struct list_head node;
-};
-
-/**
- * struct binder_sg_copy - scatter-gather data to be copied
- * @offset		offset in target buffer
- * @sender_uaddr	user address in source buffer
- * @length		bytes to copy
- * @node		list node
- *
- * This is used for the sg copy list (sgc) which is created and consumed
- * during binder_transaction() and is only accessed locally. No
- * locking is necessary.
- *
- * The list is ordered by @offset.
- */
-struct binder_sg_copy {
-	binder_size_t offset;
-	const void __user *sender_uaddr;
-	size_t length;
-	struct list_head node;
-};
-
-/**
- * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data
- * @alloc:	binder_alloc associated with @buffer
- * @buffer:	binder buffer in target process
- * @sgc_head:	list_head of scatter-gather copy list
- * @pf_head:	list_head of pointer fixup list
- *
- * Processes all elements of @sgc_head, applying fixups from @pf_head
- * and copying the scatter-gather data from the source process' user
- * buffer to the target's buffer. It is expected that the list creation
- * and processing all occurs during binder_transaction() so these lists
- * are only accessed in local context.
- *
- * Return: 0=success, else -errno
- */
-static int binder_do_deferred_txn_copies(struct binder_alloc *alloc,
-					 struct binder_buffer *buffer,
-					 struct list_head *sgc_head,
-					 struct list_head *pf_head)
-{
-	int ret = 0;
-	struct binder_sg_copy *sgc, *tmpsgc;
-	struct binder_ptr_fixup *tmppf;
-	struct binder_ptr_fixup *pf =
-		list_first_entry_or_null(pf_head, struct binder_ptr_fixup,
-					 node);
-
-	list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
-		size_t bytes_copied = 0;
-
-		while (bytes_copied < sgc->length) {
-			size_t copy_size;
-			size_t bytes_left = sgc->length - bytes_copied;
-			size_t offset = sgc->offset + bytes_copied;
-
-			/*
-			 * We copy up to the fixup (pointed to by pf)
-			 */
-			copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset)
-				       : bytes_left;
-			if (!ret && copy_size)
-				ret = binder_alloc_copy_user_to_buffer(
-						alloc, buffer,
-						offset,
-						sgc->sender_uaddr + bytes_copied,
-						copy_size);
-			bytes_copied += copy_size;
-			if (copy_size != bytes_left) {
-				BUG_ON(!pf);
-				/* we stopped at a fixup offset */
-				if (pf->skip_size) {
-					/*
-					 * we are just skipping. This is for
-					 * BINDER_TYPE_FDA where the translated
-					 * fds will be fixed up when we get
-					 * to target context.
-					 */
-					bytes_copied += pf->skip_size;
-				} else {
-					/* apply the fixup indicated by pf */
-					if (!ret)
-						ret = binder_alloc_copy_to_buffer(
-							alloc, buffer,
-							pf->offset,
-							&pf->fixup_data,
-							sizeof(pf->fixup_data));
-					bytes_copied += sizeof(pf->fixup_data);
-				}
-				list_del(&pf->node);
-				kfree(pf);
-				pf = list_first_entry_or_null(pf_head,
-						struct binder_ptr_fixup, node);
-			}
-		}
-		list_del(&sgc->node);
-		kfree(sgc);
-	}
-	list_for_each_entry_safe(pf, tmppf, pf_head, node) {
-		BUG_ON(pf->skip_size == 0);
-		list_del(&pf->node);
-		kfree(pf);
-	}
-	BUG_ON(!list_empty(sgc_head));
-
-	return ret > 0 ? -EINVAL : ret;
-}
-
-/**
- * binder_cleanup_deferred_txn_lists() - free specified lists
- * @sgc_head:	list_head of scatter-gather copy list
- * @pf_head:	list_head of pointer fixup list
- *
- * Called to clean up @sgc_head and @pf_head if there is an
- * error.
- */
-static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head,
-					      struct list_head *pf_head)
-{
-	struct binder_sg_copy *sgc, *tmpsgc;
-	struct binder_ptr_fixup *pf, *tmppf;
-
-	list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
-		list_del(&sgc->node);
-		kfree(sgc);
-	}
-	list_for_each_entry_safe(pf, tmppf, pf_head, node) {
-		list_del(&pf->node);
-		kfree(pf);
-	}
-}
-
-/**
- * binder_defer_copy() - queue a scatter-gather buffer for copy
- * @sgc_head:		list_head of scatter-gather copy list
- * @offset:		binder buffer offset in target process
- * @sender_uaddr:	user address in source process
- * @length:		bytes to copy
- *
- * Specify a scatter-gather block to be copied. The actual copy must
- * be deferred until all the needed fixups are identified and queued.
- * Then the copy and fixups are done together so un-translated values
- * from the source are never visible in the target buffer.
- *
- * We are guaranteed that repeated calls to this function will have
- * monotonically increasing @offset values so the list will naturally
- * be ordered.
- *
- * Return: 0=success, else -errno
- */
-static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset,
-			     const void __user *sender_uaddr, size_t length)
-{
-	struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL);
-
-	if (!bc)
-		return -ENOMEM;
-
-	bc->offset = offset;
-	bc->sender_uaddr = sender_uaddr;
-	bc->length = length;
-	INIT_LIST_HEAD(&bc->node);
-
-	/*
-	 * We are guaranteed that the deferred copies are in-order
-	 * so just add to the tail.
-	 */
-	list_add_tail(&bc->node, sgc_head);
-
-	return 0;
-}
-
-/**
- * binder_add_fixup() - queue a fixup to be applied to sg copy
- * @pf_head:	list_head of binder ptr fixup list
- * @offset:	binder buffer offset in target process
- * @fixup:	bytes to be copied for fixup
- * @skip_size:	bytes to skip when copying (fixup will be applied later)
- *
- * Add the specified fixup to a list ordered by @offset. When copying
- * the scatter-gather buffers, the fixup will be copied instead of
- * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup
- * will be applied later (in target process context), so we just skip
- * the bytes specified by @skip_size. If @skip_size is 0, we copy the
- * value in @fixup.
- *
- * This function is called *mostly* in @offset order, but there are
- * exceptions. Since out-of-order inserts are relatively uncommon,
- * we insert the new element by searching backward from the tail of
- * the list.
- *
- * Return: 0=success, else -errno
- */
-static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset,
-			    binder_uintptr_t fixup, size_t skip_size)
-{
-	struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL);
-	struct binder_ptr_fixup *tmppf;
-
-	if (!pf)
-		return -ENOMEM;
-
-	pf->offset = offset;
-	pf->fixup_data = fixup;
-	pf->skip_size = skip_size;
-	INIT_LIST_HEAD(&pf->node);
-
-	/* Fixups are *mostly* added in-order, but there are some
-	 * exceptions. Look backwards through list for insertion point.
-	 */
-	list_for_each_entry_reverse(tmppf, pf_head, node) {
-		if (tmppf->offset < pf->offset) {
-			list_add(&pf->node, &tmppf->node);
-			return 0;
-		}
-	}
-	/*
-	 * if we get here, then the new offset is the lowest so
-	 * insert at the head
-	 */
-	list_add(&pf->node, pf_head);
-	return 0;
-}
-
-static int binder_translate_fd_array(struct list_head *pf_head,
-				     struct binder_fd_array_object *fda,
-				     const void __user *sender_ubuffer,
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
 				     struct binder_buffer_object *parent,
-				     struct binder_buffer_object *sender_uparent,
 				     struct binder_transaction *t,
 				     struct binder_thread *thread,
 				     struct binder_transaction *in_reply_to)
 {
-	binder_size_t fdi, fd_buf_size;
+	binder_size_t fdi, fd_buf_size, num_installed_fds;
 	binder_size_t fda_offset;
-	const void __user *sender_ufda_base;
+	int target_fd;
 	struct binder_proc *proc = thread->proc;
-	int ret;
-
-	if (fda->num_fds == 0)
-		return 0;
+	struct binder_proc *target_proc = t->to_proc;
 
 	fd_buf_size = sizeof(u32) * fda->num_fds;
 	if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
@@ -2749,36 +2787,46 @@ static int binder_translate_fd_array(struct list_head *pf_head,
 	 */
 	fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) +
 		fda->parent_offset;
-	sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer +
-				fda->parent_offset;
-
-	if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) ||
-	    !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) {
+	if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) {
 		binder_user_error("%d:%d parent offset not aligned correctly.\n",
 				  proc->pid, thread->pid);
 		return -EINVAL;
 	}
-	ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32));
-	if (ret)
-		return ret;
-
 	for (fdi = 0; fdi < fda->num_fds; fdi++) {
 		u32 fd;
+
 		binder_size_t offset = fda_offset + fdi * sizeof(fd);
-		binder_size_t sender_uoffset = fdi * sizeof(fd);
 
-		ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd));
-		if (!ret)
-			ret = binder_translate_fd(fd, offset, t, thread,
-						  in_reply_to);
-		if (ret)
-			return ret > 0 ? -EINVAL : ret;
+		binder_alloc_copy_from_buffer(&target_proc->alloc,
+					      &fd, t->buffer,
+					      offset, sizeof(fd));
+		target_fd = binder_translate_fd(fd, t, thread, in_reply_to);
+		if (target_fd < 0)
+			goto err_translate_fd_failed;
+		binder_alloc_copy_to_buffer(&target_proc->alloc,
+					    t->buffer, offset,
+					    &target_fd, sizeof(fd));
 	}
 	return 0;
+
+err_translate_fd_failed:
+	/*
+	 * Failed to allocate fd or security error, free fds
+	 * installed so far.
+	 */
+	num_installed_fds = fdi;
+	for (fdi = 0; fdi < num_installed_fds; fdi++) {
+		u32 fd;
+		binder_size_t offset = fda_offset + fdi * sizeof(fd);
+		binder_alloc_copy_from_buffer(&target_proc->alloc,
+					      &fd, t->buffer,
+					      offset, sizeof(fd));
+		task_close_fd(target_proc, fd);
+	}
+	return target_fd;
 }
 
-static int binder_fixup_parent(struct list_head *pf_head,
-			       struct binder_transaction *t,
+static int binder_fixup_parent(struct binder_transaction *t,
 			       struct binder_thread *thread,
 			       struct binder_buffer_object *bp,
 			       binder_size_t off_start_offset,
@@ -2824,88 +2872,11 @@ static int binder_fixup_parent(struct list_head *pf_head,
 	}
 	buffer_offset = bp->parent_offset +
 			(uintptr_t)parent->buffer - (uintptr_t)b->user_data;
-	return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0);
-}
-
-/**
- * binder_can_update_transaction() - Can a txn be superseded by an updated one?
- * @t1: the pending async txn in the frozen process
- * @t2: the new async txn to supersede the outdated pending one
- *
- * Return:  true if t2 can supersede t1
- *          false if t2 can not supersede t1
- */
-static bool binder_can_update_transaction(struct binder_transaction *t1,
-					  struct binder_transaction *t2)
-{
-#ifdef CONFIG_REKERNEL
-	if ((t1->flags & t2->flags & TF_ONE_WAY) != TF_ONE_WAY || !t1->to_proc || !t2->to_proc)
-#else
-	if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) !=
-	    (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc)
-#endif /* CONFIG_REKERNEL */
-		return false;
-	if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code &&
-	    t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid &&
-	    t1->buffer->target_node->ptr == t2->buffer->target_node->ptr &&
-	    t1->buffer->target_node->cookie == t2->buffer->target_node->cookie)
-		return true;
-	return false;
-}
-
-/**
- * binder_find_outdated_transaction_ilocked() - Find the outdated transaction
- * @t:		 new async transaction
- * @target_list: list to find outdated transaction
- *
- * Return: the outdated transaction if found
- *         NULL if no outdated transacton can be found
- *
- * Requires the proc->inner_lock to be held.
- */
-static struct binder_transaction *
-binder_find_outdated_transaction_ilocked(struct binder_transaction *t,
-					 struct list_head *target_list)
-{
-	struct binder_work *w;
+	binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset,
+				    &bp->buffer, sizeof(bp->buffer));
 
-	list_for_each_entry(w, target_list, entry) {
-		struct binder_transaction *t_queued;
-
-		if (w->type != BINDER_WORK_TRANSACTION)
-			continue;
-		t_queued = container_of(w, struct binder_transaction, work);
-		if (binder_can_update_transaction(t_queued, t))
-			return t_queued;
-	}
-	return NULL;
-}
-
-#ifdef CONFIG_REKERNEL
-void rekernel_binder_transaction(bool reply, struct binder_transaction *t,
-			struct binder_node *target_node, struct binder_transaction_data *tr) {
-	struct binder_proc *to_proc;
-	struct binder_alloc *target_alloc;
-	if (!t->to_proc)
-		return;
-	to_proc = t->to_proc;
-
-	if (reply) {
-		binder_reply_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, false, tr);
-	} else if (t->from) {
-		if (t->from->proc) {
-			binder_trans_handler(t->from->proc->pid, t->from->proc->tsk, to_proc->pid, to_proc->tsk, false, tr);
-		}
-	} else { // oneway=1
-		binder_trans_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr);
-
-		target_alloc = &to_proc->alloc;
-		if (target_alloc->free_async_space < (target_alloc->buffer_size / 10 + 0x300)) {
-			binder_overflow_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr);
-		}
-	}
+	return 0;
 }
-#endif /* CONFIG_REKERNEL */
 
 /**
  * binder_proc_transaction() - sends a transaction to a process and wakes it up
@@ -2921,95 +2892,60 @@ void rekernel_binder_transaction(bool reply, struct binder_transaction *t,
  * If the @thread parameter is not NULL, the transaction is always queued
  * to the waitlist of that specific thread.
  *
- * Return:	0 if the transaction was successfully queued
- *		BR_DEAD_REPLY if the target process or thread is dead
- *		BR_FROZEN_REPLY if the target process or thread is frozen
+ * Return:	true if the transactions was successfully queued
+ *		false if the target process or thread is dead
  */
-static int binder_proc_transaction(struct binder_transaction *t,
+static bool binder_proc_transaction(struct binder_transaction *t,
 				    struct binder_proc *proc,
 				    struct binder_thread *thread)
 {
 	struct binder_node *node = t->buffer->target_node;
+	struct binder_priority node_prio;
 	bool oneway = !!(t->flags & TF_ONE_WAY);
 	bool pending_async = false;
-	bool skip = false;
-	struct binder_transaction *t_outdated = NULL;
 
 	BUG_ON(!node);
 	binder_node_lock(node);
+	node_prio.prio = node->min_priority;
+	node_prio.sched_policy = node->sched_policy;
 
 	if (oneway) {
 		BUG_ON(thread);
-		if (node->has_async_transaction)
+		if (node->has_async_transaction) {
 			pending_async = true;
-		else
+		} else {
 			node->has_async_transaction = true;
+		}
 	}
 
 	binder_inner_proc_lock(proc);
-	if (proc->is_frozen) {
-		proc->sync_recv |= !oneway;
-		proc->async_recv |= oneway;
-	}
 
-	if ((proc->is_frozen && !oneway) || proc->is_dead ||
-			(thread && thread->is_dead)) {
+	if (proc->is_dead || (thread && thread->is_dead)) {
 		binder_inner_proc_unlock(proc);
 		binder_node_unlock(node);
-		return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY;
+		return false;
 	}
 
-	if (!thread && !pending_async && !skip)
+	if (!thread && !pending_async)
 		thread = binder_select_thread_ilocked(proc);
 
 	if (thread) {
-		binder_transaction_priority(thread, t, node);
+		binder_transaction_priority(thread->task, t, node_prio,
+					    node->inherit_rt);
 		binder_enqueue_thread_work_ilocked(thread, &t->work);
 	} else if (!pending_async) {
 		binder_enqueue_work_ilocked(&t->work, &proc->todo);
 	} else {
-#ifdef CONFIG_REKERNEL
-		if (frozen_task_group(proc->tsk)) {
-#else
-		if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) {
-#endif /* CONFIG_REKERNEL */
-			t_outdated = binder_find_outdated_transaction_ilocked(t,
-									      &node->async_todo);
-			if (t_outdated) {
-				binder_debug(BINDER_DEBUG_TRANSACTION,
-					     "txn %d supersedes %d\n",
-					     t->debug_id, t_outdated->debug_id);
-				list_del_init(&t_outdated->work.entry);
-				proc->outstanding_txns--;
-			}
-		}
 		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
 	}
 
 	if (!pending_async)
 		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
 
-	proc->outstanding_txns++;
 	binder_inner_proc_unlock(proc);
 	binder_node_unlock(node);
 
-	/*
-	 * To reduce potential contention, free the outdated transaction and
-	 * buffer after releasing the locks.
-	 */
-	if (t_outdated) {
-		struct binder_buffer *buffer = t_outdated->buffer;
-
-		t_outdated->buffer = NULL;
-		buffer->transaction = NULL;
-		trace_binder_transaction_update_buffer_release(buffer);
-		binder_transaction_buffer_release(proc, NULL, buffer, 0, 0);
-		binder_alloc_free_buf(&proc->alloc, buffer);
-		kfree(t_outdated);
-		binder_stats_deleted(BINDER_STAT_TRANSACTION);
-	}
-
-	return 0;
+	return true;
 }
 
 /**
@@ -3045,7 +2981,7 @@ static struct binder_node *binder_get_node_refs_for_txn(
 		target_node = node;
 		binder_inc_node_nilocked(node, 1, 0, NULL);
 		binder_inc_node_tmpref_ilocked(node);
-		node->proc->tmp_ref++;
+		atomic_inc(&node->proc->tmp_ref);
 		*procp = node->proc;
 	} else
 		*error = BR_DEAD_REPLY;
@@ -3061,13 +2997,11 @@ static void binder_transaction(struct binder_proc *proc,
 {
 	int ret;
 	struct binder_transaction *t;
-	struct binder_work *w;
 	struct binder_work *tcomplete;
 	binder_size_t buffer_offset = 0;
 	binder_size_t off_start_offset, off_end_offset;
 	binder_size_t off_min;
 	binder_size_t sg_buf_offset, sg_buf_end_offset;
-	binder_size_t user_offset = 0;
 	struct binder_proc *target_proc = NULL;
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
@@ -3082,13 +3016,6 @@ static void binder_transaction(struct binder_proc *proc,
 	int t_debug_id = atomic_inc_return(&binder_last_id);
 	char *secctx = NULL;
 	u32 secctx_sz = 0;
-	bool is_nested = false;
-	struct list_head sgc_head;
-	struct list_head pf_head;
-	const void __user *user_buffer = (const void __user *)
-				(uintptr_t)tr->data.ptr.buffer;
-	INIT_LIST_HEAD(&sgc_head);
-	INIT_LIST_HEAD(&pf_head);
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->debug_id = t_debug_id;
@@ -3098,7 +3025,7 @@ static void binder_transaction(struct binder_proc *proc,
 	e->target_handle = tr->target.handle;
 	e->data_size = tr->data_size;
 	e->offsets_size = tr->offsets_size;
-	strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME);
+	e->context_name = proc->context->name;
 
 	if (reply) {
 		binder_inner_proc_lock(proc);
@@ -3132,8 +3059,6 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_inner_proc_unlock(proc);
 		target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
 		if (target_thread == NULL) {
-			/* annotation for sparse */
-			__release(&target_thread->proc->inner_lock);
 			return_error = BR_DEAD_REPLY;
 			return_error_line = __LINE__;
 			goto err_dead_binder;
@@ -3153,7 +3078,7 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_dead_binder;
 		}
 		target_proc = target_thread->proc;
-		target_proc->tmp_ref++;
+		atomic_inc(&target_proc->tmp_ref);
 		binder_inner_proc_unlock(target_thread->proc);
 	} else {
 		if (tr->target.handle) {
@@ -3174,8 +3099,8 @@ static void binder_transaction(struct binder_proc *proc,
 						ref->node, &target_proc,
 						&return_error);
 			} else {
-				binder_user_error("%d:%d got transaction to invalid handle, %u\n",
-						  proc->pid, thread->pid, tr->target.handle);
+				binder_user_error("%d:%d got transaction to invalid handle\n",
+						  proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
 			}
 			binder_proc_unlock(proc);
@@ -3189,7 +3114,7 @@ static void binder_transaction(struct binder_proc *proc,
 			else
 				return_error = BR_DEAD_REPLY;
 			mutex_unlock(&context->context_mgr_node_lock);
-			if (target_node && target_proc->pid == proc->pid) {
+			if (target_node && target_proc == proc) {
 				binder_user_error("%d:%d got transaction to context manager from process owning it\n",
 						  proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
@@ -3221,29 +3146,6 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_invalid_target_handle;
 		}
 		binder_inner_proc_lock(proc);
-
-		w = list_first_entry_or_null(&thread->todo,
-					     struct binder_work, entry);
-		if (!(tr->flags & TF_ONE_WAY) && w &&
-		    w->type == BINDER_WORK_TRANSACTION) {
-			/*
-			 * Do not allow new outgoing transaction from a
-			 * thread that has a transaction at the head of
-			 * its todo list. Only need to check the head
-			 * because binder_select_thread_ilocked picks a
-			 * thread from proc->waiting_threads to enqueue
-			 * the transaction, and nothing is queued to the
-			 * todo list while the thread is on waiting_threads.
-			 */
-			binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n",
-					  proc->pid, thread->pid);
-			binder_inner_proc_unlock(proc);
-			return_error = BR_FAILED_REPLY;
-			return_error_param = -EPROTO;
-			return_error_line = __LINE__;
-			goto err_bad_todo_list;
-		}
-
 		if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
 			struct binder_transaction *tmp;
 
@@ -3271,7 +3173,6 @@ static void binder_transaction(struct binder_proc *proc,
 					atomic_inc(&from->tmp_ref);
 					target_thread = from;
 					spin_unlock(&tmp->lock);
-					is_nested = true;
 					break;
 				}
 				spin_unlock(&tmp->lock);
@@ -3285,18 +3186,17 @@ static void binder_transaction(struct binder_proc *proc,
 	e->to_proc = target_proc->pid;
 
 	/* TODO: reuse incoming transaction for reply */
-	t = kmem_cache_zalloc(binder_transaction_pool, GFP_KERNEL);
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
 	if (t == NULL) {
 		return_error = BR_FAILED_REPLY;
 		return_error_param = -ENOMEM;
 		return_error_line = __LINE__;
 		goto err_alloc_t_failed;
 	}
-	INIT_LIST_HEAD(&t->fd_fixups);
 	binder_stats_created(BINDER_STAT_TRANSACTION);
 	spin_lock_init(&t->lock);
 
-	tcomplete = kmem_cache_zalloc(binder_work_pool, GFP_KERNEL);
+	tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
 	if (tcomplete == NULL) {
 		return_error = BR_FAILED_REPLY;
 		return_error_param = -ENOMEM;
@@ -3335,7 +3235,6 @@ static void binder_transaction(struct binder_proc *proc,
 	t->to_thread = target_thread;
 	t->code = tr->code;
 	t->flags = tr->flags;
-	t->is_nested = is_nested;
 	if (!(t->flags & TF_ONE_WAY) &&
 	    binder_supported_policy(current->policy)) {
 		/* Inherit supported policies for synchronous transactions */
@@ -3363,15 +3262,12 @@ static void binder_transaction(struct binder_proc *proc,
 		if (extra_buffers_size < added_size) {
 			/* integer overflow of extra_buffers_size */
 			return_error = BR_FAILED_REPLY;
-			return_error_param = -EINVAL;
+			return_error_param = EINVAL;
 			return_error_line = __LINE__;
 			goto err_bad_extra_size;
 		}
 	}
 
-#ifdef CONFIG_REKERNEL
-	rekernel_binder_transaction(reply, t, target_node, tr);
-#endif /* CONFIG_REKERNEL */
 	trace_binder_transaction(reply, t, target_node);
 
 	t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
@@ -3389,20 +3285,15 @@ static void binder_transaction(struct binder_proc *proc,
 		goto err_binder_alloc_buf_failed;
 	}
 	if (secctx) {
-		int err;
 		size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) +
 				    ALIGN(tr->offsets_size, sizeof(void *)) +
 				    ALIGN(extra_buffers_size, sizeof(void *)) -
 				    ALIGN(secctx_sz, sizeof(u64));
 
 		t->security_ctx = (uintptr_t)t->buffer->user_data + buf_offset;
-		err = binder_alloc_copy_to_buffer(&target_proc->alloc,
-						  t->buffer, buf_offset,
-						  secctx, secctx_sz);
-		if (err) {
-			t->security_ctx = 0;
-			WARN_ON(1);
-		}
+		binder_alloc_copy_to_buffer(&target_proc->alloc,
+					    t->buffer, buf_offset,
+					    secctx, secctx_sz);
 		security_release_secctx(secctx, secctx_sz);
 		secctx = NULL;
 	}
@@ -3412,6 +3303,19 @@ static void binder_transaction(struct binder_proc *proc,
 	t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF);
 	trace_binder_transaction_alloc_buf(t->buffer);
 
+	if (binder_alloc_copy_user_to_buffer(
+				&target_proc->alloc,
+				t->buffer, 0,
+				(const void __user *)
+					(uintptr_t)tr->data.ptr.buffer,
+				tr->data_size)) {
+		binder_user_error("%d:%d got transaction with invalid data ptr\n",
+				proc->pid, thread->pid);
+		return_error = BR_FAILED_REPLY;
+		return_error_param = -EFAULT;
+		return_error_line = __LINE__;
+		goto err_copy_data_failed;
+	}
 	if (binder_alloc_copy_user_to_buffer(
 				&target_proc->alloc,
 				t->buffer,
@@ -3456,39 +3360,14 @@ static void binder_transaction(struct binder_proc *proc,
 		size_t object_size;
 		struct binder_object object;
 		binder_size_t object_offset;
-		binder_size_t copy_size;
 
-		if (binder_alloc_copy_from_buffer(&target_proc->alloc,
-						  &object_offset,
-						  t->buffer,
-						  buffer_offset,
-						  sizeof(object_offset))) {
-			return_error = BR_FAILED_REPLY;
-			return_error_param = -EINVAL;
-			return_error_line = __LINE__;
-			goto err_bad_offset;
-		}
-
-		/*
-		 * Copy the source user buffer up to the next object
-		 * that will be processed.
-		 */
-		copy_size = object_offset - user_offset;
-		if (copy_size && (user_offset > object_offset ||
-				binder_alloc_copy_user_to_buffer(
-					&target_proc->alloc,
-					t->buffer, user_offset,
-					user_buffer + user_offset,
-					copy_size))) {
-			binder_user_error("%d:%d got transaction with invalid data ptr\n",
-					proc->pid, thread->pid);
-			return_error = BR_FAILED_REPLY;
-			return_error_param = -EFAULT;
-			return_error_line = __LINE__;
-			goto err_copy_data_failed;
-		}
-		object_size = binder_get_object(target_proc, user_buffer,
-				t->buffer, object_offset, &object);
+		binder_alloc_copy_from_buffer(&target_proc->alloc,
+					      &object_offset,
+					      t->buffer,
+					      buffer_offset,
+					      sizeof(object_offset));
+		object_size = binder_get_object(target_proc, t->buffer,
+						object_offset, &object);
 		if (object_size == 0 || object_offset < off_min) {
 			binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
 					  proc->pid, thread->pid,
@@ -3500,11 +3379,6 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_bad_offset;
 		}
-		/*
-		 * Set offset to the next buffer fragment to be
-		 * copied
-		 */
-		user_offset = object_offset + object_size;
 
 		hdr = &object.hdr;
 		off_min = object_offset + object_size;
@@ -3515,17 +3389,15 @@ static void binder_transaction(struct binder_proc *proc,
 
 			fp = to_flat_binder_object(hdr);
 			ret = binder_translate_binder(fp, t, thread);
-
-			if (ret < 0 ||
-			    binder_alloc_copy_to_buffer(&target_proc->alloc,
-							t->buffer,
-							object_offset,
-							fp, sizeof(*fp))) {
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
 				return_error_param = ret;
 				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
+			binder_alloc_copy_to_buffer(&target_proc->alloc,
+						    t->buffer, object_offset,
+						    fp, sizeof(*fp));
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
@@ -3533,42 +3405,37 @@ static void binder_transaction(struct binder_proc *proc,
 
 			fp = to_flat_binder_object(hdr);
 			ret = binder_translate_handle(fp, t, thread);
-			if (ret < 0 ||
-			    binder_alloc_copy_to_buffer(&target_proc->alloc,
-							t->buffer,
-							object_offset,
-							fp, sizeof(*fp))) {
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
 				return_error_param = ret;
 				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
+			binder_alloc_copy_to_buffer(&target_proc->alloc,
+						    t->buffer, object_offset,
+						    fp, sizeof(*fp));
 		} break;
 
 		case BINDER_TYPE_FD: {
 			struct binder_fd_object *fp = to_binder_fd_object(hdr);
-			binder_size_t fd_offset = object_offset +
-				(uintptr_t)&fp->fd - (uintptr_t)fp;
-			int ret = binder_translate_fd(fp->fd, fd_offset, t,
-						      thread, in_reply_to);
+			int target_fd = binder_translate_fd(fp->fd, t, thread,
+							    in_reply_to);
 
-			fp->pad_binder = 0;
-			if (ret < 0 ||
-			    binder_alloc_copy_to_buffer(&target_proc->alloc,
-							t->buffer,
-							object_offset,
-							fp, sizeof(*fp))) {
+			if (target_fd < 0) {
 				return_error = BR_FAILED_REPLY;
-				return_error_param = ret;
+				return_error_param = target_fd;
 				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
+			fp->pad_binder = 0;
+			fp->fd = target_fd;
+			binder_alloc_copy_to_buffer(&target_proc->alloc,
+						    t->buffer, object_offset,
+						    fp, sizeof(*fp));
 		} break;
 		case BINDER_TYPE_FDA: {
 			struct binder_object ptr_object;
 			binder_size_t parent_offset;
-			struct binder_object user_object;
-			size_t user_parent_size;
 			struct binder_fd_array_object *fda =
 				to_binder_fd_array_object(hdr);
 			size_t num_valid = (buffer_offset - off_start_offset) /
@@ -3600,35 +3467,11 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error_line = __LINE__;
 				goto err_bad_parent;
 			}
-			/*
-			 * We need to read the user version of the parent
-			 * object to get the original user offset
-			 */
-			user_parent_size =
-				binder_get_object(proc, user_buffer, t->buffer,
-						  parent_offset, &user_object);
-			if (user_parent_size != sizeof(user_object.bbo)) {
-				binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n",
-						  proc->pid, thread->pid,
-						  user_parent_size,
-						  sizeof(user_object.bbo));
-				return_error = BR_FAILED_REPLY;
-				return_error_param = -EINVAL;
-				return_error_line = __LINE__;
-				goto err_bad_parent;
-			}
-			ret = binder_translate_fd_array(&pf_head, fda,
-							user_buffer, parent,
-							&user_object.bbo, t,
-							thread, in_reply_to);
-			if (!ret)
-				ret = binder_alloc_copy_to_buffer(&target_proc->alloc,
-								  t->buffer,
-								  object_offset,
-								  fda, sizeof(*fda));
-			if (ret) {
+			ret = binder_translate_fd_array(fda, parent, t, thread,
+							in_reply_to);
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
-				return_error_param = ret > 0 ? -EINVAL : ret;
+				return_error_param = ret;
 				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
@@ -3650,14 +3493,19 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error_line = __LINE__;
 				goto err_bad_offset;
 			}
-			ret = binder_defer_copy(&sgc_head, sg_buf_offset,
-				(const void __user *)(uintptr_t)bp->buffer,
-				bp->length);
-			if (ret) {
+			if (binder_alloc_copy_user_to_buffer(
+						&target_proc->alloc,
+						t->buffer,
+						sg_buf_offset,
+						(const void __user *)
+							(uintptr_t)bp->buffer,
+						bp->length)) {
+				binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+						  proc->pid, thread->pid);
+				return_error_param = -EFAULT;
 				return_error = BR_FAILED_REPLY;
-				return_error_param = ret;
 				return_error_line = __LINE__;
-				goto err_translate_failed;
+				goto err_copy_data_failed;
 			}
 			/* Fixup buffer pointer to target proc address space */
 			bp->buffer = (uintptr_t)
@@ -3666,22 +3514,20 @@ static void binder_transaction(struct binder_proc *proc,
 
 			num_valid = (buffer_offset - off_start_offset) /
 					sizeof(binder_size_t);
-			ret = binder_fixup_parent(&pf_head, t,
-						  thread, bp,
+			ret = binder_fixup_parent(t, thread, bp,
 						  off_start_offset,
 						  num_valid,
 						  last_fixup_obj_off,
 						  last_fixup_min_off);
-			if (ret < 0 ||
-			    binder_alloc_copy_to_buffer(&target_proc->alloc,
-							t->buffer,
-							object_offset,
-							bp, sizeof(*bp))) {
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
 				return_error_param = ret;
 				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
+			binder_alloc_copy_to_buffer(&target_proc->alloc,
+						    t->buffer, object_offset,
+						    bp, sizeof(*bp));
 			last_fixup_obj_off = object_offset;
 			last_fixup_min_off = 0;
 		} break;
@@ -3694,57 +3540,22 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_bad_object_type;
 		}
 	}
-	/* Done processing objects, copy the rest of the buffer */
-	if (binder_alloc_copy_user_to_buffer(
-				&target_proc->alloc,
-				t->buffer, user_offset,
-				user_buffer + user_offset,
-				tr->data_size - user_offset)) {
-		binder_user_error("%d:%d got transaction with invalid data ptr\n",
-				proc->pid, thread->pid);
-		return_error = BR_FAILED_REPLY;
-		return_error_param = -EFAULT;
-		return_error_line = __LINE__;
-		goto err_copy_data_failed;
-	}
-
-	ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer,
-					    &sgc_head, &pf_head);
-	if (ret) {
-		binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
-				  proc->pid, thread->pid);
-		return_error = BR_FAILED_REPLY;
-		return_error_param = ret;
-		return_error_line = __LINE__;
-		goto err_copy_data_failed;
-	}
-	if (t->buffer->oneway_spam_suspect)
-		tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT;
-	else
-		tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
 	t->work.type = BINDER_WORK_TRANSACTION;
 
 	if (reply) {
 		binder_enqueue_thread_work(thread, tcomplete);
 		binder_inner_proc_lock(target_proc);
 		if (target_thread->is_dead) {
-			return_error = BR_DEAD_REPLY;
 			binder_inner_proc_unlock(target_proc);
 			goto err_dead_proc_or_thread;
 		}
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction_ilocked(target_thread, in_reply_to);
 		binder_enqueue_thread_work_ilocked(target_thread, &t->work);
-		target_proc->outstanding_txns++;
 		binder_inner_proc_unlock(target_proc);
-		if (in_reply_to->is_nested) {
-			spin_lock(&thread->prio_lock);
-			thread->prio_state = BINDER_PRIO_PENDING;
-			thread->prio_next = in_reply_to->saved_priority;
-			spin_unlock(&thread->prio_lock);
-		}
 		wake_up_interruptible_sync(&target_thread->wait);
-		binder_restore_priority(thread, &in_reply_to->saved_priority);
+		binder_restore_priority(current, in_reply_to->saved_priority);
 		binder_free_transaction(in_reply_to);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
@@ -3761,9 +3572,7 @@ static void binder_transaction(struct binder_proc *proc,
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
 		binder_inner_proc_unlock(proc);
-		return_error = binder_proc_transaction(t,
-				target_proc, target_thread);
-		if (return_error) {
+		if (!binder_proc_transaction(t, target_proc, target_thread)) {
 			binder_inner_proc_lock(proc);
 			binder_pop_transaction_ilocked(thread, t);
 			binder_inner_proc_unlock(proc);
@@ -3773,8 +3582,7 @@ static void binder_transaction(struct binder_proc *proc,
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
 		binder_enqueue_thread_work(thread, tcomplete);
-		return_error = binder_proc_transaction(t, target_proc, NULL);
-		if (return_error)
+		if (!binder_proc_transaction(t, target_proc, NULL))
 			goto err_dead_proc_or_thread;
 	}
 	if (target_thread)
@@ -3791,6 +3599,7 @@ static void binder_transaction(struct binder_proc *proc,
 	return;
 
 err_dead_proc_or_thread:
+	return_error = BR_DEAD_REPLY;
 	return_error_line = __LINE__;
 	binder_dequeue_work(proc, tcomplete);
 err_translate_failed:
@@ -3798,10 +3607,8 @@ static void binder_transaction(struct binder_proc *proc,
 err_bad_offset:
 err_bad_parent:
 err_copy_data_failed:
-	binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head);
-	binder_free_txn_fixups(t);
 	trace_binder_transaction_failed_buffer_release(t->buffer);
-	binder_transaction_buffer_release(target_proc, NULL, t->buffer,
+	binder_transaction_buffer_release(target_proc, t->buffer,
 					  buffer_offset, true);
 	if (target_node)
 		binder_dec_node_tmpref(target_node);
@@ -3813,15 +3620,12 @@ static void binder_transaction(struct binder_proc *proc,
 	if (secctx)
 		security_release_secctx(secctx, secctx_sz);
 err_get_secctx_failed:
-	kmem_cache_free(binder_work_pool, tcomplete);
+	kfree(tcomplete);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 err_alloc_tcomplete_failed:
-	if (trace_binder_txn_latency_free_enabled())
-		binder_txn_latency_free(t);
-	kmem_cache_free(binder_transaction_pool, t);
+	kfree(t);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION);
 err_alloc_t_failed:
-err_bad_todo_list:
 err_bad_call_stack:
 err_empty_call_stack:
 err_dead_binder:
@@ -3855,65 +3659,19 @@ static void binder_transaction(struct binder_proc *proc,
 		 */
 		smp_wmb();
 		WRITE_ONCE(e->debug_id_done, t_debug_id);
-		WRITE_ONCE(fe->debug_id_done, t_debug_id);
-	}
-
-	BUG_ON(thread->return_error.cmd != BR_OK);
-	if (in_reply_to) {
-		binder_restore_priority(thread, &in_reply_to->saved_priority);
-		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
-		binder_enqueue_thread_work(thread, &thread->return_error.work);
-		binder_send_failed_reply(in_reply_to, return_error);
-	} else {
-		thread->return_error.cmd = return_error;
-		binder_enqueue_thread_work(thread, &thread->return_error.work);
-	}
-}
-
-/**
- * binder_free_buf() - free the specified buffer
- * @proc:	binder proc that owns buffer
- * @buffer:	buffer to be freed
- * @is_failure:	failed to send transaction
- *
- * If buffer for an async transaction, enqueue the next async
- * transaction from the node.
- *
- * Cleanup buffer and free it.
- */
-static void
-binder_free_buf(struct binder_proc *proc,
-		struct binder_thread *thread,
-		struct binder_buffer *buffer, bool is_failure)
-{
-	binder_inner_proc_lock(proc);
-	if (buffer->transaction) {
-		buffer->transaction->buffer = NULL;
-		buffer->transaction = NULL;
-	}
-	binder_inner_proc_unlock(proc);
-	if (buffer->async_transaction && buffer->target_node) {
-		struct binder_node *buf_node;
-		struct binder_work *w;
-
-		buf_node = buffer->target_node;
-		binder_node_inner_lock(buf_node);
-		BUG_ON(!buf_node->has_async_transaction);
-		BUG_ON(buf_node->proc != proc);
-		w = binder_dequeue_work_head_ilocked(
-				&buf_node->async_todo);
-		if (!w) {
-			buf_node->has_async_transaction = false;
-		} else {
-			binder_enqueue_work_ilocked(
-					w, &proc->todo);
-			binder_wakeup_proc_ilocked(proc);
-		}
-		binder_node_inner_unlock(buf_node);
+		WRITE_ONCE(fe->debug_id_done, t_debug_id);
+	}
+
+	BUG_ON(thread->return_error.cmd != BR_OK);
+	if (in_reply_to) {
+		binder_restore_priority(current, in_reply_to->saved_priority);
+		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
+		binder_enqueue_thread_work(thread, &thread->return_error.work);
+		binder_send_failed_reply(in_reply_to, return_error);
+	} else {
+		thread->return_error.cmd = return_error;
+		binder_enqueue_thread_work(thread, &thread->return_error.work);
 	}
-	trace_binder_transaction_buffer_release(buffer);
-	binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure);
-	binder_alloc_free_buf(&proc->alloc, buffer);
 }
 
 static int binder_thread_write(struct binder_proc *proc,
@@ -3957,7 +3715,6 @@ static int binder_thread_write(struct binder_proc *proc,
 			ret = -1;
 			if (increment && !target) {
 				struct binder_node *ctx_mgr_node;
-
 				mutex_lock(&context->context_mgr_node_lock);
 				ctx_mgr_node = context->binder_context_mgr_node;
 				if (ctx_mgr_node) {
@@ -4114,7 +3871,35 @@ static int binder_thread_write(struct binder_proc *proc,
 				     proc->pid, thread->pid, (u64)data_ptr,
 				     buffer->debug_id,
 				     buffer->transaction ? "active" : "finished");
-			binder_free_buf(proc, thread, buffer, false);
+
+			binder_inner_proc_lock(proc);
+			if (buffer->transaction) {
+				buffer->transaction->buffer = NULL;
+				buffer->transaction = NULL;
+			}
+			binder_inner_proc_unlock(proc);
+			if (buffer->async_transaction && buffer->target_node) {
+				struct binder_node *buf_node;
+				struct binder_work *w;
+
+				buf_node = buffer->target_node;
+				binder_node_inner_lock(buf_node);
+				BUG_ON(!buf_node->has_async_transaction);
+				BUG_ON(buf_node->proc != proc);
+				w = binder_dequeue_work_head_ilocked(
+						&buf_node->async_todo);
+				if (!w) {
+					buf_node->has_async_transaction = false;
+				} else {
+					binder_enqueue_work_ilocked(
+							w, &proc->todo);
+					binder_wakeup_proc_ilocked(proc);
+				}
+				binder_node_inner_unlock(buf_node);
+			}
+			trace_binder_transaction_buffer_release(buffer);
+			binder_transaction_buffer_release(proc, buffer, 0, false);
+			binder_alloc_free_buf(&proc->alloc, buffer);
 			break;
 		}
 
@@ -4197,7 +3982,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				 * Allocate memory for death notification
 				 * before taking lock
 				 */
-				death = kmem_cache_zalloc(binder_ref_death_pool, GFP_KERNEL);
+				death = kzalloc(sizeof(*death), GFP_KERNEL);
 				if (death == NULL) {
 					WARN_ON(thread->return_error.cmd !=
 						BR_OK);
@@ -4222,8 +4007,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					"BC_CLEAR_DEATH_NOTIFICATION",
 					target);
 				binder_proc_unlock(proc);
-				if (death)
-					kmem_cache_free(binder_ref_death_pool, death);
+				kfree(death);
 				break;
 			}
 
@@ -4244,7 +4028,7 @@ static int binder_thread_write(struct binder_proc *proc,
 						proc->pid, thread->pid);
 					binder_node_unlock(ref->node);
 					binder_proc_unlock(proc);
-					kmem_cache_free(binder_ref_death_pool, death);
+					kfree(death);
 					break;
 				}
 				binder_stats_created(BINDER_STAT_DEATH);
@@ -4427,7 +4211,7 @@ static int binder_wait_for_work(struct binder_thread *thread,
 		binder_inner_proc_lock(proc);
 		list_del_init(&thread->waiting_thread_node);
 		if (signal_pending(current)) {
-			ret = -EINTR;
+			ret = -ERESTARTSYS;
 			break;
 		}
 	}
@@ -4438,71 +4222,6 @@ static int binder_wait_for_work(struct binder_thread *thread,
 	return ret;
 }
 
-/**
- * binder_apply_fd_fixups() - finish fd translation
- * @proc:         binder_proc associated @t->buffer
- * @t:	binder transaction with list of fd fixups
- *
- * Now that we are in the context of the transaction target
- * process, we can allocate and install fds. Process the
- * list of fds to translate and fixup the buffer with the
- * new fds.
- *
- * If we fail to allocate an fd, then free the resources by
- * fput'ing files that have not been processed and ksys_close'ing
- * any fds that have already been allocated.
- */
-static int binder_apply_fd_fixups(struct binder_proc *proc,
-				  struct binder_transaction *t)
-{
-	struct binder_txn_fd_fixup *fixup, *tmp;
-	int ret = 0;
-
-	list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) {
-		int fd = get_unused_fd_flags(O_CLOEXEC);
-
-		if (fd < 0) {
-			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "failed fd fixup txn %d fd %d\n",
-				     t->debug_id, fd);
-			ret = -ENOMEM;
-			break;
-		}
-		binder_debug(BINDER_DEBUG_TRANSACTION,
-			     "fd fixup txn %d fd %d\n",
-			     t->debug_id, fd);
-		trace_binder_transaction_fd_recv(t, fd, fixup->offset);
-		fd_install(fd, fixup->file);
-		fixup->file = NULL;
-		if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer,
-						fixup->offset, &fd,
-						sizeof(u32))) {
-			ret = -EINVAL;
-			break;
-		}
-	}
-	list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) {
-		if (fixup->file) {
-			fput(fixup->file);
-		} else if (ret) {
-			u32 fd;
-			int err;
-
-			err = binder_alloc_copy_from_buffer(&proc->alloc, &fd,
-							    t->buffer,
-							    fixup->offset,
-							    sizeof(fd));
-			WARN_ON(err);
-			if (!err)
-				binder_deferred_fd_close(fd);
-		}
-		list_del(&fixup->fixup_entry);
-		kfree(fixup);
-	}
-
-	return ret;
-}
-
 static int binder_thread_read(struct binder_proc *proc,
 			      struct binder_thread *thread,
 			      binder_uintptr_t binder_buffer, size_t size,
@@ -4539,7 +4258,7 @@ static int binder_thread_read(struct binder_proc *proc,
 			wait_event_interruptible(binder_user_error_wait,
 						 binder_stop_on_user_error < 2);
 		}
-		binder_restore_priority(thread, &proc->default_priority);
+		binder_restore_priority(current, proc->default_priority);
 	}
 
 	if (non_block) {
@@ -4565,8 +4284,6 @@ static int binder_thread_read(struct binder_proc *proc,
 		size_t trsize = sizeof(*trd);
 
 		binder_inner_proc_lock(proc);
-		if (list)
-			goto skip;
 		if (!binder_worklist_empty_ilocked(&thread->todo))
 			list = &thread->todo;
 		else if (!binder_worklist_empty_ilocked(&proc->todo) &&
@@ -4580,7 +4297,7 @@ static int binder_thread_read(struct binder_proc *proc,
 				goto retry;
 			break;
 		}
-skip:
+
 		if (end - ptr < sizeof(tr) + 4) {
 			binder_inner_proc_unlock(proc);
 			break;
@@ -4606,18 +4323,11 @@ static int binder_thread_read(struct binder_proc *proc,
 			e->cmd = BR_OK;
 			ptr += sizeof(uint32_t);
 
-			binder_stat_br(proc, thread, cmd);
+			binder_stat_br(proc, thread, e->cmd);
 		} break;
-		case BINDER_WORK_TRANSACTION_COMPLETE:
-		case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: {
-			if (proc->oneway_spam_detection_enabled &&
-				   w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT)
-				cmd = BR_ONEWAY_SPAM_SUSPECT;
-			else
-				cmd = BR_TRANSACTION_COMPLETE;
+		case BINDER_WORK_TRANSACTION_COMPLETE: {
 			binder_inner_proc_unlock(proc);
-			kmem_cache_free(binder_work_pool, w);
-			binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
+			cmd = BR_TRANSACTION_COMPLETE;
 			if (put_user(cmd, (uint32_t __user *)ptr))
 				return -EFAULT;
 			ptr += sizeof(uint32_t);
@@ -4626,6 +4336,8 @@ static int binder_thread_read(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
 				     "%d:%d BR_TRANSACTION_COMPLETE\n",
 				     proc->pid, thread->pid);
+			kfree(w);
+			binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 		} break;
 		case BINDER_WORK_NODE: {
 			struct binder_node *node = container_of(w, struct binder_node, work);
@@ -4737,7 +4449,7 @@ static int binder_thread_read(struct binder_proc *proc,
 				      (u64)cookie);
 			if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
 				binder_inner_proc_unlock(proc);
-				kmem_cache_free(binder_ref_death_pool, death);
+				kfree(death);
 				binder_stats_deleted(BINDER_STAT_DEATH);
 			} else {
 				binder_enqueue_work_ilocked(
@@ -4755,11 +4467,6 @@ static int binder_thread_read(struct binder_proc *proc,
 			if (cmd == BR_DEAD_BINDER)
 				goto done; /* DEAD_BINDER notifications can cause transactions */
 		} break;
-		default:
-			binder_inner_proc_unlock(proc);
-			pr_err("%d:%d: bad work type %d\n",
-			       proc->pid, thread->pid, w->type);
-			break;
 		}
 
 		if (!t)
@@ -4768,10 +4475,14 @@ static int binder_thread_read(struct binder_proc *proc,
 		BUG_ON(t->buffer == NULL);
 		if (t->buffer->target_node) {
 			struct binder_node *target_node = t->buffer->target_node;
+			struct binder_priority node_prio;
 
 			trd->target.ptr = target_node->ptr;
 			trd->cookie =  target_node->cookie;
-			binder_transaction_priority(thread, t, target_node);
+			node_prio.sched_policy = target_node->sched_policy;
+			node_prio.prio = target_node->min_priority;
+			binder_transaction_priority(current, t, node_prio,
+						    target_node->inherit_rt);
 			cmd = BR_TRANSACTION;
 		} else {
 			trd->target.ptr = 0;
@@ -4793,34 +4504,6 @@ static int binder_thread_read(struct binder_proc *proc,
 			trd->sender_pid = 0;
 		}
 
-		ret = binder_apply_fd_fixups(proc, t);
-		if (ret) {
-			struct binder_buffer *buffer = t->buffer;
-			bool oneway = !!(t->flags & TF_ONE_WAY);
-			int tid = t->debug_id;
-
-			if (t_from)
-				binder_thread_dec_tmpref(t_from);
-			buffer->transaction = NULL;
-			binder_cleanup_transaction(t, "fd fixups failed",
-						   BR_FAILED_REPLY);
-			binder_free_buf(proc, thread, buffer, true);
-			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
-				     "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n",
-				     proc->pid, thread->pid,
-				     oneway ? "async " :
-					(cmd == BR_REPLY ? "reply " : ""),
-				     tid, BR_FAILED_REPLY, ret, __LINE__);
-			if (cmd == BR_REPLY) {
-				cmd = BR_FAILED_REPLY;
-				if (put_user(cmd, (uint32_t __user *)ptr))
-					return -EFAULT;
-				ptr += sizeof(uint32_t);
-				binder_stat_br(proc, thread, cmd);
-				break;
-			}
-			continue;
-		}
 		trd->data_size = t->buffer->data_size;
 		trd->offsets_size = t->buffer->offsets_size;
 		trd->data.ptr.buffer = (uintptr_t)t->buffer->user_data;
@@ -4940,7 +4623,7 @@ static void binder_release_work(struct binder_proc *proc,
 		case BINDER_WORK_TRANSACTION_COMPLETE: {
 			binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 				"undelivered TRANSACTION_COMPLETE\n");
-			kmem_cache_free(binder_work_pool, w);
+			kfree(w);
 			binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 		} break;
 		case BINDER_WORK_DEAD_BINDER_AND_CLEAR:
@@ -4951,7 +4634,7 @@ static void binder_release_work(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 				"undelivered death notification, %016llx\n",
 				(u64)death->cookie);
-			kmem_cache_free(binder_ref_death_pool, death);
+			kfree(death);
 			binder_stats_deleted(BINDER_STAT_DEATH);
 		} break;
 		case BINDER_WORK_NODE:
@@ -5001,8 +4684,6 @@ static struct binder_thread *binder_get_thread_ilocked(
 	thread->return_error.cmd = BR_OK;
 	thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
 	thread->reply_error.cmd = BR_OK;
-	spin_lock_init(&thread->prio_lock);
-	thread->prio_state = BINDER_PRIO_SET;
 	INIT_LIST_HEAD(&new_thread->waiting_thread_node);
 	return thread;
 }
@@ -5016,37 +4697,27 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 	thread = binder_get_thread_ilocked(proc, NULL);
 	binder_inner_proc_unlock(proc);
 	if (!thread) {
-		new_thread = kmem_cache_zalloc(binder_thread_pool, GFP_KERNEL);
+		new_thread = kzalloc(sizeof(*thread), GFP_KERNEL);
 		if (new_thread == NULL)
 			return NULL;
 		binder_inner_proc_lock(proc);
 		thread = binder_get_thread_ilocked(proc, new_thread);
 		binder_inner_proc_unlock(proc);
 		if (thread != new_thread)
-			kmem_cache_free(binder_thread_pool, new_thread);
+			kfree(new_thread);
 	}
 	return thread;
 }
 
 static void binder_free_proc(struct binder_proc *proc)
 {
-	struct binder_device *device;
-
 	BUG_ON(!list_empty(&proc->todo));
 	BUG_ON(!list_empty(&proc->delivered_death));
-	if (proc->outstanding_txns)
-		pr_warn("%s: Unexpected outstanding_txns %d\n",
-			__func__, proc->outstanding_txns);
-	device = container_of(proc->context, struct binder_device, context);
-	if (refcount_dec_and_test(&device->ref)) {
-		kfree(proc->context->name);
-		kfree(device);
-	}
 	binder_alloc_deferred_release(&proc->alloc);
 	put_task_struct(proc->tsk);
 	put_cred(proc->cred);
 	binder_stats_deleted(BINDER_STAT_PROC);
-	kmem_cache_free(binder_proc_pool, proc);
+	kfree(proc);
 }
 
 static void binder_free_thread(struct binder_thread *thread)
@@ -5055,7 +4726,7 @@ static void binder_free_thread(struct binder_thread *thread)
 	binder_stats_deleted(BINDER_STAT_THREAD);
 	binder_proc_dec_tmpref(thread->proc);
 	put_task_struct(thread->task);
-	kmem_cache_free(binder_thread_pool, thread);
+	kfree(thread);
 }
 
 static int binder_thread_release(struct binder_proc *proc,
@@ -5073,7 +4744,7 @@ static int binder_thread_release(struct binder_proc *proc,
 	 * The corresponding dec is when we actually
 	 * free the thread in binder_free_thread()
 	 */
-	proc->tmp_ref++;
+	atomic_inc(&proc->tmp_ref);
 	/*
 	 * take a ref on this thread to ensure it
 	 * survives while we are releasing it
@@ -5085,8 +4756,6 @@ static int binder_thread_release(struct binder_proc *proc,
 		spin_lock(&t->lock);
 		if (t->to_thread == thread)
 			send_reply = t;
-	} else {
-		__acquire(&t->lock);
 	}
 	thread->is_dead = true;
 
@@ -5100,7 +4769,6 @@ static int binder_thread_release(struct binder_proc *proc,
 			     (t->to_thread == thread) ? "in" : "out");
 
 		if (t->to_thread == thread) {
-			thread->proc->outstanding_txns--;
 			t->to_proc = NULL;
 			t->to_thread = NULL;
 			if (t->buffer) {
@@ -5116,11 +4784,7 @@ static int binder_thread_release(struct binder_proc *proc,
 		spin_unlock(&last_t->lock);
 		if (t)
 			spin_lock(&t->lock);
-		else
-			__acquire(&t->lock);
 	}
-	/* annotation for sparse, lock not acquired in last iteration above */
-	__release(&t->lock);
 
 	/*
 	 * If this thread used poll, make sure we remove the waitqueue from any
@@ -5148,7 +4812,7 @@ static int binder_thread_release(struct binder_proc *proc,
 	return active_transactions;
 }
 
-static __poll_t binder_poll(struct file *filp,
+static unsigned int binder_poll(struct file *filp,
 				struct poll_table_struct *wait)
 {
 	struct binder_proc *proc = filp->private_data;
@@ -5168,7 +4832,7 @@ static __poll_t binder_poll(struct file *filp,
 	poll_wait(filp, &thread->wait, wait);
 
 	if (binder_has_work(thread, wait_for_proc_work))
-		return EPOLLIN;
+		return POLLIN;
 
 	return 0;
 }
@@ -5324,8 +4988,7 @@ static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc,
 }
 
 static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
-				struct binder_node_debug_info *info)
-{
+				struct binder_node_debug_info *info) {
 	struct rb_node *n;
 	binder_uintptr_t ptr = info->ptr;
 
@@ -5348,100 +5011,6 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
 	return 0;
 }
 
-static bool binder_txns_pending_ilocked(struct binder_proc *proc)
-{
-	struct rb_node *n;
-	struct binder_thread *thread;
-
-	if (proc->outstanding_txns > 0)
-		return true;
-
-	for (n = rb_first(&proc->threads); n; n = rb_next(n)) {
-		thread = rb_entry(n, struct binder_thread, rb_node);
-		if (thread->transaction_stack)
-			return true;
-	}
-	return false;
-}
-
-static int binder_ioctl_freeze(struct binder_freeze_info *info,
-			       struct binder_proc *target_proc)
-{
-	int ret = 0;
-
-	if (!info->enable) {
-		binder_inner_proc_lock(target_proc);
-		target_proc->sync_recv = false;
-		target_proc->async_recv = false;
-		target_proc->is_frozen = false;
-		binder_inner_proc_unlock(target_proc);
-		return 0;
-	}
-
-	/*
-	 * Freezing the target. Prevent new transactions by
-	 * setting frozen state. If timeout specified, wait
-	 * for transactions to drain.
-	 */
-	binder_inner_proc_lock(target_proc);
-	target_proc->sync_recv = false;
-	target_proc->async_recv = false;
-	target_proc->is_frozen = true;
-	binder_inner_proc_unlock(target_proc);
-
-	if (info->timeout_ms > 0)
-		ret = wait_event_interruptible_timeout(
-			target_proc->freeze_wait,
-			(!target_proc->outstanding_txns),
-			msecs_to_jiffies(info->timeout_ms));
-
-	/* Check pending transactions that wait for reply */
-	if (ret >= 0) {
-		binder_inner_proc_lock(target_proc);
-		if (binder_txns_pending_ilocked(target_proc))
-			ret = -EAGAIN;
-		binder_inner_proc_unlock(target_proc);
-	}
-
-	if (ret < 0) {
-		binder_inner_proc_lock(target_proc);
-		target_proc->is_frozen = false;
-		binder_inner_proc_unlock(target_proc);
-	}
-
-	return ret;
-}
-
-static int binder_ioctl_get_freezer_info(
-				struct binder_frozen_status_info *info)
-{
-	struct binder_proc *target_proc;
-	bool found = false;
-	__u32 txns_pending;
-
-	info->sync_recv = 0;
-	info->async_recv = 0;
-
-	mutex_lock(&binder_procs_lock);
-	hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
-		if (target_proc->pid == info->pid) {
-			found = true;
-			binder_inner_proc_lock(target_proc);
-			txns_pending = binder_txns_pending_ilocked(target_proc);
-			info->sync_recv |= target_proc->sync_recv |
-					(txns_pending << 1);
-			info->async_recv |= target_proc->async_recv;
-			binder_inner_proc_unlock(target_proc);
-		}
-	}
-	mutex_unlock(&binder_procs_lock);
-
-	if (!found)
-		return -EINVAL;
-
-	return 0;
-}
-
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -5560,96 +5129,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
-	case BINDER_FREEZE: {
-		struct binder_freeze_info info;
-		struct binder_proc **target_procs = NULL, *target_proc;
-		int target_procs_count = 0, i = 0;
-
-		ret = 0;
-
-		if (copy_from_user(&info, ubuf, sizeof(info))) {
-			ret = -EFAULT;
-			goto err;
-		}
-
-		mutex_lock(&binder_procs_lock);
-		hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
-			if (target_proc->pid == info.pid)
-				target_procs_count++;
-		}
-
-		if (target_procs_count == 0) {
-			mutex_unlock(&binder_procs_lock);
-			ret = -EINVAL;
-			goto err;
-		}
-
-		target_procs = kcalloc(target_procs_count,
-				       sizeof(struct binder_proc *),
-				       GFP_KERNEL);
-
-		if (!target_procs) {
-			mutex_unlock(&binder_procs_lock);
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
-			if (target_proc->pid != info.pid)
-				continue;
-
-			binder_inner_proc_lock(target_proc);
-			target_proc->tmp_ref++;
-			binder_inner_proc_unlock(target_proc);
-
-			target_procs[i++] = target_proc;
-		}
-		mutex_unlock(&binder_procs_lock);
-
-		for (i = 0; i < target_procs_count; i++) {
-			if (ret >= 0)
-				ret = binder_ioctl_freeze(&info,
-							  target_procs[i]);
-
-			binder_proc_dec_tmpref(target_procs[i]);
-		}
-
-		kfree(target_procs);
-
-		if (ret < 0)
-			goto err;
-		break;
-	}
-	case BINDER_GET_FROZEN_INFO: {
-		struct binder_frozen_status_info info;
-
-		if (copy_from_user(&info, ubuf, sizeof(info))) {
-			ret = -EFAULT;
-			goto err;
-		}
-
-		ret = binder_ioctl_get_freezer_info(&info);
-		if (ret < 0)
-			goto err;
-
-		if (copy_to_user(ubuf, &info, sizeof(info))) {
-			ret = -EFAULT;
-			goto err;
-		}
-		break;
-	}
-	case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: {
-		uint32_t enable;
-
-		if (copy_from_user(&enable, ubuf, sizeof(enable))) {
-			ret = -EFAULT;
-			goto err;
-		}
-		binder_inner_proc_lock(proc);
-		proc->oneway_spam_detection_enabled = (bool)enable;
-		binder_inner_proc_unlock(proc);
-		break;
-	}
 	default:
 		ret = -EINVAL;
 		goto err;
@@ -5659,7 +5138,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (thread)
 		thread->looper_need_return = false;
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
-	if (ret && ret != -EINTR)
+	if (ret && ret != -ERESTARTSYS)
 		pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
 err_unlocked:
 	trace_binder_ioctl_done(ret);
@@ -5687,6 +5166,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 	binder_alloc_vma_close(&proc->alloc);
+	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
 static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -5702,11 +5182,16 @@ static const struct vm_operations_struct binder_vm_ops = {
 
 static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 {
+	int ret;
 	struct binder_proc *proc = filp->private_data;
+	const char *failure_string;
 
 	if (proc->tsk != current->group_leader)
 		return -EINVAL;
 
+	if ((vma->vm_end - vma->vm_start) > SZ_4M)
+		vma->vm_end = vma->vm_start + SZ_4M;
+
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 		     "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
 		     __func__, proc->pid, vma->vm_start, vma->vm_end,
@@ -5714,9 +5199,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 
 	if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) {
-		pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
-		       proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM);
-		return -EPERM;
+		ret = -EPERM;
+		failure_string = "bad vm_flags";
+		goto err_bad_arg;
 	}
 	vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
 	vma->vm_flags &= ~VM_MAYWRITE;
@@ -5724,30 +5209,39 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_ops = &binder_vm_ops;
 	vma->vm_private_data = proc;
 
-	return binder_alloc_mmap_handler(&proc->alloc, vma);
+	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
+	if (ret)
+		return ret;
+	mutex_lock(&proc->files_lock);
+	proc->files = get_files_struct(current);
+	mutex_unlock(&proc->files_lock);
+	return 0;
+
+err_bad_arg:
+	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+	       proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+	return ret;
 }
 
 static int binder_open(struct inode *nodp, struct file *filp)
 {
-	struct binder_proc *proc, *itr;
+	struct binder_proc *proc;
 	struct binder_device *binder_dev;
-	struct binderfs_info *info;
-	struct dentry *binder_binderfs_dir_entry_proc = NULL;
-	bool existing_pid = false;
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__,
 		     current->group_leader->pid, current->pid);
 
-	proc = kmem_cache_zalloc(binder_proc_pool, GFP_KERNEL);
+	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
 	if (proc == NULL)
 		return -ENOMEM;
 	spin_lock_init(&proc->inner_lock);
 	spin_lock_init(&proc->outer_lock);
+	atomic_set(&proc->tmp_ref, 0);
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
+	mutex_init(&proc->files_lock);
 	proc->cred = get_cred(filp->f_cred);
 	INIT_LIST_HEAD(&proc->todo);
-	init_waitqueue_head(&proc->freeze_wait);
 	if (binder_supported_policy(current->policy)) {
 		proc->default_priority.sched_policy = current->policy;
 		proc->default_priority.prio = current->normal_prio;
@@ -5756,16 +5250,8 @@ static int binder_open(struct inode *nodp, struct file *filp)
 		proc->default_priority.prio = NICE_TO_PRIO(0);
 	}
 
-	/* binderfs stashes devices in i_private */
-	if (is_binderfs_device(nodp)) {
-		binder_dev = nodp->i_private;
-		info = nodp->i_sb->s_fs_info;
-		binder_binderfs_dir_entry_proc = info->proc_log_dir;
-	} else {
-		binder_dev = container_of(filp->private_data,
-					  struct binder_device, miscdev);
-	}
-	refcount_inc(&binder_dev->ref);
+	binder_dev = container_of(filp->private_data, struct binder_device,
+				  miscdev);
 	proc->context = &binder_dev->context;
 	binder_alloc_init(&proc->alloc);
 
@@ -5776,52 +5262,24 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	filp->private_data = proc;
 
 	mutex_lock(&binder_procs_lock);
-	hlist_for_each_entry(itr, &binder_procs, proc_node) {
-		if (itr->pid == proc->pid) {
-			existing_pid = true;
-			break;
-		}
-	}
 	hlist_add_head(&proc->proc_node, &binder_procs);
 	mutex_unlock(&binder_procs_lock);
-	if (binder_debugfs_dir_entry_proc && !existing_pid) {
+
+	if (binder_debugfs_dir_entry_proc) {
 		char strbuf[11];
 
 		snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
 		/*
-		 * proc debug entries are shared between contexts.
-		 * Only create for the first PID to avoid debugfs log spamming
-		 * The printing code will anyway print all contexts for a given
-		 * PID so this is not a problem.
+		 * proc debug entries are shared between contexts, so
+		 * this will fail if the process tries to open the driver
+		 * again with a different context. The priting code will
+		 * anyway print all contexts that a given PID has, so this
+		 * is not a problem.
 		 */
 		proc->debugfs_entry = debugfs_create_file(strbuf, 0444,
 			binder_debugfs_dir_entry_proc,
 			(void *)(unsigned long)proc->pid,
-			&proc_fops);
-	}
-
-	if (binder_binderfs_dir_entry_proc && !existing_pid) {
-		char strbuf[11];
-		struct dentry *binderfs_entry;
-
-		snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
-		/*
-		 * Similar to debugfs, the process specific log file is shared
-		 * between contexts. Only create for the first PID.
-		 * This is ok since same as debugfs, the log file will contain
-		 * information on all contexts of a given PID.
-		 */
-		binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc,
-			strbuf, &proc_fops, (void *)(unsigned long)proc->pid);
-		if (!IS_ERR(binderfs_entry)) {
-			proc->binderfs_entry = binderfs_entry;
-		} else {
-			int error;
-
-			error = PTR_ERR(binderfs_entry);
-			pr_warn("Unable to create file %s in binderfs (error %d)\n",
-				strbuf, error);
-		}
+			&binder_proc_fops);
 	}
 
 	return 0;
@@ -5863,12 +5321,6 @@ static int binder_release(struct inode *nodp, struct file *filp)
 	struct binder_proc *proc = filp->private_data;
 
 	debugfs_remove(proc->debugfs_entry);
-
-	if (proc->binderfs_entry) {
-		binderfs_remove_file(proc->binderfs_entry);
-		proc->binderfs_entry = NULL;
-	}
-
 	binder_defer_work(proc, BINDER_DEFERRED_RELEASE);
 
 	return 0;
@@ -5945,6 +5397,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 	struct rb_node *n;
 	int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
 
+	BUG_ON(proc->files);
+
 	mutex_lock(&binder_procs_lock);
 	hlist_del(&proc->proc_node);
 	mutex_unlock(&binder_procs_lock);
@@ -5963,12 +5417,9 @@ static void binder_deferred_release(struct binder_proc *proc)
 	 * Make sure proc stays alive after we
 	 * remove all the threads
 	 */
-	proc->tmp_ref++;
+	atomic_inc(&proc->tmp_ref);
 
 	proc->is_dead = true;
-	proc->is_frozen = false;
-	proc->sync_recv = false;
-	proc->async_recv = false;
 	threads = 0;
 	active_transactions = 0;
 	while ((n = rb_first(&proc->threads))) {
@@ -6029,6 +5480,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 static void binder_deferred_func(struct work_struct *work)
 {
 	struct binder_proc *proc;
+	struct files_struct *files;
 
 	int defer;
 
@@ -6046,11 +5498,23 @@ static void binder_deferred_func(struct work_struct *work)
 		}
 		mutex_unlock(&binder_deferred_lock);
 
+		files = NULL;
+		if (defer & BINDER_DEFERRED_PUT_FILES) {
+			mutex_lock(&proc->files_lock);
+			files = proc->files;
+			if (files)
+				proc->files = NULL;
+			mutex_unlock(&proc->files_lock);
+		}
+
 		if (defer & BINDER_DEFERRED_FLUSH)
 			binder_deferred_flush(proc);
 
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
+
+		if (files)
+			put_files_struct(files);
 	} while (proc);
 }
 static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
@@ -6321,9 +5785,7 @@ static const char * const binder_return_strings[] = {
 	"BR_FINISHED",
 	"BR_DEAD_BINDER",
 	"BR_CLEAR_DEATH_NOTIFICATION_DONE",
-	"BR_FAILED_REPLY",
-	"BR_FROZEN_REPLY",
-	"BR_ONEWAY_SPAM_SUSPECT",
+	"BR_FAILED_REPLY"
 };
 
 static const char * const binder_command_strings[] = {
@@ -6464,7 +5926,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 	print_binder_stats(m, "  ", &proc->stats);
 }
 
-static int state_show(struct seq_file *m, void *unused)
+
+static int binder_state_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 	struct binder_node *node;
@@ -6503,7 +5966,7 @@ static int state_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int stats_show(struct seq_file *m, void *unused)
+static int binder_stats_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 
@@ -6519,7 +5982,7 @@ static int stats_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int transactions_show(struct seq_file *m, void *unused)
+static int binder_transactions_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 
@@ -6532,7 +5995,7 @@ static int transactions_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int proc_show(struct seq_file *m, void *unused)
+static int binder_proc_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *itr;
 	int pid = (unsigned long)m->private;
@@ -6575,7 +6038,7 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
 			"\n" : " (incomplete)\n");
 }
 
-static int transaction_log_show(struct seq_file *m, void *unused)
+static int binder_transaction_log_show(struct seq_file *m, void *unused)
 {
 	struct binder_transaction_log *log = m->private;
 	unsigned int log_cur = atomic_read(&log->cur);
@@ -6596,7 +6059,7 @@ static int transaction_log_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
-const struct file_operations binder_fops = {
+static const struct file_operations binder_fops = {
 	.owner = THIS_MODULE,
 	.poll = binder_poll,
 	.unlocked_ioctl = binder_ioctl,
@@ -6607,44 +6070,10 @@ const struct file_operations binder_fops = {
 	.release = binder_release,
 };
 
-DEFINE_SHOW_ATTRIBUTE(state);
-DEFINE_SHOW_ATTRIBUTE(stats);
-DEFINE_SHOW_ATTRIBUTE(transactions);
-DEFINE_SHOW_ATTRIBUTE(transaction_log);
-
-const struct binder_debugfs_entry binder_debugfs_entries[] = {
-	{
-		.name = "state",
-		.mode = 0444,
-		.fops = &state_fops,
-		.data = NULL,
-	},
-	{
-		.name = "stats",
-		.mode = 0444,
-		.fops = &stats_fops,
-		.data = NULL,
-	},
-	{
-		.name = "transactions",
-		.mode = 0444,
-		.fops = &transactions_fops,
-		.data = NULL,
-	},
-	{
-		.name = "transaction_log",
-		.mode = 0444,
-		.fops = &transaction_log_fops,
-		.data = &binder_transaction_log,
-	},
-	{
-		.name = "failed_transaction_log",
-		.mode = 0444,
-		.fops = &transaction_log_fops,
-		.data = &binder_transaction_log_failed,
-	},
-	{} /* terminator */
-};
+BINDER_DEBUG_ENTRY(state);
+BINDER_DEBUG_ENTRY(stats);
+BINDER_DEBUG_ENTRY(transactions);
+BINDER_DEBUG_ENTRY(transaction_log);
 
 static int __init init_binder_device(const char *name)
 {
@@ -6659,7 +6088,6 @@ static int __init init_binder_device(const char *name)
 	binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
 	binder_device->miscdev.name = name;
 
-	refcount_set(&binder_device->ref, 1);
 	binder_device->context.binder_context_mgr_uid = INVALID_UID;
 	binder_device->context.name = name;
 	mutex_init(&binder_device->context.context_mgr_node_lock);
@@ -6675,130 +6103,70 @@ static int __init init_binder_device(const char *name)
 	return ret;
 }
 
-static int __init binder_create_pools(void)
-{
-	int ret;
-
-	ret = binder_buffer_pool_create();
-	if (ret)
-		return ret;
-
-	binder_node_pool = KMEM_CACHE(binder_node, SLAB_HWCACHE_ALIGN);
-	if (!binder_node_pool)
-		goto err_node_pool;
-
-	binder_proc_pool = KMEM_CACHE(binder_proc, SLAB_HWCACHE_ALIGN);
-	if (!binder_proc_pool)
-		goto err_proc_pool;
-
-	binder_ref_death_pool = KMEM_CACHE(binder_ref_death, SLAB_HWCACHE_ALIGN);
-	if (!binder_ref_death_pool)
-		goto err_ref_death_pool;
-
-	binder_ref_pool = KMEM_CACHE(binder_ref, SLAB_HWCACHE_ALIGN);
-	if (!binder_ref_pool)
-		goto err_ref_pool;
-
-	binder_thread_pool = KMEM_CACHE(binder_thread, SLAB_HWCACHE_ALIGN);
-	if (!binder_thread_pool)
-		goto err_thread_pool;
-
-	binder_transaction_pool = KMEM_CACHE(binder_transaction, SLAB_HWCACHE_ALIGN);
-	if (!binder_transaction_pool)
-		goto err_transaction_pool;
-
-	binder_work_pool = KMEM_CACHE(binder_work, SLAB_HWCACHE_ALIGN);
-	if (!binder_work_pool)
-		goto err_work_pool;
-
-	return 0;
-
-err_work_pool:
-	kmem_cache_destroy(binder_transaction_pool);
-err_transaction_pool:
-	kmem_cache_destroy(binder_thread_pool);
-err_thread_pool:
-	kmem_cache_destroy(binder_ref_pool);
-err_ref_pool:
-	kmem_cache_destroy(binder_ref_death_pool);
-err_ref_death_pool:
-	kmem_cache_destroy(binder_proc_pool);
-err_proc_pool:
-	kmem_cache_destroy(binder_node_pool);
-err_node_pool:
-	binder_buffer_pool_destroy();
-	return -ENOMEM;
-}
-
-static void __init binder_destroy_pools(void)
-{
-	binder_buffer_pool_destroy();
-	kmem_cache_destroy(binder_node_pool);
-	kmem_cache_destroy(binder_proc_pool);
-	kmem_cache_destroy(binder_ref_death_pool);
-	kmem_cache_destroy(binder_ref_pool);
-	kmem_cache_destroy(binder_thread_pool);
-	kmem_cache_destroy(binder_transaction_pool);
-	kmem_cache_destroy(binder_work_pool);
-}
-
 static int __init binder_init(void)
 {
 	int ret;
-	char *device_name, *device_tmp;
+	char *device_name, *device_names, *device_tmp;
 	struct binder_device *device;
 	struct hlist_node *tmp;
-	char *device_names = NULL;
-
-	ret = binder_create_pools();
-	if (ret)
-		return ret;
 
 	ret = binder_alloc_shrinker_init();
 	if (ret)
-		goto err_alloc_shrinker_failed;
+		return ret;
 
 	atomic_set(&binder_transaction_log.cur, ~0U);
 	atomic_set(&binder_transaction_log_failed.cur, ~0U);
 
 	binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
-	if (binder_debugfs_dir_entry_root) {
-		const struct binder_debugfs_entry *db_entry;
-
-		binder_for_each_debugfs_entry(db_entry)
-			debugfs_create_file(db_entry->name,
-					    db_entry->mode,
-					    binder_debugfs_dir_entry_root,
-					    db_entry->data,
-					    db_entry->fops);
-
+	if (binder_debugfs_dir_entry_root)
 		binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
 						 binder_debugfs_dir_entry_root);
-	}
 
-	if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) &&
-	    strcmp(binder_devices_param, "") != 0) {
-		/*
-		* Copy the module_parameter string, because we don't want to
-		* tokenize it in-place.
-		 */
-		device_names = kstrdup(binder_devices_param, GFP_KERNEL);
-		if (!device_names) {
-			ret = -ENOMEM;
-			goto err_alloc_device_names_failed;
-		}
+	if (binder_debugfs_dir_entry_root) {
+		debugfs_create_file("state",
+				    0444,
+				    binder_debugfs_dir_entry_root,
+				    NULL,
+				    &binder_state_fops);
+		debugfs_create_file("stats",
+				    0444,
+				    binder_debugfs_dir_entry_root,
+				    NULL,
+				    &binder_stats_fops);
+		debugfs_create_file("transactions",
+				    0444,
+				    binder_debugfs_dir_entry_root,
+				    NULL,
+				    &binder_transactions_fops);
+		debugfs_create_file("transaction_log",
+				    0444,
+				    binder_debugfs_dir_entry_root,
+				    &binder_transaction_log,
+				    &binder_transaction_log_fops);
+		debugfs_create_file("failed_transaction_log",
+				    0444,
+				    binder_debugfs_dir_entry_root,
+				    &binder_transaction_log_failed,
+				    &binder_transaction_log_fops);
+	}
 
-		device_tmp = device_names;
-		while ((device_name = strsep(&device_tmp, ","))) {
-			ret = init_binder_device(device_name);
-			if (ret)
-				goto err_init_binder_device_failed;
-		}
+	/*
+	 * Copy the module_parameter string, because we don't want to
+	 * tokenize it in-place.
+	 */
+	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+	if (!device_names) {
+		ret = -ENOMEM;
+		goto err_alloc_device_names_failed;
 	}
+	strcpy(device_names, binder_devices_param);
 
-	ret = init_binderfs();
-	if (ret)
-		goto err_init_binder_device_failed;
+	device_tmp = device_names;
+	while ((device_name = strsep(&device_tmp, ","))) {
+		ret = init_binder_device(device_name);
+		if (ret)
+			goto err_init_binder_device_failed;
+	}
 
 	return ret;
 
@@ -6814,9 +6182,6 @@ static int __init binder_init(void)
 err_alloc_device_names_failed:
 	debugfs_remove_recursive(binder_debugfs_dir_entry_root);
 
-err_alloc_shrinker_failed:
-	binder_destroy_pools();
-
 	return ret;
 }
 
@@ -6824,7 +6189,5 @@ device_initcall(binder_init);
 
 #define CREATE_TRACE_POINTS
 #include "binder_trace.h"
-EXPORT_TRACEPOINT_SYMBOL_GPL(binder_transaction_received);
-EXPORT_TRACEPOINT_SYMBOL_GPL(binder_txn_latency_free);
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 9eb15d712567..5addcd56afb4 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1,13 +1,23 @@
-// SPDX-License-Identifier: GPL-2.0-only
 /* binder_alloc.c
  *
  * Android IPC Subsystem
  *
  * Copyright (C) 2007-2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/cacheflush.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -18,11 +28,8 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/list_lru.h>
-#include <linux/ratelimit.h>
-#include <asm/cacheflush.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/sizes.h>
 #include "binder_alloc.h"
 #include "binder_trace.h"
 
@@ -36,7 +43,7 @@ enum {
 	BINDER_DEBUG_BUFFER_ALLOC           = 1U << 2,
 	BINDER_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 3,
 };
-static uint32_t binder_alloc_debug_mask = BINDER_DEBUG_USER_ERROR;
+static uint32_t binder_alloc_debug_mask;
 
 module_param_named(debug_mask, binder_alloc_debug_mask,
 		   uint, 0644);
@@ -44,25 +51,9 @@ module_param_named(debug_mask, binder_alloc_debug_mask,
 #define binder_alloc_debug(mask, x...) \
 	do { \
 		if (binder_alloc_debug_mask & mask) \
-			pr_info_ratelimited(x); \
+			pr_info(x); \
 	} while (0)
 
-static struct kmem_cache *binder_buffer_pool;
-
-int binder_buffer_pool_create(void)
-{
-	binder_buffer_pool = KMEM_CACHE(binder_buffer, SLAB_HWCACHE_ALIGN);
-	if (!binder_buffer_pool)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void binder_buffer_pool_destroy(void)
-{
-	kmem_cache_destroy(binder_buffer_pool);
-}
-
 static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
 {
 	return list_entry(buffer->entry.next, struct binder_buffer, entry);
@@ -173,7 +164,7 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked(
 }
 
 /**
- * binder_alloc_prepare_to_free() - get buffer given user ptr
+ * binder_alloc_buffer_lookup() - get buffer given user ptr
  * @alloc:	binder_alloc for this proc
  * @user_ptr:	User pointer to buffer data
  *
@@ -228,14 +219,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		mm = alloc->vma_vm_mm;
 
 	if (mm) {
-		down_read;
+		down_read(&mm->mmap_sem);
+		if (!mmget_still_valid(mm)) {
+			if (allocate == 0)
+				goto free_range;
+			goto err_no_vma;
+		}
 		vma = alloc->vma;
 	}
 
 	if (!vma && need_mm) {
-		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
-				   "%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
-				   alloc->pid);
+		pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+			alloc->pid);
 		goto err_no_vma;
 	}
 
@@ -284,15 +279,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 			alloc->pages_high = index + 1;
 
 		trace_binder_alloc_page_end(alloc, index);
+		/* vm_insert_page does not seem to increment the refcount */
 	}
 	if (mm) {
-		up_read;
+		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
 	return 0;
 
 free_range:
-	for (page_addr = end - PAGE_SIZE; 1; page_addr -= PAGE_SIZE) {
+	for (page_addr = end - PAGE_SIZE; page_addr >= start;
+	     page_addr -= PAGE_SIZE) {
 		bool ret;
 		size_t index;
 
@@ -305,8 +302,6 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		WARN_ON(!ret);
 
 		trace_binder_free_lru_end(alloc, index);
-		if (page_addr == start)
-			break;
 		continue;
 
 err_vm_insert_page_failed:
@@ -314,47 +309,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		page->page_ptr = NULL;
 err_alloc_page_failed:
 err_page_ptr_cleared:
-		if (page_addr == start)
-			break;
+		;
 	}
 err_no_vma:
 	if (mm) {
-		up_read;
+		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
 	return vma ? -ENOMEM : -ESRCH;
 }
 
-
-static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
-		struct vm_area_struct *vma)
-{
-	if (vma)
-		alloc->vma_vm_mm = vma->vm_mm;
-	/*
-	 * If we see alloc->vma is not NULL, buffer data structures set up
-	 * completely. Look at smp_rmb side binder_alloc_get_vma.
-	 * We also want to guarantee new alloc->vma_vm_mm is always visible
-	 * if alloc->vma is set.
-	 */
-	smp_wmb();
-	alloc->vma = vma;
-}
-
-static inline struct vm_area_struct *binder_alloc_get_vma(
-		struct binder_alloc *alloc)
-{
-	struct vm_area_struct *vma = NULL;
-
-	if (alloc->vma) {
-		/* Look at description in binder_alloc_set_vma */
-		smp_rmb();
-		vma = alloc->vma;
-	}
-	return vma;
-}
-
-static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
+static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
 {
 	/*
 	 * Find the amount and size of buffers allocated by the current caller;
@@ -363,7 +328,7 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
 	 * and at some point we'll catch them in the act. This is more efficient
 	 * than keeping a map per pid.
 	 */
-	struct rb_node *n;
+	struct rb_node *n = alloc->free_buffers.rb_node;
 	struct binder_buffer *buffer;
 	size_t total_alloc_size = 0;
 	size_t num_buffers = 0;
@@ -382,19 +347,13 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
 
 	/*
 	 * Warn if this pid has more than 50 transactions, or more than 50% of
-	 * async space (which is 25% of total buffer size). Oneway spam is only
-	 * detected when the threshold is exceeded.
+	 * async space (which is 25% of total buffer size).
 	 */
 	if (num_buffers > 50 || total_alloc_size > alloc->buffer_size / 4) {
 		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
 			     "%d: pid %d spamming oneway? %zd buffers allocated for a total size of %zd\n",
 			      alloc->pid, pid, num_buffers, total_alloc_size);
-		if (!alloc->oneway_spam_detected) {
-			alloc->oneway_spam_detected = true;
-			return true;
-		}
 	}
-	return false;
 }
 
 static struct binder_buffer *binder_alloc_new_buf_locked(
@@ -414,15 +373,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 	size_t size, data_offsets_size;
 	int ret;
 
-	down_read;
-	if (!binder_alloc_get_vma(alloc)) {
-		up_read;
-		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
-				   "%d: binder_alloc_buf, no vma\n",
-				   alloc->pid);
+	if (alloc->vma == NULL) {
+		pr_err("%d: binder_alloc_buf, no vma\n",
+		       alloc->pid);
 		return ERR_PTR(-ESRCH);
 	}
-	up_read;
 
 	data_offsets_size = ALIGN(data_size, sizeof(void *)) +
 		ALIGN(offsets_size, sizeof(void *));
@@ -492,14 +447,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 			if (buffer_size > largest_free_size)
 				largest_free_size = buffer_size;
 		}
-		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
-				   "%d: binder_alloc_buf size %zd failed, no address space\n",
-				   alloc->pid, size);
-		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
-				   "allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
-				   total_alloc_size, allocated_buffers,
-				   largest_alloc_size, total_free_size,
-				   free_buffers, largest_free_size);
+		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+			alloc->pid, size);
+		pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
+		       total_alloc_size, allocated_buffers, largest_alloc_size,
+		       total_free_size, free_buffers, largest_free_size);
 		return ERR_PTR(-ENOSPC);
 	}
 	if (n == NULL) {
@@ -526,7 +478,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 	if (buffer_size != size) {
 		struct binder_buffer *new_buffer;
 
-		new_buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL);
+		new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 		if (!new_buffer) {
 			pr_err("%s: %d failed to alloc new buffer struct\n",
 			       __func__, alloc->pid);
@@ -550,7 +502,6 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 	buffer->async_transaction = is_async;
 	buffer->extra_buffers_size = extra_buffers_size;
 	buffer->pid = pid;
-	buffer->oneway_spam_suspect = false;
 	if (is_async) {
 		alloc->free_async_space -= size + sizeof(struct binder_buffer);
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
@@ -562,9 +513,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 			 * of async space left (which is less than 10% of total
 			 * buffer size).
 			 */
-			buffer->oneway_spam_suspect = debug_low_async_space_locked(alloc, pid);
-		} else {
-			alloc->oneway_spam_detected = false;
+			debug_low_async_space_locked(alloc, pid);
 		}
 	}
 	return buffer;
@@ -624,7 +573,6 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 {
 	struct binder_buffer *prev, *next = NULL;
 	bool to_free = true;
-
 	BUG_ON(alloc->buffers.next == &buffer->entry);
 	prev = binder_buffer_prev(buffer);
 	BUG_ON(!prev->free);
@@ -665,7 +613,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 					 buffer_start_page(buffer) + PAGE_SIZE);
 	}
 	list_del(&buffer->entry);
-	kmem_cache_free(binder_buffer_pool, buffer);
+	kfree(buffer);
 }
 
 static void binder_free_buf_locked(struct binder_alloc *alloc,
@@ -690,7 +638,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
 	BUG_ON(buffer->user_data > alloc->buffer + alloc->buffer_size);
 
 	if (buffer->async_transaction) {
-		alloc->free_async_space += buffer_size + sizeof(struct binder_buffer);
+		alloc->free_async_space += size + sizeof(struct binder_buffer);
 
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
 			     "%d: binder_free_buf size %zd async free %zd\n",
@@ -731,7 +679,7 @@ static void binder_alloc_clear_buf(struct binder_alloc *alloc,
  * @alloc:	binder_alloc for this proc
  * @buffer:	kernel pointer to buffer
  *
- * Free the buffer allocated via binder_alloc_new_buf()
+ * Free the buffer allocated via binder_alloc_new_buffer()
  */
 void binder_alloc_free_buf(struct binder_alloc *alloc,
 			    struct binder_buffer *buffer)
@@ -773,34 +721,27 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 	const char *failure_string;
 	struct binder_buffer *buffer;
 
-	if (unlikely(vma->vm_mm != alloc->vma_vm_mm)) {
-		ret = -EINVAL;
-		failure_string = "invalid vma->vm_mm";
-		goto err_invalid_mm;
-	}
-
 	mutex_lock(&binder_alloc_mmap_lock);
-	if (alloc->buffer_size) {
+	if (alloc->buffer) {
 		ret = -EBUSY;
 		failure_string = "already mapped";
 		goto err_already_mapped;
 	}
-	alloc->buffer_size = min_t(unsigned long, vma->vm_end - vma->vm_start,
-				   SZ_4M);
-	mutex_unlock(&binder_alloc_mmap_lock);
 
 	alloc->buffer = (void __user *)vma->vm_start;
+	mutex_unlock(&binder_alloc_mmap_lock);
 
-	alloc->pages = kcalloc(alloc->buffer_size / PAGE_SIZE,
-			       sizeof(alloc->pages[0]),
+	alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+				   ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
 			       GFP_KERNEL);
 	if (alloc->pages == NULL) {
 		ret = -ENOMEM;
 		failure_string = "alloc page array";
 		goto err_alloc_pages_failed;
 	}
+	alloc->buffer_size = vma->vm_end - vma->vm_start;
 
-	buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL);
+	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer) {
 		ret = -ENOMEM;
 		failure_string = "alloc buffer struct";
@@ -812,7 +753,11 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 	buffer->free = 1;
 	binder_insert_free_buffer(alloc, buffer);
 	alloc->free_async_space = alloc->buffer_size / 2;
-	binder_alloc_set_vma(alloc, vma);
+	barrier();
+	alloc->vma = vma;
+	alloc->vma_vm_mm = vma->vm_mm;
+	/* Same as mmgrab() in later kernel versions */
+	atomic_inc(&alloc->vma_vm_mm->mm_count);
 
 	return 0;
 
@@ -820,16 +765,12 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 	kfree(alloc->pages);
 	alloc->pages = NULL;
 err_alloc_pages_failed:
-	alloc->buffer = NULL;
 	mutex_lock(&binder_alloc_mmap_lock);
-	alloc->buffer_size = 0;
+	alloc->buffer = NULL;
 err_already_mapped:
 	mutex_unlock(&binder_alloc_mmap_lock);
-err_invalid_mm:
-	binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
-			   "%s: %d %lx-%lx %s failed %d\n", __func__,
-			   alloc->pid, vma->vm_start, vma->vm_end,
-			   failure_string, ret);
+	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+	       alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
 	return ret;
 }
 
@@ -840,10 +781,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 	int buffers, page_count;
 	struct binder_buffer *buffer;
 
-	buffers = 0;
-	mutex_lock(&alloc->mutex);
 	BUG_ON(alloc->vma);
 
+	buffers = 0;
+	mutex_lock(&alloc->mutex);
 	while ((n = rb_first(&alloc->allocated_buffers))) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 
@@ -865,7 +806,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 
 		list_del(&buffer->entry);
 		WARN_ON_ONCE(!list_empty(&alloc->buffers));
-		kmem_cache_free(binder_buffer_pool, buffer);
+		kfree(buffer);
 	}
 
 	page_count = 0;
@@ -945,18 +886,6 @@ void binder_alloc_print_pages(struct seq_file *m,
 	int free = 0;
 
 	mutex_lock(&alloc->mutex);
-	/*
-	 * Make sure the binder_alloc is fully initialized, otherwise we might
-	 * read inconsistent state.
-	 */
-
-	down_read;
-	if (binder_alloc_get_vma(alloc) == NULL) {
-		up_read;
-		goto uninitialized;
-	}
-
-	up_read;
 	for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
 		page = &alloc->pages[i];
 		if (!page->page_ptr)
@@ -966,8 +895,6 @@ void binder_alloc_print_pages(struct seq_file *m,
 		else
 			lru++;
 	}
-
-uninitialized:
 	mutex_unlock(&alloc->mutex);
 	seq_printf(m, "  pages: %d:%d:%d\n", active, lru, free);
 	seq_printf(m, "  pages high watermark: %zu\n", alloc->pages_high);
@@ -1002,7 +929,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
  */
 void binder_alloc_vma_close(struct binder_alloc *alloc)
 {
-	binder_alloc_set_vma(alloc, NULL);
+	WRITE_ONCE(alloc->vma, NULL);
 }
 
 /**
@@ -1018,7 +945,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 				       struct list_lru_one *lru,
 				       spinlock_t *lock,
 				       void *cb_arg)
-	__must_hold(lock)
 {
 	struct mm_struct *mm = NULL;
 	struct binder_lru_page *page = container_of(item,
@@ -1042,9 +968,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	mm = alloc->vma_vm_mm;
 	if (!mmget_not_zero(mm))
 		goto err_mmget;
-	if (!*down_read_trylock)
-		goto err_down_read_mmap_sem_failed;
-	vma = binder_alloc_get_vma(alloc);
+	if (!down_write_trylock(&mm->mmap_sem))
+		goto err_down_write_mmap_sem_failed;
+	vma = alloc->vma;
 
 	list_lru_isolate(lru, item);
 	spin_unlock(lock);
@@ -1056,8 +982,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 		trace_binder_unmap_user_end(alloc, index);
 	}
-	up_read;
-	mmput_async(mm);
+	up_write(&mm->mmap_sem);
+	mmput(mm);
 
 	trace_binder_unmap_kernel_start(alloc, index);
 
@@ -1070,7 +996,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	mutex_unlock(&alloc->mutex);
 	return LRU_REMOVED_RETRY;
 
-err_down_read_mmap_sem_failed:
+err_down_write_mmap_sem_failed:
 	mmput_async(mm);
 err_mmget:
 err_page_already_freed:
@@ -1112,8 +1038,6 @@ static struct shrinker binder_shrinker = {
 void binder_alloc_init(struct binder_alloc *alloc)
 {
 	alloc->pid = current->group_leader->pid;
-	alloc->vma_vm_mm = current->mm;
-	mmgrab(alloc->vma_vm_mm);
 	mutex_init(&alloc->mutex);
 	INIT_LIST_HEAD(&alloc->buffers);
 }
@@ -1271,16 +1195,15 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc,
 	return 0;
 }
 
-static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
-				       bool to_buffer,
-				       struct binder_buffer *buffer,
-				       binder_size_t buffer_offset,
-				       void *ptr,
-				       size_t bytes)
+static void binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
+					bool to_buffer,
+					struct binder_buffer *buffer,
+					binder_size_t buffer_offset,
+					void *ptr,
+					size_t bytes)
 {
 	/* All copies must be 32-bit aligned and 32-bit size */
-	if (!check_buffer(alloc, buffer, buffer_offset, bytes))
-		return -EINVAL;
+	BUG_ON(!check_buffer(alloc, buffer, buffer_offset, bytes));
 
 	while (bytes) {
 		unsigned long size;
@@ -1308,25 +1231,25 @@ static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
 		ptr = ptr + size;
 		buffer_offset += size;
 	}
-	return 0;
 }
 
-int binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
-				struct binder_buffer *buffer,
-				binder_size_t buffer_offset,
-				void *src,
-				size_t bytes)
+void binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
+				 struct binder_buffer *buffer,
+				 binder_size_t buffer_offset,
+				 void *src,
+				 size_t bytes)
 {
-	return binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset,
-					   src, bytes);
+	binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset,
+				    src, bytes);
 }
 
-int binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
-				  void *dest,
-				  struct binder_buffer *buffer,
-				  binder_size_t buffer_offset,
-				  size_t bytes)
+void binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
+				   void *dest,
+				   struct binder_buffer *buffer,
+				   binder_size_t buffer_offset,
+				   size_t bytes)
 {
-	return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
-					   dest, bytes);
+	binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
+				    dest, bytes);
 }
+
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index a30eb98d99f4..da025cc94cd9 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -1,6 +1,15 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
  */
 
 #ifndef _LINUX_BINDER_ALLOC_H
@@ -13,6 +22,11 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/list_lru.h>
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
+#define BINDER_IPC_32BIT 1
+#endif
+
 #include <uapi/linux/android/binder.h>
 
 extern struct list_lru binder_alloc_lru;
@@ -26,8 +40,6 @@ struct binder_transaction;
  * @clear_on_free:      %true if buffer must be zeroed after use
  * @allow_user_free:    %true if user is allowed to free buffer
  * @async_transaction:  %true if buffer is in use for an async txn
- * @oneway_spam_suspect: %true if total async allocate size just exceed
- * spamming detect threshold
  * @debug_id:           unique ID for debugging
  * @transaction:        pointer to associated struct binder_transaction
  * @target_node:        struct binder_node associated with this buffer
@@ -47,8 +59,7 @@ struct binder_buffer {
 	unsigned clear_on_free:1;
 	unsigned allow_user_free:1;
 	unsigned async_transaction:1;
-	unsigned oneway_spam_suspect:1;
-	unsigned debug_id:27;
+	unsigned debug_id:28;
 
 	struct binder_transaction *transaction;
 
@@ -90,8 +101,6 @@ struct binder_lru_page {
  * @buffer_size:        size of address space specified via mmap
  * @pid:                pid for associated binder_proc (invariant after init)
  * @pages_high:         high watermark of offset in @pages
- * @oneway_spam_detected: %true if oneway spam detection fired, clear that
- * flag once the async buffer has returned to a healthy state
  *
  * Bookkeeping structure for per-proc address space management for binder
  * buffers. It is normally initialized during binder_init() and binder_mmap()
@@ -112,7 +121,6 @@ struct binder_alloc {
 	uint32_t buffer_free;
 	int pid;
 	size_t pages_high;
-	bool oneway_spam_detected;
 };
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
@@ -145,8 +153,6 @@ extern void binder_alloc_print_allocated(struct seq_file *m,
 					 struct binder_alloc *alloc);
 void binder_alloc_print_pages(struct seq_file *m,
 			      struct binder_alloc *alloc);
-extern int binder_buffer_pool_create(void);
-extern void binder_buffer_pool_destroy(void);
 
 /**
  * binder_alloc_get_free_async_space() - get free space available for async
@@ -172,16 +178,17 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc,
 				 const void __user *from,
 				 size_t bytes);
 
-int binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
-				struct binder_buffer *buffer,
-				binder_size_t buffer_offset,
-				void *src,
-				size_t bytes);
+void binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
+				 struct binder_buffer *buffer,
+				 binder_size_t buffer_offset,
+				 void *src,
+				 size_t bytes);
 
-int binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
-				  void *dest,
-				  struct binder_buffer *buffer,
-				  binder_size_t buffer_offset,
-				  size_t bytes);
+void binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
+				   void *dest,
+				   struct binder_buffer *buffer,
+				   binder_size_t buffer_offset,
+				   size_t bytes);
 
 #endif /* _LINUX_BINDER_ALLOC_H */
+
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
index c2b323bc3b3a..c839c490fde3 100644
--- a/drivers/android/binder_alloc_selftest.c
+++ b/drivers/android/binder_alloc_selftest.c
@@ -1,9 +1,18 @@
-// SPDX-License-Identifier: GPL-2.0-only
 /* binder_alloc_selftest.c
  *
  * Android IPC Subsystem
  *
  * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
deleted file mode 100644
index 3b6918d8a977..000000000000
--- a/drivers/android/binder_internal.h
+++ /dev/null
@@ -1,603 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_BINDER_INTERNAL_H
-#define _LINUX_BINDER_INTERNAL_H
-
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-#include <linux/refcount.h>
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/uidgid.h>
-#include <uapi/linux/android/binderfs.h>
-#include <uapi/linux/eventpoll.h>
-#include "binder_alloc.h"
-
-#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c)
-#define ida_free ida_remove
-
-typedef unsigned int __poll_t;
-typedef __bitwise int vm_fault_t;
-
-struct binder_context {
-	struct binder_node *binder_context_mgr_node;
-	struct mutex context_mgr_node_lock;
-	kuid_t binder_context_mgr_uid;
-	const char *name;
-};
-
-/**
- * struct binder_device - information about a binder device node
- * @hlist:          list of binder devices (only used for devices requested via
- *                  CONFIG_ANDROID_BINDER_DEVICES)
- * @miscdev:        information about a binder character device node
- * @context:        binder context information
- * @binderfs_inode: This is the inode of the root dentry of the super block
- *                  belonging to a binderfs mount.
- */
-struct binder_device {
-	struct hlist_node hlist;
-	struct miscdevice miscdev;
-	struct binder_context context;
-	struct inode *binderfs_inode;
-	refcount_t ref;
-};
-
-/**
- * binderfs_mount_opts - mount options for binderfs
- * @max: maximum number of allocatable binderfs binder devices
- * @stats_mode: enable binder stats in binderfs.
- */
-struct binderfs_mount_opts {
-	int max;
-	int stats_mode;
-};
-
-/**
- * binderfs_info - information about a binderfs mount
- * @ipc_ns:         The ipc namespace the binderfs mount belongs to.
- * @control_dentry: This records the dentry of this binderfs mount
- *                  binder-control device.
- * @root_uid:       uid that needs to be used when a new binder device is
- *                  created.
- * @root_gid:       gid that needs to be used when a new binder device is
- *                  created.
- * @mount_opts:     The mount options in use.
- * @device_count:   The current number of allocated binder devices.
- * @proc_log_dir:   Pointer to the directory dentry containing process-specific
- *                  logs.
- */
-struct binderfs_info {
-	struct ipc_namespace *ipc_ns;
-	struct dentry *control_dentry;
-	kuid_t root_uid;
-	kgid_t root_gid;
-	struct binderfs_mount_opts mount_opts;
-	int device_count;
-	struct dentry *proc_log_dir;
-};
-
-extern const struct file_operations binder_fops;
-
-extern char *binder_devices_param;
-
-#ifdef CONFIG_ANDROID_BINDERFS
-extern bool is_binderfs_device(const struct inode *inode);
-extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name,
-					   const struct file_operations *fops,
-					   void *data);
-extern void binderfs_remove_file(struct dentry *dentry);
-#else
-static inline bool is_binderfs_device(const struct inode *inode)
-{
-	return false;
-}
-static inline struct dentry *binderfs_create_file(struct dentry *dir,
-					   const char *name,
-					   const struct file_operations *fops,
-					   void *data)
-{
-	return NULL;
-}
-static inline void binderfs_remove_file(struct dentry *dentry) {}
-#endif
-
-#ifdef CONFIG_ANDROID_BINDERFS
-extern int __init init_binderfs(void);
-#else
-static inline int __init init_binderfs(void)
-{
-	return 0;
-}
-#endif
-
-struct binder_debugfs_entry {
-	const char *name;
-	umode_t mode;
-	const struct file_operations *fops;
-	void *data;
-};
-
-extern const struct binder_debugfs_entry binder_debugfs_entries[];
-
-#define binder_for_each_debugfs_entry(entry)	\
-	for ((entry) = binder_debugfs_entries;	\
-	     (entry)->name;			\
-	     (entry)++)
-
-enum binder_stat_types {
-	BINDER_STAT_PROC,
-	BINDER_STAT_THREAD,
-	BINDER_STAT_NODE,
-	BINDER_STAT_REF,
-	BINDER_STAT_DEATH,
-	BINDER_STAT_TRANSACTION,
-	BINDER_STAT_TRANSACTION_COMPLETE,
-	BINDER_STAT_COUNT
-};
-
-struct binder_stats {
-	atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1];
-	atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
-	atomic_t obj_created[BINDER_STAT_COUNT];
-	atomic_t obj_deleted[BINDER_STAT_COUNT];
-};
-
-/**
- * struct binder_work - work enqueued on a worklist
- * @entry:             node enqueued on list
- * @type:              type of work to be performed
- *
- * There are separate work lists for proc, thread, and node (async).
- */
-struct binder_work {
-	struct list_head entry;
-
-	enum binder_work_type {
-		BINDER_WORK_TRANSACTION = 1,
-		BINDER_WORK_TRANSACTION_COMPLETE,
-		BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT,
-		BINDER_WORK_RETURN_ERROR,
-		BINDER_WORK_NODE,
-		BINDER_WORK_DEAD_BINDER,
-		BINDER_WORK_DEAD_BINDER_AND_CLEAR,
-		BINDER_WORK_CLEAR_DEATH_NOTIFICATION,
-	} type;
-};
-
-struct binder_error {
-	struct binder_work work;
-	uint32_t cmd;
-};
-
-/**
- * struct binder_node - binder node bookkeeping
- * @debug_id:             unique ID for debugging
- *                        (invariant after initialized)
- * @lock:                 lock for node fields
- * @work:                 worklist element for node work
- *                        (protected by @proc->inner_lock)
- * @rb_node:              element for proc->nodes tree
- *                        (protected by @proc->inner_lock)
- * @dead_node:            element for binder_dead_nodes list
- *                        (protected by binder_dead_nodes_lock)
- * @proc:                 binder_proc that owns this node
- *                        (invariant after initialized)
- * @refs:                 list of references on this node
- *                        (protected by @lock)
- * @internal_strong_refs: used to take strong references when
- *                        initiating a transaction
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @local_weak_refs:      weak user refs from local process
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @local_strong_refs:    strong user refs from local process
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @tmp_refs:             temporary kernel refs
- *                        (protected by @proc->inner_lock while @proc
- *                        is valid, and by binder_dead_nodes_lock
- *                        if @proc is NULL. During inc/dec and node release
- *                        it is also protected by @lock to provide safety
- *                        as the node dies and @proc becomes NULL)
- * @ptr:                  userspace pointer for node
- *                        (invariant, no lock needed)
- * @cookie:               userspace cookie for node
- *                        (invariant, no lock needed)
- * @has_strong_ref:       userspace notified of strong ref
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @pending_strong_ref:   userspace has acked notification of strong ref
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @has_weak_ref:         userspace notified of weak ref
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @pending_weak_ref:     userspace has acked notification of weak ref
- *                        (protected by @proc->inner_lock if @proc
- *                        and by @lock)
- * @has_async_transaction: async transaction to node in progress
- *                        (protected by @lock)
- * @sched_policy:         minimum scheduling policy for node
- *                        (invariant after initialized)
- * @accept_fds:           file descriptor operations supported for node
- *                        (invariant after initialized)
- * @min_priority:         minimum scheduling priority
- *                        (invariant after initialized)
- * @inherit_rt:           inherit RT scheduling policy from caller
- * @txn_security_ctx:     require sender's security context
- *                        (invariant after initialized)
- * @async_todo:           list of async work items
- *                        (protected by @proc->inner_lock)
- *
- * Bookkeeping structure for binder nodes.
- */
-struct binder_node {
-	int debug_id;
-	spinlock_t lock;
-	struct binder_work work;
-	union {
-		struct rb_node rb_node;
-		struct hlist_node dead_node;
-	};
-	struct binder_proc *proc;
-	struct hlist_head refs;
-	int internal_strong_refs;
-	int local_weak_refs;
-	int local_strong_refs;
-	int tmp_refs;
-	binder_uintptr_t ptr;
-	binder_uintptr_t cookie;
-	struct {
-		/*
-		 * bitfield elements protected by
-		 * proc inner_lock
-		 */
-		u8 has_strong_ref:1;
-		u8 pending_strong_ref:1;
-		u8 has_weak_ref:1;
-		u8 pending_weak_ref:1;
-	};
-	struct {
-		/*
-		 * invariant after initialization
-		 */
-		u8 sched_policy:2;
-		u8 inherit_rt:1;
-		u8 accept_fds:1;
-		u8 txn_security_ctx:1;
-		u8 min_priority;
-	};
-	bool has_async_transaction;
-	struct list_head async_todo;
-};
-
-struct binder_ref_death {
-	/**
-	 * @work: worklist element for death notifications
-	 *        (protected by inner_lock of the proc that
-	 *        this ref belongs to)
-	 */
-	struct binder_work work;
-	binder_uintptr_t cookie;
-};
-
-/**
- * struct binder_ref_data - binder_ref counts and id
- * @debug_id:        unique ID for the ref
- * @desc:            unique userspace handle for ref
- * @strong:          strong ref count (debugging only if not locked)
- * @weak:            weak ref count (debugging only if not locked)
- *
- * Structure to hold ref count and ref id information. Since
- * the actual ref can only be accessed with a lock, this structure
- * is used to return information about the ref to callers of
- * ref inc/dec functions.
- */
-struct binder_ref_data {
-	int debug_id;
-	uint32_t desc;
-	int strong;
-	int weak;
-};
-
-/**
- * struct binder_ref - struct to track references on nodes
- * @data:        binder_ref_data containing id, handle, and current refcounts
- * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
- * @rb_node_node: node for lookup by @node in proc's rb_tree
- * @node_entry:  list entry for node->refs list in target node
- *               (protected by @node->lock)
- * @proc:        binder_proc containing ref
- * @node:        binder_node of target node. When cleaning up a
- *               ref for deletion in binder_cleanup_ref, a non-NULL
- *               @node indicates the node must be freed
- * @death:       pointer to death notification (ref_death) if requested
- *               (protected by @node->lock)
- *
- * Structure to track references from procA to target node (on procB). This
- * structure is unsafe to access without holding @proc->outer_lock.
- */
-struct binder_ref {
-	/* Lookups needed: */
-	/*   node + proc => ref (transaction) */
-	/*   desc + proc => ref (transaction, inc/dec ref) */
-	/*   node => refs + procs (proc exit) */
-	struct binder_ref_data data;
-	struct rb_node rb_node_desc;
-	struct rb_node rb_node_node;
-	struct hlist_node node_entry;
-	struct binder_proc *proc;
-	struct binder_node *node;
-	struct binder_ref_death *death;
-};
-
-/**
- * struct binder_priority - scheduler policy and priority
- * @sched_policy            scheduler policy
- * @prio                    [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
- *
- * The binder driver supports inheriting the following scheduler policies:
- * SCHED_NORMAL
- * SCHED_BATCH
- * SCHED_FIFO
- * SCHED_RR
- */
-struct binder_priority {
-	unsigned int sched_policy;
-	int prio;
-};
-
-enum binder_prio_state {
-	BINDER_PRIO_SET,	/* desired priority set */
-	BINDER_PRIO_PENDING,	/* initiated a saved priority restore */
-	BINDER_PRIO_ABORT,	/* abort the pending priority restore */
-};
-
-/**
- * struct binder_proc - binder process bookkeeping
- * @proc_node:            element for binder_procs list
- * @threads:              rbtree of binder_threads in this proc
- *                        (protected by @inner_lock)
- * @nodes:                rbtree of binder nodes associated with
- *                        this proc ordered by node->ptr
- *                        (protected by @inner_lock)
- * @refs_by_desc:         rbtree of refs ordered by ref->desc
- *                        (protected by @outer_lock)
- * @refs_by_node:         rbtree of refs ordered by ref->node
- *                        (protected by @outer_lock)
- * @waiting_threads:      threads currently waiting for proc work
- *                        (protected by @inner_lock)
- * @pid                   PID of group_leader of process
- *                        (invariant after initialized)
- * @tsk                   task_struct for group_leader of process
- *                        (invariant after initialized)
- * @cred                  struct cred associated with the `struct file`
- *                        in binder_open()
- *                        (invariant after initialized)
- * @deferred_work_node:   element for binder_deferred_list
- *                        (protected by binder_deferred_lock)
- * @deferred_work:        bitmap of deferred work to perform
- *                        (protected by binder_deferred_lock)
- * @outstanding_txns:     number of transactions to be transmitted before
- *                        processes in freeze_wait are woken up
- *                        (protected by @inner_lock)
- * @is_dead:              process is dead and awaiting free
- *                        when outstanding transactions are cleaned up
- *                        (protected by @inner_lock)
- * @is_frozen:            process is frozen and unable to service
- *                        binder transactions
- *                        (protected by @inner_lock)
- * @sync_recv:            process received sync transactions since last frozen
- *                        bit 0: received sync transaction after being frozen
- *                        bit 1: new pending sync transaction during freezing
- *                        (protected by @inner_lock)
- * @async_recv:           process received async transactions since last frozen
- *                        (protected by @inner_lock)
- * @freeze_wait:          waitqueue of processes waiting for all outstanding
- *                        transactions to be processed
- *                        (protected by @inner_lock)
- * @todo:                 list of work for this process
- *                        (protected by @inner_lock)
- * @stats:                per-process binder statistics
- *                        (atomics, no lock needed)
- * @delivered_death:      list of delivered death notification
- *                        (protected by @inner_lock)
- * @max_threads:          cap on number of binder threads
- *                        (protected by @inner_lock)
- * @requested_threads:    number of binder threads requested but not
- *                        yet started. In current implementation, can
- *                        only be 0 or 1.
- *                        (protected by @inner_lock)
- * @requested_threads_started: number binder threads started
- *                        (protected by @inner_lock)
- * @tmp_ref:              temporary reference to indicate proc is in use
- *                        (protected by @inner_lock)
- * @default_priority:     default scheduler priority
- *                        (invariant after initialized)
- * @debugfs_entry:        debugfs node
- * @alloc:                binder allocator bookkeeping
- * @context:              binder_context for this proc
- *                        (invariant after initialized)
- * @inner_lock:           can nest under outer_lock and/or node lock
- * @outer_lock:           no nesting under innor or node lock
- *                        Lock order: 1) outer, 2) node, 3) inner
- * @binderfs_entry:       process-specific binderfs log file
- * @oneway_spam_detection_enabled: process enabled oneway spam detection
- *                        or not
- *
- * Bookkeeping structure for binder processes
- */
-struct binder_proc {
-	struct hlist_node proc_node;
-	struct rb_root threads;
-	struct rb_root nodes;
-	struct rb_root refs_by_desc;
-	struct rb_root refs_by_node;
-	struct list_head waiting_threads;
-	int pid;
-	struct task_struct *tsk;
-	const struct cred *cred;
-	struct hlist_node deferred_work_node;
-	int deferred_work;
-	int outstanding_txns;
-	bool is_dead;
-	bool is_frozen;
-	bool sync_recv;
-	bool async_recv;
-	wait_queue_head_t freeze_wait;
-
-	struct list_head todo;
-	struct binder_stats stats;
-	struct list_head delivered_death;
-	int max_threads;
-	int requested_threads;
-	int requested_threads_started;
-	int tmp_ref;
-	struct binder_priority default_priority;
-	struct dentry *debugfs_entry;
-	struct binder_alloc alloc;
-	struct binder_context *context;
-	spinlock_t inner_lock;
-	spinlock_t outer_lock;
-	struct dentry *binderfs_entry;
-	bool oneway_spam_detection_enabled;
-};
-
-/**
- * struct binder_thread - binder thread bookkeeping
- * @proc:                 binder process for this thread
- *                        (invariant after initialization)
- * @rb_node:              element for proc->threads rbtree
- *                        (protected by @proc->inner_lock)
- * @waiting_thread_node:  element for @proc->waiting_threads list
- *                        (protected by @proc->inner_lock)
- * @pid:                  PID for this thread
- *                        (invariant after initialization)
- * @looper:               bitmap of looping state
- *                        (only accessed by this thread)
- * @looper_needs_return:  looping thread needs to exit driver
- *                        (no lock needed)
- * @transaction_stack:    stack of in-progress transactions for this thread
- *                        (protected by @proc->inner_lock)
- * @todo:                 list of work to do for this thread
- *                        (protected by @proc->inner_lock)
- * @process_todo:         whether work in @todo should be processed
- *                        (protected by @proc->inner_lock)
- * @return_error:         transaction errors reported by this thread
- *                        (only accessed by this thread)
- * @reply_error:          transaction errors reported by target thread
- *                        (protected by @proc->inner_lock)
- * @wait:                 wait queue for thread work
- * @stats:                per-thread statistics
- *                        (atomics, no lock needed)
- * @tmp_ref:              temporary reference to indicate thread is in use
- *                        (atomic since @proc->inner_lock cannot
- *                        always be acquired)
- * @is_dead:              thread is dead and awaiting free
- *                        when outstanding transactions are cleaned up
- *                        (protected by @proc->inner_lock)
- * @task:                 struct task_struct for this thread
- * @prio_lock:            protects thread priority fields
- * @prio_next:            saved priority to be restored next
- *                        (protected by @prio_lock)
- * @prio_state:           state of the priority restore process as
- *                        defined by enum binder_prio_state
- *                        (protected by @prio_lock)
- *
- * Bookkeeping structure for binder threads.
- */
-struct binder_thread {
-	struct binder_proc *proc;
-	struct rb_node rb_node;
-	struct list_head waiting_thread_node;
-	int pid;
-	int looper;              /* only modified by this thread */
-	bool looper_need_return; /* can be written by other thread */
-	struct binder_transaction *transaction_stack;
-	struct list_head todo;
-	bool process_todo;
-	struct binder_error return_error;
-	struct binder_error reply_error;
-	wait_queue_head_t wait;
-	struct binder_stats stats;
-	atomic_t tmp_ref;
-	bool is_dead;
-	struct task_struct *task;
-	spinlock_t prio_lock;
-	struct binder_priority prio_next;
-	enum binder_prio_state prio_state;
-};
-
-/**
- * struct binder_txn_fd_fixup - transaction fd fixup list element
- * @fixup_entry:          list entry
- * @file:                 struct file to be associated with new fd
- * @offset:               offset in buffer data to this fixup
- *
- * List element for fd fixups in a transaction. Since file
- * descriptors need to be allocated in the context of the
- * target process, we pass each fd to be processed in this
- * struct.
- */
-struct binder_txn_fd_fixup {
-	struct list_head fixup_entry;
-	struct file *file;
-	size_t offset;
-};
-
-struct binder_transaction {
-	int debug_id;
-	struct binder_work work;
-	struct binder_thread *from;
-	struct binder_transaction *from_parent;
-	struct binder_proc *to_proc;
-	struct binder_thread *to_thread;
-	struct binder_transaction *to_parent;
-	unsigned need_reply:1;
-	/* unsigned is_dead:1; */       /* not used at the moment */
-
-	struct binder_buffer *buffer;
-	unsigned int    code;
-	unsigned int    flags;
-	struct binder_priority priority;
-	struct binder_priority saved_priority;
-	bool set_priority_called;
-	bool is_nested;
-	kuid_t  sender_euid;
-	struct list_head fd_fixups;
-	binder_uintptr_t security_ctx;
-	/**
-	 * @lock:  protects @from, @to_proc, and @to_thread
-	 *
-	 * @from, @to_proc, and @to_thread can be set to NULL
-	 * during thread teardown
-	 */
-	spinlock_t lock;
-};
-
-/**
- * struct binder_object - union of flat binder object types
- * @hdr:   generic object header
- * @fbo:   binder object (nodes and refs)
- * @fdo:   file descriptor object
- * @bbo:   binder buffer pointer
- * @fdao:  file descriptor array
- *
- * Used for type-independent object copies
- */
-struct binder_object {
-	union {
-		struct binder_object_header hdr;
-		struct flat_binder_object fbo;
-		struct binder_fd_object fdo;
-		struct binder_buffer_object bbo;
-		struct binder_fd_array_object fdao;
-	};
-};
-
-#endif /* _LINUX_BINDER_INTERNAL_H */
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 5d82cf8af88b..7674231af8cb 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -1,6 +1,15 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
  */
 
 #undef TRACE_SYSTEM
@@ -119,35 +128,6 @@ TRACE_EVENT(binder_wait_for_work,
 		  __entry->thread_todo)
 );
 
-TRACE_EVENT(binder_txn_latency_free,
-	TP_PROTO(struct binder_transaction *t,
-		 int from_proc, int from_thread,
-		 int to_proc, int to_thread),
-	TP_ARGS(t, from_proc, from_thread, to_proc, to_thread),
-	TP_STRUCT__entry(
-		__field(int, debug_id)
-		__field(int, from_proc)
-		__field(int, from_thread)
-		__field(int, to_proc)
-		__field(int, to_thread)
-		__field(unsigned int, code)
-		__field(unsigned int, flags)
-	),
-	TP_fast_assign(
-		__entry->debug_id = t->debug_id;
-		__entry->from_proc = from_proc;
-		__entry->from_thread = from_thread;
-		__entry->to_proc = to_proc;
-		__entry->to_thread = to_thread;
-		__entry->code = t->code;
-		__entry->flags = t->flags;
-	),
-	TP_printk("transaction=%d from %d:%d to %d:%d flags=0x%x code=0x%x",
-		  __entry->debug_id, __entry->from_proc, __entry->from_thread,
-		  __entry->to_proc, __entry->to_thread, __entry->code,
-		  __entry->flags)
-);
-
 TRACE_EVENT(binder_transaction,
 	TP_PROTO(bool reply, struct binder_transaction *t,
 		 struct binder_node *target_node),
@@ -267,40 +247,22 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
 		  __entry->dest_ref_debug_id, __entry->dest_ref_desc)
 );
 
-TRACE_EVENT(binder_transaction_fd_send,
-	TP_PROTO(struct binder_transaction *t, int fd, size_t offset),
-	TP_ARGS(t, fd, offset),
+TRACE_EVENT(binder_transaction_fd,
+	TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd),
+	TP_ARGS(t, src_fd, dest_fd),
 
 	TP_STRUCT__entry(
 		__field(int, debug_id)
-		__field(int, fd)
-		__field(size_t, offset)
+		__field(int, src_fd)
+		__field(int, dest_fd)
 	),
 	TP_fast_assign(
 		__entry->debug_id = t->debug_id;
-		__entry->fd = fd;
-		__entry->offset = offset;
+		__entry->src_fd = src_fd;
+		__entry->dest_fd = dest_fd;
 	),
-	TP_printk("transaction=%d src_fd=%d offset=%zu",
-		  __entry->debug_id, __entry->fd, __entry->offset)
-);
-
-TRACE_EVENT(binder_transaction_fd_recv,
-	TP_PROTO(struct binder_transaction *t, int fd, size_t offset),
-	TP_ARGS(t, fd, offset),
-
-	TP_STRUCT__entry(
-		__field(int, debug_id)
-		__field(int, fd)
-		__field(size_t, offset)
-	),
-	TP_fast_assign(
-		__entry->debug_id = t->debug_id;
-		__entry->fd = fd;
-		__entry->offset = offset;
-	),
-	TP_printk("transaction=%d dest_fd=%d offset=%zu",
-		  __entry->debug_id, __entry->fd, __entry->offset)
+	TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d",
+		  __entry->debug_id, __entry->src_fd, __entry->dest_fd)
 );
 
 DECLARE_EVENT_CLASS(binder_buffer_class,
@@ -310,17 +272,14 @@ DECLARE_EVENT_CLASS(binder_buffer_class,
 		__field(int, debug_id)
 		__field(size_t, data_size)
 		__field(size_t, offsets_size)
-		__field(size_t, extra_buffers_size)
 	),
 	TP_fast_assign(
 		__entry->debug_id = buf->debug_id;
 		__entry->data_size = buf->data_size;
 		__entry->offsets_size = buf->offsets_size;
-		__entry->extra_buffers_size = buf->extra_buffers_size;
 	),
-	TP_printk("transaction=%d data_size=%zd offsets_size=%zd extra_buffers_size=%zd",
-		  __entry->debug_id, __entry->data_size, __entry->offsets_size,
-		  __entry->extra_buffers_size)
+	TP_printk("transaction=%d data_size=%zd offsets_size=%zd",
+		  __entry->debug_id, __entry->data_size, __entry->offsets_size)
 );
 
 DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf,
@@ -335,10 +294,6 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
 	TP_PROTO(struct binder_buffer *buffer),
 	TP_ARGS(buffer));
 
-DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release,
-	     TP_PROTO(struct binder_buffer *buffer),
-	     TP_ARGS(buffer));
-
 TRACE_EVENT(binder_update_page_range,
 	TP_PROTO(struct binder_alloc *alloc, bool allocate,
 		 void __user *start, void __user *end),
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
deleted file mode 100644
index f80d1fb9d9b2..000000000000
--- a/drivers/android/binderfs.c
+++ /dev/null
@@ -1,819 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/fsnotify.h>
-#include <linux/gfp.h>
-#include <linux/idr.h>
-#include <linux/init.h>
-#include <linux/ipc_namespace.h>
-#include <linux/kdev_t.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/namei.h>
-#include <linux/magic.h>
-#include <linux/major.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/mount.h>
-#include <linux/fs_parser.h>
-#include <linux/radix-tree.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/spinlock_types.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/user_namespace.h>
-#include <uapi/asm-generic/errno-base.h>
-#include <uapi/linux/android/binder.h>
-#include <uapi/linux/android/binderfs.h>
-
-#include "binder_internal.h"
-
-#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c)
-#define ida_free ida_remove
-
-#define FIRST_INODE 1
-#define SECOND_INODE 2
-#define INODE_OFFSET 3
-#define INTSTRLEN 21
-#define BINDERFS_MAX_MINOR (1U << MINORBITS)
-/* Ensure that the initial ipc namespace always has devices available. */
-#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4)
-
-static dev_t binderfs_dev;
-static DEFINE_MUTEX(binderfs_minors_mutex);
-static DEFINE_IDA(binderfs_minors);
-
-enum binderfs_param {
-	Opt_max,
-	Opt_stats_mode,
-};
-
-enum binderfs_stats_mode {
-	binderfs_stats_mode_unset,
-	binderfs_stats_mode_global,
-};
-
-struct binder_features {
-	bool oneway_spam_detection;
-};
-
-static const struct constant_table binderfs_param_stats[] = {
-	{ "global", binderfs_stats_mode_global },
-	{}
-};
-
-static const struct fs_parameter_spec binderfs_fs_parameters[] = {
-	fsparam_u32("max",	Opt_max),
-	fsparam_enum("stats",	Opt_stats_mode, binderfs_param_stats),
-	{}
-};
-
-static struct binder_features binder_features = {
-	.oneway_spam_detection = true,
-};
-
-static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-bool is_binderfs_device(const struct inode *inode)
-{
-	if (inode->i_sb->s_magic == BINDERFS_SUPER_MAGIC)
-		return true;
-
-	return false;
-}
-
-/**
- * binderfs_binder_device_create - allocate inode from super block of a
- *                                 binderfs mount
- * @ref_inode: inode from wich the super block will be taken
- * @userp:     buffer to copy information about new device for userspace to
- * @req:       struct binderfs_device as copied from userspace
- *
- * This function allocates a new binder_device and reserves a new minor
- * number for it.
- * Minor numbers are limited and tracked globally in binderfs_minors. The
- * function will stash a struct binder_device for the specific binder
- * device in i_private of the inode.
- * It will go on to allocate a new inode from the super block of the
- * filesystem mount, stash a struct binder_device in its i_private field
- * and attach a dentry to that inode.
- *
- * Return: 0 on success, negative errno on failure
- */
-static int binderfs_binder_device_create(struct inode *ref_inode,
-					 struct binderfs_device __user *userp,
-					 struct binderfs_device *req)
-{
-	int minor, ret;
-	struct dentry *dentry, *root;
-	struct binder_device *device;
-	char *name = NULL;
-	size_t name_len;
-	struct inode *inode = NULL;
-	struct super_block *sb = ref_inode->i_sb;
-	struct binderfs_info *info = sb->s_fs_info;
-#if defined(CONFIG_IPC_NS)
-	bool use_reserve = (info->ipc_ns == &init_ipc_ns);
-#else
-	bool use_reserve = true;
-#endif
-
-	/* Reserve new minor number for the new device. */
-	mutex_lock(&binderfs_minors_mutex);
-	if (++info->device_count <= info->mount_opts.max)
-		minor = ida_alloc_max(&binderfs_minors,
-				      use_reserve ? BINDERFS_MAX_MINOR :
-						    BINDERFS_MAX_MINOR_CAPPED,
-				      GFP_KERNEL);
-	else
-		minor = -ENOSPC;
-	if (minor < 0) {
-		--info->device_count;
-		mutex_unlock(&binderfs_minors_mutex);
-		return minor;
-	}
-	mutex_unlock(&binderfs_minors_mutex);
-
-	ret = -ENOMEM;
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		goto err;
-
-	inode = new_inode(sb);
-	if (!inode)
-		goto err;
-
-	inode->i_ino = minor + INODE_OFFSET;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
-	init_special_inode(inode, S_IFCHR | 0600,
-			   MKDEV(MAJOR(binderfs_dev), minor));
-	inode->i_fop = &binder_fops;
-	inode->i_uid = info->root_uid;
-	inode->i_gid = info->root_gid;
-
-	req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */
-	name_len = strlen(req->name);
-	/* Make sure to include terminating NUL byte */
-	name = kmemdup(req->name, name_len + 1, GFP_KERNEL);
-	if (!name)
-		goto err;
-
-	refcount_set(&device->ref, 1);
-	device->binderfs_inode = inode;
-	device->context.binder_context_mgr_uid = INVALID_UID;
-	device->context.name = name;
-	device->miscdev.name = name;
-	device->miscdev.minor = minor;
-	mutex_init(&device->context.context_mgr_node_lock);
-
-	req->major = MAJOR(binderfs_dev);
-	req->minor = minor;
-
-	if (userp && copy_to_user(userp, req, sizeof(*req))) {
-		ret = -EFAULT;
-		goto err;
-	}
-
-	root = sb->s_root;
-	inode_lock(d_inode(root));
-
-	/* look it up */
-	dentry = lookup_one_len(name, root, name_len);
-	if (IS_ERR(dentry)) {
-		inode_unlock(d_inode(root));
-		ret = PTR_ERR(dentry);
-		goto err;
-	}
-
-	if (d_really_is_positive(dentry)) {
-		/* already exists */
-		dput(dentry);
-		inode_unlock(d_inode(root));
-		ret = -EEXIST;
-		goto err;
-	}
-
-	inode->i_private = device;
-	d_instantiate(dentry, inode);
-	fsnotify_create(root->d_inode, dentry);
-	inode_unlock(d_inode(root));
-
-	return 0;
-
-err:
-	kfree(name);
-	kfree(device);
-	mutex_lock(&binderfs_minors_mutex);
-	--info->device_count;
-	ida_free(&binderfs_minors, minor);
-	mutex_unlock(&binderfs_minors_mutex);
-	iput(inode);
-
-	return ret;
-}
-
-/**
- * binderfs_ctl_ioctl - handle binder device node allocation requests
- *
- * The request handler for the binder-control device. All requests operate on
- * the binderfs mount the binder-control device resides in:
- * - BINDER_CTL_ADD
- *   Allocate a new binder device.
- *
- * Return: 0 on success, negative errno on failure
- */
-static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
-			     unsigned long arg)
-{
-	int ret = -EINVAL;
-	struct inode *inode = file_inode(file);
-	struct binderfs_device __user *device = (struct binderfs_device __user *)arg;
-	struct binderfs_device device_req;
-
-	switch (cmd) {
-	case BINDER_CTL_ADD:
-		ret = copy_from_user(&device_req, device, sizeof(device_req));
-		if (ret) {
-			ret = -EFAULT;
-			break;
-		}
-
-		ret = binderfs_binder_device_create(inode, device, &device_req);
-		break;
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-static void binderfs_evict_inode(struct inode *inode)
-{
-	struct binder_device *device = inode->i_private;
-	struct binderfs_info *info = BINDERFS_SB(inode->i_sb);
-
-	clear_inode(inode);
-
-	if (!S_ISCHR(inode->i_mode) || !device)
-		return;
-
-	mutex_lock(&binderfs_minors_mutex);
-	--info->device_count;
-	ida_free(&binderfs_minors, device->miscdev.minor);
-	mutex_unlock(&binderfs_minors_mutex);
-
-	if (refcount_dec_and_test(&device->ref)) {
-		kfree(device->context.name);
-		kfree(device);
-	}
-}
-
-static int binderfs_fs_context_parse_param(struct fs_context *fc,
-					   struct fs_parameter *param)
-{
-	int opt;
-	struct binderfs_mount_opts *ctx = fc->fs_private;
-	struct fs_parse_result result;
-
-	opt = fs_parse(fc, binderfs_fs_parameters, param, &result);
-	if (opt < 0)
-		return opt;
-
-	switch (opt) {
-	case Opt_max:
-		if (result.uint_32 > BINDERFS_MAX_MINOR)
-			return invalfc(fc, "Bad value for '%s'", param->key);
-
-		ctx->max = result.uint_32;
-		break;
-	case Opt_stats_mode:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		ctx->stats_mode = result.uint_32;
-		break;
-	default:
-		return invalfc(fc, "Unsupported parameter '%s'", param->key);
-	}
-
-	return 0;
-}
-
-static int binderfs_fs_context_reconfigure(struct fs_context *fc)
-{
-	struct binderfs_mount_opts *ctx = fc->fs_private;
-	struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb);
-
-	if (info->mount_opts.stats_mode != ctx->stats_mode)
-		return invalfc(fc, "Binderfs stats mode cannot be changed during a remount");
-
-	info->mount_opts.stats_mode = ctx->stats_mode;
-	info->mount_opts.max = ctx->max;
-	return 0;
-}
-
-static int binderfs_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct binderfs_info *info = BINDERFS_SB(root->d_sb);
-
-	if (info->mount_opts.max <= BINDERFS_MAX_MINOR)
-		seq_printf(seq, ",max=%d", info->mount_opts.max);
-
-	switch (info->mount_opts.stats_mode) {
-	case binderfs_stats_mode_unset:
-		break;
-	case binderfs_stats_mode_global:
-		seq_printf(seq, ",stats=global");
-		break;
-	}
-
-	return 0;
-}
-
-static void binderfs_put_super(struct super_block *sb)
-{
-	struct binderfs_info *info = sb->s_fs_info;
-
-	if (info && info->ipc_ns)
-		put_ipc_ns(info->ipc_ns);
-
-	kfree(info);
-	sb->s_fs_info = NULL;
-}
-
-static const struct super_operations binderfs_super_ops = {
-	.evict_inode    = binderfs_evict_inode,
-	.show_options	= binderfs_show_options,
-	.statfs         = simple_statfs,
-	.put_super	= binderfs_put_super,
-};
-
-static inline bool is_binderfs_control_device(const struct dentry *dentry)
-{
-	struct binderfs_info *info = dentry->d_sb->s_fs_info;
-
-	return info->control_dentry == dentry;
-}
-
-static int binderfs_rename(struct user_namespace *mnt_userns,
-			   struct inode *old_dir, struct dentry *old_dentry,
-			   struct inode *new_dir, struct dentry *new_dentry,
-			   unsigned int flags)
-{
-	if (is_binderfs_control_device(old_dentry) ||
-	    is_binderfs_control_device(new_dentry))
-		return -EPERM;
-
-	return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir,
-			     new_dentry, flags);
-}
-
-static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	if (is_binderfs_control_device(dentry))
-		return -EPERM;
-
-	return simple_unlink(dir, dentry);
-}
-
-static const struct file_operations binder_ctl_fops = {
-	.owner		= THIS_MODULE,
-	.open		= nonseekable_open,
-	.unlocked_ioctl	= binder_ctl_ioctl,
-	.compat_ioctl	= binder_ctl_ioctl,
-	.llseek		= noop_llseek,
-};
-
-/**
- * binderfs_binder_ctl_create - create a new binder-control device
- * @sb: super block of the binderfs mount
- *
- * This function creates a new binder-control device node in the binderfs mount
- * referred to by @sb.
- *
- * Return: 0 on success, negative errno on failure
- */
-static int binderfs_binder_ctl_create(struct super_block *sb)
-{
-	int minor, ret;
-	struct dentry *dentry;
-	struct binder_device *device;
-	struct inode *inode = NULL;
-	struct dentry *root = sb->s_root;
-	struct binderfs_info *info = sb->s_fs_info;
-#if defined(CONFIG_IPC_NS)
-	bool use_reserve = (info->ipc_ns == &init_ipc_ns);
-#else
-	bool use_reserve = true;
-#endif
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return -ENOMEM;
-
-	/* If we have already created a binder-control node, return. */
-	if (info->control_dentry) {
-		ret = 0;
-		goto out;
-	}
-
-	ret = -ENOMEM;
-	inode = new_inode(sb);
-	if (!inode)
-		goto out;
-
-	/* Reserve a new minor number for the new device. */
-	mutex_lock(&binderfs_minors_mutex);
-	minor = ida_alloc_max(&binderfs_minors,
-			      use_reserve ? BINDERFS_MAX_MINOR :
-					    BINDERFS_MAX_MINOR_CAPPED,
-			      GFP_KERNEL);
-	mutex_unlock(&binderfs_minors_mutex);
-	if (minor < 0) {
-		ret = minor;
-		goto out;
-	}
-
-	inode->i_ino = SECOND_INODE;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
-	init_special_inode(inode, S_IFCHR | 0600,
-			   MKDEV(MAJOR(binderfs_dev), minor));
-	inode->i_fop = &binder_ctl_fops;
-	inode->i_uid = info->root_uid;
-	inode->i_gid = info->root_gid;
-
-	refcount_set(&device->ref, 1);
-	device->binderfs_inode = inode;
-	device->miscdev.minor = minor;
-
-	dentry = d_alloc_name(root, "binder-control");
-	if (!dentry)
-		goto out;
-
-	inode->i_private = device;
-	info->control_dentry = dentry;
-	d_add(dentry, inode);
-
-	return 0;
-
-out:
-	kfree(device);
-	iput(inode);
-
-	return ret;
-}
-
-static const struct inode_operations binderfs_dir_inode_operations = {
-	.lookup = simple_lookup,
-	.rename = binderfs_rename,
-	.unlink = binderfs_unlink,
-};
-
-static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
-{
-	struct inode *ret;
-
-	ret = new_inode(sb);
-	if (ret) {
-		ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
-		ret->i_mode = mode;
-		ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
-	}
-	return ret;
-}
-
-static struct dentry *binderfs_create_dentry(struct dentry *parent,
-					     const char *name)
-{
-	struct dentry *dentry;
-
-	dentry = lookup_one_len(name, parent, strlen(name));
-	if (IS_ERR(dentry))
-		return dentry;
-
-	/* Return error if the file/dir already exists. */
-	if (d_really_is_positive(dentry)) {
-		dput(dentry);
-		return ERR_PTR(-EEXIST);
-	}
-
-	return dentry;
-}
-
-void binderfs_remove_file(struct dentry *dentry)
-{
-	struct inode *parent_inode;
-
-	parent_inode = d_inode(dentry->d_parent);
-	inode_lock(parent_inode);
-	if (simple_positive(dentry)) {
-		dget(dentry);
-		simple_unlink(parent_inode, dentry);
-		d_delete(dentry);
-		dput(dentry);
-	}
-	inode_unlock(parent_inode);
-}
-
-struct dentry *binderfs_create_file(struct dentry *parent, const char *name,
-				    const struct file_operations *fops,
-				    void *data)
-{
-	struct dentry *dentry;
-	struct inode *new_inode, *parent_inode;
-	struct super_block *sb;
-
-	parent_inode = d_inode(parent);
-	inode_lock(parent_inode);
-
-	dentry = binderfs_create_dentry(parent, name);
-	if (IS_ERR(dentry))
-		goto out;
-
-	sb = parent_inode->i_sb;
-	new_inode = binderfs_make_inode(sb, S_IFREG | 0444);
-	if (!new_inode) {
-		dput(dentry);
-		dentry = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	new_inode->i_fop = fops;
-	new_inode->i_private = data;
-	d_instantiate(dentry, new_inode);
-	fsnotify_create(parent_inode, dentry);
-
-out:
-	inode_unlock(parent_inode);
-	return dentry;
-}
-
-static struct dentry *binderfs_create_dir(struct dentry *parent,
-					  const char *name)
-{
-	struct dentry *dentry;
-	struct inode *new_inode, *parent_inode;
-	struct super_block *sb;
-
-	parent_inode = d_inode(parent);
-	inode_lock(parent_inode);
-
-	dentry = binderfs_create_dentry(parent, name);
-	if (IS_ERR(dentry))
-		goto out;
-
-	sb = parent_inode->i_sb;
-	new_inode = binderfs_make_inode(sb, S_IFDIR | 0755);
-	if (!new_inode) {
-		dput(dentry);
-		dentry = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	new_inode->i_fop = &simple_dir_operations;
-	new_inode->i_op = &simple_dir_inode_operations;
-
-	set_nlink(new_inode, 2);
-	d_instantiate(dentry, new_inode);
-	inc_nlink(parent_inode);
-	fsnotify_mkdir(parent_inode, dentry);
-
-out:
-	inode_unlock(parent_inode);
-	return dentry;
-}
-
-static int binder_features_show(struct seq_file *m, void *unused)
-{
-	bool *feature = m->private;
-
-	seq_printf(m, "%d\n", *feature);
-
-	return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(binder_features);
-
-static int init_binder_features(struct super_block *sb)
-{
-	struct dentry *dentry, *dir;
-
-	dir = binderfs_create_dir(sb->s_root, "features");
-	if (IS_ERR(dir))
-		return PTR_ERR(dir);
-
-	dentry = binderfs_create_file(dir, "oneway_spam_detection",
-				      &binder_features_fops,
-				      &binder_features.oneway_spam_detection);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-
-	return 0;
-}
-
-static int init_binder_logs(struct super_block *sb)
-{
-	struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir;
-	const struct binder_debugfs_entry *db_entry;
-	struct binderfs_info *info;
-	int ret = 0;
-
-	binder_logs_root_dir = binderfs_create_dir(sb->s_root,
-						   "binder_logs");
-	if (IS_ERR(binder_logs_root_dir)) {
-		ret = PTR_ERR(binder_logs_root_dir);
-		goto out;
-	}
-
-	binder_for_each_debugfs_entry(db_entry) {
-		dentry = binderfs_create_file(binder_logs_root_dir,
-					      db_entry->name,
-					      db_entry->fops,
-					      db_entry->data);
-		if (IS_ERR(dentry)) {
-			ret = PTR_ERR(dentry);
-			goto out;
-		}
-	}
-
-	proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc");
-	if (IS_ERR(proc_log_dir)) {
-		ret = PTR_ERR(proc_log_dir);
-		goto out;
-	}
-	info = sb->s_fs_info;
-	info->proc_log_dir = proc_log_dir;
-
-out:
-	return ret;
-}
-
-static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
-{
-	int ret;
-	struct binderfs_info *info;
-	struct binderfs_mount_opts *ctx = fc->fs_private;
-	struct inode *inode = NULL;
-	struct binderfs_device device_info = {};
-	const char *name;
-	size_t len;
-
-	sb->s_blocksize = PAGE_SIZE;
-	sb->s_blocksize_bits = PAGE_SHIFT;
-
-	/*
-	 * The binderfs filesystem can be mounted by userns root in a
-	 * non-initial userns. By default such mounts have the MS_NODEV flag
-	 * set in s_iflags to prevent security issues where userns root can
-	 * just create random device nodes via mknod() since it owns the
-	 * filesystem mount. But binderfs does not allow to create any files
-	 * including devices nodes. The only way to create binder devices nodes
-	 * is through the binder-control device which userns root is explicitly
-	 * allowed to do. So removing the MS_NODEV flag from s_iflags is both
-	 * necessary and safe.
-	 */
-	sb->s_iflags &= ~MS_NODEV;
-	sb->s_iflags |= SB_I_NOEXEC;
-	sb->s_magic = BINDERFS_SUPER_MAGIC;
-	sb->s_op = &binderfs_super_ops;
-	sb->s_time_gran = 1;
-
-	sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL);
-	if (!sb->s_fs_info)
-		return -ENOMEM;
-	info = sb->s_fs_info;
-
-	info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
-
-	info->root_gid = make_kgid(sb->s_user_ns, 0);
-	if (!gid_valid(info->root_gid))
-		info->root_gid = GLOBAL_ROOT_GID;
-	info->root_uid = make_kuid(sb->s_user_ns, 0);
-	if (!uid_valid(info->root_uid))
-		info->root_uid = GLOBAL_ROOT_UID;
-	info->mount_opts.max = ctx->max;
-	info->mount_opts.stats_mode = ctx->stats_mode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return -ENOMEM;
-
-	inode->i_ino = FIRST_INODE;
-	inode->i_fop = &simple_dir_operations;
-	inode->i_mode = S_IFDIR | 0755;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
-	inode->i_op = &binderfs_dir_inode_operations;
-	set_nlink(inode, 2);
-
-	sb->s_root = d_make_root(inode);
-	if (!sb->s_root)
-		return -ENOMEM;
-
-	ret = binderfs_binder_ctl_create(sb);
-	if (ret)
-		return ret;
-
-	name = binder_devices_param;
-	for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
-		strscpy(device_info.name, name, len + 1);
-		ret = binderfs_binder_device_create(inode, NULL, &device_info);
-		if (ret)
-			return ret;
-		name += len;
-		if (*name == ',')
-			name++;
-	}
-
-	ret = init_binder_features(sb);
-	if (ret)
-		return ret;
-
-	if (info->mount_opts.stats_mode == binderfs_stats_mode_global)
-		return init_binder_logs(sb);
-
-	return 0;
-}
-
-static int binderfs_fs_context_get_tree(struct fs_context *fc)
-{
-	return get_tree_nodev(fc, binderfs_fill_super);
-}
-
-static void binderfs_fs_context_free(struct fs_context *fc)
-{
-	struct binderfs_mount_opts *ctx = fc->fs_private;
-
-	kfree(ctx);
-}
-
-static const struct fs_context_operations binderfs_fs_context_ops = {
-	.free		= binderfs_fs_context_free,
-	.get_tree	= binderfs_fs_context_get_tree,
-	.parse_param	= binderfs_fs_context_parse_param,
-	.reconfigure	= binderfs_fs_context_reconfigure,
-};
-
-static int binderfs_init_fs_context(struct fs_context *fc)
-{
-	struct binderfs_mount_opts *ctx;
-
-	ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	ctx->max = BINDERFS_MAX_MINOR;
-	ctx->stats_mode = binderfs_stats_mode_unset;
-
-	fc->fs_private = ctx;
-	fc->ops = &binderfs_fs_context_ops;
-
-	return 0;
-}
-
-static struct file_system_type binder_fs_type = {
-	.name			= "binder",
-	.init_fs_context	= binderfs_init_fs_context,
-	.parameters		= binderfs_fs_parameters,
-	.kill_sb		= kill_litter_super,
-	.fs_flags		= FS_USERNS_MOUNT,
-};
-
-int __init init_binderfs(void)
-{
-	int ret;
-	const char *name;
-	size_t len;
-
-	/* Verify that the default binderfs device names are valid. */
-	name = binder_devices_param;
-	for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
-		if (len > BINDERFS_MAX_NAME)
-			return -E2BIG;
-		name += len;
-		if (*name == ',')
-			name++;
-	}
-
-	/* Allocate new major number for binderfs. */
-	ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR,
-				  "binder");
-	if (ret)
-		return ret;
-
-	ret = register_filesystem(&binder_fs_type);
-	if (ret) {
-		unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
-		return ret;
-	}
-
-	return ret;
-}
diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
deleted file mode 100644
index fd718ab02392..000000000000
--- a/drivers/android/vendor_hooks.c
+++ /dev/null
@@ -1,433 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* vendor_hook.c
- *
- * Android Vendor Hook Support
- *
- * Copyright 2020 Google LLC
- */
-
-#ifndef __GENKSYMS__
-#include <uapi/linux/android/binder.h>
-#include <uapi/linux/hdreg.h>
-#include <linux/bpf.h>
-#include <linux/bpf_verifier.h>
-#include <linux/coredump.h>
-#include <linux/fileattr.h>
-#include <linux/fsverity.h>
-#include <linux/igmp.h>
-#include <linux/ipc_namespace.h>
-#include <linux/mtd/mtd.h>
-#include <linux/pagemap.h>
-#include <linux/pr.h>
-#include <linux/skmsg.h>
-#include <linux/statfs.h>
-#include <linux/time_namespace.h>
-#include <linux/timekeeper_internal.h>
-#include <linux/zlib.h>
-#include <net/ip6_fib.h>
-#include <net/ip_tunnels.h>
-#include <net/macsec.h>
-#include <net/ioam6.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/smc.h>
-#include <net/tc_act/tc_gate.h>
-#include <../fs/mount.h>
-#include <../kernel/audit.h>
-#include <../kernel/locking/mutex.h>
-#include <../net/can/af_can.h>
-#include <../net/tipc/bearer.h>
-#include <../kernel/printk/printk_ringbuffer.h>
-#endif
-
-#define CREATE_TRACE_POINTS
-#include <trace/hooks/vendor_hooks.h>
-#include <linux/tracepoint.h>
-
-#include <trace/hooks/user.h>
-#include <trace/hooks/fpsimd.h>
-#include <trace/hooks/binder.h>
-#include <trace/hooks/futex.h>
-#include <trace/hooks/dtask.h>
-#include <trace/hooks/cpuidle.h>
-#include <trace/hooks/topology.h>
-#include <trace/hooks/mpam.h>
-#include <trace/hooks/gic.h>
-#include <trace/hooks/wqlockup.h>
-#include <trace/hooks/debug.h>
-#include <trace/hooks/sysrqcrash.h>
-#include <trace/hooks/printk.h>
-#include <trace/hooks/gic_v3.h>
-#include <trace/hooks/epoch.h>
-#include <trace/hooks/cpufreq.h>
-#include <trace/hooks/mm.h>
-#include <trace/hooks/preemptirq.h>
-#include <trace/hooks/ftrace_dump.h>
-#include <trace/hooks/ufshcd.h>
-#ifdef __GENKSYMS__
-#include <trace/hooks/block.h>
-#endif
-#include <trace/hooks/cgroup.h>
-#include <trace/hooks/sys.h>
-#include <trace/hooks/iommu.h>
-#include <trace/hooks/net.h>
-#include <trace/hooks/timer.h>
-#include <trace/hooks/pm_domain.h>
-#include <trace/hooks/cpuidle_psci.h>
-#include <trace/hooks/vmscan.h>
-#include <trace/hooks/avc.h>
-#include <trace/hooks/creds.h>
-#include <trace/hooks/memory.h>
-#include <trace/hooks/module.h>
-#ifdef __GENKSYMS__
-#include <trace/hooks/selinux.h>
-#endif
-#include <trace/hooks/syscall_check.h>
-#include <trace/hooks/logbuf.h>
-#include <trace/hooks/remoteproc.h>
-#include <trace/hooks/hung_task.h>
-#include <trace/hooks/bug.h>
-#include <trace/hooks/softlockup.h>
-#include <trace/hooks/power.h>
-#include <trace/hooks/fault.h>
-#include <trace/hooks/traps.h>
-#include <trace/hooks/fips140.h>
-#include <trace/hooks/thermal.h>
-#include <trace/hooks/rwsem.h>
-#include <trace/hooks/timekeeping.h>
-#include <trace/hooks/audio_usboffload.h>
-#include <trace/hooks/drm_framebuffer.h>
-#include <trace/hooks/drm_atomic.h>
-#include <trace/hooks/psci.h>
-#include <trace/hooks/usb.h>
-#include <trace/hooks/regmap.h>
-#include <trace/hooks/dmabuf.h>
-#include <trace/hooks/mmc.h>
-#include <trace/hooks/evdev.h>
-#include <trace/hooks/signal.h>
-#include <trace/hooks/cfg80211.h>
-#include <trace/hooks/bl_hib.h>
-#include <trace/hooks/dm.h>
-#include <trace/hooks/direct_io.h>
-#include <trace/hooks/loop.h>
-#include <trace/hooks/psi.h>
-/*
- * Export tracepoints that act as a bare tracehook (ie: have no trace event
- * associated with them) to allow external modules to probe them.
- */
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_refrigerator);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_arch_set_freq_scale);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_is_fpsimd_save);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_transaction_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_priority_skip);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_set_priority);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_restore_priority);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wakeup_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_send_sig_info);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_sleep_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_futex);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_end);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_traverse_plist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_this);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_up_q_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_process_killed);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_killed_process);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_can_spin_on_owner);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sched_show_task);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_hotplug);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller_id);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_ext_header);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_gic_v3_set_affinity);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_set_affinity);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v3_affinity_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_suspend_epoch_val);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_resume_epoch_val);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_table_limits);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_resolve_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_fast_switch);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_target);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_offline);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_skip_swapcache_flags);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_gfp_zone_flags);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_readahead_gfp_mask);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_readahead_gfp_mask);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rmqueue_bulk);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_disable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_enable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_disable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_enable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_can_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_online);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_size_check);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_format_check);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_dump_buffer);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_compl_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_set_task);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_syscall_prctl_finished);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_uic_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sdev);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_clock_scaling);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_use_mcq_hooks);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_max_tag);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_map_tag);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_set_sqid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_handler);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_make_hba_operational);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_hba_capabilities);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_print_trs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_send_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_config);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_has_oustanding_reqs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_get_outstanding_reqs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_cmd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_pending);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_retry_complete);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kfree_skb);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cgroup_force_kthread_migration);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wait_for_work);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_entry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_select_worklist_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sync_txn_recvd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpufreq_transition);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_add_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_update_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_remove_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_balance_anon_file_reclaim);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_show_max_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_failed_page_trylock);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_clear);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_get_result);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_page_trylock);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_referenced_check_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drain_all_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_drain_all_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pcplist_add_cma_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_slab_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_insert);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_delete);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_replace);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_lookup);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_commit_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_exit_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_override_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_revert_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_mutex_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rtmutex_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rwsem_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_pcpu_rwsem_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_nx);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_rw);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_before_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_after_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_oom_check_panic);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf_pr_cont);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks_dn);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_meminfo_proc_show);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_mm);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_slowpath);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mem);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_print_slabinfo_header);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_shrink_slab);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cache_show);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_report_bug);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watchdog_timer_softlockup);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_logging);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_unfrozen);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_die_kernel_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sea);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_mem_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sp_pc_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_undefinstr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_ptrauth_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_panic_unhandled);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_arm64_serror_panic);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_serror);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_vmpressure);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_request_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_target_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_register);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_disable_thermal_cooling_stats);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_throttle_update);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tk_based_time_sync);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kswapd_per_node);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_vendor_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_ep_action);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_synctype);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_connect);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_audio_usb_offload_disconnect);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_atomic_remove_fb);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drm_atomic_check_modeset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_tos_resident_on);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_cpu_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_usb_new_device_added);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_regmap_update);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_mutex_list_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_unlock_slowpath);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dma_buf_release);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dmabuf_heap_flags_validation);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pass_input_event);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_check_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmap_region);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_unmap_one);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_node_memcgs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sdio_pm_flag_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_scan_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_swappiness);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_partial_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_cache_card_properties);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_print_transaction_info);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_tlb_conf);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_calc_decayed_watermark);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_watermark);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_reset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_attach_sd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sdhci_get_cd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_gpio_cd_irqt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_update_partition_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_cmdline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_dataline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_partition_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_cmdline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_dataline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_set_context);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_get_context);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_track_hash);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_id_remove);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_offline);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_online);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_slab);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpuset_fork);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_looper_state_registered);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_read);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_free_proc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_release);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_has_work_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_read_done);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v2_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_signal);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_alloc_new_buf_locked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_reply);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_trans);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_preset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_memcg_scan_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_thermal_stats);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_new_ref);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_del_ref);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mapcount_pages);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_traversal_lruvec);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_should_be_protected);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mark_page_accessed);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_ffu_update_cid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_uid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_user);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_cpu_get_power);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_cache_forced_ra);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_acct_update_power);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rmqueue);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_inactive_ratio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_hibernation_swap);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_cpu_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_hib_resume_bdev);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dma_buf_stats_teardown);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_cold_or_pageout);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_retry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_encrypt_page);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_init_aes_encrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_skip_swap_map_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_post_image_save);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dm_update_clone_bio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ctl_dirty_rate);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_direct_io_update_bio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_loop_prepare_cmd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_event);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_group);
-/*
- * For type visibility
- */
-const struct readahead_control *GKI_struct_readahead_control;
-EXPORT_SYMBOL_GPL(GKI_struct_readahead_control);
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 6cbdf2737004..378717d1b3b4 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -377,19 +377,11 @@ static int input_get_disposition(struct input_dev *dev,
 	return disposition;
 }
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern bool ksu_input_hook __read_mostly;
-extern int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, int *value);
-#endif
 static void input_handle_event(struct input_dev *dev,
 			       unsigned int type, unsigned int code, int value)
 {
 	int disposition = input_get_disposition(dev, type, code, &value);
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (unlikely(ksu_input_hook))
-		ksu_handle_input_handle_event(&type, &code, &value);
-#endif
 	if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN)
 		add_input_randomness(type, code, value);
 
diff --git a/drivers/input/touchscreen/fts_521/fts.c b/drivers/input/touchscreen/fts_521/fts.c
index 4722476a4c2d..e0750f28a609 100644
--- a/drivers/input/touchscreen/fts_521/fts.c
+++ b/drivers/input/touchscreen/fts_521/fts.c
@@ -2723,15 +2723,15 @@ static void fts_enter_pointer_event_handler(struct fts_ts_info *info,
 		input_report_key(info->input_dev, BTN_TOOL_FINGER, 1);
 
 	/*input_report_abs(info->input_dev, ABS_MT_TRACKING_ID, touchId); */
-		input_report_abs(info->input_dev, ABS_MT_POSITION_X, x);
-		input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y);
-		input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z);
-		input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z);
-		input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance);
+	input_report_abs(info->input_dev, ABS_MT_POSITION_X, x);
+	input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y);
+	input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z);
+	input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z);
+	input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance);
 #ifdef CONFIG_INPUT_PRESS_NDT
-		input_report_abs(info->input_dev, ABS_MT_PRESSURE, z);
+	input_report_abs(info->input_dev, ABS_MT_PRESSURE, z);
 #endif
-		input_sync(info->input_dev);
+	input_sync(info->input_dev);
 	/* pr_info("%s: Event 0x%02x - ID[%d], (x, y, z) = (%3d, %3d, %3d) type = %d\n",
 		 __func__, *event, touchId, x, y, z, touchType); */
 
diff --git a/drivers/kernelsu/Kbuild b/drivers/kernelsu/Kbuild
deleted file mode 100644
index 800da52d0892..000000000000
--- a/drivers/kernelsu/Kbuild
+++ /dev/null
@@ -1,26 +0,0 @@
-obj-y += ksuinit.o
-obj-y += allowlist.o
-obj-y += app_profile.o
-obj-y += apk_sign.o
-obj-y += sucompat.o
-obj-y += throne_tracker.o
-obj-y += setuid_hook.o
-obj-y += kernel_compat.o
-obj-y += kernel_umount.o
-obj-y += supercalls.o
-obj-y += feature.o
-obj-y += ksud.o
-obj-y += file_wrapper.o
-obj-y += su_mount_ns.o
-obj-y += shim.o
-obj-y += selinux/selinux.o
-obj-y += selinux/sepolicy.o
-obj-y += selinux/rules.o
-
-ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
-ccflags-y += -I$(objtree)/security/selinux -include $(srctree)/include/uapi/asm-generic/errno.h
-
-ccflags-y += -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat
-ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-missing-prototypes
-
-# Keep a new line here !! Because someone may append config
diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig
index 8464a6c4ca4b..10608831444f 100644
--- a/drivers/kernelsu/Kconfig
+++ b/drivers/kernelsu/Kconfig
@@ -1,48 +1,77 @@
 menu "KernelSU"
 
 config KSU
-	tristate "KernelSU function support"
-	default y
+	bool "KernelSU function support"
+	depends on !CPU_BIG_ENDIAN
+	depends on SECURITY_SELINUX
+	select SECCOMP
+	default n
 	help
 	  Enable kernel-level root privileges on Android System.
-	  To compile as a module, choose M here: the
-	  module will be called kernelsu.
 
-config KSU_DEBUG
-	bool "KernelSU debug mode"
-	depends on KSU
+config KSU_KPROBES_KSUD
+	bool "Enable dynamic kprobes for early boot hooks"
+	depends on KPROBES && KRETPROBES
+	default y
+	help
+	  Use dynamic hooks via kprobes for functions only
+	  on early boot. Hooks are unregistered at boot complete
+	  to reduce overhead.
+
+config KSU_TAMPER_SYSCALL_TABLE
+	bool "EXPERIMENTAL: tamper sys_call_table for sucompat + sys_reboot"
+	depends on (ARM || ARM64) && !CFI_CLANG && !CFI
 	default n
 	help
-	  Enable KernelSU debug mode.
+	  EXPERIMENTAL: use syscall table hijacking method demonstrated on zx2c4's
+	  kernel-assisted-superuser. Replaces sys_reboot, sys_execve, sys_newfstatat,
+	  sys_faccessat, sys_newfstat_ret manual hooks.
+	  Personally tested on Linux 3.10 ~ 4.14, aarch64.
 
-config KSU_ALLOWLIST_WORKAROUND
-	bool "KernelSU allowlist workaround"
+config KSU_FEATURE_SULOG
+	bool "KernelSU SU Logging feature"
 	depends on KSU
-	default n
+	default y
 	help
-	  Enable workaround for broken allowlist save
+	  Build KernelSU's SU Log.
 
-choice
-	prompt "KernelSU hooks"
-	default KSU_MANUAL_HOOK if !KPROBES
-	default KSU_SYSCALL_HOOK if KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS
+config KSU_FEATURE_ADBROOT
+	bool "KernelSU ADB Root feature"
+	depends on KSU
+	default y
 	help
-	  KernelSU core hooks.
+	  Build KernelSU's adb root feature.
 
-config KSU_MANUAL_HOOK
-	bool "KernelSU manual hook mode."
-	depends on KSU && KSU != m
+config KSU_FEATURE_SELINUX_HIDE
+	bool "KernelSU SELinux hide feature"
+	depends on KSU
+	default y
 	help
-	  Enable manual hook support.
+	  Build KernelSU's SELinux hide feature.
+	  This is a dumber implementation, but it should be fine for most cases.
 
-config KSU_SYSCALL_HOOK
-	bool "KernelSU syscall hook mode."
+config KSU_DEBUG
+	bool "KernelSU debug mode"
 	depends on KSU
-	depends on KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS
+	default n
 	help
-	  Enable KPROBES, KRETPROBES and TRACEPOINT hook for KernelSU core.
-	  This should not be used on kernel below 5.10.
+	  Enable KernelSU debug mode.
 
-endchoice
+config KSU_THRONE_TRACKER_ALWAYS_THREADED
+	bool "Always run throne tracker in a kthread"
+	default n
+	help
+	  Enable this option to run throne tracker in a kthread for the first
+	  run, which happens at boot time / decryption stage. This can decrease
+	  boot time, but can cause crowning failure on some FDE/FBEv1 setups.
+	  If unsure, say n.
+
+config KSU_LSM_SECURITY_HOOKS
+	bool "Use LSM security hooks"
+        depends on KSU
+        default y
+	help
+	  Disabling this is mostly useful for kernel > 6.8.
+	  Make sure to implement manual hooks on security/security.c.
 
 endmenu
diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile
new file mode 100644
index 000000000000..7c2fcedc7eac
--- /dev/null
+++ b/drivers/kernelsu/Makefile
@@ -0,0 +1,78 @@
+# NOTE: unity build. single unit.
+
+obj-$(CONFIG_KSU) := ksu.o
+
+CFLAGS_ksu.o += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
+CFLAGS_ksu.o += -I$(objtree)/security/selinux
+
+# uncommon, but wont hurt, check for 3-arg security_add_hooks
+ifeq ($(shell grep -A1 "void security_add_hooks" $(srctree)/include/linux/lsm_hooks.h 2>/dev/null | grep -q lsm 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_SECURITY_ADD_HOOKS_V2
+endif
+
+ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_CURRENT_SID
+endif
+
+ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/include/security.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_SELINUX_STATE
+endif
+
+ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT
+endif
+
+# half-assed-backport from 5.1
+ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY
+endif
+
+ifeq ($(shell grep -q "^DEFINE_RWLOCK(policy_rwlock);" $(srctree)/security/selinux/ss/services.c; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK
+endif
+
+ifeq ($(shell grep -q "cpus_ptr;" $(srctree)/include/linux/sched.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_BACKPORTED_CPUS_PTR
+endif
+
+ifeq ($(shell grep -q "^struct security_operations selinux_ops" $(srctree)/security/selinux/hooks.c; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_EXPORTED_SELINUX_OPS
+endif
+
+# UL, look for read_iter on f_op struct
+ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_FOP_READ_ITER
+endif
+
+# UL, look for iterate_dir on ‎fs/readdir.c
+ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_ITERATE_DIR
+endif
+
+CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-declaration-after-statement 
+CFLAGS_ksu.o += -Wno-int-conversion -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast
+CFLAGS_ksu.o += -Wno-unused-variable -Wno-unused-function -Wno-format
+CFLAGS_ksu.o += -Wno-macro-redefined
+
+# dont be too strict
+CFLAGS_REMOVE_ksu.o += -Werror
+
+# so we can see stack use atleast, as we disable all stack safety here
+CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024)
+
+# to make sure we can use builtins
+CFLAGS_REMOVE_ksu.o += -fno-builtin
+
+ifneq ($(CONFIG_KSU_DEBUG),y)
+# strip, remove tracing / profiling
+# comment out if proper backtrace is needed
+CFLAGS_ksu.o += -g0 -fno-unwind-tables -fno-asynchronous-unwind-tables -fomit-frame-pointer
+CFLAGS_REMOVE_ksu.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ksu.o += -pg
+
+# if cflags can be macro'd, this will be called 'TRUST_ME'
+CFLAGS_ksu.o += -fno-stack-protector -fno-stack-check
+CFLAGS_REMOVE_ksu.o += -fsanitize=shadow-call-stack
+endif # CONFIG_KSU_DEBUG
+
+# Keep a new line here!! Because someone may append config
diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/allowlist.c
deleted file mode 100644
index 9152b7174b6c..000000000000
--- a/drivers/kernelsu/allowlist.c
+++ /dev/null
@@ -1,576 +0,0 @@
-#include <linux/mutex.h>
-#include <linux/task_work.h>
-#include <linux/capability.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/printk.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#include <linux/sched/task.h>
-#else
-#include <linux/sched.h>
-#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-#include <linux/compiler_types.h>
-#endif
-
-#include "klog.h" // IWYU pragma: keep
-#include "ksud.h"
-#include "selinux/selinux.h"
-#include "allowlist.h"
-#include "manager.h"
-#include "kernel_compat.h"
-#include "su_mount_ns.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "syscall_handler.h"
-#endif
-
-#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32
-#define FILE_FORMAT_VERSION 3 // u32
-
-#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID
-#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0"
-
-static DEFINE_MUTEX(allowlist_mutex);
-
-// default profiles, these may be used frequently, so we cache it
-static struct root_profile default_root_profile;
-static struct non_root_profile default_non_root_profile;
-
-static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly
-	__aligned(PAGE_SIZE);
-static int allow_list_pointer __read_mostly = 0;
-
-static void remove_uid_from_arr(uid_t uid)
-{
-	int *temp_arr;
-	int i, j;
-
-	if (allow_list_pointer == 0)
-		return;
-
-	temp_arr = kzalloc(sizeof(allow_list_arr), GFP_KERNEL);
-	if (temp_arr == NULL) {
-		pr_err("%s: unable to allocate memory\n", __func__);
-		return;
-	}
-
-	for (i = j = 0; i < allow_list_pointer; i++) {
-		if (allow_list_arr[i] == uid)
-			continue;
-		temp_arr[j++] = allow_list_arr[i];
-	}
-
-	allow_list_pointer = j;
-
-	for (; j < ARRAY_SIZE(allow_list_arr); j++)
-		temp_arr[j] = -1;
-
-	memcpy(&allow_list_arr, temp_arr, PAGE_SIZE);
-	kfree(temp_arr);
-}
-
-static void init_default_profiles(void)
-{
-	kernel_cap_t full_cap = CAP_FULL_SET;
-
-	default_root_profile.uid = 0;
-	default_root_profile.gid = 0;
-	default_root_profile.groups_count = 1;
-	default_root_profile.groups[0] = 0;
-	memcpy(&default_root_profile.capabilities.effective, &full_cap,
-	       sizeof(default_root_profile.capabilities.effective));
-	default_root_profile.namespaces = KSU_NS_INHERITED;
-	strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN);
-
-	// This means that we will umount modules by default!
-	default_non_root_profile.umount_modules = true;
-}
-
-struct perm_data {
-	struct list_head list;
-	struct app_profile profile;
-};
-
-static struct list_head allow_list;
-
-static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE);
-#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1)
-
-#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist"
-
-void persistent_allow_list(void);
-
-void ksu_show_allow_list(void)
-{
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-	pr_info("ksu_show_allow_list\n");
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		pr_info("uid :%d, allow: %d\n", p->profile.current_uid,
-			p->profile.allow_su);
-	}
-}
-
-#ifdef CONFIG_KSU_DEBUG
-static void ksu_grant_root_to_shell(void)
-{
-	struct app_profile profile = {
-		.version = KSU_APP_PROFILE_VER,
-		.allow_su = true,
-		.current_uid = 2000,
-	};
-	strcpy(profile.key, "com.android.shell");
-	strcpy(profile.rp_config.profile.selinux_domain,
-	       KSU_DEFAULT_SELINUX_DOMAIN);
-	ksu_set_app_profile(&profile, false);
-}
-#endif
-
-bool ksu_get_app_profile(struct app_profile *profile)
-{
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-	bool found = false;
-
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		bool uid_match = profile->current_uid == p->profile.current_uid;
-		if (uid_match) {
-			// found it, override it with ours
-			memcpy(profile, &p->profile, sizeof(*profile));
-			found = true;
-			goto exit;
-		}
-	}
-
-exit:
-	return found;
-}
-
-static inline bool forbid_system_uid(uid_t uid)
-{
-#define SHELL_UID 2000
-#define SYSTEM_UID 1000
-	return uid < SHELL_UID && uid != SYSTEM_UID;
-}
-
-static bool profile_valid(struct app_profile *profile)
-{
-	if (!profile) {
-		return false;
-	}
-
-	if (profile->version < KSU_APP_PROFILE_VER) {
-		pr_info("Unsupported profile version: %d\n", profile->version);
-		return false;
-	}
-
-	if (profile->allow_su) {
-		if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) {
-			return false;
-		}
-
-		if (strlen(profile->rp_config.profile.selinux_domain) == 0) {
-			return false;
-		}
-	}
-
-	return true;
-}
-
-bool ksu_set_app_profile(struct app_profile *profile, bool persist)
-{
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-	bool result = false;
-
-	if (!profile_valid(profile)) {
-		pr_err("Failed to set app profile: invalid profile!\n");
-		return false;
-	}
-
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		// both uid and package must match, otherwise it will break multiple package with different user id
-		if (profile->current_uid == p->profile.current_uid &&
-		    !strcmp(profile->key, p->profile.key)) {
-			// found it, just override it all!
-			memcpy(&p->profile, profile, sizeof(*profile));
-			result = true;
-			goto out;
-		}
-	}
-
-	// not found, alloc a new node!
-	p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL);
-	if (!p) {
-		pr_err("ksu_set_app_profile alloc failed\n");
-		return false;
-	}
-
-	memcpy(&p->profile, profile, sizeof(*profile));
-	if (profile->allow_su) {
-		pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n",
-			profile->key, profile->current_uid,
-			profile->rp_config.profile.gid,
-			profile->rp_config.profile.selinux_domain);
-	} else {
-		pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n",
-			profile->key, profile->current_uid,
-			profile->nrp_config.profile.umount_modules);
-	}
-	list_add_tail(&p->list, &allow_list);
-
-out:
-	if (profile->current_uid <= BITMAP_UID_MAX) {
-		if (profile->allow_su)
-			allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |=
-				1 << (profile->current_uid % BITS_PER_BYTE);
-		else
-			allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &=
-				~(1 << (profile->current_uid % BITS_PER_BYTE));
-	} else {
-		if (profile->allow_su) {
-			/*
-             * 1024 apps with uid higher than BITMAP_UID_MAX
-             * registered to request superuser?
-             */
-			if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) {
-				pr_err("too many apps registered\n");
-				WARN_ON(1);
-				return false;
-			}
-			allow_list_arr[allow_list_pointer++] =
-				profile->current_uid;
-		} else {
-			remove_uid_from_arr(profile->current_uid);
-		}
-	}
-	result = true;
-
-	// check if the default profiles is changed, cache it to a single struct to accelerate access.
-	if (unlikely(!strcmp(profile->key, "$"))) {
-		// set default non root profile
-		memcpy(&default_non_root_profile, &profile->nrp_config.profile,
-		       sizeof(default_non_root_profile));
-	}
-
-	if (unlikely(!strcmp(profile->key, "#"))) {
-		// set default root profile
-		memcpy(&default_root_profile, &profile->rp_config.profile,
-		       sizeof(default_root_profile));
-	}
-
-	if (persist) {
-		persistent_allow_list();
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-		// FIXME: use a new flag
-		ksu_mark_running_process();
-#endif
-	}
-
-	return result;
-}
-
-bool __ksu_is_allow_uid(uid_t uid)
-{
-	int i;
-
-	if (forbid_system_uid(uid)) {
-		// do not bother going through the list if it's system
-		return false;
-	}
-
-	if (likely(ksu_is_manager_appid_valid()) &&
-	    unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) {
-		// manager is always allowed!
-		return true;
-	}
-
-	if (likely(uid <= BITMAP_UID_MAX)) {
-		return !!(allow_list_bitmap[uid / BITS_PER_BYTE] &
-			  (1 << (uid % BITS_PER_BYTE)));
-	} else {
-		for (i = 0; i < allow_list_pointer; i++) {
-			if (allow_list_arr[i] == uid)
-				return true;
-		}
-	}
-
-	return false;
-}
-
-bool __ksu_is_allow_uid_for_current(uid_t uid)
-{
-	if (unlikely(uid == 0)) {
-		// already root, but only allow our domain.
-		return is_ksu_domain();
-	}
-	return __ksu_is_allow_uid(uid);
-}
-
-bool ksu_uid_should_umount(uid_t uid)
-{
-	struct app_profile profile = { .current_uid = uid };
-
-	if (likely(ksu_is_manager_appid_valid()) &&
-	    unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) {
-		// we should not umount on manager!
-		return false;
-	}
-
-	bool found = ksu_get_app_profile(&profile);
-	if (!found) {
-		// no app profile found, it must be non root app
-		return default_non_root_profile.umount_modules;
-	}
-	if (profile.allow_su) {
-		// if found and it is granted to su, we shouldn't umount for it
-		return false;
-	} else {
-		// found an app profile
-		if (profile.nrp_config.use_default) {
-			return default_non_root_profile.umount_modules;
-		} else {
-			return profile.nrp_config.profile.umount_modules;
-		}
-	}
-}
-
-struct root_profile *ksu_get_root_profile(uid_t uid)
-{
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		if (uid == p->profile.current_uid && p->profile.allow_su) {
-			if (!p->profile.rp_config.use_default) {
-				return &p->profile.rp_config.profile;
-			}
-		}
-	}
-
-	// use default profile
-	return &default_root_profile;
-}
-
-bool ksu_get_allow_list(int *array, int *length, bool allow)
-{
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-	int i = 0;
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		// pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow);
-		if (p->profile.allow_su == allow) {
-			array[i++] = p->profile.current_uid;
-		}
-	}
-	*length = i;
-
-	return true;
-}
-
-static void do_persistent_allow_list(struct callback_head *_cb)
-{
-	u32 magic = FILE_MAGIC;
-	u32 version = FILE_FORMAT_VERSION;
-	struct perm_data *p = NULL;
-	struct list_head *pos = NULL;
-	loff_t off = 0;
-
-	mutex_lock(&allowlist_mutex);
-	struct file *fp = ksu_filp_open_compat(
-		KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-	if (IS_ERR(fp)) {
-		pr_err("save_allow_list create file failed: %ld\n",
-		       PTR_ERR(fp));
-		goto unlock;
-	}
-
-	// store magic and version
-	if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) !=
-	    sizeof(magic)) {
-		pr_err("save_allow_list write magic failed.\n");
-		goto close_file;
-	}
-
-	if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) !=
-	    sizeof(version)) {
-		pr_err("save_allow_list write version failed.\n");
-		goto close_file;
-	}
-
-	list_for_each (pos, &allow_list) {
-		p = list_entry(pos, struct perm_data, list);
-		pr_info("save allow list, name: %s uid :%d, allow: %d\n",
-			p->profile.key, p->profile.current_uid,
-			p->profile.allow_su);
-
-		ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile),
-					&off);
-	}
-
-close_file:
-	filp_close(fp, 0);
-unlock:
-	mutex_unlock(&allowlist_mutex);
-	kfree(_cb);
-}
-
-void persistent_allow_list(void)
-{
-	struct task_struct *tsk;
-
-	tsk = get_pid_task(find_vpid(1), PIDTYPE_PID);
-	if (!tsk) {
-		pr_err("save_allow_list find init task err\n");
-		return;
-	}
-
-	struct callback_head *cb =
-		kzalloc(sizeof(struct callback_head), GFP_KERNEL);
-	if (!cb) {
-		pr_err("save_allow_list alloc cb err\b");
-		goto put_task;
-	}
-	cb->func = do_persistent_allow_list;
-	if (task_work_add(tsk, cb, TWA_RESUME)) {
-		kfree(cb);
-		pr_warn("save_allow_list add task_work failed\n");
-	}
-
-put_task:
-	put_task_struct(tsk);
-}
-
-void ksu_load_allow_list(void)
-{
-	loff_t off = 0;
-	ssize_t ret = 0;
-	struct file *fp = NULL;
-	u32 magic;
-	u32 version;
-
-#ifdef CONFIG_KSU_DEBUG
-	// always allow adb shell by default
-	ksu_grant_root_to_shell();
-#endif
-
-	// load allowlist now!
-	fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0);
-	if (IS_ERR(fp)) {
-		pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp));
-		return;
-	}
-
-	// verify magic
-	if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) !=
-		    sizeof(magic) ||
-	    magic != FILE_MAGIC) {
-		pr_err("allowlist file invalid: %d!\n", magic);
-		goto exit;
-	}
-
-	if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) !=
-	    sizeof(version)) {
-		pr_err("allowlist read version: %d failed\n", version);
-		goto exit;
-	}
-
-	pr_info("allowlist version: %d\n", version);
-
-	while (true) {
-		struct app_profile profile;
-
-		ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile),
-					     &off);
-
-		if (ret <= 0) {
-			pr_info("load_allow_list read err: %zd\n", ret);
-			break;
-		}
-
-		pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n",
-			profile.key, profile.current_uid, profile.allow_su);
-		ksu_set_app_profile(&profile, false);
-	}
-
-exit:
-	ksu_show_allow_list();
-	filp_close(fp, 0);
-}
-
-void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *),
-			 void *data)
-{
-	struct perm_data *np, *n = NULL;
-
-	if (!ksu_boot_completed) {
-		pr_info("boot not completed, skip prune\n");
-		return;
-	}
-
-	bool modified = false;
-	// TODO: use RCU!
-	mutex_lock(&allowlist_mutex);
-	list_for_each_entry_safe (np, n, &allow_list, list) {
-		uid_t uid = np->profile.current_uid;
-		char *package = np->profile.key;
-		// we use this uid for special cases, don't prune it!
-		bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID;
-		if (!is_preserved_uid && !is_uid_valid(uid, package, data)) {
-			modified = true;
-			pr_info("prune uid: %d, package: %s\n", uid, package);
-			list_del(&np->list);
-			if (likely(uid <= BITMAP_UID_MAX)) {
-				allow_list_bitmap[uid / BITS_PER_BYTE] &=
-					~(1 << (uid % BITS_PER_BYTE));
-			}
-			remove_uid_from_arr(uid);
-			smp_mb();
-			kfree(np);
-		}
-	}
-	mutex_unlock(&allowlist_mutex);
-
-	if (modified) {
-		persistent_allow_list();
-	}
-}
-
-void ksu_allowlist_init(void)
-{
-	int i;
-
-	BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE);
-	BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE);
-
-	for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++)
-		allow_list_arr[i] = -1;
-
-	INIT_LIST_HEAD(&allow_list);
-
-	init_default_profiles();
-}
-
-void ksu_allowlist_exit(void)
-{
-	struct perm_data *np, *n = NULL;
-
-	// free allowlist
-	mutex_lock(&allowlist_mutex);
-	list_for_each_entry_safe (np, n, &allow_list, list) {
-		list_del(&np->list);
-		kfree(np);
-	}
-	mutex_unlock(&allowlist_mutex);
-}
diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/app_profile.c
deleted file mode 100644
index 4d2f333ebffd..000000000000
--- a/drivers/kernelsu/app_profile.c
+++ /dev/null
@@ -1,206 +0,0 @@
-#include <linux/version.h>
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include <linux/sched/signal.h> // signal_struct
-#include <linux/sched/task.h>
-#endif
-#include <linux/sched.h>
-#include <linux/seccomp.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/uidgid.h>
-
-#include "allowlist.h"
-#include "app_profile.h"
-#include "arch.h"
-#include "kernel_compat.h"
-#include "klog.h" // IWYU pragma: keep
-#include "selinux/selinux.h"
-#include "su_mount_ns.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "syscall_handler.h"
-#endif
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0)
-static struct group_info root_groups = {
-	.usage = REFCOUNT_INIT(2),
-};
-#else
-static struct group_info root_groups = { .usage = ATOMIC_INIT(2) };
-#endif
-
-void setup_groups(struct root_profile *profile, struct cred *cred)
-{
-	if (profile->groups_count > KSU_MAX_GROUPS) {
-		pr_warn("Failed to setgroups, too large group: %d!\n",
-			profile->uid);
-		return;
-	}
-
-	if (profile->groups_count == 1 && profile->groups[0] == 0) {
-		// setgroup to root and return early.
-		if (cred->group_info)
-			put_group_info(cred->group_info);
-		cred->group_info = get_group_info(&root_groups);
-		return;
-	}
-
-	u32 ngroups = profile->groups_count;
-	struct group_info *group_info = groups_alloc(ngroups);
-	if (!group_info) {
-		pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid);
-		return;
-	}
-
-	int i;
-	for (i = 0; i < ngroups; i++) {
-		gid_t gid = profile->groups[i];
-		kgid_t kgid = make_kgid(current_user_ns(), gid);
-		if (!gid_valid(kgid)) {
-			pr_warn("Failed to setgroups, invalid gid: %d\n", gid);
-			put_group_info(group_info);
-			return;
-		}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
-		group_info->gid[i] = kgid;
-#else
-		GROUP_AT(group_info, i) = kgid;
-#endif
-	}
-
-	groups_sort(group_info);
-	set_groups(cred, group_info);
-	put_group_info(group_info);
-}
-
-static void do_disable_seccomp(void)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-	struct task_struct *fake;
-	fake = kmalloc(sizeof(*fake), GFP_ATOMIC);
-	if (!fake) {
-		pr_err("%s: cannot allocate fake struct!\n", __func__);
-		return;
-	}
-#endif
-
-	// Refer to kernel/seccomp.c: seccomp_set_mode_strict
-	// When disabling Seccomp, ensure that current->sighand->siglock is held during the operation.
-	spin_lock_irq(&current->sighand->siglock);
-	// disable seccomp
-#if defined(CONFIG_GENERIC_ENTRY) &&                                           \
-	LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-	clear_syscall_work(SECCOMP);
-#else
-	clear_thread_flag(TIF_SECCOMP);
-#endif
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-	memcpy(fake, current, sizeof(*fake));
-#endif
-	current->seccomp.mode = 0;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
-	// put_seccomp_filter is allowed while we holding sighand
-	put_seccomp_filter(current);
-#endif
-	current->seccomp.filter = NULL;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0)
-	atomic_set(&current->seccomp.filter_count, 0);
-#endif
-	spin_unlock_irq(&current->sighand->siglock);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0)
-	// https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577
-	fake->flags |= PF_EXITING;
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-	// https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558
-	fake->sighand = NULL;
-#endif
-	seccomp_filter_release(fake);
-	kfree(fake);
-#endif
-}
-
-void disable_seccomp(void)
-{
-	// https://github.com/backslashxx/KernelSU/tree/e28930645e764b9f0e5d0d1b0d5e236464939075/kernel/app_profile.c
-	if (!!!current->seccomp.mode) {
-		return;
-	}
-
-	do_disable_seccomp();
-}
-
-void escape_with_root_profile(void)
-{
-	struct cred *cred;
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	struct task_struct *t;
-#endif
-
-	if (current_euid().val == 0) {
-		pr_warn("Already root, don't escape!\n");
-		return;
-	}
-
-	cred = prepare_creds();
-	if (!cred) {
-		pr_warn("prepare_creds failed!\n");
-		return;
-	}
-
-	struct root_profile *profile = ksu_get_root_profile(cred->uid.val);
-
-	cred->uid.val = profile->uid;
-	cred->suid.val = profile->uid;
-	cred->euid.val = profile->uid;
-	cred->fsuid.val = profile->uid;
-
-	cred->gid.val = profile->gid;
-	cred->fsgid.val = profile->gid;
-	cred->sgid.val = profile->gid;
-	cred->egid.val = profile->gid;
-	cred->securebits = 0;
-
-	BUILD_BUG_ON(sizeof(profile->capabilities.effective) !=
-		     sizeof(kernel_cap_t));
-
-	// setup capabilities
-	// we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process
-	// we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec!
-	u64 cap_for_ksud =
-		profile->capabilities.effective | CAP_DAC_READ_SEARCH;
-	memcpy(&cred->cap_effective, &cap_for_ksud,
-	       sizeof(cred->cap_effective));
-	memcpy(&cred->cap_permitted, &profile->capabilities.effective,
-	       sizeof(cred->cap_permitted));
-	memcpy(&cred->cap_bset, &profile->capabilities.effective,
-	       sizeof(cred->cap_bset));
-
-	setup_groups(profile, cred);
-
-	commit_creds(cred);
-
-	disable_seccomp();
-
-	setup_selinux(profile->selinux_domain);
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	for_each_thread (current, t) {
-		ksu_set_task_tracepoint_flag(t);
-	}
-#endif
-
-	setup_mount_ns(profile->namespaces);
-}
-
-void escape_to_root_for_init(void)
-{
-	setup_selinux(KERNEL_SU_CONTEXT);
-}
diff --git a/drivers/kernelsu/app_profile.h b/drivers/kernelsu/app_profile.h
deleted file mode 100644
index 1263509c2f5e..000000000000
--- a/drivers/kernelsu/app_profile.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef __KSU_H_APP_PROFILE
-#define __KSU_H_APP_PROFILE
-
-#include <linux/types.h>
-
-// Forward declarations
-struct cred;
-
-#define KSU_APP_PROFILE_VER 2
-#define KSU_MAX_PACKAGE_NAME 256
-// NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups.
-#define KSU_MAX_GROUPS 32
-#define KSU_SELINUX_DOMAIN 64
-
-struct root_profile {
-	int32_t uid;
-	int32_t gid;
-
-	int32_t groups_count;
-	int32_t groups[KSU_MAX_GROUPS];
-
-	// kernel_cap_t is u32[2] for capabilities v3
-	struct {
-		u64 effective;
-		u64 permitted;
-		u64 inheritable;
-	} capabilities;
-
-	char selinux_domain[KSU_SELINUX_DOMAIN];
-
-	int32_t namespaces;
-};
-
-struct non_root_profile {
-	bool umount_modules;
-};
-
-struct app_profile {
-	// It may be utilized for backward compatibility, although we have never explicitly made any promises regarding this.
-	u32 version;
-
-	// this is usually the package of the app, but can be other value for special apps
-	char key[KSU_MAX_PACKAGE_NAME];
-	int32_t current_uid;
-	bool allow_su;
-
-	union {
-		struct {
-			bool use_default;
-			char template_name[KSU_MAX_PACKAGE_NAME];
-
-			struct root_profile profile;
-		} rp_config;
-
-		struct {
-			bool use_default;
-
-			struct non_root_profile profile;
-		} nrp_config;
-	};
-};
-
-// Escalate current process to root with the appropriate profile
-void escape_with_root_profile(void);
-
-void escape_to_root_for_init(void);
-
-#endif
diff --git a/drivers/kernelsu/arch.h b/drivers/kernelsu/arch.h
deleted file mode 100644
index b1c79a8c9985..000000000000
--- a/drivers/kernelsu/arch.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef __KSU_H_ARCH
-#define __KSU_H_ARCH
-
-#include <linux/version.h>
-
-#if defined(__aarch64__)
-
-#define __PT_PARM1_REG regs[0]
-#define __PT_PARM2_REG regs[1]
-#define __PT_PARM3_REG regs[2]
-#define __PT_SYSCALL_PARM4_REG regs[3]
-#define __PT_CCALL_PARM4_REG regs[3]
-#define __PT_PARM5_REG regs[4]
-#define __PT_PARM6_REG regs[5]
-#define __PT_RET_REG regs[30]
-#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */
-#define __PT_RC_REG regs[0]
-#define __PT_SP_REG sp
-#define __PT_IP_REG pc
-
-#define REBOOT_SYMBOL "__arm64_sys_reboot"
-#define SYS_READ_SYMBOL "__arm64_sys_read"
-#define SYS_EXECVE_SYMBOL "__arm64_sys_execve"
-
-#elif defined(__x86_64__)
-
-#define __PT_PARM1_REG di
-#define __PT_PARM2_REG si
-#define __PT_PARM3_REG dx
-/* syscall uses r10 for PARM4 */
-#define __PT_SYSCALL_PARM4_REG r10
-#define __PT_CCALL_PARM4_REG cx
-#define __PT_PARM5_REG r8
-#define __PT_PARM6_REG r9
-#define __PT_RET_REG sp
-#define __PT_FP_REG bp
-#define __PT_RC_REG ax
-#define __PT_SP_REG sp
-#define __PT_IP_REG ip
-
-#define REBOOT_SYMBOL "__x64_sys_reboot"
-#define SYS_READ_SYMBOL "__x64_sys_read"
-#define SYS_EXECVE_SYMBOL "__x64_sys_execve"
-
-#else
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#error "Unsupported arch"
-#endif
-#endif
-
-/* allow some architecutres to override `struct pt_regs` */
-#ifndef __PT_REGS_CAST
-#define __PT_REGS_CAST(x) (x)
-#endif
-
-#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
-#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
-#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG)
-#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG)
-#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG)
-#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG)
-#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG)
-#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG)
-#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG)
-#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG)
-#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG)
-#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG)
-
-#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs))
-
-#endif
diff --git a/drivers/kernelsu/feature/adb_root.c b/drivers/kernelsu/feature/adb_root.c
new file mode 100644
index 000000000000..125d0470e75b
--- /dev/null
+++ b/drivers/kernelsu/feature/adb_root.c
@@ -0,0 +1,289 @@
+#ifdef CONFIG_KSU_FEATURE_ADBROOT
+
+static bool ksu_adb_root __read_mostly = false;
+
+static long is_exec_adbd(const char __user **filename_user)
+{
+	// should be bigger than `/apex/com.android.adbd/bin/adbd`
+	char buf[40] = { 0 };
+	size_t copysize = sizeof("/apex/com.android.adbd/bin/adbd");
+
+	if (!!copy_from_user(buf, *filename_user, copysize))
+		return 0;
+
+	if (!!endswith(buf, "/adbd"))
+		return 0;
+
+	pr_info("%s: adbd: %s \n", __func__, buf);
+
+	return 1;
+}
+
+static long is_libadbroot_ok()
+{
+	static const char kLibAdbRoot[] = "/data/adb/ksu/lib/libadbroot.so";
+	struct path path;
+	long ret = kern_path(kLibAdbRoot, 0, &path);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			pr_err("libadbroot.so not exists, skip adb root. Please run `ksud install`\n");
+			ret = 0;
+		} else {
+			pr_err("access libadbroot.so failed: %ld, skip adb root\n", ret);
+		}
+		return ret;
+	} else {
+		ret = 1;
+	}
+	path_put(&path);
+	return ret;
+}
+
+// NOTE: envp is (void ***), void * const char __user * const char __user *
+static long setup_ld_preload(void ***envp_arg)
+{
+	static const char kLdPreload[] = "LD_PRELOAD=/data/adb/ksu/lib/libadbroot.so";
+	static const char kLdLibraryPath[] = "LD_LIBRARY_PATH=/data/adb/ksu/lib";
+	static const size_t kReadEnvBatch = 16;
+	static const size_t kPtrSize = sizeof(unsigned long);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	unsigned long stackp = current_user_stack_pointer();
+#else
+	volatile unsigned long stackp = current->mm->start_stack; // its just a stack smash in the end, it'll work.
+#endif
+	unsigned long envp, ld_preload_p, ld_library_path_p;
+	unsigned long *envp_p = (uintptr_t)envp_arg;
+	unsigned long *tmp_env_p = NULL, *tmp_env_p2 = NULL;
+	size_t env_count = 0, total_size;
+	long ret;
+
+	envp = (char __user **)untagged_addr((unsigned long)*envp_p);
+
+	ld_preload_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdPreload), 8); // 2 words on 32-bit, 32-on-64 its gonna be fine dw.
+	ret = copy_to_user(ld_preload_p, kLdPreload, sizeof(kLdPreload));
+	if (ret != 0) {
+		pr_warn("write ld_preload when adb_root_handle_execve failed: %ld\n", ret);
+		return -EFAULT;
+	}
+
+	ld_library_path_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdLibraryPath), 8);
+	ret = copy_to_user(ld_library_path_p, kLdLibraryPath, sizeof(kLdLibraryPath));
+	if (ret != 0) {
+		pr_warn("write ld_library_path when adb_root_handle_execve failed: %ld\n", ret);
+		return -EFAULT;
+	}
+
+	for (;;) {
+		tmp_env_p2 = krealloc(tmp_env_p, (env_count + kReadEnvBatch + 2) * kPtrSize, GFP_KERNEL);
+		if (tmp_env_p2 == NULL) {
+			pr_err("alloc tmp env failed\n");
+			ret = -ENOMEM;
+			goto out_release_env_p;
+		}
+		tmp_env_p = tmp_env_p2;
+		ret = copy_from_user(&tmp_env_p[env_count], envp + env_count * kPtrSize, kReadEnvBatch * kPtrSize);
+		if (ret < 0) {
+			pr_warn("Access envp when adb_root_handle_execve failed: %ld\n", ret);
+			ret = -EFAULT;
+			goto out_release_env_p;
+		}
+		size_t read_count = kReadEnvBatch * kPtrSize - ret;
+		size_t max_new_env_count = read_count / kPtrSize, new_env_count = 0;
+		bool meet_zero = false;
+		for (; new_env_count < max_new_env_count; new_env_count++) {
+			if (!tmp_env_p[new_env_count + env_count]) {
+				meet_zero = true;
+				break;
+			}
+		}
+		if (!meet_zero) {
+			if (read_count % kPtrSize != 0) {
+				pr_err("unaligned envp array!\n");
+				ret = -EFAULT;
+				goto out_release_env_p;
+			} else if (ret != 0) {
+				pr_err("truncated envp array!\n");
+				ret = -EFAULT;
+				goto out_release_env_p;
+			}
+		}
+		env_count += new_env_count;
+		if (meet_zero)
+			break;
+	}
+
+	// We should have allocated enough memory
+	// TODO: handle existing LD_PRELOAD
+	tmp_env_p[env_count++] = ld_preload_p;
+	tmp_env_p[env_count++] = ld_library_path_p;
+	tmp_env_p[env_count++] = 0;
+	total_size = env_count * kPtrSize;
+
+	stackp -= total_size;
+	ret = copy_to_user(stackp, tmp_env_p, total_size);
+	if (ret != 0) {
+		pr_err("copy new env failed: %ld\n", ret);
+		ret = -EFAULT;
+		goto out_release_env_p;
+	}
+
+	*envp_p = stackp;
+	ret = 0;
+
+out_release_env_p:
+	if (tmp_env_p) {
+		kfree(tmp_env_p);
+	}
+
+	return ret;
+}
+
+static noinline void do_ksu_adb_root_handle_execve(void *filename, void *envp_in)
+{
+	if (likely(test_thread_flag(TIF_SECCOMP)))
+		return;
+
+	uid_t uid = current_euid().val;
+	if (uid != 0 && uid != 2000)
+        	return;
+
+	// filename is void * char __user *
+	const char __user **filename_user = (const char __user **)filename;
+
+	if (likely(!is_exec_adbd(filename_user)))
+		return;
+
+	if (unlikely(!is_libadbroot_ok()))
+		return;
+
+	if (setup_ld_preload((void ***)envp_in))
+		return;
+
+	pr_info("escape to root for adb\n");
+	escape_to_root_for_adb_root();
+	escape_with_root_profile(); // why is this needed for 3.x?
+	return;
+}
+
+static noinline void do_ksu_adb_root_handle_execveat(void *filename, void *envp_in)
+{
+	if (likely(test_thread_flag(TIF_SECCOMP)))
+		return;
+
+	uid_t uid = current_euid().val;
+	if (uid != 0 && uid != 2000)
+        	return;
+
+	if (!filename)
+		return;
+
+	// filename is char **
+	if (!*(void **)filename)
+		return;
+
+	if (!!endswith(*(char **)filename, "/adbd"))
+		return;
+
+	if (unlikely(!is_libadbroot_ok()))
+		return;
+
+	if (!envp_in)
+		return;
+
+	struct user_arg_ptr *envp = (struct user_arg_ptr *)envp_in;
+
+	void ***envp_addr = (void ***)&envp->ptr.native;
+#ifdef CONFIG_COMPAT
+	if (unlikely(envp->is_compat))
+		envp_addr = (void ***)&envp->ptr.compat;
+#endif
+
+	pr_info("%s: envp 0x%lx \n", __func__, (uintptr_t)*envp_addr );
+
+	if (setup_ld_preload(envp_addr))
+		return; 
+
+	pr_info("escape to root for adb\n");
+	escape_to_root_for_adb_root();
+	escape_with_root_profile(); // why is this needed?
+	return;
+}
+
+#ifdef KSU_CAN_USE_JUMP_LABEL // see kernel_compat.h
+
+DEFINE_STATIC_KEY_FALSE(ksu_adb_root_key);
+
+static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in)
+{
+	if (static_branch_unlikely(&ksu_adb_root_key))
+		do_ksu_adb_root_handle_execve(filename, envp_in);
+}
+static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in)
+{
+	if (static_branch_unlikely(&ksu_adb_root_key))
+		do_ksu_adb_root_handle_execveat(filename, envp_in);
+}
+
+static inline void ksu_static_branch_enable() { static_branch_enable(&ksu_adb_root_key); smp_mb(); }
+static inline void ksu_static_branch_disable() { static_branch_disable(&ksu_adb_root_key); smp_mb(); }
+#else /* ! KSU_CAN_USE_JUMP_LABEL */
+static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in)
+{
+	if (unlikely(ksu_adb_root))
+		do_ksu_adb_root_handle_execve(filename, envp_in);
+}
+static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in)
+{
+	if (unlikely(ksu_adb_root))
+		do_ksu_adb_root_handle_execveat(filename, envp_in);
+}
+static inline void ksu_static_branch_enable() { } // no-op
+static inline void ksu_static_branch_disable() { } // no-op
+#endif // KSU_CAN_USE_JUMP_LABEL
+
+static int kernel_adb_root_feature_get(u64 *value)
+{
+	*value = ksu_adb_root ? 1 : 0;
+	return 0;
+}
+
+static int kernel_adb_root_feature_set(u64 value)
+{
+	bool enable = value != 0;
+
+	// prevent double enable / double disable
+	// as old api does ref inc / dec, its a 'lil risky
+	if (enable == ksu_adb_root)
+		return 0;
+
+	if (enable) {
+		ksu_adb_root = true;
+		ksu_static_branch_enable();
+	} else {
+		ksu_adb_root = false;
+		ksu_static_branch_disable();
+	}
+	pr_info("adb_root: set to %d\n", enable);
+	return 0;
+}
+
+static const struct ksu_feature_handler ksu_adb_root_handler = {
+	.feature_id = KSU_FEATURE_ADB_ROOT,
+	.name = "adb_root",
+	.get_handler = kernel_adb_root_feature_get,
+	.set_handler = kernel_adb_root_feature_set,
+};
+
+void __init ksu_adb_root_init(void)
+{
+	if (ksu_register_feature_handler(&ksu_adb_root_handler)) {
+		pr_err("Failed to register adb_root feature handler\n");
+	}
+}
+
+void __exit ksu_adb_root_exit(void)
+{
+	ksu_unregister_feature_handler(KSU_FEATURE_ADB_ROOT);
+}
+
+#endif // CONFIG_KSU_FEATURE_ADBROOT
diff --git a/drivers/kernelsu/feature/adb_root.h b/drivers/kernelsu/feature/adb_root.h
new file mode 100644
index 000000000000..331148751ca5
--- /dev/null
+++ b/drivers/kernelsu/feature/adb_root.h
@@ -0,0 +1,9 @@
+#ifndef __KSU_H_ADB_ROOT
+#define __KSU_H_ADB_ROOT
+
+#ifdef CONFIG_KSU_FEATURE_ADBROOT
+void ksu_adb_root_init(void);
+void ksu_adb_root_exit(void);
+#endif
+
+#endif
diff --git a/drivers/kernelsu/feature/kernel_umount.c b/drivers/kernelsu/feature/kernel_umount.c
new file mode 100644
index 000000000000..f5d399657852
--- /dev/null
+++ b/drivers/kernelsu/feature/kernel_umount.c
@@ -0,0 +1,115 @@
+static bool ksu_kernel_umount_enabled __read_mostly = true;
+
+static int kernel_umount_feature_get(u64 *value)
+{
+	*value = ksu_kernel_umount_enabled ? 1 : 0;
+	return 0;
+}
+
+static int kernel_umount_feature_set(u64 value)
+{
+	bool enable = value != 0;
+	ksu_kernel_umount_enabled = enable;
+	pr_info("kernel_umount: set to %d\n", enable);
+	return 0;
+}
+
+static const struct ksu_feature_handler kernel_umount_handler = {
+	.feature_id = KSU_FEATURE_KERNEL_UMOUNT,
+	.name = "kernel_umount",
+	.get_handler = kernel_umount_feature_get,
+	.set_handler = kernel_umount_feature_set,
+};
+
+extern int path_umount(struct path *path, int flags);
+
+static inline void ksu_umount_mnt(const char *mnt, struct path *path, int flags)
+{
+	int err = path_umount(path, flags);
+	if (err)
+		pr_info("umount %s failed: %d\n", mnt, err);
+}
+
+static void try_umount(const char *mnt, int flags)
+{
+	struct path path;
+	int err = kern_path(mnt, 0, &path);
+	if (err) {
+		return;
+	}
+
+	if (path.dentry != path.mnt->mnt_root) {
+		// it is not root mountpoint, maybe umounted by others already.
+		path_put(&path);
+		return;
+	}
+
+	ksu_umount_mnt(mnt, &path, flags);
+}
+
+static inline int ksu_handle_umount(struct cred *new, const struct cred *old)
+{
+	uid_t new_uid = ksu_get_uid_t(new->uid);
+	uid_t old_uid = ksu_get_uid_t(old->uid);
+
+	if (!ksu_kernel_umount_enabled)
+		return 0;
+
+	// if there isn't any module mounted, just ignore it!
+	if (!ksu_module_mounted)
+		return 0;
+
+	if (!ksu_cred)
+		return 0;
+
+	// There are 6 scenarios:
+	// 1. Normal app: zygote -> appuid
+	// 2. Isolated process forked from zygote: zygote -> isolated_process
+	// 3. App zygote forked from zygote: zygote -> appuid
+	// 4. Webview zygote forked from zygote: zygote -> WEBVIEW_ZYGOTE_UID (no need to handle, app cannot run custom code)
+	// 5. Isolated process forked from app zygote: appuid -> isolated_process (already handled by 3)
+	// 6. Isolated process forked from webview zygote (no need to handle, app cannot run custom code)
+	if (!is_appuid(new_uid) && !is_isolated_process(new_uid))
+		return 0;
+
+	if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid))
+		return 0;
+
+	// check old process's selinux context, if it is not zygote, ignore it!
+	// because some su apps may setuid to untrusted_app but they are in global mount namespace
+	// when we umount for such process, that is a disaster!
+	// also handle case 4 and 5
+	bool is_zygote_child = is_zygote(old);
+	if (!is_zygote_child) {
+		pr_info("handle umount ignore non zygote child: %d\n", current->pid);
+		return 0;
+	}
+	// umount the target mnt
+	pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid);
+
+	const struct cred *saved = override_creds(ksu_cred);
+
+	struct mount_entry *entry;
+	down_read(&mount_list_lock);
+	list_for_each_entry (entry, &mount_list, list) {
+		pr_info("%s: unmounting: %s flags: 0x%x\n", __func__, entry->umountable, entry->flags);
+		try_umount(entry->umountable, entry->flags);
+	}
+	up_read(&mount_list_lock);
+
+	revert_creds(saved);
+
+	return 0;
+}
+
+void __init ksu_kernel_umount_init(void)
+{
+	if (ksu_register_feature_handler(&kernel_umount_handler)) {
+		pr_err("Failed to register kernel_umount feature handler\n");
+	}
+}
+
+void __exit ksu_kernel_umount_exit(void)
+{
+	ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT);
+}
diff --git a/drivers/kernelsu/feature/kernel_umount.h b/drivers/kernelsu/feature/kernel_umount.h
new file mode 100644
index 000000000000..51af740d619c
--- /dev/null
+++ b/drivers/kernelsu/feature/kernel_umount.h
@@ -0,0 +1,13 @@
+#ifndef __KSU_H_KERNEL_UMOUNT
+#define __KSU_H_KERNEL_UMOUNT
+
+// for the umount list
+struct mount_entry {
+    char *umountable;
+    unsigned int flags;
+    struct list_head list;
+};
+extern struct list_head mount_list;
+extern struct rw_semaphore mount_list_lock;
+
+#endif
diff --git a/drivers/kernelsu/feature/selinux_hide.c b/drivers/kernelsu/feature/selinux_hide.c
new file mode 100644
index 000000000000..962fadf7fa8c
--- /dev/null
+++ b/drivers/kernelsu/feature/selinux_hide.c
@@ -0,0 +1,404 @@
+/**
+ *  NOTE: this isnt the fullblown thing like upstream's where we straight up backport
+ *  SELinux. This is just questionable to do when we want to support a plethora of
+ *  non-standard kernels.
+ *
+ *  While what we are doing here is kinda improper, for most cases
+ *  this should be mroe than enough.
+ *
+ *  this will include write_op / selinux_transaction_write spoofing and then avc spoofing.
+ *  our goal for this one is to be self contained as much as possible
+ *  with only one call from ksu's initcall.
+ *
+ */
+
+// enabled by default
+static bool ksu_selinux_hide_enabled __read_mostly = true;
+
+// sids for avc spoofing
+static u32 su_sid __read_mostly = 0;
+static u32 ksu_sid __read_mostly = 0;
+static u32 priv_app_sid __read_mostly = 0;
+
+static inline int ksu_selinux_get_sids()
+{
+	// dont load at all if we cant get sids
+	int err = security_secctx_to_secid("u:r:su:s0", strlen("u:r:su:s0"), &su_sid);
+	if (!err)
+		pr_info("selinux_hide: su_sid: %u\n", su_sid);
+
+	err = security_secctx_to_secid("u:r:ksu:s0", strlen("u:r:ksu:s0"), &ksu_sid);
+	if (!err)
+		pr_info("selinux_hide: ksu_sid: %u\n", su_sid);
+
+	err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid);
+	if (!err)
+		pr_info("selinux_hide: priv_app_sid: %u\n", su_sid);
+
+	if (!su_sid || !ksu_sid || !priv_app_sid)
+		return -1;
+
+	return 0;
+}
+
+// deprecate in a month
+int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass)
+{
+	if (!ksu_selinux_hide_enabled)
+		return 0;
+
+	if (tsid != su_sid && tsid != ksu_sid)
+		return 0;
+
+	pr_info("selinux_hide: prevent log for sid: %u\n", tsid);
+	*tclass = 0;
+
+	return 0;
+}
+
+void ksu_slow_avc_audit(u32 *tsid)
+{
+	if (!ksu_selinux_hide_enabled)
+		return;
+
+	// if tsid is su, we just replace it
+	// unsure if its enough, but this is how it is aye?
+	if (*tsid == su_sid || *tsid == ksu_sid) {
+		pr_info("selinux_hide: slow_avc_audit: replace tsid: %u with priv_app_sid: %u\n", *tsid, priv_app_sid);
+		*tsid = priv_app_sid;
+	}
+
+	return;
+}
+
+static inline bool ksu_should_destroy_context(char *str)
+{
+	if (!str)
+		return false;
+
+	struct ksu_hidden_node *node;
+
+	read_lock(&ksu_sepolicy_shitlist_lock);
+	list_for_each_entry(node, &ksu_sepolicy_rule_list, list) {
+		if (strstr(str, node->name)) {
+			read_unlock(&ksu_sepolicy_shitlist_lock);
+			return true;
+		}
+	}
+	read_unlock(&ksu_sepolicy_shitlist_lock);
+
+	return false;
+}
+
+/**
+ *  security_setprocattr is a weird LSM on 5.4 and up, and this is normally backported
+ *  down to 4.14 and 4.19. somehow this LSM is a one-shot. only the first to register
+ *  is called.
+ *
+ *  however this is not an issue for us on 3.x as we are hijacking selinux_ops on it
+ *
+ */
+int ksu_hide_setprocattr(const char *name, void *value, size_t size)
+{
+	if (!ksu_selinux_hide_enabled)
+		return 0;
+
+	// only hook when seccomp is enabled
+	if (!test_thread_flag(TIF_SECCOMP))
+		return 0;
+
+	// only appuid
+	if (current_uid().val < 10000)
+		return 0;
+
+	if (!size)
+		return 0;
+
+	if (!name)
+		return 0;
+
+	if (!!strcmp(name, "current"))
+		return 0;
+
+	char *str = (char *)value;
+
+	if (!str)
+		return 0;
+
+	// to make sure its terminated
+	char buf[64] = { 0 };
+	size_t len = (size < 63) ? size : 63;
+
+	memcpy(buf, str, len);
+
+	if (!ksu_should_destroy_context(buf))
+		return 0;
+	
+	pr_info("block setprocattr for context: %s\n", buf);
+	str[1] = '1';
+
+	return 0;
+}
+
+// for manual hook
+void ksu_sel_write_context(struct file **file, char **buf, size_t *size)
+{
+	if (!ksu_selinux_hide_enabled)
+		return;
+
+	// only hook when seccomp is enabled
+	if (!test_thread_flag(TIF_SECCOMP))
+		return;
+
+	// only appuid
+	if (current_uid().val < 10000)
+		return;
+
+	// upstream doesnt do this, so we should also not.
+	//if (!ksu_uid_should_umount(current_uid().val))
+	//	return;
+
+	char *mbuf = *buf;
+
+	if (!mbuf)
+		return;
+
+	if (!ksu_should_destroy_context(mbuf))
+		return;
+
+	pr_info("selinux_hide: destroy: %s \n", mbuf);
+	mbuf[1] = '1';
+	return;
+
+}
+
+#if defined(CONFIG_KPROBES)
+
+#include <linux/kprobes.h>
+static struct kprobe *slow_avc_audit_kp;
+static struct kprobe *sel_write_context_kp;
+static struct kprobe *sel_write_access_kp;
+
+static int slow_avc_audit_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+
+#if defined(KSU_COMPAT_HAS_SELINUX_STATE)
+	u32 *tsid = (u32 *)&PT_REGS_PARM3(regs);
+#else
+	u32 *tsid = (u32 *)&PT_REGS_PARM2(regs);
+#endif
+
+	ksu_slow_avc_audit(tsid);
+
+	return 0;
+}
+
+static int sel_write_context_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	char **buf = (char **)&PT_REGS_PARM2(regs);
+
+	ksu_sel_write_context(NULL, buf, NULL);
+	return 0;
+}
+
+// this deals with __user, this is here in case its really needed.
+#if 0
+static int selinux_transaction_write_pre_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	
+	bool *should_destroy = (bool *)ri->data;
+	*should_destroy = false;
+
+	if (!test_thread_flag(TIF_SECCOMP))
+		return 0;
+
+	if (current_uid().val < 10000)
+		return 0;
+
+	if (!ksu_uid_should_umount(current_uid().val))
+		return 0;
+
+	const char __user **buf = (const char __user **)&PT_REGS_PARM2(regs);
+	char __user *uptr = *(char **)buf;
+
+	char kbuf[128] = { 0 };
+
+	if (ksu_copy_from_user_retry(kbuf, uptr, 127))
+		return 0;
+
+	// move ptr to the next one after space
+	char *target = strchr(kbuf, ' ');
+	if (likely(target))
+		target++;
+	else
+		target = kbuf;
+
+	if (!ksu_should_destroy_context(target))
+		return 0;
+
+	pr_info("selinux_transaction_write: destroy: %s \n", kbuf);
+	*should_destroy = true;
+
+	return 0;
+}
+
+static int selinux_transaction_write_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	// if bool is true, mod PT_REGS_RC to ret EINVAL
+	bool *should_destroy = (bool *)ri->data;
+	
+	if (*should_destroy)
+		PT_REGS_RC(regs) = -EINVAL;
+
+	return 0;
+}
+
+static struct kretprobe selinux_transaction_write_rp = {
+	.kp.symbol_name = "selinux_transaction_write",
+	.handler = selinux_transaction_write_ret_handler,
+	.entry_handler = selinux_transaction_write_pre_handler,
+	.data_size = sizeof(bool),
+	.maxactive = 20,
+};
+#endif
+
+// copied from upstream
+static struct kprobe *init_kprobe(const char *name, kprobe_pre_handler_t handler)
+{
+	struct kprobe *kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+	if (!kp)
+		return NULL;
+	kp->symbol_name = name;
+	kp->pre_handler = handler;
+
+	int ret = register_kprobe(kp);
+	pr_info("%s: register %s kprobe: %d\n", __func__, name, ret);
+	if (ret) {
+		kfree(kp);
+		return NULL;
+	}
+
+	return kp;
+}
+static void destroy_kprobe(struct kprobe **kp_ptr)
+{
+	struct kprobe *kp = *kp_ptr;
+	if (!kp)
+		return;
+	unregister_kprobe(kp);
+	synchronize_rcu();
+	kfree(kp);
+	*kp_ptr = NULL;
+}
+#endif // CONFIG_KPROBES
+
+
+static void ksu_selinux_hide_enable() 
+{
+	int ret = ksu_selinux_get_sids();
+	if (ret)
+		pr_info("selinux_hide: sid grab fail!\n");
+
+#if defined(CONFIG_KPROBES)
+	slow_avc_audit_kp = init_kprobe("slow_avc_audit", slow_avc_audit_pre_handler);
+
+	sel_write_context_kp = init_kprobe("sel_write_context", sel_write_context_pre_handler);
+	sel_write_access_kp = init_kprobe("sel_write_access", sel_write_context_pre_handler);
+#endif
+
+	pr_info("selinux_hide: started! make sure manual hooks are in-place!\n");
+
+	ksu_selinux_hide_enabled = true;
+}
+
+static void ksu_selinux_hide_disable()
+{
+#if defined(CONFIG_KPROBES)
+	pr_info("selinux_hide: unregister slow_avc_audit kprobe!\n");
+	destroy_kprobe(&slow_avc_audit_kp);
+
+	pr_info("selinux_hide: unregister sel_write_context kprobe!\n");
+	destroy_kprobe(&sel_write_context_kp);
+
+	pr_info("selinux_hide: unregister sel_write_access kprobe!\n");
+	destroy_kprobe(&sel_write_access_kp);
+#endif
+
+	pr_info("selinux_hide: closing down hooks!\n");
+
+	ksu_selinux_hide_enabled = false;
+}
+
+// init kthread
+static int ksu_hide_init_thread(void *data)
+{
+	unsigned int i = 0;
+
+	set_user_nice(current, 19); // low prio
+
+start:
+	if (!!*(volatile bool *)&ksu_boot_completed)
+		goto bail;
+
+	msleep(5000);
+
+	i++;
+
+	if (i < 12)
+		goto start;
+
+bail:
+
+	ksu_add_shit_to_list(KERNEL_SU_DOMAIN);
+	ksu_add_shit_to_list(KERNEL_SU_FILE);
+
+	ksu_selinux_hide_enable();
+	return 0;
+}
+
+static int selinux_hide_feature_get(u64 *value)
+{
+	*value = ksu_selinux_hide_enabled ? 1 : 0;
+	return 0;
+}
+
+static int selinux_hide_feature_set(u64 value)
+{
+	bool enable = value != 0;
+	int ret = 0;
+
+	if (enable == ksu_selinux_hide_enabled)
+		return 0;
+
+	pr_info("selinux_hide: set to %d\n", enable);
+
+	if (enable)
+		ksu_selinux_hide_enable();
+	else
+		ksu_selinux_hide_disable();
+
+	return ret;
+}
+
+static const struct ksu_feature_handler selinux_hide_handler = {
+	.feature_id = KSU_FEATURE_SELINUX_HIDE,
+	.name = "selinux_hide",
+	.get_handler = selinux_hide_feature_get,
+	.set_handler = selinux_hide_feature_set,
+};
+
+void __init ksu_selinux_hide_init()
+{
+	// we init this on a kthread
+	kthread_run(ksu_hide_init_thread, NULL, "kthread");
+
+	if (ksu_register_feature_handler(&selinux_hide_handler)) {
+		pr_err("Failed to register selinux_hide feature handler\n");
+	}
+}
+
+void __exit ksu_selinux_hide_exit()
+{
+	ksu_unregister_feature_handler(KSU_FEATURE_SELINUX_HIDE);
+}
+
diff --git a/drivers/kernelsu/feature/selinux_hide.h b/drivers/kernelsu/feature/selinux_hide.h
new file mode 100644
index 000000000000..39c60206b9c6
--- /dev/null
+++ b/drivers/kernelsu/feature/selinux_hide.h
@@ -0,0 +1,65 @@
+#ifndef __KSU_H_SELINUX_HIDE
+#define __KSU_H_SELINUX_HIDE
+
+void ksu_selinux_hide_init();
+void ksu_selinux_hide_exit();
+
+// /selinux/rules.c, linked list
+LIST_HEAD(ksu_sepolicy_rule_list);
+DEFINE_RWLOCK(ksu_sepolicy_shitlist_lock);
+
+struct ksu_hidden_node {
+	struct list_head list;
+	char *name;
+};
+
+static void ksu_add_shit_to_list(const char *name)
+{
+	if (!name)
+		return;
+
+	if (!strcmp(name, "zygote"))
+		return;
+
+	if (!strcmp(name, "app_zygote"))
+		return;
+
+	struct ksu_hidden_node *node;
+	size_t name_len = strlen(name);
+
+	// check for dupes
+	write_lock(&ksu_sepolicy_shitlist_lock);
+	list_for_each_entry(node, &ksu_sepolicy_rule_list, list) {
+		// ":name:"
+		if (strlen(node->name) == (name_len + 2) && !memcmp(node->name + 1, name, name_len))
+			goto unlock_list;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_ATOMIC);
+	if (!node)
+		goto unlock_list;
+
+	// ':' + original + ':' + \0
+	size_t len = strlen(name);	
+	node->name = kmalloc(name_len + 3, GFP_ATOMIC);
+	if (!node->name) {
+		kfree(node);
+		goto unlock_list;
+	}
+
+	node->name[0] = ':';
+	memcpy(node->name + 1, name, name_len);
+	node->name[name_len + 1] = ':';
+	node->name[name_len + 2] = '\0';
+
+	list_add(&node->list, &ksu_sepolicy_rule_list);
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG))
+		pr_info("%s: now tracking type: %s, padded: %s \n", __func__, name, node->name);
+
+unlock_list:
+	write_unlock(&ksu_sepolicy_shitlist_lock);
+	return;
+}
+
+#endif
diff --git a/drivers/kernelsu/feature/sucompat.c b/drivers/kernelsu/feature/sucompat.c
new file mode 100644
index 000000000000..174e170cb146
--- /dev/null
+++ b/drivers/kernelsu/feature/sucompat.c
@@ -0,0 +1,419 @@
+#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE
+#define SUCOMPAT_HOOK_TYPE static __always_inline int
+#else
+#define SUCOMPAT_HOOK_TYPE int
+#endif
+
+#define SU_PATH "/system/bin/su"
+#define SH_PATH "/system/bin/sh"
+
+static bool ksu_su_compat_enabled __read_mostly = true;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+static void __user *userspace_stack_buffer(const void *d, size_t len)
+{
+	/* To avoid having to mmap a page in userspace, just write below the stack
+   * pointer. */
+	char __user *p = (void __user *)current_user_stack_pointer() - len;
+
+	return copy_to_user(p, d, len) ? NULL : p;
+}
+#else
+static void __user *userspace_stack_buffer(const void *d, size_t len)
+{
+	if (!current->mm)
+		return NULL;
+
+	volatile unsigned long start_stack = current->mm->start_stack;
+	unsigned int step = 32;
+	
+start_loop:
+	;
+	char __user *p = (void __user *)(start_stack - step - len);
+	if (IS_ENABLED(CONFIG_KSU_DEBUG))
+		pr_info("%s: start_stack: %lx p: %lx len: %zu\n", __func__, start_stack, (unsigned long)p, len );
+
+	if (!copy_to_user(p, d, len))
+		return p;
+
+	step = step + step;
+
+	if (step <= 2048)
+		goto start_loop;
+
+	return NULL;
+}
+#endif
+
+static char __user *sh_user_path(void)
+{
+	static const char sh_path[] = "/system/bin/sh";
+
+	return userspace_stack_buffer(sh_path, sizeof(sh_path));
+}
+
+static char __user *ksud_user_path(void)
+{
+	static const char ksud_path[] = KSUD_PATH;
+
+	return userspace_stack_buffer(ksud_path, sizeof(ksud_path));
+}
+
+#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && defined(KSU_CAN_USE_JUMP_LABEL)
+DEFINE_STATIC_KEY_TRUE(ksud_sucompat_key);
+static inline void ksu_sucompat_enable_branch()
+{
+	pr_info("su_compat: enable sucompat branches\n");
+	static_branch_enable(&ksud_sucompat_key);
+	smp_mb();
+}
+static inline void ksu_sucompat_disable_branch()
+{
+	pr_info("su_compat: remove sucompat branches\n");
+	static_branch_disable(&ksud_sucompat_key);
+	smp_mb();
+}
+#else
+static inline void ksu_sucompat_enable_branch() { } // no-op
+static inline void ksu_sucompat_disable_branch() { } // no-op
+#endif
+
+__attribute__((hot))
+static __always_inline bool is_su_allowed(const void **ptr_to_check)
+{
+#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	// read as: if not 'likely' disabled
+	if (!!!static_branch_likely(&ksud_sucompat_key))
+		return false;
+#else
+	if (!ksu_su_compat_enabled)
+		return false;
+#endif // KSU_CAN_USE_JUMP_LABEL
+#endif
+
+	if (likely(test_thread_flag(TIF_SECCOMP)))
+		return false;
+
+	// see seccomp check above
+	// so if its root but not ksu domain, deny, see __ksu_is_allow_uid_for_current
+	// actually, we can likely skip this step?
+	uid_t uid = current_uid().val;
+	if (!!uid)
+		goto uid_check;
+
+	if (!is_ksu_domain())
+		return false;
+	goto check_ptr;
+
+	// NOTE: shell has its seccomp disabled, so we only need to check for this thing
+	// short-circuit if not shell! as we allow apps on setuid lsm by disabling seccomp
+uid_check:
+	if (likely(uid != 2000))
+		goto check_ptr;
+
+	// use internal function, not the macro
+	if (!__ksu_is_allow_uid(uid))
+		return false;
+
+check_ptr:
+	// first check the pointer-to-pointer
+	if (unlikely(!ptr_to_check))
+		return false;
+
+	// now dereference pointer-to-pointer to check actual pointer
+	if (unlikely(!*ptr_to_check))
+		return false;
+
+	return true;
+}
+
+static __always_inline void ksu_sucompat_user_common(const char __user **filename_user,
+				const char *syscall_name,
+				const bool escalate,
+				const uint8_t sym)
+{
+	uintptr_t buf;
+	const char su[] = SU_PATH;
+
+	// sugar prep
+	uintptr_t *su_p = (uintptr_t *)su;
+	uintptr_t __user *fn_p = (uintptr_t *)*(char **)filename_user;
+
+	// assert /system/bin/su\0 = 15 bytes.
+	BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad
+	BUILD_BUG_ON(sizeof(su) < 15);
+
+	/*
+	 * it seems this is actually the slowest part, we peek last word first to speed it up
+	 * NOTE: get_user rets EFAULT on err, so if we are copying a pointer
+	 * that goes to nothing, we also detect that and ret fast
+	 *
+	 * first read overreads, reading 8 bytes, "bin/su\0?" /  4 bytes, "su\0?" when we only need 7/3
+	 * but this is fine as we are guaranteed alignment, hardware provides trailing garbeg
+	 * if it is specially crafted and hits a page guard, we just get EFAULT anyway
+	 *
+	 * on 64-bit we do this in 2 word compare, 4 on 32-bit
+	 *
+	 * we can do some bitmasking 0xFFFFFF blah blah to do that tail compare (7 or 3 bytes), 
+	 * but hot damn I hate that shit, lets just have __builtin_memcmp do it for us
+	 *
+	 */
+
+#ifdef CONFIG_64BIT
+	if (get_user(buf, &fn_p[1]))
+		return;
+
+	if (likely(!!__builtin_memcmp(&buf, su + sizeof(uintptr_t), sizeof(su) - sizeof(uintptr_t) )))
+		return;
+#else
+	if (get_user(buf, &fn_p[3]))
+		return;
+
+	if (likely(!!__builtin_memcmp(&buf, su +  (3 * sizeof(uintptr_t)), sizeof(su) - (3 * sizeof(uintptr_t)) )))
+		return;
+
+	if (unlikely(get_user(buf, &fn_p[2])))
+		return;
+
+	if (buf != su_p[2])
+		return;
+
+	if (unlikely(get_user(buf, &fn_p[1])))
+		return;
+
+	if (unlikely(buf != su_p[1]))
+		return;
+#endif
+	// last word
+	if (unlikely(get_user(buf, &fn_p[0])))
+		return;
+
+	if (unlikely(buf != su_p[0]))
+		return;
+
+	write_sulog(sym);
+
+	if (!escalate)
+		goto no_escalate;
+
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL);
+#endif
+	if (!!escape_with_root_profile())
+		return;
+
+	// NOTE: we only check file existence, not exec success!
+	struct path kpath;
+	if (!!kern_path("/data/adb/ksud", 0, &kpath))
+		goto no_ksud;
+
+	path_put(&kpath);
+	pr_info("%s su->ksud!\n", syscall_name);
+	*filename_user = ksud_user_path();
+	return;
+
+no_ksud:
+no_escalate:
+	pr_info("%s su->sh!\n", syscall_name);
+	*filename_user = sh_user_path();
+	return;
+
+}
+
+// sys_faccessat
+SUCOMPAT_HOOK_TYPE ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, int *__unused_flags)
+{
+	if (!is_su_allowed((const void **)filename_user))
+		return 0;
+
+	ksu_sucompat_user_common(filename_user, "faccessat", false, 'a');
+	return 0;
+}
+
+// sys_newfstatat, sys_fstat64
+SUCOMPAT_HOOK_TYPE ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags)
+{
+	if (!is_su_allowed((const void **)filename_user))
+		return 0;
+
+	ksu_sucompat_user_common(filename_user, "newfstatat", false, 's');
+	return 0;
+}
+
+// sys_execve, compat_sys_execve
+SUCOMPAT_HOOK_TYPE ksu_handle_execve(const char __user **filename_user, void *argv, void *envp)
+{
+	sys_execve_escape_ksud((void *)filename_user);
+
+#ifdef CONFIG_KSU_FEATURE_ADBROOT
+	ksu_adb_root_handle_execve((void *)filename_user, (void *)envp);
+#endif
+
+	if (!is_su_allowed((const void **)filename_user))
+		return 0;
+
+	ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x');
+	return 0;
+}
+
+#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE
+static __always_inline void ksu_sucompat_kernel_common(void **filename_ptr, void *argv, void *envp, const char *function_name)
+{
+	kernel_execve_escape_ksud((void *)filename_ptr);
+
+#ifdef CONFIG_KSU_FEATURE_ADBROOT
+	ksu_adb_root_handle_execveat((void *)filename_ptr, (void *)envp);
+#endif
+
+	if (!is_su_allowed((const void **)filename_ptr))
+		return;
+
+	// it seems this is actually the slowest part, we peek last word first to speed it up
+	// sugar prep
+	const char su[] = SU_PATH;
+	uintptr_t *su_p = (uintptr_t *)su;
+	uintptr_t *fn_p = (uintptr_t *)*(char **)filename_ptr;
+
+	// assert /system/bin/su\0 = 15 bytes.
+	BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad
+	BUILD_BUG_ON(sizeof(su) < 15);
+
+	// getname_flags pads this so nothing to worry about, dereference with confidence!
+#ifdef CONFIG_64BIT
+	if (likely(!!__builtin_memcmp(&fn_p[1], &su_p[1], sizeof(su) - sizeof(uintptr_t) )))
+		return;
+#else
+	if (likely(!!__builtin_memcmp(&fn_p[3], &su_p[3], sizeof(su) - (3 * sizeof(uintptr_t)) )))
+		return;
+
+	if (fn_p[2] != su_p[2])
+		return;
+
+	if (fn_p[1] != su_p[1])
+		return;
+#endif
+
+	if (unlikely(fn_p[0] != su_p[0]))
+		return;
+
+	// we only handle execve here after removing vfs_statx hook for >= 6.1
+	write_sulog('x');
+
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL);
+#endif
+	if (!!escape_with_root_profile())
+		return;
+
+	// NOTE: we only check file existence, not exec success!
+	struct path kpath;
+	if (!!kern_path("/data/adb/ksud", 0, &kpath))
+		goto no_ksud;
+
+	path_put(&kpath);
+	pr_info("%s su->ksud!\n", function_name);
+	memcpy(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH));
+	return;
+
+no_ksud:
+	pr_info("%s su->sh!\n", function_name);
+	memcpy(*filename_ptr, SH_PATH, sizeof(SH_PATH));
+	return;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
+// take note: struct filename **filename, for do_execveat_common / do_execve_common on >= 3.14
+int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags)
+{
+	struct filename *filename = *filename_ptr;
+	if (IS_ERR(filename)) // see getname_flags
+		return 0;
+
+	ksu_sucompat_kernel_common((void **)&filename->name, argv, envp, "do_execveat_common");
+	return 0;
+}
+#else
+// take note: char **filename, for do_execve_common on < 3.14
+int ksu_legacy_execve_sucompat(const char **filename_ptr, void *argv, void *envp)
+{
+	ksu_sucompat_kernel_common((void **)filename_ptr, argv, envp, "do_execve_common");
+	return 0;
+}
+#endif
+#endif // CONFIG_KSU_TAMPER_SYSCALL_TABLE
+
+#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE
+static void syscall_table_sucompat_enable();
+static void syscall_table_sucompat_disable();
+#else
+static inline void syscall_table_sucompat_enable() { } // no-op
+static inline void syscall_table_sucompat_disable() { } // no-op
+#endif
+
+static void ksu_sucompat_enable()
+{
+
+	ksu_sucompat_enable_branch();
+	syscall_table_sucompat_enable();
+
+	ksu_su_compat_enabled = true;
+	pr_info("%s: hooks enabled: exec, faccessat, stat\n", __func__);
+}
+
+static void ksu_sucompat_disable()
+{
+
+	ksu_sucompat_disable_branch();
+	syscall_table_sucompat_disable();
+
+	ksu_su_compat_enabled = false;
+	pr_info("%s: hooks disabled: exec, faccessat, stat\n", __func__);
+}
+
+static int su_compat_feature_get(u64 *value)
+{
+	*value = ksu_su_compat_enabled ? 1 : 0;
+	return 0;
+}
+
+static int su_compat_feature_set(u64 value)
+{
+	bool enable = value != 0;
+
+	if (enable == ksu_su_compat_enabled) {
+		pr_info("su_compat: no need to change\n");
+	return 0;
+	}
+
+	if (enable) {
+		ksu_sucompat_enable();
+	} else {
+		ksu_sucompat_disable();
+	}
+
+	ksu_su_compat_enabled = enable;
+	pr_info("su_compat: set to %d\n", enable);
+
+	return 0;
+}
+
+static const struct ksu_feature_handler su_compat_handler = {
+	.feature_id = KSU_FEATURE_SU_COMPAT,
+	.name = "su_compat",
+	.get_handler = su_compat_feature_get,
+	.set_handler = su_compat_feature_set,
+};
+
+// sucompat: permited process can execute 'su' to gain root access.
+void __init ksu_sucompat_init()
+{
+	if (ksu_register_feature_handler(&su_compat_handler)) {
+		pr_err("Failed to register su_compat feature handler\n");
+	}
+}
+
+void __exit ksu_sucompat_exit()
+{
+	ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT);
+}
diff --git a/drivers/kernelsu/feature/sucompat.h b/drivers/kernelsu/feature/sucompat.h
new file mode 100644
index 000000000000..580384ee9c6c
--- /dev/null
+++ b/drivers/kernelsu/feature/sucompat.h
@@ -0,0 +1,7 @@
+#ifndef __KSU_H_SUCOMPAT
+#define __KSU_H_SUCOMPAT
+
+void ksu_sucompat_init(void);
+void ksu_sucompat_exit(void);
+
+#endif
diff --git a/drivers/kernelsu/feature/sulog.c b/drivers/kernelsu/feature/sulog.c
new file mode 100644
index 000000000000..9f76805ca4f6
--- /dev/null
+++ b/drivers/kernelsu/feature/sulog.c
@@ -0,0 +1,57 @@
+static bool ksu_sulog_enabled __read_mostly = false;
+
+static int sulog_feature_get(u64 *value)
+{
+	*value = ksu_sulog_enabled ? 1 : 0;
+	return 0;
+}
+
+static int sulog_feature_set(u64 value)
+{
+	bool enable = value != 0;
+
+	ksu_sulog_enabled = enable;
+	pr_info("sulog: set to %d\n", enable);
+	return 0;
+}
+
+static const struct ksu_feature_handler sulog_handler = {
+	.feature_id = KSU_FEATURE_SULOG,
+	.name = "sulog",
+	.get_handler = sulog_feature_get,
+	.set_handler = sulog_feature_set,
+};
+
+bool ksu_sulog_is_enabled(void)
+{
+	return ksu_sulog_enabled;
+}
+
+void __init ksu_sulog_init(void)
+{
+	int ret;
+
+	ksu_sulog_enabled = false;
+
+	ret = ksu_register_feature_handler(&sulog_handler);
+	if (ret) {
+		pr_err("Failed to register sulog feature handler\n");
+		return;
+	}
+
+	ret = ksu_sulog_events_init();
+	if (ret) {
+		pr_err("Failed to initialize sulog events: %d\n", ret);
+		ksu_unregister_feature_handler(KSU_FEATURE_SULOG);
+		return;
+	}
+
+	ksu_sulog_fd_init();
+}
+
+void __exit ksu_sulog_exit(void)
+{
+	ksu_sulog_fd_exit();
+	ksu_sulog_events_exit();
+	ksu_unregister_feature_handler(KSU_FEATURE_SULOG);
+}
diff --git a/drivers/kernelsu/feature/sulog.h b/drivers/kernelsu/feature/sulog.h
new file mode 100644
index 000000000000..565f59113cd0
--- /dev/null
+++ b/drivers/kernelsu/feature/sulog.h
@@ -0,0 +1,8 @@
+#ifndef __KSU_H_SULOG
+#define __KSU_H_SULOG
+
+bool ksu_sulog_is_enabled(void);
+void ksu_sulog_init(void);
+void ksu_sulog_exit(void);
+
+#endif
diff --git a/drivers/kernelsu/hook/core_hook.c b/drivers/kernelsu/hook/core_hook.c
new file mode 100644
index 000000000000..54572c2c611f
--- /dev/null
+++ b/drivers/kernelsu/hook/core_hook.c
@@ -0,0 +1,440 @@
+#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS
+#define LSM_HANDLER_TYPE static int
+#else
+#define LSM_HANDLER_TYPE int
+#endif
+
+LSM_HANDLER_TYPE ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
+			    struct inode *new_inode, struct dentry *new_dentry)
+{
+	ksu_rename_observer(old_dentry, new_dentry);
+	return 0;
+}
+
+LSM_HANDLER_TYPE ksu_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
+{
+	// see sys_setresuid
+	if (flags == LSM_SETID_RES)
+		ksu_handle_setresuid_cred(new, old);
+
+	return 0;
+}
+
+LSM_HANDLER_TYPE ksu_bprm_check(struct linux_binprm *bprm)
+{
+
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	ksu_sulog_emit_bprm((const char *)bprm->filename);
+#endif
+
+	return 0;
+}
+
+LSM_HANDLER_TYPE ksu_file_permission(struct file *file, int mask)
+{
+#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE)
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	if (static_branch_likely(&ksud_vfs_read_key))
+		ksu_install_rc_hook(file);
+#else
+	if (unlikely(ksu_vfs_read_hook))
+		ksu_install_rc_hook(file);
+#endif
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+static struct security_hook_list ksu_hooks[] __ro_after_init = {
+	LSM_HOOK_INIT(inode_rename, ksu_inode_rename),
+	LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid),
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	LSM_HOOK_INIT(bprm_check_security, ksu_bprm_check),
+#endif
+#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE)
+	LSM_HOOK_INIT(file_permission, ksu_file_permission),
+#endif
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) || defined(KSU_COMPAT_SECURITY_ADD_HOOKS_V2)
+#define ksu_security_add_hooks security_add_hooks
+#else
+#define ksu_security_add_hooks(a, b, c) security_add_hooks(a, b)
+#endif
+
+static __init void ksu_lsm_hook_init(void)
+{
+	ksu_security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu");
+
+	pr_info("core_hook: initialized %d LSMs \n", ARRAY_SIZE(ksu_hooks));
+}
+
+#else /* < 4.2, LSM */
+
+// selinux_ops (LSM), security_operations struct tampering for ultra legacy
+
+static uintptr_t selinux_ops_addr = NULL;
+
+#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE
+static int (*orig_setprocattr) (struct task_struct *p, char *name, void *value, size_t size) = NULL;
+static int hook_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
+{
+
+	ksu_hide_setprocattr(name, value, size);
+	return orig_setprocattr(p, name, value, size);
+}
+#endif
+
+static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry) = NULL;
+static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
+			    struct inode *new_inode, struct dentry *new_dentry)
+{
+	ksu_inode_rename(old_inode, old_dentry, new_inode, new_dentry);
+	return orig_inode_rename(old_inode, old_dentry, new_inode, new_dentry);
+}
+
+static int (*orig_task_fix_setuid) (struct cred *new, const struct cred *old, int flags) = NULL;
+static int hook_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
+{
+	ksu_task_fix_setuid(new, old, flags);
+	return orig_task_fix_setuid(new, old, flags);
+}
+
+static int (*orig_bprm_check_security)(struct linux_binprm *bprm) = NULL;
+static int hook_bprm_check_security(struct linux_binprm *bprm)
+{
+	ksu_bprm_check(bprm);
+	return orig_bprm_check_security(bprm);
+}
+
+static int (*orig_file_permission) (struct file *file, int mask) = NULL;
+static int hook_file_permission(struct file *file, int mask)
+{
+
+	ksu_file_permission(file, mask);
+	return orig_file_permission(file, mask);
+}
+
+static inline bool verify_selinux_cred_free(void *fn_ptr)
+{
+	bool success = false;
+
+	if (!fn_ptr)
+		return false;
+
+	// ref: https://elixir.bootlin.com/linux/v3.18.140/source/security/selinux/hooks.c#L3474
+	void (*selinux_cred_free_fn)(struct cred *) = fn_ptr;
+
+	struct cred dummy_cred;
+
+	// explicitly set it to NULL
+	// make sure this happens!
+	// #1. it wont trigger BUG_ON
+	// #2. this way it will kfree(NULL), which does nothing
+	*(volatile void **)&dummy_cred.security = NULL;
+	barrier();
+
+	selinux_cred_free_fn(&dummy_cred);
+
+	// check if selinux_cred_free is successful
+	if ((unsigned long)*(volatile void **)&dummy_cred.security == 0x7UL)
+		success = true;
+
+	pr_info("selinux_cred_free: 0x%lx cred->security: 0x%lx success: %d\n", (unsigned long)fn_ptr, (unsigned long)dummy_cred.security, success);
+
+	return success;
+}
+
+// we should see a lot of pointers that is inside stext && etext
+// basically we check for "pointer density"
+static inline bool is_selinux_ops_valid(uintptr_t addr)
+{
+	extern char _stext[], _etext[];
+	int total_slots = sizeof(struct security_operations) / sizeof(void *); 
+	int valid_ptr = 0;
+	int i = 0;
+
+	uintptr_t member_ptr = 0;
+	uintptr_t current_slot_addr;
+
+	// we will be off by one or off by two due to sizeof("selinux")
+	// thats 8 bytes, on 32 bit, this is two pointers worth, not a big deal
+
+density_verify_start:
+	current_slot_addr = addr + (i * sizeof(void *));
+
+	member_ptr = 0;
+	if (copy_from_kernel_nofault(&member_ptr, (void *)current_slot_addr, sizeof(uintptr_t) ))
+		goto next_iter; // if it fails, just try next slot
+
+	// give up early
+	if (!valid_ptr && i >= 20)
+		return false;
+
+	// pr_info("%s: member_ptr: 0x%lx \n", __func__, (long)member_ptr);
+	if (member_ptr >= (uintptr_t)_stext && member_ptr <= (uintptr_t)_etext)
+		valid_ptr++;
+
+next_iter:
+	i++;	
+	if (i < total_slots)
+		goto density_verify_start;
+
+	pr_info("%s: density: valid: %lu slots: %lu \n", __func__, valid_ptr, total_slots);
+
+	// maybe increase to 75% or something?
+	return (valid_ptr > (total_slots / 2));
+}
+
+static inline bool check_candidate(uintptr_t addr)
+{
+	struct security_operations *candidate = (struct security_operations *)addr;
+
+	char char_buf[sizeof("selinux")] = { 0 };
+
+	if (copy_from_kernel_nofault(char_buf, (void *)addr, sizeof("selinux") ))
+		return false;
+
+	if (!!memcmp(char_buf, "selinux", sizeof("selinux")))
+		return false;
+
+	// candidate found!
+	pr_info("%s: candidate selinux_ops at 0x%lx\n", __func__, (long)addr);
+
+	// check ptr density	
+	if (!is_selinux_ops_valid(addr))
+		return false;
+
+	if (!candidate->cred_free)
+		return false;
+
+#ifdef CONFIG_KALLSYMS // not always available, can also fail, but it wont hurt to try.
+	uintptr_t ksym_ptr = (uintptr_t)kallsyms_lookup_name("selinux_cred_free");
+	if (unlikely(ksym_ptr != (uintptr_t)candidate->cred_free))
+		goto test_fn;
+
+	pr_info("%s: selinux_cred_free found via ksym_lookup: 0x%lx probe_result: 0x%lx \n", __func__, (long)ksym_ptr, (long)candidate->cred_free);
+	return true;
+
+test_fn:
+#endif
+
+	pr_info("%s: candidate selinux_cred_free at 0x%lx\n", __func__, (long)candidate->cred_free);
+	return verify_selinux_cred_free((void *)candidate->cred_free);
+}
+
+/** 
+ * we do this in blocks of sequential 10k pointers.
+ * 10k pointers up, 10k pointers down
+ * this is predictable, more cache friendly, no trashing.
+ *
+ * one up, one down oscillating scan isn't as friendly to teh cahce.
+ * once ptrdiff of up vs down is larger than L1, it will be trashy.
+ *
+ */
+static noinline void *hunt_for_selinux_ops(void *heuristic_ptr)
+{
+	uintptr_t anchor = (uintptr_t)heuristic_ptr;
+	uintptr_t curr;
+	unsigned long iter_count = 0;
+	unsigned long max_index = 10000; // max number of pointers to test, one way
+	unsigned long i = 0;
+
+	uintptr_t start = anchor - max_index * sizeof(void *);
+	uintptr_t end = anchor + max_index * sizeof(void *);
+	pr_info("%s: scan range: 0x%lx - 0x%lx anchor: 0x%lx\n", __func__, (long)start, (long)end, (long)anchor);
+
+scan_up:
+	if (i >= max_index) {
+		i = 1;
+		goto scan_down;
+	}
+
+	curr = anchor + (i * sizeof(void *));
+	i++;
+	iter_count++;
+
+	if (check_candidate(curr))
+		goto found;
+
+	goto scan_up;
+
+scan_down:
+	if (i >= max_index)
+		goto not_found;
+
+	curr = anchor - (i * sizeof(void *));
+	i++;
+	iter_count++;
+
+	if (check_candidate(curr))
+		goto found;
+
+	goto scan_down;
+
+found:
+	pr_info("%s: found selinux_ops at 0x%lx iter_count: %lu \n", __func__, curr, iter_count);
+	return (void *)curr;
+
+not_found:
+	pr_info("%s: selinux_ops not found in range! iter_count: %lu \n", __func__, iter_count);
+	return NULL;
+}
+
+static inline void set_selinux_ops()
+{
+	extern int selinux_enabled;
+	extern struct security_class_mapping secclass_map[];
+	extern struct list_head crypto_alg_list;
+	extern unsigned int avc_cache_threshold;
+	
+	struct security_operations *ops = NULL;
+
+// if user exports selinux_ops, we just go for it!
+#ifdef KSU_HAS_EXPORTED_SELINUX_OPS
+	extern struct security_operations selinux_ops;
+	if (!ops)
+		ops = (struct security_operations *)&selinux_ops;
+#endif
+
+// not always available, can also fail, but it wont hurt to try.
+#ifdef CONFIG_KALLSYMS
+	if (!ops)
+		ops = (struct security_operations *)kallsyms_lookup_name("selinux_ops");
+#endif
+
+#ifdef CONFIG_KEYS
+	extern struct key_user root_key_user;
+	if (!ops)
+		ops = (struct security_operations *)hunt_for_selinux_ops((void *)&root_key_user);
+#endif
+
+	if (!ops)
+		ops = (struct security_operations *)hunt_for_selinux_ops((void *)&avc_cache_threshold);
+
+	if (!ops)
+		ops = (struct security_operations *)hunt_for_selinux_ops((void *)&crypto_alg_list);
+
+	if (!ops)
+		ops = (struct security_operations *)hunt_for_selinux_ops((void *)&selinux_enabled);
+
+	if (!ops)
+		ops = (struct security_operations *)hunt_for_selinux_ops((void *)&secclass_map);
+
+	if (!ops)
+		return;
+
+	selinux_ops_addr = (uintptr_t)ops;	
+}
+
+// stop_machine
+static int ksu_unregister_lsm_hook(void *data)
+{
+	struct security_operations *ops = (struct security_operations *)selinux_ops_addr;
+
+	if (orig_file_permission) {
+		pr_info("%s: restoring file_permission 0x%lx -> 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission);
+		ops->file_permission = orig_file_permission;
+	}
+	
+	return 0;
+}
+
+static int ksu_lsm_hook_restore(void *data)
+{
+	struct security_operations *ops = (struct security_operations *)selinux_ops_addr;
+	if (!ops)
+		return 0;
+
+	if (!!strcmp((char *)ops, "selinux"))
+		return 0;
+
+loop_start:
+
+	msleep(1000);
+
+	if (*(volatile bool *)&ksu_vfs_read_hook)
+		goto loop_start;
+
+	pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops );
+
+	stop_machine(ksu_unregister_lsm_hook, NULL, NULL);
+
+	return 0;
+}
+
+// stop_machine
+static int ksu_register_lsm_hook(void *data)
+{
+	struct security_operations *ops = (struct security_operations *)selinux_ops_addr;
+
+	orig_bprm_set_creds = ops->bprm_set_creds;
+	ops->bprm_set_creds = hook_bprm_set_creds;
+
+	orig_inode_rename = ops->inode_rename;
+	ops->inode_rename = hook_inode_rename;
+
+#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE
+	orig_setprocattr = ops->setprocattr;
+	ops->setprocattr = hook_setprocattr;
+#endif
+
+	orig_task_fix_setuid = ops->task_fix_setuid;
+	ops->task_fix_setuid = hook_task_fix_setuid;
+
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	orig_bprm_check_security = ops->bprm_check_security;
+	ops->bprm_check_security = hook_bprm_check_security;
+#endif
+
+#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE)
+	orig_file_permission = ops->file_permission;
+	ops->file_permission = hook_file_permission;
+#endif
+
+	return 0;
+}
+
+static void ksu_lsm_hook_init(void)
+{
+	set_selinux_ops();
+
+	struct security_operations *ops = (struct security_operations *)selinux_ops_addr;
+	if (!ops)
+		return;
+
+	if (!!strcmp((char *)ops, "selinux"))
+		return;
+
+	pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops );
+
+	stop_machine(ksu_register_lsm_hook, NULL, NULL);
+	
+	kthread_run(ksu_lsm_hook_restore, NULL, "unhook");
+	return;
+}
+
+#endif // < 4.2
+
+#else /* ! CONFIG_KSU_LSM_SECURITY_HOOKS */
+// TEMP hooks, remove this in a month.
+int ksu_handle_setuid(struct cred *new, const struct cred *old)
+{
+	ksu_handle_setresuid_cred(new, old);
+	return 0;
+}
+int ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	ksu_rename_observer(old_dentry, new_dentry);
+	return 0;
+}
+static inline void ksu_lsm_hook_init(void) { } // nothing, no-op
+#endif // CONFIG_KSU_LSM_SECURITY_HOOKS
+
+void __init ksu_core_init(void)
+{
+	ksu_lsm_hook_init();
+}
diff --git a/drivers/kernelsu/hook/kp_ksud.c b/drivers/kernelsu/hook/kp_ksud.c
new file mode 100644
index 000000000000..24ad5c3a14b4
--- /dev/null
+++ b/drivers/kernelsu/hook/kp_ksud.c
@@ -0,0 +1,143 @@
+#include <linux/kprobes.h>
+
+// sys_newfstat rp
+// upstream: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002
+
+static int sys_newfstat_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs)
+{
+	struct pt_regs *real_regs = PT_REAL_REGS(regs);
+
+	// grab ptr on entry
+	uintptr_t *arg = (uintptr_t *)p->data;
+	arg[0] = (uintptr_t)PT_REGS_PARM1(regs); 
+	arg[1] = (uintptr_t)PT_REGS_PARM2(regs); 
+
+	return 0;
+}
+
+static int sys_newfstat_handler_post(struct kretprobe_instance *p, struct pt_regs *regs)
+{
+	uintptr_t *arg = (uintptr_t *)p->data;
+	unsigned int fd = (unsigned int)arg[0];
+	struct stat __user *statbuf = (struct stat __user *)arg[1];
+
+	ksu_handle_newfstat_ret(&fd, &statbuf);
+
+	return 0;
+}
+
+static struct kretprobe sys_newfstat_rp = {
+	.kp.symbol_name = SYS_NEWFSTAT_SYMBOL,
+	.entry_handler = sys_newfstat_handler_pre,
+	.handler = sys_newfstat_handler_post,
+	.data_size = sizeof(uintptr_t) * 2, // int + ptr, should fit
+};
+
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+static int sys_fstat64_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs)
+{
+	struct pt_regs *real_regs = PT_REAL_REGS(regs);
+
+	// grab ptr on entry
+	uintptr_t *arg = (uintptr_t *)p->data;
+	arg[0] = (uintptr_t)PT_REGS_PARM1(regs); 
+	arg[1] = (uintptr_t)PT_REGS_PARM2(regs); 
+
+	return 0;
+}
+
+static int sys_fstat64_handler_post(struct kretprobe_instance *p, struct pt_regs *regs)
+{
+	uintptr_t *arg = (uintptr_t *)p->data;
+	unsigned long fd = (unsigned long)arg[0];
+	struct stat64 __user *statbuf = (struct stat64 __user *)arg[1];
+
+	ksu_handle_fstat64_ret(&fd, &statbuf);
+
+	return 0;
+}
+
+static struct kretprobe sys_fstat64_rp = {
+	.kp.symbol_name = SYS_FSTAT64_SYMBOL,
+	.entry_handler = sys_fstat64_handler_pre,
+	.handler = sys_fstat64_handler_post,
+	.data_size = sizeof(uintptr_t) * 2, // long + ptr, should fit
+};
+#endif
+
+// sys_reboot
+static int sys_reboot_handler_pre(struct kprobe *p, struct pt_regs *regs)
+{
+	struct pt_regs *real_regs = PT_REAL_REGS(regs);
+	int *magic1 = (int *)&PT_REGS_PARM1(real_regs); // ptr so we can mutate this
+	int magic2 = (int)PT_REGS_PARM2(real_regs);
+	int cmd = (int)PT_REGS_PARM3(real_regs);
+	void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs);
+
+	if (*magic1 != KSU_INSTALL_MAGIC1)
+		return 0;
+
+	// HACK: flip preempt status inside kp
+	// checking not really needed but its cool
+	bool got_flipped = false;
+	if (likely(!preemptible())) {
+		preempt_enable();
+		got_flipped = true;
+	}
+
+	ksu_handle_sys_reboot(*magic1, magic2, cmd, arg);
+
+	if (got_flipped)
+		preempt_disable();
+
+	// to prevent double hooking
+	*magic1 = 0;
+
+	return 0;
+}
+
+static struct kprobe sys_reboot_kp = {
+	.symbol_name = SYS_REBOOT_SYMBOL,
+	.pre_handler = sys_reboot_handler_pre,
+};
+
+static int unregister_kprobe_function(void *data)
+{
+	set_user_nice(current, 19); // low prio
+
+loop_start:
+
+	msleep(1000);
+
+	if (*(volatile bool *)&ksu_vfs_read_hook)
+		goto loop_start;
+
+	pr_info("kp_ksud: unregistering kprobes...\n");
+
+	unregister_kretprobe(&sys_newfstat_rp);
+	pr_info("kp_ksud: unregister sys_newfstat_rp!\n");
+
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+	unregister_kretprobe(&sys_fstat64_rp);
+	pr_info("kp_ksud: unregister sys_fstat64_rp!\n");
+#endif
+
+	return 0;
+}
+
+static __init int kp_ksud_init()
+{
+	int ret = register_kprobe(&sys_reboot_kp); // dont unreg this one
+	pr_info("kp_ksud: sys_reboot_kp: %d\n", ret);
+
+	int ret2 = register_kretprobe(&sys_newfstat_rp);
+	pr_info("kp_ksud: sys_newfstat_rp: %d\n", ret2);
+
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+	int ret3 = register_kretprobe(&sys_fstat64_rp);
+	pr_info("kp_ksud: sys_fstat64_rp: %d\n", ret3);
+#endif
+
+	kthread_run(unregister_kprobe_function, NULL, "kp_unreg");
+	return 0;
+}
diff --git a/drivers/kernelsu/hook/setuid_hook.c b/drivers/kernelsu/hook/setuid_hook.c
new file mode 100644
index 000000000000..2c0aeab247ae
--- /dev/null
+++ b/drivers/kernelsu/hook/setuid_hook.c
@@ -0,0 +1,35 @@
+static __always_inline void ksu_handle_setresuid_cred(struct cred *new, const struct cred *old)
+{
+	if (!new || !old)
+		return;
+
+	uid_t new_uid = ksu_get_uid_t(new->uid);
+	uid_t old_uid = ksu_get_uid_t(old->uid);
+
+	// old process is not root, ignore it.
+	if (unlikely(!!old_uid))
+		return;
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG))
+		pr_info("handle_setresuid from %d to %d\n", old_uid, new_uid);
+
+	// we dont have those new fancy things upstream has
+	// lets just do the original thing where we disable seccomp
+	if (unlikely(is_uid_manager(new_uid)))
+		goto install_ksu_fd;
+
+	if (ksu_is_allow_uid_for_current(new_uid))
+		goto kill_seccomp;
+
+	// Handle kernel umount
+	ksu_handle_umount(new, old);
+	return;
+
+install_ksu_fd:
+	pr_info("install fd for manager: %d\n", new_uid);
+	ksu_install_fd();
+
+kill_seccomp:
+	disable_seccomp();
+	return;
+}
diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm.c b/drivers/kernelsu/hook/syscall_table_hook_arm.c
new file mode 100644
index 000000000000..996b3da89a06
--- /dev/null
+++ b/drivers/kernelsu/hook/syscall_table_hook_arm.c
@@ -0,0 +1,398 @@
+#ifndef CONFIG_ARM
+#error "only meant for ARM"
+#endif
+
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h
+
+#define __ARMEABI_reboot	88
+#define __ARMEABI_execve	11
+#define __ARMEABI_faccessat	334
+#define __ARMEABI_fstatat64	327
+#define __ARMEABI_fstat64	197
+#define __ARMEABI_read		3
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
+
+// on 4.19+ its is no longer just a void *sys_call_table[]
+// it becomes syscall_fn_t sys_call_table[];
+
+static syscall_fn_t armeabi_reboot __read_mostly = NULL;
+static long hook_armeabi_reboot(const struct pt_regs *regs)
+{
+	int magic1 = (int)regs->regs[0];
+	int magic2 = (int)regs->regs[1];
+	unsigned int cmd = (unsigned int)regs->regs[2];
+	void __user **arg = (void __user **)&regs->regs[3];
+
+	ksu_handle_sys_reboot(magic1, magic2, cmd, arg);
+	return armeabi_reboot(regs);
+}
+
+static syscall_fn_t armeabi_execve __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_execve(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[0];
+	void ***argv = (void ***)&regs->regs[1];
+	void ***envp = (void ***)&regs->regs[2];
+
+	ksu_handle_execve(filename, argv, envp);
+	return armeabi_execve(regs);
+}
+
+static syscall_fn_t armeabi_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_faccessat(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_faccessat(NULL, filename, NULL, NULL);
+	return armeabi_faccessat(regs);
+}
+
+static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_fstatat64(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_stat(NULL, filename, NULL);
+	return armeabi_fstatat64(regs);
+}
+
+static syscall_fn_t armeabi_fstat64 __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_fstat64_ret(const struct pt_regs *regs)
+{
+	// we handle it like rp
+	unsigned long *fd = (unsigned long *)&regs->regs[0];
+	struct stat64 __user **statbuf = (struct stat64 __user **)&regs->regs[1];
+
+	long ret = armeabi_fstat64(regs);
+	ksu_handle_fstat64_ret(fd, statbuf);
+	return ret;
+}
+
+static syscall_fn_t armeabi_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_read(const struct pt_regs *regs)
+{
+	unsigned int fd = (unsigned int)regs->regs[0];	
+
+	ksu_handle_sys_read_fd(fd);
+	return armeabi_read(regs);
+}
+
+#else // END OF 4.19+ SYSCALL HANDLERS
+
+/**
+ *  for legacy syscall abi, we straight up call the syscall symbol
+ *  this is easier and maybe a little bit faster
+ *
+ */
+ 
+extern void *sys_call_table[];
+
+static uintptr_t armeabi_reboot __read_mostly = NULL;
+static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg)
+{
+	ksu_handle_sys_reboot(magic1, magic2, cmd, &arg);
+	return sys_reboot(magic1, magic2, cmd, arg);
+}
+
+static uintptr_t armeabi_execve __read_mostly = NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+__attribute__((hot))
+static long hook_armeabi_execve(const char __user * filename,
+				const char __user *const __user * argv,
+				const char __user *const __user * envp)
+{
+	ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp);
+	return sys_execve(filename, argv, envp);
+}
+
+#else /* sys_execve_oabi */
+
+/**
+ *  on 3.0 / 3.4 ARM, sys_execve sc entry accepts 3 args (r0, r1, r2)
+ *  however, sys_execve on that version, needs 4. the kernel does this small wrapper
+ *  where it puts sp + 8 on r3. without it, hook won't work.
+ *
+ * // arch/arm/kernel/entry-common.S
+ *
+ * sys_execve_wrapper:
+ *		add	r3, sp, #S_OFF
+ *		b	sys_execve
+ * ENDPROC(sys_execve_wrapper)
+ *
+ */
+#include <asm/ptrace.h>
+
+__attribute__((used, noipa))
+static long hook_sys_execve(const char __user *filenamei,
+			  const char __user *const __user *argv,
+			  const char __user *const __user *envp, struct pt_regs *regs)
+{
+	ksu_handle_execve(&filenamei, (void ***)&argv, (void ***)&envp);
+	return sys_execve(filenamei, argv, envp, regs);
+}
+
+#define S_OFF "8"
+__attribute__((naked))
+static noinline void hook_armeabi_execve()
+{
+	asm volatile(
+		"add r3, sp, #" S_OFF "\n"
+		"b   hook_sys_execve\n"
+	);
+}
+
+#endif /* sys_execve_oabi */
+
+
+static uintptr_t armeabi_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode)
+{
+	ksu_handle_faccessat(&dfd, &filename, &mode, NULL);
+	return sys_faccessat(dfd, filename, mode);
+}
+
+static uintptr_t armeabi_fstatat64 __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag)
+{
+	ksu_handle_stat(&dfd, &filename, &flag);
+	return sys_fstatat64(dfd, filename, statbuf, flag);
+}
+
+static uintptr_t armeabi_fstat64 __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf)
+{
+	// we handle it like rp
+	long ret = sys_fstat64(fd, statbuf);
+	ksu_handle_fstat64_ret(&fd, &statbuf);
+	return ret;
+}
+
+static uintptr_t armeabi_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count)
+{
+	ksu_handle_sys_read_fd(fd);
+	return sys_read(fd, buf, count);
+}
+
+#endif // SYSCALL HANDLERS
+
+// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head
+// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou"
+
+// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!)
+// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void **
+// for 4.19- old_ptr is actually void **
+// target_table is void *target_table[];
+static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table)
+{
+	void **sctable = (void **)target_table;
+	void **syscall_slot_addr = &sctable[syscall_nr];
+
+	if (!*syscall_slot_addr)
+		return;
+
+	pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr);
+
+	/*
+	 * basically the trick is
+	 * addr, say 0xffff1234, this is READ-ONLY
+	 * align it, 0xffff0000
+	 * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234
+	 * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE
+	 * write on 0xcccc0000 + 0x00001234
+	 *
+	 */
+
+	// prep vmap alias
+	unsigned long addr = (unsigned long)syscall_slot_addr;
+	unsigned long base = addr & PAGE_MASK;
+	unsigned long offset = addr & ~PAGE_MASK; // offset_in_page
+
+	// this is impossible for our case because the page alignment
+	// but be careful for other cases!
+	// BUG_ON(offset + len > PAGE_SIZE);
+	if (offset + sizeof(void *) > PAGE_SIZE) {
+		pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__);
+		return;
+	}
+
+	// virtual mapping of a physical page 
+	struct page *page = phys_to_page(__pa(base));
+	if (!page)
+		return;
+
+	// create a "writabel address" which is mapped to teh same address
+	void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (!writable_addr)
+		return;
+
+	// swap on the alias
+	void **target_slot = (void **)((unsigned long)writable_addr + offset);
+
+	preempt_disable();
+	local_irq_disable();
+
+	*(void **)old_ptr = *target_slot; 
+
+	*target_slot = new_ptr;
+	smp_mb(); // ^^
+
+	local_irq_enable();
+	preempt_enable();
+
+	vunmap(writable_addr);
+
+	smp_mb(); 
+}
+
+static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table)
+{
+	void **sctable = (void **)target_table;
+	void **syscall_slot_addr = &sctable[syscall_nr];
+
+	if (!*syscall_slot_addr)
+		return;
+
+	/*
+	 * we do this to make sure that old_ptr is filled.
+	 * we risk a dead syscall !!!
+	 * if read_and_replace failed or we restore again, it wont be pointing to anything
+	 * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast
+	 * yeah it really just dummy copies machine instructions at this point.
+	 *
+	 * normally we use probe_kernel_address / get_kernel_nofault here but the API is 
+	 * so inconsistent across kernel versions, and since its just a dummied wrapper 
+	 * for copy_from_kernel_nofault we can do it ourselves
+	 *
+	 */
+
+	long dummy = 0;
+	if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long)))
+		return;
+
+	pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr);
+
+	// prep vmap alias
+	unsigned long addr = (unsigned long)syscall_slot_addr;
+	unsigned long base = addr & PAGE_MASK;
+	unsigned long offset = addr & ~PAGE_MASK; // offset_in_page
+
+	// this is impossible for our case because the page alignment
+	// but be careful for other cases!
+	// BUG_ON(offset + len > PAGE_SIZE);
+	if (offset + sizeof(void *) > PAGE_SIZE) {
+		pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__);
+		return;
+	}
+
+	// virtual mapping of a physical page 
+	struct page *page = phys_to_page(__pa(base));
+	if (!page)
+		return;
+
+	// create a "writabel address" which is mapped to teh same address
+	void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (!writable_addr)
+		return;
+
+	// swap on the alias
+	void **target_slot = (void **)((unsigned long)writable_addr + offset);
+
+	// check if its ours
+	if (*target_slot != new_ptr) {
+		pr_info("%s: syscall is not ours!\n", __func__);
+		goto out;
+	}
+	
+	pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr );
+
+	preempt_disable();
+	local_irq_disable();
+
+	*target_slot = *(void **)old_ptr;	
+	smp_mb(); // ^^
+
+	*(void **)old_ptr = NULL; // explicit reset
+
+	local_irq_enable();
+	preempt_enable();
+
+out:
+	vunmap(writable_addr);
+
+	smp_mb(); 
+}
+
+static int ksu_syscall_table_restore()
+{
+	set_user_nice(current, 19); // low prio
+
+loop_start:
+
+	msleep(1000);
+
+	if (*(volatile bool *)&ksu_vfs_read_hook)
+		goto loop_start;
+
+	restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table);
+	restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table);
+	
+	return 0;
+}
+
+static DEFINE_MUTEX(sucompat_toggle_mutex);
+
+static void syscall_table_sucompat_enable()
+{
+	mutex_lock(&sucompat_toggle_mutex);
+	read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table);
+	read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table);
+	read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table);
+	mutex_unlock(&sucompat_toggle_mutex);
+}
+
+static void syscall_table_sucompat_disable()
+{
+	mutex_lock(&sucompat_toggle_mutex);
+	restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table);
+	restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table);
+	restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table);
+	mutex_unlock(&sucompat_toggle_mutex);
+}
+
+static __init int ksu_syscall_table_hook_init()
+{
+	// enable on init!
+	syscall_table_sucompat_enable();
+
+	read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)sys_call_table);
+
+	// theres an issue on fstat64 on oabi, so lets not hook it
+	// this is not that much of a loss since 3.0 / 3.4 devices aren't really running A17
+	// TODO: fix and handle this
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+	read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table);
+#endif
+
+	read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table);
+
+	// start unreg kthread
+	kthread_run(ksu_syscall_table_restore, NULL, "unhook");
+	return 0;
+}
+device_initcall_sync(ksu_syscall_table_hook_init);
+
+// EOF
diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm64.c b/drivers/kernelsu/hook/syscall_table_hook_arm64.c
new file mode 100644
index 000000000000..ced382be024c
--- /dev/null
+++ b/drivers/kernelsu/hook/syscall_table_hook_arm64.c
@@ -0,0 +1,513 @@
+#ifndef CONFIG_ARM64
+#error "only meant for ARM64"
+#endif
+
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h
+// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h
+
+#define __AARCH64_reboot	142
+#define __AARCH64_execve	221
+#define __AARCH64_faccessat	48
+#define __AARCH64_newfstatat	79
+#define __AARCH64_newfstat	80
+#define __AARCH64_read		63
+
+// NOTE: CONFIG_COMPAT implies __ARCH_WANT_COMPAT_STAT64 (fstatat64, fstat64)
+#define __ARMEABI_reboot	88
+#define __ARMEABI_execve	11
+#define __ARMEABI_faccessat	334
+#define __ARMEABI_fstatat64	327
+#define __ARMEABI_fstat64	197
+#define __ARMEABI_read		3
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
+
+// on 4.19+ its is no longer just a void *sys_call_table[]
+// it becomes syscall_fn_t sys_call_table[];
+
+static syscall_fn_t aarch64_reboot __read_mostly = NULL; 
+static long hook_aarch64_reboot(const struct pt_regs *regs)
+{
+	int magic1 = (int)regs->regs[0];
+	int magic2 = (int)regs->regs[1];
+	unsigned int cmd = (unsigned int)regs->regs[2];
+	void __user **arg = (void __user **)&regs->regs[3];
+
+	ksu_handle_sys_reboot(magic1, magic2, cmd, arg);
+	return aarch64_reboot(regs);
+}
+
+static syscall_fn_t aarch64_execve __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_execve(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[0];
+	void ***argv = (void ***)&regs->regs[1];
+	void ***envp = (void ***)&regs->regs[2];
+
+	ksu_handle_execve(filename, argv, envp);
+	return aarch64_execve(regs);
+}
+
+static syscall_fn_t aarch64_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_faccessat(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_faccessat(NULL, filename, NULL, NULL);
+	return aarch64_faccessat(regs);
+}
+
+static syscall_fn_t aarch64_newfstatat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_newfstatat(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_stat(NULL, filename, NULL);
+	return aarch64_newfstatat(regs);
+}
+
+static syscall_fn_t aarch64_newfstat __read_mostly = NULL;
+__attribute__((cold))
+static long hook_aarch64_newfstat_ret(const struct pt_regs *regs)
+{
+	// we handle it like rp
+	unsigned int *fd = (unsigned int *)&regs->regs[0];
+	struct stat __user **statbuf = (struct stat __user **)&regs->regs[1];
+
+	long ret = aarch64_newfstat(regs);
+	ksu_handle_newfstat_ret(fd, statbuf);
+	return ret;
+}
+
+static syscall_fn_t aarch64_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_aarch64_read(const struct pt_regs *regs)
+{
+	unsigned int fd = (unsigned int)regs->regs[0];
+
+	ksu_handle_sys_read_fd(fd);
+	return aarch64_read(regs);
+}
+
+#ifdef CONFIG_COMPAT
+static syscall_fn_t armeabi_reboot __read_mostly = NULL;
+static long hook_armeabi_reboot(const struct pt_regs *regs)
+{
+	int magic1 = (int)regs->regs[0];
+	int magic2 = (int)regs->regs[1];
+	unsigned int cmd = (unsigned int)regs->regs[2];
+	void __user **arg = (void __user **)&regs->regs[3];
+
+	ksu_handle_sys_reboot(magic1, magic2, cmd, arg);
+	return armeabi_reboot(regs);
+}
+
+static syscall_fn_t armeabi_execve __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_execve(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[0];
+	void ***argv = (void ***)&regs->regs[1];
+	void ***envp = (void ***)&regs->regs[2];
+
+	ksu_handle_execve(filename, argv, envp);
+	return armeabi_execve(regs);
+}
+
+static syscall_fn_t armeabi_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_faccessat(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_faccessat(NULL, filename, NULL, NULL);
+	return armeabi_faccessat(regs);
+}
+
+static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_fstatat64(const struct pt_regs *regs)
+{
+	const char __user **filename = (const char __user **)&regs->regs[1];
+
+	ksu_handle_stat(NULL, filename, NULL);
+	return armeabi_fstatat64(regs);
+}
+
+static syscall_fn_t armeabi_fstat64 __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_fstat64_ret(const struct pt_regs *regs)
+{
+	// we handle it like rp
+	unsigned long *fd = (unsigned long *)&regs->regs[0];
+	struct stat64 __user **statbuf = (struct stat64 __user **)&regs->regs[1];
+
+	long ret = armeabi_fstat64(regs);
+	ksu_handle_fstat64_ret(fd, statbuf);
+	return ret;
+}
+
+static syscall_fn_t armeabi_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_read(const struct pt_regs *regs)
+{
+	unsigned int fd = (unsigned int)regs->regs[0];	
+
+	ksu_handle_sys_read_fd(fd);
+	return armeabi_read(regs);
+}
+
+#endif // CONFIG_COMPAT
+
+#else // END OF 4.19+ SYSCALL HANDLERS
+
+/**
+ *  for legacy syscall abi, we straight up call the syscall symbol
+ *  this is easier and maybe a little bit faster
+ *
+ */
+
+static uintptr_t aarch64_reboot __read_mostly = NULL;
+static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg)
+{
+	ksu_handle_sys_reboot(magic1, magic2, cmd, &arg);
+	return sys_reboot(magic1, magic2, cmd, arg);
+}
+
+static uintptr_t aarch64_execve __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_execve(const char __user * filename,
+				const char __user *const __user * argv,
+				const char __user *const __user * envp)
+{
+	ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp);
+	return sys_execve(filename, argv, envp);
+}
+
+static uintptr_t aarch64_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_faccessat(int dfd, const char __user * filename, int mode)
+{
+	ksu_handle_faccessat(&dfd, &filename, &mode, NULL);
+	return sys_faccessat(dfd, filename, mode);
+}
+
+static uintptr_t aarch64_newfstatat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_aarch64_newfstatat(int dfd, const char __user * filename, struct stat __user * statbuf, int flag)
+{
+	ksu_handle_stat(&dfd, &filename, &flag);
+	return sys_newfstatat(dfd, filename, statbuf, flag);
+}
+
+static uintptr_t aarch64_newfstat __read_mostly = NULL;
+__attribute__((cold))
+static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * statbuf)
+{
+	// we handle it like rp
+	long ret = sys_newfstat(fd, statbuf);
+	ksu_handle_newfstat_ret(&fd, &statbuf);
+	return ret;
+}
+
+static uintptr_t aarch64_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count)
+{
+	ksu_handle_sys_read_fd(fd);
+	return sys_read(fd, buf, count);
+}
+
+#ifdef CONFIG_COMPAT
+extern const void *compat_sys_call_table[];
+
+static uintptr_t armeabi_reboot __read_mostly = NULL;
+static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg)
+{
+	ksu_handle_sys_reboot(magic1, magic2, cmd, &arg);
+	return sys_reboot(magic1, magic2, cmd, arg);
+}
+
+static uintptr_t armeabi_execve __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_execve(const char __user * filename,
+				const compat_uptr_t __user * argv,
+				const compat_uptr_t __user * envp)
+{
+	ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp);
+	return compat_sys_execve(filename, argv, envp);
+}
+
+static uintptr_t armeabi_faccessat __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode)
+{
+	ksu_handle_faccessat(&dfd, &filename, &mode, NULL);
+	return sys_faccessat(dfd, filename, mode);
+}
+
+static uintptr_t armeabi_fstatat64 __read_mostly = NULL;
+__attribute__((hot))
+static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag)
+{
+	ksu_handle_stat(&dfd, &filename, &flag);
+	return sys_fstatat64(dfd, filename, statbuf, flag);
+}
+
+static uintptr_t armeabi_fstat64 __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf)
+{
+	// we handle it like rp
+	long ret = sys_fstat64(fd, statbuf);
+	ksu_handle_fstat64_ret(&fd, &statbuf);
+	return ret;
+}
+
+static uintptr_t armeabi_read __read_mostly = NULL;
+__attribute__((cold))
+static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count)
+{
+	ksu_handle_sys_read_fd(fd);
+	return sys_read(fd, buf, count);
+}
+
+#endif // CONFIG_COMPAT
+
+#endif // SYSCALL HANDLERS
+
+// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head
+// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou"
+
+// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!)
+// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void **
+// for 4.19- old_ptr is actually void **
+// target_table is void *target_table[];
+static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table)
+{
+	void **sctable = (void **)target_table;
+	void **syscall_slot_addr = &sctable[syscall_nr];
+
+	if (!*syscall_slot_addr)
+		return;
+
+	pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr);
+
+	/*
+	 * basically the trick is
+	 * addr, say 0xffff1234, this is READ-ONLY
+	 * align it, 0xffff0000
+	 * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234
+	 * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE
+	 * write on 0xcccc0000 + 0x00001234
+	 *
+	 */
+
+	// prep vmap alias
+	unsigned long addr = (unsigned long)syscall_slot_addr;
+	unsigned long base = addr & PAGE_MASK;
+	unsigned long offset = addr & ~PAGE_MASK; // offset_in_page
+
+	// this is impossible for our case because the page alignment
+	// but be careful for other cases!
+	// BUG_ON(offset + len > PAGE_SIZE);
+	if (offset + sizeof(void *) > PAGE_SIZE) {
+		pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__);
+		return;
+	}
+
+	// virtual mapping of a physical page 
+	struct page *page = phys_to_page(__pa(base));
+	if (!page)
+		return;
+
+	// create a "writabel address" which is mapped to teh same address
+	void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (!writable_addr)
+		return;
+
+	// swap on the alias
+	void **target_slot = (void **)((unsigned long)writable_addr + offset);
+
+	preempt_disable();
+	local_irq_disable();
+
+	*(void **)old_ptr = *target_slot; 
+
+	*target_slot = new_ptr;
+	smp_mb(); // ^^
+
+	local_irq_enable();
+	preempt_enable();
+
+	vunmap(writable_addr);
+
+	smp_mb(); 
+}
+
+static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table)
+{
+	void **sctable = (void **)target_table;
+	void **syscall_slot_addr = &sctable[syscall_nr];
+
+	if (!*syscall_slot_addr)
+		return;
+
+	/*
+	 * we do this to make sure that old_ptr is filled.
+	 * we risk a dead syscall !!!
+	 * if read_and_replace failed or we restore again, it wont be pointing to anything
+	 * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast
+	 * yeah it really just dummy copies machine instructions at this point.
+	 *
+	 * normally we use probe_kernel_address / get_kernel_nofault here but the API is 
+	 * so inconsistent across kernel versions, and since its just a dummied wrapper 
+	 * for copy_from_kernel_nofault we can do it ourselves
+	 *
+	 */
+
+	long dummy = 0;
+	if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long)))
+		return;
+
+	pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr);
+
+	// prep vmap alias
+	unsigned long addr = (unsigned long)syscall_slot_addr;
+	unsigned long base = addr & PAGE_MASK;
+	unsigned long offset = addr & ~PAGE_MASK; // offset_in_page
+
+	// this is impossible for our case because the page alignment
+	// but be careful for other cases!
+	// BUG_ON(offset + len > PAGE_SIZE);
+	if (offset + sizeof(void *) > PAGE_SIZE) {
+		pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__);
+		return;
+	}
+
+	// virtual mapping of a physical page 
+	struct page *page = phys_to_page(__pa(base));
+	if (!page)
+		return;
+
+	// create a "writabel address" which is mapped to teh same address
+	void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (!writable_addr)
+		return;
+
+	// swap on the alias
+	void **target_slot = (void **)((unsigned long)writable_addr + offset);
+
+	// check if its ours
+	if (*target_slot != new_ptr) {
+		pr_info("%s: syscall is not ours!\n", __func__);
+		goto out;
+	}
+	
+	pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr );
+
+	preempt_disable();
+	local_irq_disable();
+
+	*target_slot = *(void **)old_ptr;	
+	smp_mb(); // ^^
+
+	*(void **)old_ptr = NULL; // explicit reset
+
+	local_irq_enable();
+	preempt_enable();
+
+out:
+	vunmap(writable_addr);
+
+	smp_mb(); 
+}
+
+static int ksu_syscall_table_restore()
+{
+	set_user_nice(current, 19); // low prio
+
+loop_start:
+
+	msleep(1000);
+
+	if (*(volatile bool *)&ksu_vfs_read_hook)
+		goto loop_start;
+
+	restore_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table);
+	restore_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table);
+
+#if defined(CONFIG_COMPAT)
+	restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table);
+	restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table);
+#endif
+	
+	return 0;
+}
+
+static DEFINE_MUTEX(sucompat_toggle_mutex);
+
+static void syscall_table_sucompat_enable()
+{
+	mutex_lock(&sucompat_toggle_mutex);
+
+	read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table);
+	read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table);
+	read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table);
+
+#if defined(CONFIG_COMPAT)
+	read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table);
+	read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table);
+	read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table);
+#endif
+
+	mutex_unlock(&sucompat_toggle_mutex);
+}
+
+static void syscall_table_sucompat_disable()
+{
+	mutex_lock(&sucompat_toggle_mutex);
+
+	restore_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table);
+	restore_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table);
+	restore_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table);
+
+#if defined(CONFIG_COMPAT)
+	restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table);
+	restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table);
+	restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table);
+#endif
+
+	mutex_unlock(&sucompat_toggle_mutex);
+}
+
+static __init int ksu_syscall_table_hook_init()
+{
+	// enable on init!
+	syscall_table_sucompat_enable();
+
+	read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table);
+
+	// will be unregged
+	read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table);
+	read_and_replace_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table);
+
+#if defined(CONFIG_COMPAT)
+	read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table);
+
+	// will be unregged
+	read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table);
+	read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table);
+#endif // COMPAT
+
+	// start unreg kthread
+	kthread_run(ksu_syscall_table_restore, NULL, "unhook");
+	return 0;
+}
+late_initcall(ksu_syscall_table_hook_init);
+
+// EOF
diff --git a/drivers/kernelsu/include/arch.h b/drivers/kernelsu/include/arch.h
new file mode 100644
index 000000000000..c80db6632efa
--- /dev/null
+++ b/drivers/kernelsu/include/arch.h
@@ -0,0 +1,134 @@
+#ifndef __KSU_H_ARCH
+#define __KSU_H_ARCH
+
+#if defined(__aarch64__)
+
+#define __PT_PARM1_REG regs[0]
+#define __PT_PARM2_REG regs[1]
+#define __PT_PARM3_REG regs[2]
+#define __PT_SYSCALL_PARM4_REG regs[3]
+#define __PT_CCALL_PARM4_REG regs[3]
+#define __PT_PARM5_REG regs[4]
+#define __PT_PARM6_REG regs[5]
+#define __PT_RET_REG regs[30]
+#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG regs[0]
+#define __PT_SP_REG sp
+#define __PT_IP_REG pc
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
+#define SYS_EXECVE_SYMBOL "__arm64_sys_execve"
+#define SYS_REBOOT_SYMBOL "__arm64_sys_reboot"
+#define SYS_NEWFSTAT_SYMBOL "__arm64_sys_newfstat"
+#define SYS_FSTAT64_SYMBOL "__arm64_sys_fstat64"
+#define SYS_READ_SYMBOL "__arm64_sys_read"
+#define SYS_NEWFSTATAT_SYMBOL "__arm64_sys_newfstatat"
+#define SYS_FACCESSAT_SYMBOL "__arm64_sys_faccessat"
+#else
+#define SYS_EXECVE_SYMBOL "sys_execve"
+#define SYS_REBOOT_SYMBOL "sys_reboot"
+#define SYS_NEWFSTAT_SYMBOL "sys_newfstat"
+#define SYS_FSTAT64_SYMBOL "sys_fstat64"
+#define SYS_READ_SYMBOL "sys_read"
+#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat"
+#define SYS_FACCESSAT_SYMBOL "sys_faccessat"
+#endif
+
+#elif defined(__arm__)
+
+// https://elixir.bootlin.com/linux/v6.17-rc6/source/tools/lib/bpf/bpf_tracing.h
+#define __PT_PARM1_REG uregs[0]
+#define __PT_PARM2_REG uregs[1]
+#define __PT_PARM3_REG uregs[2]
+#define __PT_PARM4_REG uregs[3]
+
+// seems to work atleast on 3.0 on samsung galaxy s3
+// nfi what im doing
+#define __PT_SYSCALL_PARM4_REG uregs[3] 
+#define __PT_CCALL_PARM4_REG uregs[3]
+
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG uregs[4]
+#define __PT_PARM6_SYSCALL_REG uregs[5]
+#define __PT_PARM7_SYSCALL_REG uregs[6]
+
+#define __PT_RET_REG uregs[14]
+#define __PT_FP_REG uregs[11]	/* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG uregs[0]
+#define __PT_SP_REG uregs[13]
+#define __PT_IP_REG uregs[12]
+
+#define SYS_EXECVE_SYMBOL "sys_execve"
+#define SYS_REBOOT_SYMBOL "sys_reboot"
+#define SYS_NEWFSTAT_SYMBOL "sys_newfstat"
+#define SYS_FSTAT64_SYMBOL "sys_fstat64"
+#define SYS_READ_SYMBOL "sys_read"
+#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat"
+#define SYS_FACCESSAT_SYMBOL "sys_faccessat"
+
+#elif defined(__x86_64__)
+
+#define __PT_PARM1_REG di
+#define __PT_PARM2_REG si
+#define __PT_PARM3_REG dx
+/* syscall uses r10 for PARM4 */
+#define __PT_SYSCALL_PARM4_REG r10
+#define __PT_CCALL_PARM4_REG cx
+#define __PT_PARM5_REG r8
+#define __PT_PARM6_REG r9
+#define __PT_RET_REG sp
+#define __PT_FP_REG bp
+#define __PT_RC_REG ax
+#define __PT_SP_REG sp
+#define __PT_IP_REG ip
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
+#define SYS_EXECVE_SYMBOL "__x64_sys_execve"
+#define SYS_REBOOT_SYMBOL "__x64_sys_reboot"
+#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat"
+#define SYS_FSTAT64_SYMBOL "__ia32_compat_sys_x86_fstat64"
+#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat"
+#define SYS_NEWFSTATAT_SYMBOL "__x64_sys_newfstatat"
+#define SYS_FACCESSAT_SYMBOL "__x64_sys_faccessat"
+#else
+#define SYS_EXECVE_SYMBOL "sys_execve"
+#define SYS_REBOOT_SYMBOL "sys_reboot"
+#define SYS_NEWFSTAT_SYMBOL "sys_newfstat"
+#define SYS_FSTAT64_SYMBOL "sys_fstat64"
+#define SYS_READ_SYMBOL "sys_read"
+#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat"
+#define SYS_FACCESSAT_SYMBOL "sys_faccessat"
+#endif
+
+#else
+#error "Unsupported arch"
+#endif
+
+/* allow some architecutres to override `struct pt_regs` */
+#ifndef __PT_REGS_CAST
+#define __PT_REGS_CAST(x) (x)
+#endif
+
+#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
+#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
+#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG)
+#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG)
+#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG)
+#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG)
+#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG)
+#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG)
+#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG)
+#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG)
+#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG)
+#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG)
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
+#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs))
+#else
+#define PT_REAL_REGS(regs) ((regs))
+#endif
+
+#endif
diff --git a/drivers/kernelsu/klog.h b/drivers/kernelsu/include/klog.h
similarity index 82%
rename from drivers/kernelsu/klog.h
rename to drivers/kernelsu/include/klog.h
index a934027fbeeb..6de40a66680e 100644
--- a/drivers/kernelsu/klog.h
+++ b/drivers/kernelsu/include/klog.h
@@ -1,8 +1,6 @@
 #ifndef __KSU_H_KLOG
 #define __KSU_H_KLOG
 
-#include <linux/printk.h>
-
 #ifdef pr_fmt
 #undef pr_fmt
 #define pr_fmt(fmt) "KernelSU: " fmt
diff --git a/drivers/kernelsu/ksu.h b/drivers/kernelsu/include/ksu.h
similarity index 75%
rename from drivers/kernelsu/ksu.h
rename to drivers/kernelsu/include/ksu.h
index 32e81d967fff..2f5841290b1d 100644
--- a/drivers/kernelsu/ksu.h
+++ b/drivers/kernelsu/include/ksu.h
@@ -1,11 +1,7 @@
 #ifndef __KSU_H_KSU
 #define __KSU_H_KSU
 
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/cred.h>
-
-#define KERNEL_SU_VERSION 32430
+#define KERNEL_SU_VERSION 32485
 
 #define EVENT_POST_FS_DATA 1
 #define EVENT_BOOT_COMPLETED 2
@@ -25,6 +21,6 @@ static inline int endswith(const char *s, const char *t)
 	return strcmp(s + slen - tlen, t);
 }
 
-extern struct cred *ksu_cred;
+extern struct cred* ksu_cred;
 
 #endif
diff --git a/drivers/kernelsu/include/uapi/app_profile.h b/drivers/kernelsu/include/uapi/app_profile.h
new file mode 100644
index 000000000000..7aa29e0f6293
--- /dev/null
+++ b/drivers/kernelsu/include/uapi/app_profile.h
@@ -0,0 +1,61 @@
+#ifndef __KSU_UAPI_APP_PROFILE_H
+#define __KSU_UAPI_APP_PROFILE_H
+
+#define KSU_APP_PROFILE_VER 3
+#define KSU_MAX_PACKAGE_NAME 256
+/* NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. */
+#define KSU_MAX_GROUPS 32
+#define KSU_SELINUX_DOMAIN 64
+
+struct root_profile {
+	__s32 uid;
+	__s32 gid;
+
+	__u32 groups_count;
+	__s32 groups[KSU_MAX_GROUPS];
+
+	/* kernel_cap_t is u32[2] for capabilities v3 */
+	struct {
+		__u64 effective;
+		__u64 permitted;
+		__u64 inheritable;
+	} capabilities;
+
+	char selinux_domain[KSU_SELINUX_DOMAIN];
+
+	__s32 namespaces;
+};
+
+struct non_root_profile {
+	bool umount_modules;
+};
+
+struct app_profile {
+	/*
+	 * It may be utilized for backward compatibility, although we have never
+	 * explicitly made any promises regarding this.
+	 */
+	__u32 version;
+
+	/* this is usually the package of the app, but can be other value for special apps */
+	char key[KSU_MAX_PACKAGE_NAME];
+	__s32 curr_uid;
+	bool allow_su;
+
+	union {
+		struct {
+			bool use_default;
+			char template_name[KSU_MAX_PACKAGE_NAME];
+
+			struct root_profile profile;
+		} rp_config;
+
+		struct {
+			bool use_default;
+
+			struct non_root_profile profile;
+		} nrp_config;
+	};
+};
+
+#endif
diff --git a/drivers/kernelsu/include/uapi/feature.h b/drivers/kernelsu/include/uapi/feature.h
new file mode 100644
index 000000000000..b1b92f2fdc48
--- /dev/null
+++ b/drivers/kernelsu/include/uapi/feature.h
@@ -0,0 +1,14 @@
+#ifndef __KSU_UAPI_FEATURE_H
+#define __KSU_UAPI_FEATURE_H
+
+enum ksu_feature_id {
+	KSU_FEATURE_SU_COMPAT = 0,
+	KSU_FEATURE_KERNEL_UMOUNT = 1,
+	KSU_FEATURE_SULOG = 2,
+	KSU_FEATURE_ADB_ROOT = 3,
+	KSU_FEATURE_SELINUX_HIDE = 4,
+
+	KSU_FEATURE_MAX
+};
+
+#endif
diff --git a/drivers/kernelsu/include/uapi/selinux.h b/drivers/kernelsu/include/uapi/selinux.h
new file mode 100644
index 000000000000..960454f7f46a
--- /dev/null
+++ b/drivers/kernelsu/include/uapi/selinux.h
@@ -0,0 +1,29 @@
+#ifndef __KSU_UAPI_SELINUX_H
+#define __KSU_UAPI_SELINUX_H
+
+#define KSU_SEPOLICY_CMD_NORMAL_PERM 1
+#define KSU_SEPOLICY_CMD_XPERM 2
+#define KSU_SEPOLICY_CMD_TYPE_STATE 3
+#define KSU_SEPOLICY_CMD_TYPE 4
+#define KSU_SEPOLICY_CMD_TYPE_ATTR 5
+#define KSU_SEPOLICY_CMD_ATTR 6
+#define KSU_SEPOLICY_CMD_TYPE_TRANSITION 7
+#define KSU_SEPOLICY_CMD_TYPE_CHANGE 8
+#define KSU_SEPOLICY_CMD_GENFSCON 9
+
+#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW 1
+#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY 2
+#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW 3
+#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT 4
+
+#define KSU_SEPOLICY_SUBCMD_XPERM_ALLOW 1
+#define KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW 2
+#define KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT 3
+
+#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE 1
+#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE 2
+
+#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE 1
+#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER 2
+
+#endif
diff --git a/drivers/kernelsu/include/uapi/sulog.h b/drivers/kernelsu/include/uapi/sulog.h
new file mode 100644
index 000000000000..9453a4bd0c16
--- /dev/null
+++ b/drivers/kernelsu/include/uapi/sulog.h
@@ -0,0 +1,32 @@
+#ifndef __KSU_UAPI_SULOG_H
+#define __KSU_UAPI_SULOG_H
+
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#define KSU_SULOG_EVENT_VERSION 1
+#ifndef TASK_COMM_LEN
+#define TASK_COMM_LEN 16
+#endif
+
+enum ksu_sulog_event_type {
+	KSU_SULOG_EVENT_ROOT_EXECVE = 1,
+	KSU_SULOG_EVENT_SUCOMPAT = 2,
+	KSU_SULOG_EVENT_IOCTL_GRANT_ROOT = 3,
+};
+
+struct ksu_sulog_event {
+	__u16 version;
+	__u16 event_type;
+	__s32 retval;
+	__u32 pid;
+	__u32 tgid;
+	__u32 ppid;
+	__u32 uid;
+	__u32 euid;
+	char comm[TASK_COMM_LEN];
+	__u32 filename_len;
+	__u32 argv_len;
+} __packed;
+
+#endif
diff --git a/drivers/kernelsu/include/uapi/supercall.h b/drivers/kernelsu/include/uapi/supercall.h
new file mode 100644
index 000000000000..dbfc5f1158bd
--- /dev/null
+++ b/drivers/kernelsu/include/uapi/supercall.h
@@ -0,0 +1,162 @@
+#ifndef __KSU_UAPI_SUPERCALL_H
+#define __KSU_UAPI_SUPERCALL_H
+
+/* Magic numbers for reboot hook to install fd */
+#define KSU_INSTALL_MAGIC1 0xDEADBEEF
+#define KSU_INSTALL_MAGIC2 0xCAFEBABE
+
+struct ksu_become_daemon_cmd {
+	__u8 token[65]; /* Input: daemon token (null-terminated) */
+};
+
+#define EVENT_POST_FS_DATA 1
+#define EVENT_BOOT_COMPLETED 2
+#define EVENT_MODULE_MOUNTED 3
+
+#define KSU_GET_INFO_FLAG_LKM (1U << 0)
+#define KSU_GET_INFO_FLAG_MANAGER (1U << 1)
+#define KSU_GET_INFO_FLAG_LATE_LOAD (1U << 2)
+#define KSU_GET_INFO_FLAG_PR_BUILD (1U << 3)
+
+struct ksu_get_info_cmd {
+	__u32 version; /* Output: KERNEL_SU_VERSION */
+	__u32 flags; /* Output: KSU_GET_INFO_FLAG_* bits */
+	__u32 features; /* Output: max feature ID supported */
+};
+
+struct ksu_report_event_cmd {
+	__u32 event; /* Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. */
+};
+
+struct ksu_set_sepolicy_cmd {
+	__u64 data_len; /* Input: bytes of serialized command payload */
+	__aligned_u64 data; /* Input: pointer to serialized payload */
+};
+
+struct ksu_sepolicy_cmd_hdr {
+	__u32 cmd; /* Input: command type, CMD_* */
+	__u32 subcmd; /* Input: command subtype */
+};
+/*
+ * After each ksu_sepolicy_cmd_hdr, command arguments are encoded sequentially as:
+ * [u32 len][len bytes][\0], where len excludes the trailing '\0'.
+ * len == 0 represents ALL.
+ * Argument count is derived from cmd:
+ * KSU_SEPOLICY_CMD_NORMAL_PERM=4, KSU_SEPOLICY_CMD_XPERM=5,
+ * KSU_SEPOLICY_CMD_TYPE_STATE=1, KSU_SEPOLICY_CMD_TYPE=2,
+ * KSU_SEPOLICY_CMD_TYPE_ATTR=2, KSU_SEPOLICY_CMD_ATTR=1,
+ * KSU_SEPOLICY_CMD_TYPE_TRANSITION=5, KSU_SEPOLICY_CMD_TYPE_CHANGE=4,
+ * KSU_SEPOLICY_CMD_GENFSCON=3.
+ */
+
+struct ksu_check_safemode_cmd {
+	__u8 in_safe_mode; /* Output: true if in safe mode, false otherwise */
+};
+
+/* deprecated */
+struct ksu_get_allow_list_cmd {
+	__u32 uids[128]; /* Output: array of allowed/denied UIDs */
+	__u32 count; /* Output: number of UIDs in array */
+	__u8 allow; /* Input: true for allow list, false for deny list */
+};
+
+struct ksu_new_get_allow_list_cmd {
+	__u16 count; /* Input / Output: number of UIDs in array */
+	__u16 total_count; /* Output: total number of UIDs in requested list */
+	__u32 uids[0]; /* Output: array of allowed/denied UIDs */
+};
+
+struct ksu_uid_granted_root_cmd {
+	__u32 uid; /* Input: target UID to check */
+	__u8 granted; /* Output: true if granted, false otherwise */
+};
+
+struct ksu_uid_should_umount_cmd {
+	__u32 uid; /* Input: target UID to check */
+	__u8 should_umount; /* Output: true if should umount, false otherwise */
+};
+
+struct ksu_get_manager_appid_cmd {
+	__u32 appid; /* Output: manager app id */
+};
+
+struct ksu_get_app_profile_cmd {
+	struct app_profile profile; /* Input/Output: app profile structure */
+};
+
+struct ksu_set_app_profile_cmd {
+	struct app_profile profile; /* Input: app profile structure */
+};
+
+struct ksu_get_feature_cmd {
+	__u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */
+	__u64 value; /* Output: feature value/state */
+	__u8 supported; /* Output: true if feature is supported, false otherwise */
+};
+
+struct ksu_set_feature_cmd {
+	__u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */
+	__u64 value; /* Input: feature value/state to set */
+};
+
+struct ksu_get_wrapper_fd_cmd {
+	__u32 fd; /* Input: userspace fd */
+	__u32 flags; /* Input: flags of userspace fd */
+};
+
+struct ksu_manage_mark_cmd {
+	__u32 operation; /* Input: KSU_MARK_* */
+	__s32 pid; /* Input: target pid (0 for all processes) */
+	__u32 result; /* Output: for get operation - mark status or reg_count */
+};
+
+#define KSU_MARK_GET 1
+#define KSU_MARK_MARK 2
+#define KSU_MARK_UNMARK 3
+#define KSU_MARK_REFRESH 4
+
+struct ksu_nuke_ext4_sysfs_cmd {
+	__aligned_u64 arg; /* Input: mnt pointer */
+};
+
+struct ksu_add_try_umount_cmd {
+	__aligned_u64 arg; /* char ptr, this is the mountpoint */
+	__u32 flags; /* this is the flag we use for it */
+	__u8 mode; /* denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry */
+};
+
+struct ksu_get_sulog_fd_cmd {
+	__u32 flags; /* Input: reserved for future use, must be 0 */
+};
+
+#define KSU_UMOUNT_WIPE 0	// ignore everything and wipe list
+#define KSU_UMOUNT_ADD 1	// add entry (path + flags)
+#define KSU_UMOUNT_DEL 2	// delete entry, strcmp
+
+// IOCTL command definitions
+#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0)
+#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0)
+#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0)
+#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ|_IOC_WRITE, 'K', 4, 0)
+#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0)
+// deprecated
+#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 6, 0)
+// deprecated
+#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 7, 0)
+#define KSU_IOCTL_NEW_GET_ALLOW_LIST _IOWR('K', 6, struct ksu_new_get_allow_list_cmd)
+#define KSU_IOCTL_NEW_GET_DENY_LIST _IOWR('K', 7, struct ksu_new_get_allow_list_cmd)
+#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ|_IOC_WRITE, 'K', 8, 0)
+#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ|_IOC_WRITE, 'K', 9, 0)
+#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0)
+#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ|_IOC_WRITE, 'K', 11, 0)
+#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0)
+#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ|_IOC_WRITE, 'K', 13, 0)
+#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0)
+#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0)
+#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ|_IOC_WRITE, 'K', 16, 0)
+#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0)
+#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0)
+#define KSU_IOCTL_SET_INIT_PGRP _IO('K', 19)
+#define KSU_IOCTL_GET_SULOG_FD _IOW('K', 20, struct ksu_get_sulog_fd_cmd)
+
+#endif
diff --git a/drivers/kernelsu/infra/event_queue.c b/drivers/kernelsu/infra/event_queue.c
new file mode 100644
index 000000000000..333a10c0c523
--- /dev/null
+++ b/drivers/kernelsu/infra/event_queue.c
@@ -0,0 +1,393 @@
+struct ksu_event_queue_node {
+	struct list_head list;
+	struct ksu_event_record_hdr hdr;
+	__u8 payload[];
+};
+
+static size_t ksu_event_queue_record_size(__u32 payload_len)
+{
+	return sizeof(struct ksu_event_record_hdr) + payload_len;
+}
+
+static void ksu_event_queue_note_drop_locked(struct ksu_event_queue *queue, __u64 seq)
+{
+	queue->dropped_total++;
+	if (!queue->dropped_pending) {
+		queue->dropped_first_seq = seq;
+	}
+	queue->dropped_pending++;
+	queue->dropped_last_seq = seq;
+}
+
+static bool ksu_event_queue_has_data_locked(const struct ksu_event_queue *queue)
+{
+	return queue->dropped_pending || queue->dropped_inflight || !list_empty(&queue->pending);
+}
+
+static void ksu_event_queue_mark_closed(struct ksu_event_queue *queue)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	queue->closed = true;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+}
+
+void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len)
+{
+	spin_lock_init(&queue->lock);
+	mutex_init(&queue->read_lock);
+	INIT_LIST_HEAD(&queue->pending);
+	init_waitqueue_head(&queue->read_wait);
+	queue->queued = 0;
+	queue->max_queued = max_queued;
+	queue->max_payload_len = max_payload_len;
+	queue->next_seq = 1;
+	queue->dropped_total = 0;
+	queue->dropped_pending = 0;
+	queue->dropped_first_seq = 0;
+	queue->dropped_last_seq = 0;
+	queue->dropped_inflight = 0;
+	queue->dropped_inflight_first_seq = 0;
+	queue->dropped_inflight_last_seq = 0;
+	queue->closed = false;
+}
+
+void ksu_event_queue_destroy(struct ksu_event_queue *queue)
+{
+	struct ksu_event_queue_node *node, *tmp;
+	unsigned long irq_flags;
+
+	ksu_event_queue_mark_closed(queue);
+	wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP);
+
+	mutex_lock(&queue->read_lock);
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	list_for_each_entry_safe (node, tmp, &queue->pending, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+	queue->queued = 0;
+	queue->dropped_pending = 0;
+	queue->dropped_first_seq = 0;
+	queue->dropped_last_seq = 0;
+	queue->dropped_inflight = 0;
+	queue->dropped_inflight_first_seq = 0;
+	queue->dropped_inflight_last_seq = 0;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+	mutex_unlock(&queue->read_lock);
+
+	wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP);
+}
+
+int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len, gfp_t gfp)
+{
+	struct ksu_event_queue_node *node = NULL;
+	unsigned long irq_flags;
+	__u64 seq;
+	bool wake = false;
+	int ret = 0;
+
+	if (len > queue->max_payload_len) {
+		return -EMSGSIZE;
+	}
+
+	if (len && !payload) {
+		return -EINVAL;
+	}
+
+	node = kmalloc(struct_size(node, payload, len), gfp);
+
+	if (node) {
+		INIT_LIST_HEAD(&node->list);
+		node->hdr.type = type;
+		node->hdr.flags = flags;
+		node->hdr.len = len;
+		node->hdr.ts_ns = 0;
+		node->hdr.seq = 0;
+
+		if (len) {
+			memcpy(node->payload, payload, len);
+		}
+	}
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (queue->closed) {
+		ret = -EPIPE;
+		goto out_unlock;
+	}
+
+	seq = queue->next_seq++;
+	if (!node || (queue->max_queued && queue->queued >= queue->max_queued)) {
+		ksu_event_queue_note_drop_locked(queue, seq);
+		wake = true;
+		ret = node ? -ENOSPC : -ENOMEM;
+		goto out_unlock;
+	}
+
+	node->hdr.seq = seq;
+	node->hdr.ts_ns = ktime_get_ns();
+	list_add_tail(&node->list, &queue->pending);
+	queue->queued++;
+	wake = true;
+
+out_unlock:
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	if (ret && node) {
+		kfree(node);
+	}
+
+	if (wake) {
+		wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM);
+	}
+
+	return ret;
+}
+
+void ksu_event_queue_drop(struct ksu_event_queue *queue)
+{
+	unsigned long irq_flags;
+	__u64 seq;
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (queue->closed) {
+		spin_unlock_irqrestore(&queue->lock, irq_flags);
+		return;
+	}
+
+	seq = queue->next_seq++;
+	ksu_event_queue_note_drop_locked(queue, seq);
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM);
+}
+
+static int ksu_event_queue_wait_ready(struct ksu_event_queue *queue, int file_flags)
+{
+	int ret;
+
+	for (;;) {
+		if (ksu_event_queue_has_data(queue)) {
+			return 0;
+		}
+
+		if (READ_ONCE(queue->closed)) {
+			return 0;
+		}
+
+		if (file_flags & O_NONBLOCK) {
+			return -EAGAIN;
+		}
+
+		ret = wait_event_interruptible(queue->read_wait, queue->closed || ksu_event_queue_has_data(queue));
+		if (ret) {
+			return ret;
+		}
+	}
+}
+
+static ssize_t ksu_event_queue_read_drop(struct ksu_event_queue *queue, char __user *buf, size_t count)
+{
+	struct ksu_event_record_hdr hdr;
+	struct ksu_event_queue_dropped_info info;
+	size_t record_size = ksu_event_queue_record_size(sizeof(info));
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (!queue->dropped_pending) {
+		spin_unlock_irqrestore(&queue->lock, irq_flags);
+		return 0;
+	}
+	if (count < record_size) {
+		spin_unlock_irqrestore(&queue->lock, irq_flags);
+		return -EMSGSIZE;
+	}
+
+	hdr.type = KSU_EVENT_QUEUE_TYPE_DROPPED;
+	hdr.flags = KSU_EVENT_RECORD_FLAG_INTERNAL;
+	hdr.len = sizeof(info);
+	hdr.seq = queue->dropped_first_seq;
+	hdr.ts_ns = ktime_get_ns();
+
+	info.dropped = queue->dropped_pending;
+	info.first_seq = queue->dropped_first_seq;
+	info.last_seq = queue->dropped_last_seq;
+
+	queue->dropped_inflight = queue->dropped_pending;
+	queue->dropped_inflight_first_seq = queue->dropped_first_seq;
+	queue->dropped_inflight_last_seq = queue->dropped_last_seq;
+	queue->dropped_pending = 0;
+	queue->dropped_first_seq = 0;
+	queue->dropped_last_seq = 0;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	if (copy_to_user(buf, &hdr, sizeof(hdr))) {
+		goto out_restore;
+	}
+
+	if (copy_to_user(buf + sizeof(hdr), &info, sizeof(info))) {
+		goto out_restore;
+	}
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	queue->dropped_inflight = 0;
+	queue->dropped_inflight_first_seq = 0;
+	queue->dropped_inflight_last_seq = 0;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	return record_size;
+
+out_restore:
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (!queue->dropped_pending) {
+		queue->dropped_pending = queue->dropped_inflight;
+		queue->dropped_first_seq = queue->dropped_inflight_first_seq;
+		queue->dropped_last_seq = queue->dropped_inflight_last_seq;
+	} else {
+		queue->dropped_pending += queue->dropped_inflight;
+		queue->dropped_first_seq = queue->dropped_inflight_first_seq;
+	}
+	queue->dropped_inflight = 0;
+	queue->dropped_inflight_first_seq = 0;
+	queue->dropped_inflight_last_seq = 0;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	return -EFAULT;
+}
+
+static ssize_t ksu_event_queue_read_node(struct ksu_event_queue *queue, char __user *buf, size_t count)
+{
+	struct ksu_event_queue_node *node;
+	struct list_head *first;
+	size_t record_size;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (list_empty(&queue->pending)) {
+		spin_unlock_irqrestore(&queue->lock, irq_flags);
+		return 0;
+	}
+
+	first = queue->pending.next;
+	node = list_entry(first, struct ksu_event_queue_node, list);
+	record_size = ksu_event_queue_record_size(node->hdr.len);
+	if (count < record_size) {
+		spin_unlock_irqrestore(&queue->lock, irq_flags);
+		return -EMSGSIZE;
+	}
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	if (copy_to_user(buf, &node->hdr, sizeof(node->hdr))) {
+		return -EFAULT;
+	}
+
+	if (node->hdr.len && copy_to_user(buf + sizeof(node->hdr), node->payload, node->hdr.len)) {
+		return -EFAULT;
+	}
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	list_del(first);
+	queue->queued--;
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	kfree(node);
+	return record_size;
+}
+
+ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags)
+{
+	ssize_t ret;
+	ssize_t copied = 0;
+
+	if (!count) {
+		return 0;
+	}
+
+	ret = mutex_lock_interruptible(&queue->read_lock);
+	if (ret) {
+		return ret;
+	}
+
+	ret = ksu_event_queue_wait_ready(queue, file_flags);
+	if (ret) {
+		copied = ret;
+		goto out_unlock;
+	}
+
+	while (count > 0) {
+		ret = ksu_event_queue_read_drop(queue, buf, count);
+		if (ret < 0) {
+			if (!copied) {
+				copied = ret;
+			}
+			break;
+		}
+		if (ret > 0) {
+			copied += ret;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = ksu_event_queue_read_node(queue, buf, count);
+		if (ret < 0) {
+			if (!copied) {
+				copied = ret;
+			}
+			break;
+		}
+		if (ret == 0) {
+			break;
+		}
+
+		copied += ret;
+		buf += ret;
+		count -= ret;
+	}
+
+	if (!copied && READ_ONCE(queue->closed)) {
+		copied = 0;
+	}
+
+out_unlock:
+	mutex_unlock(&queue->read_lock);
+	return copied;
+}
+
+unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait)
+{
+	unsigned __bitwise mask = 0;
+	unsigned long irq_flags;
+
+	poll_wait(file, &queue->read_wait, wait);
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	if (ksu_event_queue_has_data_locked(queue)) {
+		mask |= POLLIN | POLLRDNORM;
+	}
+	if (queue->closed) {
+		mask |= POLLHUP;
+	}
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	return mask;
+}
+
+void ksu_event_queue_close(struct ksu_event_queue *queue)
+{
+	ksu_event_queue_mark_closed(queue);
+	wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP);
+}
+
+bool ksu_event_queue_has_data(struct ksu_event_queue *queue)
+{
+	bool has_data;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&queue->lock, irq_flags);
+	has_data = ksu_event_queue_has_data_locked(queue);
+	spin_unlock_irqrestore(&queue->lock, irq_flags);
+
+	return has_data;
+}
diff --git a/drivers/kernelsu/infra/event_queue.h b/drivers/kernelsu/infra/event_queue.h
new file mode 100644
index 000000000000..2170f64fd8c8
--- /dev/null
+++ b/drivers/kernelsu/infra/event_queue.h
@@ -0,0 +1,54 @@
+#ifndef KSU_EVENT_QUEUE_H
+#define KSU_EVENT_QUEUE_H
+
+#define KSU_EVENT_RECORD_FLAG_INTERNAL (1U << 0)
+#define KSU_EVENT_QUEUE_TYPE_DROPPED ((__u16)0xFFFF)
+
+struct ksu_event_record_hdr {
+	__u16 type;
+	__u16 flags;
+	__u32 len;
+	__u64 seq;
+	__u64 ts_ns;
+};
+
+struct ksu_event_queue_dropped_info {
+	__u64 dropped;
+	__u64 first_seq;
+	__u64 last_seq;
+};
+
+struct ksu_event_queue {
+	spinlock_t lock;
+	/* The first implementation supports a single reader. */
+	struct mutex read_lock;
+	struct list_head pending;
+	wait_queue_head_t read_wait;
+	__u32 queued;
+	__u32 max_queued;
+	__u32 max_payload_len;
+	__u64 next_seq;
+	__u64 dropped_total;
+	__u64 dropped_pending;
+	__u64 dropped_first_seq;
+	__u64 dropped_last_seq;
+	__u64 dropped_inflight;
+	__u64 dropped_inflight_first_seq;
+	__u64 dropped_inflight_last_seq;
+	bool closed;
+};
+
+void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len);
+void ksu_event_queue_destroy(struct ksu_event_queue *queue);
+
+int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len,
+						 gfp_t gfp);
+void ksu_event_queue_drop(struct ksu_event_queue *queue);
+
+ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags);
+unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait);
+
+void ksu_event_queue_close(struct ksu_event_queue *queue);
+bool ksu_event_queue_has_data(struct ksu_event_queue *queue);
+
+#endif // KSU_EVENT_QUEUE_H
diff --git a/drivers/kernelsu/file_wrapper.c b/drivers/kernelsu/infra/file_wrapper.c
similarity index 54%
rename from drivers/kernelsu/file_wrapper.c
rename to drivers/kernelsu/infra/file_wrapper.c
index f2b252334645..98bb2539073a 100644
--- a/drivers/kernelsu/file_wrapper.c
+++ b/drivers/kernelsu/infra/file_wrapper.c
@@ -1,23 +1,3 @@
-#include <linux/gfp.h>
-#include <linux/fdtable.h>
-#include <linux/export.h>
-#include <linux/anon_inodes.h>
-#include <linux/aio.h> // kernel 3.18
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <linux/file.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/version.h>
-#include <linux/mount.h>
-
-#include "objsec.h"
-#include "ksud.h"
-
 struct ksu_file_wrapper {
 	struct file *orig;
 	struct file_operations ops;
@@ -28,8 +8,7 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp);
 static int ksu_wrapper_open(struct inode *ino, struct file *fp)
 {
 	struct path *orig_path = fp->f_path.dentry->d_fsdata;
-	struct file *orig_file =
-		dentry_open(orig_path, fp->f_flags, current_cred());
+	struct file *orig_file = dentry_open(orig_path, fp->f_flags, current_cred());
 	if (IS_ERR(orig_file)) {
 		return PTR_ERR(orig_file);
 	}
@@ -49,151 +28,136 @@ static const struct file_operations ksu_file_wrapper_inode_fops = {
 	.open = ksu_wrapper_open
 };
 
-static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->llseek(data->orig, off, flags);
 }
 
-static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz,
-				loff_t *off)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz, loff_t *off) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->read(orig, ptr, sz, off);
 }
 
-static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr,
-				 size_t sz, loff_t *off)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr, size_t sz, loff_t *off) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->write(orig, ptr, sz, off);
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
-static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi)
-{
-	struct ksu_file_wrapper *data = iocb->ki_filp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi) {
+	struct ksu_file_wrapper* data = iocb->ki_filp->private_data;
+	struct file* orig = data->orig;
 	iocb->ki_filp = orig;
 	return orig->f_op->read_iter(iocb, iovi);
 }
 
-static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi)
-{
-	struct ksu_file_wrapper *data = iocb->ki_filp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi) {
+	struct ksu_file_wrapper* data = iocb->ki_filp->private_data;
+	struct file* orig = data->orig;
 	iocb->ki_filp = orig;
 	return orig->f_op->write_iter(iocb, iovi);
 }
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
-static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch *icb,
-			      unsigned int v)
-{
-	struct ksu_file_wrapper *data = kiocb->ki_filp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch* icb, unsigned int v) {
+	struct ksu_file_wrapper* data = kiocb->ki_filp->private_data;
+	struct file* orig = data->orig;
 	kiocb->ki_filp = orig;
 	return orig->f_op->iopoll(kiocb, icb, v);
 }
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
-static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin)
-{
-	struct ksu_file_wrapper *data = kiocb->ki_filp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin) {
+	struct ksu_file_wrapper* data = kiocb->ki_filp->private_data;
+	struct file* orig = data->orig;
 	kiocb->ki_filp = orig;
 	return orig->f_op->iopoll(kiocb, spin);
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
-static int ksu_wrapper_iterate(struct file *fp, struct dir_context *dc)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR))
+static int ksu_wrapper_iterate (struct file *fp, struct dir_context *dc) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->iterate(orig, dc);
 }
+#endif 
+
+// int (*readdir) (struct file *, void *, filldir_t);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR)
+static int ksu_wrapper_readdir(struct file *fp, void *ptr, filldir_t filler) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
+	return orig->f_op->readdir(orig, ptr, filler);
+}
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
-static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->iterate_shared(orig, dc);
 }
 #endif
 
 // typedef unsigned __bitwise __poll_t;
-static unsigned __bitwise ksu_wrapper_poll(struct file *fp,
-					   struct poll_table_struct *pts)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static unsigned __bitwise ksu_wrapper_poll(struct file *fp, struct poll_table_struct *pts) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->poll(orig, pts);
 }
 
-static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd,
-				       unsigned long arg)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->unlocked_ioctl(orig, cmd, arg);
 }
 
-static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd,
-				     unsigned long arg)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->compat_ioctl(orig, cmd, arg);
 }
 
-static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct * vma) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->mmap(orig, vma);
 }
 
-static int ksu_wrapper_flush(struct file *fp, fl_owner_t id)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_flush(struct file *fp, fl_owner_t id) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->flush(orig, id);
 }
 
-static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2,
-			     int datasync)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+
+static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2, int datasync) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->fsync(orig, off1, off2, datasync);
 }
 
-static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->fasync(arg, orig, arg2);
 }
 
-static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	return orig->f_op->lock(orig, arg1, fl);
 }
 
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
-static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1,
-				    size_t sz, loff_t *off, int arg2)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, size_t sz, loff_t *off, int arg2) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->sendpage) {
 		return orig->f_op->sendpage(orig, pg, arg1, sz, off, arg2);
 	}
@@ -201,51 +165,38 @@ static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1,
 }
 #endif
 
-static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp,
-						   unsigned long arg1,
-						   unsigned long arg2,
-						   unsigned long arg3,
-						   unsigned long arg4)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->get_unmapped_area) {
-		return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3,
-						     arg4);
+		return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3, arg4);
 	}
 	return -EINVAL;
 }
 
 // static int ksu_wrapper_check_flags(int arg) {}
 
-static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->flock) {
 		return orig->f_op->flock(orig, arg1, fl);
 	}
 	return -EINVAL;
 }
 
-static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info *pii,
-					struct file *fp, loff_t *off, size_t sz,
-					unsigned int arg1)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info * pii, struct file *fp, loff_t *off, size_t sz, unsigned int arg1) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->splice_write) {
 		return orig->f_op->splice_write(pii, orig, off, sz, arg1);
 	}
 	return -EINVAL;
 }
 
-static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off,
-				       struct pipe_inode_info *pii, size_t sz,
-				       unsigned int arg1)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, struct pipe_inode_info *pii, size_t sz, unsigned int arg1) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->splice_read) {
 		return orig->f_op->splice_read(orig, off, pii, sz, arg1);
 	}
@@ -253,10 +204,9 @@ static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off,
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
-void ksu_wrapper_splice_eof(struct file *fp)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+void ksu_wrapper_splice_eof(struct file *fp) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->splice_eof) {
 		return orig->f_op->splice_eof(orig);
 	}
@@ -264,46 +214,36 @@ void ksu_wrapper_splice_eof(struct file *fp)
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
-static int ksu_wrapper_setlease(struct file *fp, int arg1,
-				struct file_lease **fl, void **p)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lease **fl, void **p) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->setlease) {
 		return orig->f_op->setlease(orig, arg1, fl, p);
 	}
 	return -EINVAL;
 }
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
-static int ksu_wrapper_setlease(struct file *fp, int arg1,
-				struct file_lock **fl, void **p)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lock **fl, void **p) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->setlease) {
 		return orig->f_op->setlease(orig, arg1, fl, p);
 	}
 	return -EINVAL;
 }
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
-// int (*setlease)(struct file *, long, struct file_lock **, void **);
-static int ksu_wrapper_setlease(struct file *fp, long arg1,
-				struct file_lock **fl, void **p)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) // int (*setlease)(struct file *, long, struct file_lock **, void **);
+static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl, void **p) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->setlease) {
 		return orig->f_op->setlease(orig, arg1, fl, p);
 	}
 	return -EINVAL;
 }
-#else
-// int (*setlease)(struct file *, long, struct file_lock **);
-static int ksu_wrapper_setlease(struct file *fp, long arg1,
-				struct file_lock **fl)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+#else // int (*setlease)(struct file *, long, struct file_lock **);
+static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->setlease) {
 		return orig->f_op->setlease(orig, arg1, fl);
 	}
@@ -311,11 +251,9 @@ static int ksu_wrapper_setlease(struct file *fp, long arg1,
 }
 #endif
 
-static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset,
-				  loff_t len)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->fallocate) {
 		return orig->f_op->fallocate(orig, mode, offset, len);
 	}
@@ -323,19 +261,17 @@ static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset,
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
-static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct ksu_file_wrapper *data = f->private_data;
-	struct file *orig = data->orig;
+static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) {
+	struct ksu_file_wrapper* data = f->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->show_fdinfo) {
 		orig->f_op->show_fdinfo(m, orig);
 	}
 }
-#else
-static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct ksu_file_wrapper *data = f->private_data;
-	struct file *orig = data->orig;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) {
+	struct ksu_file_wrapper* data = f->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->show_fdinfo) {
 		orig->f_op->show_fdinfo(m, orig);
 	}
@@ -345,15 +281,11 @@ static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
 // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1593-1606;drc=398da7defe218d3e51b0f3bdff75147e28125b60
-static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in,
-					   struct file *file_out,
-					   loff_t pos_out, size_t len,
-					   unsigned int flags)
-{
-	struct ksu_file_wrapper *data = file_out->private_data;
-	struct file *orig = data->orig;
-	return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len,
-					   flags);
+static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out,
+		loff_t pos_out, size_t len, unsigned int flags) {
+	struct ksu_file_wrapper* data = file_out->private_data;
+	struct file* orig = data->orig;
+	return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len, flags);
 }
 #endif
 
@@ -364,30 +296,24 @@ static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in,
 // REMAP_FILE_DEDUP: use file_out
 // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=483-484;drc=398da7defe218d3e51b0f3bdff75147e28125b60
 static loff_t ksu_wrapper_remap_file_range(struct file *file_in, loff_t pos_in,
-					   struct file *file_out,
-					   loff_t pos_out, loff_t len,
-					   unsigned int remap_flags)
-{
+				struct file *file_out, loff_t pos_out,
+				loff_t len, unsigned int remap_flags) {
 	if (remap_flags & REMAP_FILE_DEDUP) {
-		struct ksu_file_wrapper *data = file_out->private_data;
-		struct file *orig = data->orig;
-		return orig->f_op->remap_file_range(file_in, pos_in, orig,
-						    pos_out, len, remap_flags);
+		struct ksu_file_wrapper* data = file_out->private_data;
+		struct file* orig = data->orig;
+		return orig->f_op->remap_file_range(file_in, pos_in, orig, pos_out, len, remap_flags);
 	} else {
-		struct ksu_file_wrapper *data = file_in->private_data;
-		struct file *orig = data->orig;
-		return orig->f_op->remap_file_range(orig, pos_in, file_out,
-						    pos_out, len, remap_flags);
+		struct ksu_file_wrapper* data = file_in->private_data;
+		struct file* orig = data->orig;
+		return orig->f_op->remap_file_range(orig, pos_in, file_out, pos_out, len, remap_flags);
 	}
 }
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
-static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2,
-			       int flags)
-{
-	struct ksu_file_wrapper *data = fp->private_data;
-	struct file *orig = data->orig;
+static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, int flags) {
+	struct ksu_file_wrapper* data = fp->private_data;
+	struct file* orig = data->orig;
 	if (orig->f_op->fadvise) {
 		return orig->f_op->fadvise(orig, off1, off2, flags);
 	}
@@ -397,8 +323,7 @@ static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2,
 
 static void ksu_release_file_wrapper(struct ksu_file_wrapper *data);
 
-static int ksu_wrapper_release(struct inode *inode, struct file *filp)
-{
+static int ksu_wrapper_release(struct inode *inode, struct file *filp) {
 	// https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/file_table.c;l=467-473;drc=3be0b283b562eabbc2b1f3bb534dc8903079bbaa
 	// f_op->release is called before fops_put(f_op), so we put it manually.
 	fops_put(filp->f_op);
@@ -408,10 +333,8 @@ static int ksu_wrapper_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp)
-{
-	struct ksu_file_wrapper *p =
-		kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL);
+static struct ksu_file_wrapper* ksu_create_file_wrapper(struct file* fp) {
+	struct ksu_file_wrapper* p = kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL);
 	if (!p) {
 		return ERR_PTR(-ENOMEM);
 	}
@@ -425,24 +348,23 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp)
 	p->ops.write = fp->f_op->write ? ksu_wrapper_write : NULL;
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
 	p->ops.read_iter = fp->f_op->read_iter ? ksu_wrapper_read_iter : NULL;
-	p->ops.write_iter =
-		fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL;
+	p->ops.write_iter = fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL;
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
 	p->ops.iopoll = fp->f_op->iopoll ? ksu_wrapper_iopoll : NULL;
 #endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR))
 	p->ops.iterate = fp->f_op->iterate ? ksu_wrapper_iterate : NULL;
 #endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR)
+	p->ops.readdir = fp->f_op->readdir ? ksu_wrapper_readdir : NULL;
+#endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
-	p->ops.iterate_shared =
-		fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL;
+	p->ops.iterate_shared = fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL;
 #endif
 	p->ops.poll = fp->f_op->poll ? ksu_wrapper_poll : NULL;
-	p->ops.unlocked_ioctl =
-		fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL;
-	p->ops.compat_ioctl =
-		fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL;
+	p->ops.unlocked_ioctl = fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL;
+	p->ops.compat_ioctl = fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL;
 	p->ops.mmap = fp->f_op->mmap ? ksu_wrapper_mmap : NULL;
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
 	p->ops.fop_flags = fp->f_op->fop_flags;
@@ -457,34 +379,27 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp)
 #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
 	p->ops.sendpage = fp->f_op->sendpage ? ksu_wrapper_sendpage : NULL;
 #endif
-	p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ?
-					   ksu_wrapper_get_unmapped_area :
-					   NULL;
+	p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ? ksu_wrapper_get_unmapped_area : NULL;
 	p->ops.check_flags = fp->f_op->check_flags;
 	p->ops.flock = fp->f_op->flock ? ksu_wrapper_flock : NULL;
-	p->ops.splice_write =
-		fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL;
-	p->ops.splice_read =
-		fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL;
+	p->ops.splice_write = fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL;
+	p->ops.splice_read = fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL;
 	p->ops.setlease = fp->f_op->setlease ? ksu_wrapper_setlease : NULL;
 	p->ops.fallocate = fp->f_op->fallocate ? ksu_wrapper_fallocate : NULL;
-	p->ops.show_fdinfo =
-		fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	p->ops.show_fdinfo = fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL;
+#endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
-	p->ops.copy_file_range =
-		fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL;
+	p->ops.copy_file_range = fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL;
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
-	p->ops.remap_file_range = fp->f_op->remap_file_range ?
-					  ksu_wrapper_remap_file_range :
-					  NULL;
+	p->ops.remap_file_range = fp->f_op->remap_file_range ? ksu_wrapper_remap_file_range : NULL;
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
 	p->ops.fadvise = fp->f_op->fadvise ? ksu_wrapper_fadvise : NULL;
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
-	p->ops.splice_eof =
-		fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL;
+	p->ops.splice_eof = fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL;
 #endif
 
 	return p;
@@ -492,12 +407,12 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp)
 
 static void ksu_release_file_wrapper(struct ksu_file_wrapper *data)
 {
-	fput((struct file *)data->orig);
+	fput((struct file*) data->orig);
 	kfree(data);
 }
 
 static char *ksu_wrapper_d_dname(struct dentry *dentry, char *buffer,
-				 int buflen)
+								 int buflen)
 {
 	struct path *orig_path = dentry->d_fsdata;
 	return d_path(orig_path, buffer, buflen);
@@ -519,71 +434,8 @@ static const struct dentry_operations ksu_file_wrapper_d_ops = {
 #define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
 #define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
-// There is no anon_inode_create_getfile in 4.19, but it's not difficult to implement it.
-// https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e
-
-// Borrow kernel's anon_inode_mnt, so that we don't need to mount one by ourselves.
-static struct vfsmount *anon_inode_mnt __read_mostly;
-
-static struct inode *
-ksu_anon_inode_make_secure_inode(const char *name,
-				 const struct inode *context_inode)
-{
-	struct inode *inode;
-
-	if (unlikely(!anon_inode_mnt)) {
-		return ERR_PTR(-ENODEV);
-	}
-
-	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
-	if (IS_ERR(inode))
-		return inode;
-	inode->i_flags &= ~S_PRIVATE;
-
-	return inode;
-}
-
-static struct file *ksu_anon_inode_create_getfile_compat(
-	const char *name, const struct file_operations *fops, void *priv,
-	int flags, const struct inode *context_inode)
-{
-	struct inode *inode;
-	struct file *file;
-
-	if (fops->owner && !try_module_get(fops->owner))
-		return ERR_PTR(-ENOENT);
-
-	inode = ksu_anon_inode_make_secure_inode(name, context_inode);
-	if (IS_ERR(inode)) {
-		file = ERR_CAST(inode);
-		goto err;
-	}
-
-	file = alloc_file_pseudo(inode, anon_inode_mnt, name,
-				 flags & (O_ACCMODE | O_NONBLOCK), fops);
-	if (IS_ERR(file))
-		goto err_iput;
-
-	file->f_mapping = inode->i_mapping;
-
-	file->private_data = priv;
-
-	return file;
-
-err_iput:
-	iput(inode);
-err:
-	module_put(fops->owner);
-	return file;
-}
-#else // KERNEL_VERSION < 4.19
-struct file *ksu_anon_inode_create_getfile_compat(
-	const char *name, const struct file_operations *fops, void *priv,
-	int flags, const struct inode *context_inode)
-{
-	return anon_inode_getfile(name, fops, priv, flags);
-}
+#else
+#define ksu_anon_inode_create_getfile_compat(a, b, c, d, e) anon_inode_getfile(a, b, c, d)
 #endif
 
 int ksu_install_file_wrapper(int fd)
@@ -611,8 +463,7 @@ int ksu_install_file_wrapper(int fd)
 		"[ksu_fdwrapper]", &file_wrapper_data->ops, file_wrapper_data,
 		orig_file->f_flags, NULL);
 	if (IS_ERR(wrapper_file)) {
-		pr_err("ksu_fdwrapper: getfile failed: %ld\n",
-		       PTR_ERR(wrapper_file));
+		pr_err("ksu_fdwrapper: getfile failed: %ld\n", PTR_ERR(wrapper_file));
 		ret = PTR_ERR(wrapper_file);
 		goto out_release_wrapper;
 	}
@@ -623,15 +474,7 @@ int ksu_install_file_wrapper(int fd)
 	struct inode *wrapper_inode = file_inode(wrapper_file);
 	// libc's stdio relies on the fstat() result of the fd to determine its buffer type.
 	wrapper_inode->i_mode = file_inode(orig_file)->i_mode;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
-	struct inode_security_struct *wrapper_sec =
-		selinux_inode(wrapper_inode);
-#else
-	struct inode_security_struct *wrapper_sec =
-		(struct inode_security_struct *)wrapper_inode->i_security;
-#endif
-
+	struct inode_security_struct *wrapper_sec = selinux_inode(wrapper_inode);
 	// Use ksu_file_sid to bypass SELinux check.
 	// When we call `su` from terminal app, this is useful.
 	if (wrapper_sec) {
@@ -670,21 +513,4 @@ int ksu_install_file_wrapper(int fd)
 	return ret;
 }
 
-void ksu_file_wrapper_init(void)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) &&                          \
-	LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
-	static const struct file_operations tmp = { .owner = THIS_MODULE };
-	struct file *dummy = anon_inode_getfile("dummy", &tmp, NULL, 0);
-	if (IS_ERR(dummy)) {
-		pr_err("file_wrapper: initialize anon_inode_mnt failed, can't get file: %ld\n",
-		       PTR_ERR(dummy));
-		return;
-	}
-	anon_inode_mnt = dummy->f_path.mnt;
-	if (unlikely(!anon_inode_mnt)) {
-		pr_err("file_wrapper: initialize anon_inode_mnt failed, got NULL\n");
-	}
-	fput(dummy);
-#endif
-}
+void __init ksu_file_wrapper_init(void) { }
diff --git a/drivers/kernelsu/file_wrapper.h b/drivers/kernelsu/infra/file_wrapper.h
similarity index 76%
rename from drivers/kernelsu/file_wrapper.h
rename to drivers/kernelsu/infra/file_wrapper.h
index faae4dded301..ee672312b7aa 100644
--- a/drivers/kernelsu/file_wrapper.h
+++ b/drivers/kernelsu/infra/file_wrapper.h
@@ -1,9 +1,6 @@
 #ifndef KSU_FILE_WRAPPER_H
 #define KSU_FILE_WRAPPER_H
 
-#include <linux/file.h>
-#include <linux/fs.h>
-
 int ksu_install_file_wrapper(int fd);
 void ksu_file_wrapper_init(void);
 
diff --git a/drivers/kernelsu/su_mount_ns.c b/drivers/kernelsu/infra/su_mount_ns.c
similarity index 52%
rename from drivers/kernelsu/su_mount_ns.c
rename to drivers/kernelsu/infra/su_mount_ns.c
index 4a0e4a29b103..7f5651d5de73 100644
--- a/drivers/kernelsu/su_mount_ns.c
+++ b/drivers/kernelsu/infra/su_mount_ns.c
@@ -1,88 +1,38 @@
-#include <linux/dcache.h>
-#include <linux/errno.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/fs_struct.h>
-#include <linux/limits.h>
-#include <linux/namei.h>
-#include <linux/proc_ns.h>
-#include <linux/pid.h>
-#include <linux/slab.h>
-#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include <linux/sched/task.h>
-#else
-#include <linux/sched.h>
-#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
-#include <uapi/linux/mount.h>
-#else
-#include <uapi/linux/fs.h>
-#endif
-
-#include "klog.h" // IWYU pragma: keep
-#include "ksu.h"
-#include "kernel_compat.h"
-#include "su_mount_ns.h"
-
 extern int path_mount(const char *dev_name, struct path *path,
-		      const char *type_page, unsigned long flags,
-		      void *data_page);
+					  const char *type_page, unsigned long flags,
+					  void *data_page);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
-
-// RKSU: tiny arch.h, avoid depending on real arch.h
-#ifndef __PT_REGS_CAST
-#define __PT_REGS_CAST(x) (x)
-#endif
-
 #if defined(__aarch64__)
-#define PT_PARM1(x) (__PT_REGS_CAST(x)->regs[0])
-#define PT_PARM2(x) (__PT_REGS_CAST(x)->regs[1])
 extern long __arm64_sys_setns(const struct pt_regs *regs);
-#define do_sys_setns(regs) (__arm64_sys_setns(regs))
 #elif defined(__x86_64__)
-#define PT_PARM1(x) (__PT_REGS_CAST(x)->di)
-#define PT_PARM2(x) (__PT_REGS_CAST(x)->si)
 extern long __x64_sys_setns(const struct pt_regs *regs);
-#define do_sys_setns(regs) (__x64_sys_setns(regs))
 #elif defined(__arm__) // https://syscalls.mebeim.net/?table=arm/32/eabi/latest
-// taken from:
-// https://github.com/backslashxx/KernelSU/blob/8b71e8bce199e8ac44538648e298092a9b3ef42b/kernel/arch.h#L29
-#define PT_PARM1(x) (__PT_REGS_CAST(x)->uregs[0])
-#define PT_PARM2(x) (__PT_REGS_CAST(x)->uregs[1])
 extern long sys_setns(const struct pt_regs *regs);
-#define do_sys_setns(regs) (sys_setns(regs))
 #endif
 
 static long ksu_sys_setns(int fd, int flags)
 {
-#ifdef PT_PARM1
 	struct pt_regs regs;
 	memset(&regs, 0, sizeof(regs));
 
-	PT_PARM1(&regs) = fd;
-	PT_PARM2(&regs) = flags;
+	PT_REGS_PARM1(&regs) = fd;
+	PT_REGS_PARM2(&regs) = flags;
 
-	return do_sys_setns(&regs);
+#if defined(__aarch64__)
+	return __arm64_sys_setns(&regs);
+#elif defined(__x86_64__)
+	return __x64_sys_setns(&regs);
+#elif defined(__arm__)
+	return sys_setns(&regs);
 #else
 	return -ENOSYS;
 #endif
 }
 #else
-static long ksu_sys_setns(int fd, int flags)
-{
-	return sys_setns(fd, flags);
-}
-
-int ksys_unshare(unsigned long unshare_flags)
-{
-	return sys_unshare(unshare_flags);
-}
-#endif
+#define ksu_sys_setns sys_setns
+#define ksys_unshare sys_unshare
+#endif // > 4.17
 
 // global mode , need CAP_SYS_ADMIN and CAP_SYS_CHROOT to perform setns
 static void ksu_mnt_ns_global(void)
@@ -103,14 +53,14 @@ static void ksu_mnt_ns_global(void)
 	if (IS_ERR(pwd_path)) {
 		if (PTR_ERR(pwd_path) == -ENAMETOOLONG) {
 			pr_warn("absolute pwd longer than: %d, skip restore pwd!!\n",
-				PATH_MAX);
+					PATH_MAX);
 		} else {
-			pr_warn("get absolute pwd failed: %ld\n",
-				PTR_ERR(pwd_path));
+			pr_warn("get absolute pwd failed: %ld\n", PTR_ERR(pwd_path));
 		}
 		pwd_path = NULL;
 	}
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 try_setns:
 
 	rcu_read_lock();
@@ -130,18 +80,37 @@ static void ksu_mnt_ns_global(void)
 		goto out;
 	}
 	struct path ns_path;
-	long ret = ns_get_path(&ns_path, pid1_task, &mntns_operations);
+	long ret = (long)ns_get_path(&ns_path, pid1_task, &mntns_operations);
 	put_task_struct(pid1_task);
 	if (ret) {
 		pr_warn("failed get path for init mount namespace: %ld\n", ret);
 		goto out;
 	}
+#else
+try_setns:
+	;
+	// on UL kernels we can try to just feed it with struct path of /proc/1/ns/mnt
+	// we do NOT have ns_get_path. if it works, GOOD. if it doesn't I don't care.
+	struct path ns_path;
+	const struct cred *saved = override_creds(ksu_cred);
+
+	// make sure to LOOKUP_FOLLOW
+	// /proc/1/ns/mnt -> 'mnt:[4026531840]'
+	long ret = kern_path("/proc/1/ns/mnt", LOOKUP_FOLLOW, &ns_path);
+	if (ret) {
+		revert_creds(saved);
+		pr_warn("kern_path /proc/1/ns/mnt fail! ret: %d\n", ret);
+		goto out;
+	}
+	revert_creds(saved);
+#endif
+
 	struct file *ns_file = dentry_open(&ns_path, O_RDONLY, ksu_cred);
 
 	path_put(&ns_path);
 	if (IS_ERR(ns_file)) {
 		pr_warn("failed open file for init mount namespace: %ld\n",
-			PTR_ERR(ns_file));
+				PTR_ERR(ns_file));
 		goto out;
 	}
 
@@ -155,7 +124,7 @@ static void ksu_mnt_ns_global(void)
 	fd_install(fd, ns_file);
 	ret = ksu_sys_setns(fd, CLONE_NEWNS);
 
-	do_close_fd(fd);
+	close_fd(fd);
 
 	if (ret) {
 		pr_warn("call setns failed: %ld\n", ret);
@@ -169,8 +138,7 @@ static void ksu_mnt_ns_global(void)
 			set_fs_pwd(current->fs, &new_pwd);
 			path_put(&new_pwd);
 		} else {
-			pr_warn("restore pwd failed: %d, path: %s\n", err,
-				pwd_path);
+			pr_warn("restore pwd failed: %d, path: %s\n", err, pwd_path);
 		}
 	}
 out:
@@ -189,8 +157,7 @@ static void ksu_mnt_ns_individual(void)
 	// make root mount private
 	struct path root_path;
 	get_fs_root(current->fs, &root_path);
-	int pm_ret =
-		path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL);
+	int pm_ret = path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL);
 	path_put(&root_path);
 
 	if (pm_ret < 0) {
@@ -198,54 +165,6 @@ static void ksu_mnt_ns_individual(void)
 	}
 }
 
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-struct ksu_mns_tw {
-	struct callback_head cb;
-	int32_t ns_mode;
-};
-
-static void ksu_setup_mount_ns_tw_func(struct callback_head *cb)
-{
-	struct ksu_mns_tw *tw = container_of(cb, struct ksu_mns_tw, cb);
-	const struct cred *old_cred = override_creds(ksu_cred);
-	if (tw->ns_mode == KSU_NS_GLOBAL) {
-		ksu_mnt_ns_global();
-	} else {
-		ksu_mnt_ns_individual();
-	}
-	revert_creds(old_cred);
-	kfree(tw);
-}
-
-static void ksu_handle_setup_mount_ns(int32_t ns_mode)
-{
-	struct ksu_mns_tw *tw = kzalloc(sizeof(*tw), GFP_ATOMIC);
-	if (!tw) {
-		pr_err("no mem for tw! skip mnt_ns magic for pid: %d.\n",
-		       current->pid);
-		return;
-	}
-	tw->cb.func = ksu_setup_mount_ns_tw_func;
-	tw->ns_mode = ns_mode;
-	if (task_work_add(current, &tw->cb, TWA_RESUME)) {
-		kfree(tw);
-		pr_err("add task work failed! skip mnt_ns magic for pid: %d.\n",
-		       current->pid);
-	}
-}
-#else
-static void ksu_handle_setup_mount_ns(int32_t ns_mode)
-{
-	const struct cred *old_cred = override_creds(ksu_cred);
-	if (ns_mode == KSU_NS_GLOBAL) {
-		ksu_mnt_ns_global();
-	} else {
-		ksu_mnt_ns_individual();
-	}
-	revert_creds(old_cred);
-}
-#endif
-
 void setup_mount_ns(int32_t ns_mode)
 {
 	// inherit mode
@@ -255,16 +174,21 @@ void setup_mount_ns(int32_t ns_mode)
 	}
 
 	if (ns_mode != KSU_NS_GLOBAL && ns_mode != KSU_NS_INDIVIDUAL) {
-		pr_warn("pid: %d ,unknown mount namespace mode: %d\n",
-			current->pid, ns_mode);
+		pr_warn("pid: %d ,unknown mount namespace mode: %d\n", current->pid,
+				ns_mode);
 		return;
 	}
 
 	if (!ksu_cred) {
-		pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n",
-		       current->pid);
+		pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n", current->pid);
 		return;
 	}
 
-	ksu_handle_setup_mount_ns(ns_mode);
+	const struct cred *old_cred = override_creds(ksu_cred);
+	if (ns_mode == KSU_NS_GLOBAL) {
+		ksu_mnt_ns_global();
+	} else {
+		ksu_mnt_ns_individual();
+	}
+	revert_creds(old_cred);
 }
diff --git a/drivers/kernelsu/su_mount_ns.h b/drivers/kernelsu/infra/su_mount_ns.h
similarity index 100%
rename from drivers/kernelsu/su_mount_ns.h
rename to drivers/kernelsu/infra/su_mount_ns.h
diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c
index 38f0251f08a4..26f4a9471de5 100644
--- a/drivers/kernelsu/kernel_compat.c
+++ b/drivers/kernelsu/kernel_compat.c
@@ -1,199 +1,102 @@
-#include <linux/version.h>
-#include <linux/fs.h>
-#include <linux/dcache.h>
-#include <linux/uaccess.h>
-#include <linux/fdtable.h>
-#include <linux/string.h>
-#include <linux/security.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include <linux/sched/task.h>
-#else
-#include <linux/sched.h>
-#endif
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "klog.h" // IWYU pragma: keep
-#include "kernel_compat.h"
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||                           \
-	defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND)
-#include <linux/key.h>
-#include <linux/errno.h>
-#include <linux/cred.h>
-
-extern int install_session_keyring_to_cred(struct cred *, struct key *);
-struct key *init_session_keyring = NULL;
-
-static int install_session_keyring(struct key *keyring)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
+__weak int path_mount(const char *dev_name, struct path *path, 
+	const char *type_page, unsigned long flags, void *data_page)
 {
-	struct cred *new;
-	int ret;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	// 384 is enough 
+	char buf[384] = {0};
 
-	ret = install_session_keyring_to_cred(new, keyring);
-	if (ret < 0) {
-		abort_creds(new);
-		return ret;
-	}
+	// -1 on the size as implicit null termination
+	// as we zero init the thing
+	char *realpath = d_path(path, buf, sizeof(buf) - 1);
+	if (!(realpath && realpath != buf)) 
+		return -ENOENT;
 
-	return commit_creds(new);
+	mm_segment_t old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	long ret = do_mount(dev_name, (const char __user *)realpath, type_page, flags, data_page);
+	set_fs(old_fs);
+	return ret;
 }
 #endif
 
-struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode)
-{
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||                           \
-	defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND)
-	if (init_session_keyring != NULL && !current_cred()->session_keyring &&
-	    (current->flags & PF_WQ_WORKER)) {
-		pr_info("installing init session keyring for older kernel\n");
-		install_session_keyring(init_session_keyring);
-	}
-#endif
-	return filp_open(filename, flags, mode);
-}
-
-ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count,
-			       loff_t *pos)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
+__weak int path_umount(struct path *path, int flags)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-	return kernel_read(p, buf, count, pos);
-#else
-	loff_t offset = pos ? *pos : 0;
-	ssize_t result = kernel_read(p, offset, (char *)buf, count);
-	if (pos && result > 0) {
-		*pos = offset + result;
-	}
-	return result;
-#endif
-}
+	char buf[256] = {0};
+	int ret;
 
-ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count,
-				loff_t *pos)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-	return kernel_write(p, buf, count, pos);
-#else
-	loff_t offset = pos ? *pos : 0;
-	ssize_t result = kernel_write(p, buf, count, offset);
-	if (pos && result > 0) {
-		*pos = offset + result;
+	// -1 on the size as implicit null termination
+	// as we zero init the thing
+	char *usermnt = d_path(path, buf, sizeof(buf) - 1);
+	if (!(usermnt && usermnt != buf)) {
+		ret = -ENOENT;
+		goto out;
 	}
-	return result;
-#endif
-}
 
-static inline long
-do_strncpy_user_nofault(char *dst, const void __user *unsafe_addr, long count)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
-	return strncpy_from_user_nofault(dst, unsafe_addr, count);
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
-	return strncpy_from_unsafe_user(dst, unsafe_addr, count);
-#else
 	mm_segment_t old_fs = get_fs();
-	long ret;
+	set_fs(KERNEL_DS);
 
-	if (unlikely(count <= 0))
-		return 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+	ret = ksys_umount((char __user *)usermnt, flags);
+#else
+	ret = (int)sys_umount((char __user *)usermnt, flags);
+#endif
 
-	set_fs(USER_DS);
-	pagefault_disable();
-	ret = strncpy_from_user(dst, unsafe_addr, count);
-	pagefault_enable();
 	set_fs(old_fs);
 
-	if (ret >= count) {
-		ret = count;
-		dst[ret - 1] = '\0';
-	} else if (ret > 0) {
-		ret++;
-	}
-
+	// release ref here! user_path_at increases it
+	// then only cleans for itself
+out:
+	path_put(path); 
 	return ret;
-#endif
 }
+#endif
 
-long ksu_strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
-				   long count)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0)
+__weak long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
 {
-#ifdef CONFIG_KSU_MANUAL_HOOK
+	// https://elixir.bootlin.com/linux/v5.2.21/source/mm/maccess.c#L27
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
-	ret = do_strncpy_user_nofault(dst, unsafe_addr, count);
-	if (likely(ret >= 0))
-		return ret;
-
-	// we faulted! fallback to slow path
-	if (unlikely(!ksu_access_ok(unsafe_addr, count)))
-		return -EFAULT;
-
-	ret = strncpy_from_user(dst, unsafe_addr, count);
-	if (ret >= count) {
-		ret = count;
-		dst[ret - 1] = '\0';
-	} else if (ret >= 0) {
-		ret++;
-	}
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst,
+			(__force const void __user *)src, size);
+	pagefault_enable();
+	set_fs(old_fs);
 
-	return ret;
-#else
-	return do_strncpy_user_nofault(dst, unsafe_addr, count);
-#endif
+	return ret ? -EFAULT : 0;
 }
+#endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0)
-int path_mount(const char *dev_name, struct path *path, const char *type_page,
-	       unsigned long flags, void *data_page)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) 
+__weak long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
 {
-	// 384 is enough
-	char buf[384] = { 0 };
-	mm_segment_t old_fs;
-	long ret;
+	// https://elixir.bootlin.com/linux/v5.8/source/mm/maccess.c#L205
+	long ret = -EFAULT;
+	mm_segment_t old_fs = get_fs();
 
-	// -1 on the size as implicit null termination
-	// as we zero init the thing
-	char *realpath = d_path(path, buf, sizeof(buf) - 1);
-	if (!(realpath && realpath != buf))
-		return -ENOENT;
+	set_fs(USER_DS);
+
+	// normally theres an access_ok check here
+	// but for what we use it, it will always be true.
+	// so we skip it
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst, src, size);
+	pagefault_enable();
 
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	ret = do_mount(dev_name, (const char __user *)realpath, type_page,
-		       flags, data_page);
 	set_fs(old_fs);
-	return ret;
-}
-#endif
 
-int do_close_fd(unsigned int fd)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-	return close_fd(fd);
-#else
-	return __close_fd(current->files, fd);
-#endif
+	if (ret)
+		return -EFAULT;
+	return 0;
 }
+#endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0)
-// https://elixir.bootlin.com/linux/v5.10.247/source/mm/util.c#L664
-void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize,
-			   gfp_t flags)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) || !defined(CONFIG_EXT4_FS)
+__weak void ext4_unregister_sysfs(struct super_block *sb)
 {
-	void *newp;
-
-	if (oldsize >= newsize)
-		return (void *)p;
-	newp = kvmalloc(newsize, flags);
-	if (!newp)
-		return NULL;
-	memcpy(newp, p, oldsize);
-	kvfree(p);
-	return newp;
+	pr_info("%s: feature not implemented!\n", __func__);
 }
 #endif
diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h
index b8fe8874d17d..147efae61ccf 100644
--- a/drivers/kernelsu/kernel_compat.h
+++ b/drivers/kernelsu/kernel_compat.h
@@ -1,62 +1,375 @@
 #ifndef __KSU_H_KERNEL_COMPAT
 #define __KSU_H_KERNEL_COMPAT
 
-#include <linux/fs.h>
-#include <linux/version.h>
-#include <linux/task_work.h>
-#include <linux/key.h>
-
-/*
- * Adapt to Huawei HISI kernel without affecting other kernels ,
- * Huawei Hisi Kernel EBITMAP Enable or Disable Flag ,
- * From ss/ebitmap.h
+#if defined(CONFIG_KEYS) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)
+extern int install_session_keyring_to_cred(struct cred *cred, struct key *keyring);
+static struct key *init_session_keyring = NULL;
+
+bool is_init(const struct cred* cred);
+
+static inline int install_session_keyring(struct key *keyring)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = install_session_keyring_to_cred(new, keyring);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
+}
+
+// up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */
+// so we need to grab this using rcu_dereference
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->session_keyring); }
+#else
+static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->tgcred->session_keyring); }
+#endif
+
+__attribute__((cold))
+static noinline void ksu_grab_init_session_keyring()
+{
+	if (init_session_keyring)
+		return;
+
+	if (!!strcmp(current->comm, "init"))
+		return;
+
+	if (!!!is_init(current_cred()))
+		return;
+
+	// now we are sure that this is the key we want
+	struct key *keyring = ksu_get_current_session_keyring();
+	if (!keyring)
+		return;
+
+	init_session_keyring = key_get(keyring);
+
+	pr_info("%s: init_session_keyring: 0x%lx \n", __func__, (uintptr_t)init_session_keyring);
+}
+
+static noinline struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode)
+{
+	// it used to be that we put this on (current->flags & PF_WQ_WORKER)
+	// but since things actually needing this has been offloaded to kthread
+	// like allowlist write, we check for that instead.
+	if (!(current->flags & PF_KTHREAD))
+		goto filp_open;
+
+	if (!!ksu_get_current_session_keyring())
+		goto filp_open;
+	
+	if (!!!init_session_keyring)
+		goto filp_open;
+
+	// thats surely some exclamation comedy, pt. 2
+	// now we are sure that we need to install init keyring to current
+	install_session_keyring(init_session_keyring);
+
+filp_open:
+	return filp_open(filename, flags, mode);
+}
+#define filp_open ksu_filp_open_compat
+#else
+static inline void ksu_grab_init_session_keyring() {} // no-op
+#endif // KEYS && < 5.2
+
+#ifndef __ro_after_init
+#define __ro_after_init
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
+#define d_inode(dentry) ((dentry)->d_inode)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) && defined(CONFIG_ARM64)
+#ifndef TIF_SECCOMP
+#define TIF_SECCOMP		11
+#endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
+static inline void *ksu_kvmalloc(size_t size, gfp_t flags)
+{
+	void *buf = kmalloc(size, flags);
+	if (!buf)
+		buf = vmalloc(size);
+	
+	return buf;
+}
+
+static inline void ksu_kvfree(void *buf)
+{
+	if (is_vmalloc_addr(buf))
+		vfree(buf);
+	else
+		kfree(buf);
+}
+#define kvmalloc ksu_kvmalloc
+#define kvfree ksu_kvfree
+#endif
+
+// for supercalls.c fd install tw
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) && !defined(TWA_RESUME)
+#define TWA_RESUME 1
+#endif
+
+// this is ksys_close, however that is spotty to use 
+// as 5.10 backported close_fd and rekt ksys_close
+// so we use what it does internally, __close_fd
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+#define close_fd(fd) __close_fd(current->files, fd)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)
+#define close_fd sys_close
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 0)
+static inline struct file *ksu_dentry_open(const struct path *path, int flags, const struct cred *cred)
+{
+	return dentry_open((*path).dentry, (*path).mnt, flags, cred);
+}
+#define dentry_open ksu_dentry_open
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)
+#ifndef replace_fops
+#define replace_fops(f, fops) \
+	do {	\
+		struct file *__file = (f); \
+		fops_put(__file->f_op); \
+		BUG_ON(!(__file->f_op = (fops))); \
+	} while(0)
+#endif
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0) && defined(CONFIG_JUMP_LABEL)
+#define KSU_CAN_USE_JUMP_LABEL
+
+// https://elixir.bootlin.com/linux/v3.10.108/source/include/linux/jump_label.h#L211
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
+static inline void ksu_static_key_enable(struct static_key *key)
+{
+	int count = atomic_read(&key->enabled);
+	if (!count)
+		static_key_slow_inc(key);
+}
+
+static inline void ksu_static_key_disable(struct static_key *key)
+{
+	int count = atomic_read(&key->enabled);
+	if (count)
+		static_key_slow_dec(key);
+}
+
+#define static_branch_enable(k)		ksu_static_key_enable(k)
+#define static_branch_disable(k)	ksu_static_key_disable(k)
+
+#define static_branch_unlikely(k)	static_key_false(k)
+#define static_branch_likely(k)		static_key_true(k)
+
+#ifndef DEFINE_STATIC_KEY_FALSE
+#define DEFINE_STATIC_KEY_FALSE(k)	struct static_key k = STATIC_KEY_INIT_FALSE
+#endif
+
+#ifndef DEFINE_STATIC_KEY_TRUE
+#define DEFINE_STATIC_KEY_TRUE(k)	struct static_key k = STATIC_KEY_INIT_TRUE
+#endif
+
+#endif // < 4.3
+#endif // >= 3.4 && CONFIG_JUMP_LABEL
+
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+		const compat_uptr_t __user *compat;
+#endif
+	} ptr;
+};
+
+extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
+
+/**
+ * ksu_copy_from_user_retry
+ * try nofault copy first, if it fails, try with plain
+ * paramters are the same as copy_from_user
+ * 0 = success
  */
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) &&                         \
-		(LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) ||             \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)) &&                    \
-		(LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0))
-#ifdef HISI_SELINUX_EBITMAP_RO
-#define CONFIG_IS_HW_HISI
+extern long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
+static __always_inline long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned long count)
+{
+	long ret = copy_from_user_nofault(to, from, count);
+	if (likely(!ret))
+		return ret;
+
+	// we faulted! fallback to slow path
+	return copy_from_user(to, from, count);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) // caller is reponsible for sanity!
+static inline void ksu_zeroed_strncpy(char *dest, const char *src, size_t count)
+{
+	// this is actually faster due to dead store elimination
+	// count - 1 as implicit null termination
+	__builtin_memset(dest, 0, count);
+	__builtin_strncpy(dest, src, count - 1);
+}
+#define strscpy_pad ksu_zeroed_strncpy
 #endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
+#define strscpy ksu_zeroed_strncpy
 #endif
 
-extern long ksu_strncpy_from_user_nofault(char *dst,
-					  const void __user *unsafe_addr,
-					  long count);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
+#define d_is_reg(dentry) S_ISREG((dentry)->d_inode->i_mode)
+#endif
 
-extern struct file *ksu_filp_open_compat(const char *filename, int flags,
-					 umode_t mode);
-extern ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count,
-				      loff_t *pos);
-extern ssize_t ksu_kernel_write_compat(struct file *p, const void *buf,
-				       size_t count, loff_t *pos);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 5, 0)
+struct user_struct *ksu_alloc_uid(kuid_t uid) { return alloc_uid(current_user_ns(), uid); }
+#define alloc_uid ksu_alloc_uid
+#endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||                           \
-	defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND)
-extern struct key *init_session_keyring;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR)
+struct dir_context { const filldir_t actor; loff_t pos; };
+#define iterate_dir(file, ctx) vfs_readdir(file, (ctx)->actor, ctx)
 #endif
 
-extern int do_close_fd(unsigned int fd);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+__weak char *bin2hex(char *dst, const void *src, size_t count)
+{
+	const unsigned char *_src = src;
+	while (count--)
+		dst = pack_hex_byte(dst, *_src++);
+	return dst;
+}
+#endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0)
-extern void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize,
-				  gfp_t flags);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)
+#define file_inode(f) ((f)->f_path.dentry->d_inode)
 #endif
 
-#ifndef VERIFY_READ
-#define ksu_access_ok(addr, size) access_ok(addr, size)
-#else
-#define ksu_access_ok(addr, size) access_ok(VERIFY_READ, addr, size)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(CONFIG_LSM)
+#define selinux_inode(inode) ((inode)->i_security)
+#define selinux_cred(cred) ((cred)->security)
 #endif
 
-// Linux >= 5.7
-// task_work_add (struct, struct, enum)
-// Linux pre-5.7
-// task_work_add (struct, struct, bool)
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
-#ifndef TWA_RESUME
-#define TWA_RESUME true
+#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 15, 0)
+__weak void groups_sort(struct group_info *group_info) { } // no-op
 #endif
+
+#ifndef U16_MAX
+#define	U16_MAX	((u16)(~0U))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0) && !defined(EPOLLIN)
+#define EPOLLIN		0x00000001
+#define EPOLLPRI	0x00000002
+#define EPOLLOUT	0x00000004
+#define EPOLLERR	0x00000008
+#define EPOLLHUP	0x00000010
+#define EPOLLRDNORM	0x00000040
+#define EPOLLRDBAND	0x00000080
+#define EPOLLWRNORM	0x00000100
+#define EPOLLWRBAND	0x00000200
+#define EPOLLMSG	0x00000400
+#define EPOLLRDHUP	0x00002000
+#endif // < 4.12 && !EPOLLIN
+
+#ifndef READ_ONCE
+#define READ_ONCE(x) (*(const volatile typeof(x) *)&(x))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 15, 0)
+#define task_ppid_nr(a) (pid_t)sys_getppid()
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 17, 0)
+static inline u64 ksu_ktime_get_ns(void) { return ktime_to_ns(ktime_get()); }
+#define ktime_get_ns ksu_ktime_get_ns
+#endif
+
+// WARNING: no overflow safety!
+#ifndef struct_size
+#define struct_size(p, member, n) (sizeof(*(p)) + (n) * sizeof(*(p)->member))
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0)
+#ifndef ALIGN_DOWN
+#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a))
 #endif
+#endif
+
+#ifndef untagged_addr
+#define untagged_addr(addr) (addr)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
+// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418
+static noinline ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ssize_t result = vfs_read(p, (void __user *)buf, count, pos);
+	set_fs(old_fs);
+	return result;
+}
+// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512
+static noinline ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos);
+	set_fs(old_fs);
+	return res;
+}
+#define kernel_read ksu_kernel_read_compat
+#define kernel_write ksu_kernel_write_compat
+#endif // < 4.14
+
+static inline void ksu_kfree_byref(void *buf) { kfree(*(void **)buf); }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 9, 0)
+// hashtable.h, list.h, rculist.h
+// ref: https://github.com/torvalds/linux/commit/b67bfe0d42cac56c512dd5da4b1b347a23f4b70a
+#include "linux_hashtable.h"
+static inline int __must_check ksu_kref_get_unless_zero(struct kref *kref)
+{ 
+	return atomic_add_unless(&kref->refcount, 1, 0); 
+}
+#define kref_get_unless_zero ksu_kref_get_unless_zero
+#endif // < 3.9
+
+/**
+ *  kver agnostic workaround for < 3.14's CONFIG_UIDGID_STRICT_TYPE_CHECKS=n
+ *
+ *  - force dereferences an unsigned int (uid_t)
+ *  - redefines current_uid / current_euid macros
+ *
+ * ref
+ *  - https://elixir.bootlin.com/linux/v3.13/source/include/linux/uidgid.h
+ *  - https://elixir.bootlin.com/linux/v3.13/source/include/linux/cred.h#L331
+ */
+#define ksu_get_uid_t(x) *(unsigned int *)&(x)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 14, 0)
+#undef current_uid
+#undef current_euid
+typedef struct { uid_t val; } ksu_kuid_t;
+static inline ksu_kuid_t current_uid() { return *(ksu_kuid_t *)(&current_cred()->uid); }
+static inline ksu_kuid_t current_euid() { return *(ksu_kuid_t *)(&current_cred()->euid); }
+#endif // < 3.14
+
+#endif // __KSU_H_KERNEL_COMPAT
diff --git a/drivers/kernelsu/kernel_includes.h b/drivers/kernelsu/kernel_includes.h
new file mode 100644
index 000000000000..c3ea6cb0db09
--- /dev/null
+++ b/drivers/kernelsu/kernel_includes.h
@@ -0,0 +1,179 @@
+#ifndef __KSU_H_KERNEL_INCLUDES
+#define __KSU_H_KERNEL_INCLUDES
+
+// common
+#include <asm/current.h>
+#include <asm/syscall.h>
+#include <crypto/hash.h>
+#include <linux/aio.h>
+#include <linux/anon_inodes.h>
+#include <linux/atomic.h>
+#include <linux/binfmts.h>
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/compat.h>
+#include <linux/compiler.h>
+#include <linux/cred.h>
+#include <linux/dcache.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/filter.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/input.h>
+#include <linux/ioctl.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/lsm_audit.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/path.h>
+#include <linux/pid.h>
+#include <linux/poll.h>
+#include <linux/printk.h>
+#include <linux/ptrace.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/thread_info.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/uidgid.h>
+#include <linux/uio.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+// versioned / conditional
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)
+#include <linux/hex.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0)
+#include <linux/stop_machine.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0)
+#include <linux/proc_ns.h>
+#else
+#include <linux/proc_fs.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
+#include <uapi/linux/mount.h>
+#else
+#include <uapi/linux/fs.h>
+#endif
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+#include <linux/input-event-codes.h>
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+#include <uapi/linux/input.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
+#include <uapi/asm-generic/errno.h>
+#else
+#include <asm-generic/errno.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+#include <crypto/sha2.h>
+#else
+#include <crypto/sha.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+#include <linux/compiler_types.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0)
+#include <uapi/linux/eventpoll.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+#include <linux/sched/task_stack.h>
+#include <uapi/linux/sched/types.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/sched/user.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)
+#include <linux/hashtable.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+#include <linux/task_work.h>
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+#include <linux/lsm_hooks.h>
+#endif
+
+/**
+ * replace common mem/str functions with builtins
+ * so legacy kernels get better inlining and optimized routines (with newer compielrs)
+ * a lot of people rice their flags (mcpu/march), this'll be a good reward for them.
+ * minimum that people use is gcc 4.9 for 3.x kernels, so these are fineee
+ * https://github.com/gcc-mirror/gcc/blob/releases/gcc-4.9/gcc/builtins.def#L562
+ *
+ */
+#if !defined(CONFIG_KSU_DEBUG)
+
+#define memchr		__builtin_memchr
+#define memcmp		__builtin_memcmp
+#define memcpy		__builtin_memcpy
+#define memmove		__builtin_memmove
+#define memset		__builtin_memset
+#define strcasecmp	__builtin_strcasecmp
+#define strcat		__builtin_strcat
+#define strchr		__builtin_strchr
+#define strcmp		__builtin_strcmp
+#define strcpy		__builtin_strcpy
+#define strcspn		__builtin_strcspn
+#define strlen		__builtin_strlen
+#define strncasecmp	__builtin_strncasecmp
+#define strncat		__builtin_strncat
+#define strncmp		__builtin_strncmp
+#define strncpy		__builtin_strncpy
+#define strpbrk		__builtin_strpbrk
+#define strrchr		__builtin_strrchr
+#define strspn		__builtin_strspn
+//#define strstr		__builtin_strstr
+
+#endif // !CONFIG_KSU_DEBUG
+
+#endif // __KSU_H_KERNEL_INCLUDES
diff --git a/drivers/kernelsu/kernel_umount.c b/drivers/kernelsu/kernel_umount.c
deleted file mode 100644
index cd9889ea7f72..000000000000
--- a/drivers/kernelsu/kernel_umount.c
+++ /dev/null
@@ -1,190 +0,0 @@
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/task_work.h>
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/nsproxy.h>
-#include <linux/path.h>
-#include <linux/printk.h>
-#include <linux/types.h>
-#include <linux/syscalls.h>
-
-#include "kernel_umount.h"
-#include "klog.h" // IWYU pragma: keep
-#include "allowlist.h"
-#include "kernel_compat.h"
-#include "selinux/selinux.h"
-#include "feature.h"
-#include "ksud.h"
-#include "ksu.h"
-
-bool __read_mostly ksu_kernel_umount_enabled = true;
-
-static int kernel_umount_feature_get(u64 *value)
-{
-	*value = ksu_kernel_umount_enabled ? 1 : 0;
-	return 0;
-}
-
-static int kernel_umount_feature_set(u64 value)
-{
-	bool enable = value != 0;
-	ksu_kernel_umount_enabled = enable;
-	pr_info("kernel_umount: set to %d\n", enable);
-	return 0;
-}
-
-static const struct ksu_feature_handler kernel_umount_handler = {
-	.feature_id = KSU_FEATURE_KERNEL_UMOUNT,
-	.name = "kernel_umount",
-	.get_handler = kernel_umount_feature_get,
-	.set_handler = kernel_umount_feature_set,
-};
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-extern int path_umount(struct path *path, int flags);
-static int ksu_umount_mnt(const char *__never_use_mnt, struct path *path,
-			   int flags)
-{
-	return path_umount(path, flags);
-}
-#else
-static int ksu_sys_umount(const char *mnt, int flags)
-{
-	char __user *usermnt = (char __user *)mnt;
-	mm_segment_t old_fs;
-	int ret = 0;
-
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
-	ret = ksys_umount(usermnt, flags);
-#else
-	// Perhaps its not necessary to cast it
-	ret = (int)sys_umount(usermnt, flags); // cuz asmlinkage long sys##name
-#endif
-	set_fs(old_fs);
-	return ret;
-}
-#define ksu_umount_mnt(mnt, __unused, flags)                                   \
-	({                                                                     \
-		path_put(__unused);                                            \
-		ksu_sys_umount(mnt, flags);                                    \
-	})
-
-#endif
-
-static void try_umount(const char *mnt, int flags)
-{
-	struct path path;
-	int ret = 0;
-	if (kern_path(mnt, 0, &path)) {
-		return;
-	}
-
-	if (path.dentry != path.mnt->mnt_root) {
-		// it is not root mountpoint, maybe umounted by others already.
-		path_put(&path);
-		return;
-	}
-
-	ret = ksu_umount_mnt(mnt, &path, flags);
-	if (ret) {
-		pr_info("%s: umounting %s (flags=0x%x) failed, err: %d\n",
-			__func__, mnt, flags, ret);
-	}
-}
-
-struct umount_tw {
-	struct callback_head cb;
-};
-
-static void umount_tw_func(struct callback_head *cb)
-{
-	struct umount_tw *tw = container_of(cb, struct umount_tw, cb);
-	const struct cred *saved = override_creds(ksu_cred);
-
-	down_read(&mount_list_lock);
-	struct mount_entry *entry;
-	list_for_each_entry (entry, &mount_list, list) {
-		pr_info("%s: unmounting: %s flags 0x%x\n", __func__,
-			entry->umountable, entry->flags);
-		try_umount(entry->umountable, entry->flags);
-	}
-	up_read(&mount_list_lock);
-
-	revert_creds(saved);
-	kfree(tw);
-}
-
-int ksu_handle_umount(uid_t old_uid, uid_t new_uid)
-{
-	// if there isn't any module mounted, just ignore it!
-	if (!ksu_module_mounted) {
-		return 0;
-	}
-
-	if (!ksu_kernel_umount_enabled) {
-		return 0;
-	}
-
-	if (!ksu_cred) {
-		return 0;
-	}
-
-	// There are 5 scenarios:
-	// 1. Normal app: zygote -> appuid
-	// 2. Isolated process forked from zygote: zygote -> isolated_process
-	// 3. App zygote forked from zygote: zygote -> appuid
-	// 4. Isolated process froked from app zygote: appuid -> isolated_process (already handled by 3)
-	// 5. Isolated process froked from webview zygote (no need to handle, app cannot run custom code)
-	if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) {
-		return 0;
-	}
-
-	if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) {
-		return 0;
-	}
-
-	// check old process's selinux context, if it is not zygote, ignore it!
-	// because some su apps may setuid to untrusted_app but they are in global mount namespace
-	// when we umount for such process, that is a disaster!
-	// also handle case 4 and 5
-	bool is_zygote_child = is_zygote(get_current_cred());
-	if (!is_zygote_child) {
-		pr_info("handle umount ignore non zygote child: %d\n",
-			current->pid);
-		return 0;
-	}
-	// umount the target mnt
-	pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid);
-
-	struct umount_tw *tw;
-	tw = kzalloc(sizeof(*tw), GFP_ATOMIC);
-	if (!tw)
-		return 0;
-
-	tw->cb.func = umount_tw_func;
-
-	int err = task_work_add(current, &tw->cb, TWA_RESUME);
-	if (err) {
-		kfree(tw);
-		pr_warn("unmount add task_work failed\n");
-	}
-
-	return 0;
-}
-
-void ksu_kernel_umount_init(void)
-{
-	if (ksu_register_feature_handler(&kernel_umount_handler)) {
-		pr_err("Failed to register kernel_umount feature handler\n");
-	}
-}
-
-void ksu_kernel_umount_exit(void)
-{
-	ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT);
-}
diff --git a/drivers/kernelsu/kernel_umount.h b/drivers/kernelsu/kernel_umount.h
deleted file mode 100644
index 96a23fba5bcd..000000000000
--- a/drivers/kernelsu/kernel_umount.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __KSU_H_KERNEL_UMOUNT
-#define __KSU_H_KERNEL_UMOUNT
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/rwsem.h>
-
-void ksu_kernel_umount_init(void);
-void ksu_kernel_umount_exit(void);
-
-// Handler function to be called from setresuid hook
-int ksu_handle_umount(uid_t old_uid, uid_t new_uid);
-
-// for the umount list
-struct mount_entry {
-	char *umountable;
-	unsigned int flags;
-	struct list_head list;
-};
-extern struct list_head mount_list;
-extern struct rw_semaphore mount_list_lock;
-
-extern bool __read_mostly ksu_kernel_umount_enabled;
-
-#endif
diff --git a/drivers/kernelsu/kp_hook.c b/drivers/kernelsu/kp_hook.c
deleted file mode 100644
index 23ef72fb14ba..000000000000
--- a/drivers/kernelsu/kp_hook.c
+++ /dev/null
@@ -1,167 +0,0 @@
-#include <linux/kprobes.h>
-#include <linux/compat.h>
-#include <linux/workqueue.h>
-
-#define DECL_KP(name, sym, pre)                                                \
-	struct kprobe name = {                                                 \
-		.symbol_name = sym,                                            \
-		.pre_handler = pre,                                            \
-	}
-
-// ksud.c
-
-static struct work_struct stop_vfs_read_work, stop_execve_hook_work,
-	stop_input_hook_work;
-
-static int sys_execve_handler_pre(struct kprobe *p, struct pt_regs *regs)
-{
-	struct pt_regs *real_regs = PT_REAL_REGS(regs);
-	const char __user **filename_user =
-		(const char **)&PT_REGS_PARM1(real_regs);
-	const char __user *const __user *__argv =
-		(const char __user *const __user *)PT_REGS_PARM2(real_regs);
-	struct user_arg_ptr argv = { .ptr.native = __argv };
-	struct filename filename_in, *filename_p;
-	char path[32];
-
-	if (!filename_user)
-		return 0;
-	if (!ksu_retry_filename_access(filename_user, path, 32, false))
-		return 0;
-
-	filename_in.name = path;
-	filename_p = &filename_in;
-	return ksu_handle_execveat_ksud((int *)AT_FDCWD, &filename_p, &argv,
-					NULL, NULL);
-}
-
-static int sys_read_handler_pre(struct kprobe *p, struct pt_regs *regs)
-{
-	struct pt_regs *real_regs = PT_REAL_REGS(regs);
-	unsigned int fd = PT_REGS_PARM1(real_regs);
-	char __user **buf_ptr = (char __user **)&PT_REGS_PARM2(real_regs);
-	size_t *count_ptr = (size_t *)&PT_REGS_PARM3(real_regs);
-
-	return ksu_handle_sys_read(fd, buf_ptr, count_ptr);
-}
-
-static int input_handle_event_handler_pre(struct kprobe *p,
-					  struct pt_regs *regs)
-{
-	unsigned int *type = (unsigned int *)&PT_REGS_PARM2(regs);
-	unsigned int *code = (unsigned int *)&PT_REGS_PARM3(regs);
-	int *value = (int *)&PT_REGS_CCALL_PARM4(regs);
-	return ksu_handle_input_handle_event(type, code, value);
-}
-
-static DECL_KP(execve_kp, SYS_EXECVE_SYMBOL, sys_execve_handler_pre);
-static DECL_KP(vfs_read_kp, SYS_READ_SYMBOL, sys_read_handler_pre);
-static DECL_KP(input_event_kp, "input_event", input_handle_event_handler_pre);
-
-static void do_stop_vfs_read_hook(struct work_struct *work)
-{
-	unregister_kprobe(&vfs_read_kp);
-}
-
-static void do_stop_execve_hook(struct work_struct *work)
-{
-	unregister_kprobe(&execve_kp);
-}
-
-static void do_stop_input_hook(struct work_struct *work)
-{
-	unregister_kprobe(&input_event_kp);
-}
-
-void kp_handle_ksud_stop(enum ksud_stop_code stop_code)
-{
-	bool ret;
-	switch (stop_code) {
-	case VFS_READ_HOOK_KP: {
-		ret = schedule_work(&stop_vfs_read_work);
-		pr_info("unregister vfs_read kprobe: %d!\n", ret);
-		break;
-	}
-	case EXECVE_HOOK_KP: {
-		ret = schedule_work(&stop_execve_hook_work);
-		pr_info("unregister execve kprobe: %d!\n", ret);
-		break;
-	}
-	case INPUT_EVENT_HOOK_KP: {
-		static bool input_hook_stopped = false;
-		if (input_hook_stopped) {
-			return;
-		}
-		input_hook_stopped = true;
-		ret = schedule_work(&stop_input_hook_work);
-		pr_info("unregister input kprobe: %d!\n", ret);
-		break;
-	}
-	default:
-		return;
-	}
-	return;
-}
-
-void kp_handle_ksud_init(void)
-{
-	int ret;
-
-	ret = register_kprobe(&execve_kp);
-	pr_info("ksud: execve_kp: %d\n", ret);
-
-	ret = register_kprobe(&vfs_read_kp);
-	pr_info("ksud: vfs_read_kp: %d\n", ret);
-
-	ret = register_kprobe(&input_event_kp);
-	pr_info("ksud: input_event_kp: %d\n", ret);
-
-	INIT_WORK(&stop_vfs_read_work, do_stop_vfs_read_hook);
-	INIT_WORK(&stop_execve_hook_work, do_stop_execve_hook);
-	INIT_WORK(&stop_input_hook_work, do_stop_input_hook);
-}
-
-void kp_handle_ksud_exit(void)
-{
-	unregister_kprobe(&execve_kp);
-	// this should be done before unregister vfs_read_kp
-	// unregister_kprobe(&vfs_read_kp);
-	unregister_kprobe(&input_event_kp);
-}
-
-// supercalls.c
-
-extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd,
-				 void __user **arg);
-
-static int reboot_handler_pre(struct kprobe *p, struct pt_regs *regs)
-{
-	struct pt_regs *real_regs = PT_REAL_REGS(regs);
-	int magic1 = (int)PT_REGS_PARM1(real_regs);
-	int magic2 = (int)PT_REGS_PARM2(real_regs);
-	void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs);
-
-	// cmd is not really used here, so we NULL!
-	if (ksu_handle_sys_reboot(magic1, magic2, NULL, arg)) {
-		pr_err("kp_hook: sys_reboot failure\n");
-	}
-
-	return 0;
-}
-
-static DECL_KP(reboot_kp, REBOOT_SYMBOL, reboot_handler_pre);
-
-void kp_handle_supercalls_init(void)
-{
-	int rc = register_kprobe(&reboot_kp);
-	if (rc) {
-		pr_err("reboot kprobe failed: %d\n", rc);
-		return;
-	}
-	pr_info("reboot kprobe registered successfully\n");
-}
-
-void kp_handle_supercalls_exit(void)
-{
-	unregister_kprobe(&reboot_kp);
-}
diff --git a/drivers/kernelsu/kp_hook.h b/drivers/kernelsu/kp_hook.h
deleted file mode 100644
index 708e78665ba8..000000000000
--- a/drivers/kernelsu/kp_hook.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __KSU_H_KP_HOOK
-#define __KSU_H_KP_HOOK
-
-// ksud.c
-enum ksud_stop_code {
-	VFS_READ_HOOK_KP = 0,
-	EXECVE_HOOK_KP,
-	INPUT_EVENT_HOOK_KP,
-};
-
-int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr,
-			size_t *count_ptr);
-
-int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code,
-				  int *value);
-
-void kp_handle_ksud_stop(enum ksud_stop_code);
-void kp_handle_ksud_init(void);
-void kp_handle_ksud_exit(void);
-
-// supercalls.c
-void kp_handle_supercalls_init(void);
-void kp_handle_supercalls_exit(void);
-
-#endif
diff --git a/drivers/kernelsu/kp_util.c b/drivers/kernelsu/kp_util.c
deleted file mode 100644
index 05e6715672c8..000000000000
--- a/drivers/kernelsu/kp_util.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <linux/mm.h>
-#include <linux/pgtable.h>
-#include <linux/printk.h>
-#include <linux/preempt.h>
-#include <asm/current.h>
-
-static bool try_set_access_flag(unsigned long addr)
-{
-#ifdef CONFIG_ARM64
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
-	bool ret = false;
-
-	if (!mm)
-		return false;
-
-	if (!mmap_read_trylock(mm))
-		return false;
-
-	vma = find_vma(mm, addr);
-	if (!vma || addr < vma->vm_start)
-		goto out_unlock;
-
-	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		goto out_unlock;
-
-	p4d = p4d_offset(pgd, addr);
-	if (!p4d_present(*p4d))
-		goto out_unlock;
-
-	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
-		goto out_unlock;
-
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
-		goto out_unlock;
-
-	if (pmd_trans_huge(*pmd))
-		goto out_unlock;
-
-	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	if (!ptep)
-		goto out_unlock;
-
-	pte = *ptep;
-
-	if (!pte_present(pte))
-		goto out_pte_unlock;
-
-	if (pte_young(pte)) {
-		ret = true;
-		goto out_pte_unlock;
-	}
-
-	ptep_set_access_flags(vma, addr, ptep, pte_mkyoung(pte), 0);
-	pr_info("set AF for addr %lx\n", addr);
-	ret = true;
-
-out_pte_unlock:
-	pte_unmap_unlock(ptep, ptl);
-out_unlock:
-	mmap_read_unlock(mm);
-	return ret;
-#else
-	return false;
-#endif
-}
-
-bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest,
-			       size_t dest_len, bool exit_atomic_ctx)
-{
-	unsigned long addr;
-	const char __user *fn;
-	long ret;
-
-	if (!char_usr_ptr)
-		return false;
-
-	addr = untagged_addr((unsigned long)*char_usr_ptr);
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("got addr: %lu\n", addr);
-#endif
-	fn = (const char __user *)addr;
-	memset(dest, 0, dest_len);
-	ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len);
-
-	if (ret < 0 && try_set_access_flag(addr)) {
-		ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len);
-	}
-
-	/*
-	 * This is crazy, but we know what we are doing:
-         * Temporarily exit atomic context to handle page faults, then restore it.
-         */
-	if (exit_atomic_ctx) {
-		if (ret < 0 && preempt_count()) {
-#ifdef CONFIG_KSU_DEBUG
-			pr_info("access to pointer failed, attempting to rescue..\n");
-#endif
-			preempt_enable_no_resched_notrace();
-			ret = strncpy_from_user(dest, fn, dest_len);
-			preempt_disable_notrace();
-		}
-	}
-
-	if (ret < 0) {
-		pr_err("all fallback were tried. err: %lu\n", ret);
-		return false;
-	}
-
-	return true;
-}
diff --git a/drivers/kernelsu/kp_util.h b/drivers/kernelsu/kp_util.h
deleted file mode 100644
index b9128964d6a8..000000000000
--- a/drivers/kernelsu/kp_util.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __KSU_H_KP_UTIL
-#define __KSU_H_KP_UTIL
-#include <linux/types.h>
-
-#ifndef preempt_enable_no_resched_notrace
-#define preempt_enable_no_resched_notrace()                                    \
-	do {                                                                   \
-		barrier();                                                     \
-		__preempt_count_dec();                                         \
-	} while (0)
-#endif
-
-#ifndef preempt_disable_notrace
-#define preempt_disable_notrace()                                              \
-	do {                                                                   \
-		__preempt_count_inc();                                         \
-		barrier();                                                     \
-	} while (0)
-#endif
-
-bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest,
-			       size_t dest_len, bool exit_atomic_ctx);
-
-#endif
diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c
new file mode 100644
index 000000000000..79b98fd73e21
--- /dev/null
+++ b/drivers/kernelsu/ksu.c
@@ -0,0 +1,164 @@
+#include "kernel_includes.h"
+
+// uapi
+#include "include/uapi/app_profile.h"
+#include "include/uapi/feature.h"
+#include "include/uapi/selinux.h"
+#include "include/uapi/supercall.h"
+#include "include/uapi/sulog.h"
+
+// includes
+#include "include/klog.h"
+#include "include/arch.h"
+#include "include/ksu.h"
+
+// selinux includes
+#include "avc_ss.h"
+#include "objsec.h"
+#include "ss/services.h"
+#include "ss/symtab.h"
+#include "xfrm.h"
+#ifndef KSU_COMPAT_USE_SELINUX_STATE
+#include "avc.h"
+#endif
+
+// kernel compat, lite ones
+#include "kernel_compat.h"
+
+#include "policy/app_profile.h"
+#include "policy/allowlist.h"
+#include "policy/feature.h"
+#include "manager/apk_sign.h"
+#include "manager/manager_identity.h"
+#include "manager/throne_tracker.h"
+#include "supercall/internal.h"
+#include "supercall/supercall.h"
+#include "infra/su_mount_ns.h"
+#include "infra/file_wrapper.h"
+#include "infra/event_queue.h"
+#include "feature/adb_root.h"
+#include "feature/kernel_umount.h"
+#include "feature/selinux_hide.h"
+#include "feature/sucompat.h"
+#include "feature/sulog.h"
+#include "runtime/ksud.h"
+#include "runtime/ksud_escape.h"
+#include "sulog/event.h"
+#include "sulog/fd.h"
+
+#include "selinux/selinux.h"
+#include "selinux/sepolicy.h"
+
+// unity build
+#include "tiny_sulog.c"
+#include "policy/allowlist.c"
+#include "policy/app_profile.c"
+#include "policy/feature.c"
+#include "manager/apk_sign.c"
+#include "manager/pkg_observer.c"
+#include "manager/throne_tracker.c"
+
+#include "supercall/perm.c"
+#include "supercall/dispatch.c"
+#include "supercall/supercall.c"
+
+#include "infra/su_mount_ns.c"
+#include "infra/file_wrapper.c"
+#include "infra/event_queue.c"
+
+#include "feature/adb_root.c"
+#include "feature/kernel_umount.c"
+#include "feature/selinux_hide.c"
+#include "feature/sucompat.c"
+#include "feature/sulog.c"
+#include "runtime/ksud.c"
+#include "runtime/ksud_escape.c"
+
+#include "sulog/event.c"
+#include "sulog/fd.c"
+
+#include "hook/setuid_hook.c"
+#include "hook/core_hook.c"	// lsm
+
+#include "selinux/selinux.c"
+#include "selinux/sepolicy.c"
+#include "selinux/rules.c"
+
+#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE
+#ifdef CONFIG_ARM64
+	#include "hook/syscall_table_hook_arm64.c"
+#elif defined(CONFIG_ARM)
+	#include "hook/syscall_table_hook_arm.c"
+#endif
+#endif /* CONFIG_KSU_TAMPER_SYSCALL_TABLE */
+
+#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE)
+#include "hook/kp_ksud.c"
+#endif
+
+// __weak fn's
+#include "kernel_compat.c"
+
+struct cred* ksu_cred;
+
+extern void ksu_supercalls_init();
+
+int __init kernelsu_init(void)
+{
+#ifdef CONFIG_KSU_DEBUG
+	pr_alert("*************************************************************");
+	pr_alert("**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **");
+	pr_alert("**                                                         **");
+	pr_alert("**         You are running KernelSU in DEBUG mode          **");
+	pr_alert("**                                                         **");
+	pr_alert("**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **");
+	pr_alert("*************************************************************");
+#endif
+
+	ksu_cred = prepare_creds();
+	if (!ksu_cred) {
+		pr_err("prepare cred failed!\n");
+	}
+
+	ksu_feature_init();
+
+	ksu_supercalls_init();
+
+	ksu_sucompat_init(); // so the feature is registered
+
+	ksu_kernel_umount_init(); // so the feature is registered
+
+#ifdef CONFIG_KSU_FEATURE_SULOG	
+	ksu_sulog_init(); // so the feature is registered
+#endif
+
+#ifdef CONFIG_KSU_FEATURE_ADBROOT
+	ksu_adb_root_init(); // so the feature is registered
+#endif
+
+#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE
+	ksu_selinux_hide_init();
+#endif
+
+	ksu_core_init();
+
+#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE)
+	kp_ksud_init();
+#endif
+
+	ksu_allowlist_init();
+
+	ksu_throne_tracker_init();
+
+	ksu_ksud_init();
+
+	ksu_file_wrapper_init();
+
+	return 0;
+}
+
+device_initcall(kernelsu_init);
+
+// MODULE_LICENSE("GPL");
+// MODULE_AUTHOR("weishu");
+// MODULE_DESCRIPTION("Android KernelSU");
diff --git a/drivers/kernelsu/ksud.c b/drivers/kernelsu/ksud.c
deleted file mode 100644
index c880d2270c3a..000000000000
--- a/drivers/kernelsu/ksud.c
+++ /dev/null
@@ -1,644 +0,0 @@
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-#include <linux/task_work.h>
-#include <asm/current.h>
-#include <linux/compat.h>
-#include <linux/cred.h>
-#include <linux/dcache.h>
-#include <linux/err.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
-#include <linux/input-event-codes.h>
-#else
-#include <uapi/linux/input.h>
-#endif
-#include <linux/aio.h>
-#include <linux/printk.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/namei.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#include <linux/sched/signal.h>
-#else
-#include <linux/sched.h>
-#endif
-
-#include "manager.h"
-#include "allowlist.h"
-#include "arch.h"
-#include "kernel_compat.h"
-#include "klog.h" // IWYU pragma: keep
-#include "ksud.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "kp_hook.h"
-#endif
-#include "selinux/selinux.h"
-#include "throne_tracker.h"
-
-#if defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
-extern int ksu_observer_init(void);
-#endif
-
-bool ksu_module_mounted __read_mostly = false;
-bool ksu_boot_completed __read_mostly = false;
-
-static const char KERNEL_SU_RC[] =
-	"\n"
-
-	"on post-fs-data\n"
-	"    start logd\n"
-	// We should wait for the post-fs-data finish
-	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH
-	" post-fs-data\n"
-	"\n"
-
-	"on nonencrypted\n"
-	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n"
-	"\n"
-
-	"on property:vold.decrypt=trigger_restart_framework\n"
-	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n"
-	"\n"
-
-	"on property:sys.boot_completed=1\n"
-	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH
-	" boot-completed\n"
-	"\n"
-
-	"\n";
-
-static void stop_vfs_read_hook(void);
-static void stop_execve_hook(void);
-static void stop_input_hook(void);
-
-#ifdef CONFIG_KSU_MANUAL_HOOK
-bool ksu_vfs_read_hook __read_mostly = true;
-bool ksu_execveat_hook __read_mostly = true;
-bool ksu_input_hook __read_mostly = true;
-#endif
-
-void on_post_fs_data(void)
-{
-	static bool already_post_fs_data = false;
-	if (already_post_fs_data) {
-		pr_info("on_post_fs_data already done\n");
-		return;
-	}
-	already_post_fs_data = true;
-	pr_info("on_post_fs_data!\n");
-	ksu_load_allow_list();
-#if defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
-	ksu_observer_init();
-#endif
-	stop_input_hook();
-}
-
-extern void ext4_unregister_sysfs(struct super_block *sb);
-int nuke_ext4_sysfs(const char *mnt)
-{
-	struct path path;
-	int err = kern_path(mnt, 0, &path);
-	if (err) {
-		pr_err("nuke path err: %d\n", err);
-		return err;
-	}
-
-	struct super_block *sb = path.dentry->d_inode->i_sb;
-	const char *name = sb->s_type->name;
-	if (strcmp(name, "ext4") != 0) {
-		pr_info("nuke but module aren't mounted\n");
-		path_put(&path);
-		return -EINVAL;
-	}
-
-	ext4_unregister_sysfs(sb);
-	path_put(&path);
-	return 0;
-}
-
-void on_module_mounted(void)
-{
-	pr_info("on_module_mounted!\n");
-	ksu_module_mounted = true;
-}
-
-void on_boot_completed(void)
-{
-	ksu_boot_completed = true;
-	pr_info("on_boot_completed!\n");
-#if defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
-	track_throne(true);
-#endif
-}
-
-#define MAX_ARG_STRINGS 0x7FFFFFFF
-
-static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
-{
-	const char __user *native;
-
-#ifdef CONFIG_COMPAT
-	if (unlikely(argv.is_compat)) {
-		compat_uptr_t compat;
-
-		if (get_user(compat, argv.ptr.compat + nr))
-			return ERR_PTR(-EFAULT);
-
-		return compat_ptr(compat);
-	}
-#endif
-
-	if (get_user(native, argv.ptr.native + nr))
-		return ERR_PTR(-EFAULT);
-
-	return native;
-}
-
-/*
- * count() counts the number of strings in array ARGV.
- */
-
-/*
- * Make sure old GCC compiler can use __maybe_unused,
- * Test passed in 4.4.x ~ 4.9.x when use GCC.
- */
-
-static int __maybe_unused count(struct user_arg_ptr argv, int max)
-{
-	int i = 0;
-
-	if (argv.ptr.native != NULL) {
-		for (;;) {
-			const char __user *p = get_user_arg_ptr(argv, i);
-
-			if (!p)
-				break;
-
-			if (IS_ERR(p))
-				return -EFAULT;
-
-			if (i >= max)
-				return -E2BIG;
-			++i;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-#ifdef CONFIG_KSU_MANUAL_HOOK
-			cond_resched();
-#endif
-		}
-	}
-	return i;
-}
-
-static void on_post_fs_data_cbfun(struct callback_head *cb)
-{
-	on_post_fs_data();
-}
-
-static struct callback_head on_post_fs_data_cb = {
-	.func = on_post_fs_data_cbfun
-};
-
-static inline void handle_second_stage(void)
-{
-	apply_kernelsu_rules();
-	cache_sid();
-	setup_ksu_cred();
-}
-
-static bool check_argv(struct user_arg_ptr argv, int index,
-		       const char *expected, char *buf, size_t buf_len)
-{
-	const char __user *p;
-	int argc;
-	long ret;
-
-	argc = count(argv, MAX_ARG_STRINGS);
-	if (argc <= index) {
-		return false;
-	}
-
-	p = get_user_arg_ptr(argv, index);
-	if (IS_ERR_OR_NULL(p)) {
-		if (PTR_ERR(p)) {
-			pr_err("check_argv: invalid user pointer, err: %ld\n",
-			       PTR_ERR(p));
-		}
-		return false;
-	}
-
-	ret = ksu_strncpy_from_user_nofault(buf, p, buf_len);
-	if (ret <= 0) {
-		pr_err("check_argv: failed to copy pointer, err: %ld\n", ret);
-		return false;
-	}
-
-	buf[buf_len - 1] = '\0';
-
-	return !strcmp(buf, expected);
-}
-
-// IMPORTANT NOTE: the call from execve_handler_pre WON'T provided correct value for envp and flags in GKI version
-int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr,
-			     struct user_arg_ptr *argv,
-			     struct user_arg_ptr *envp, int *flags)
-{
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (!ksu_execveat_hook) {
-		return 0;
-	}
-#endif
-	struct filename *filename;
-
-	static const char app_process[] = "/system/bin/app_process";
-	static bool first_zygote = true;
-
-	/* This applies to versions Android 10+ */
-	static const char system_bin_init[] = "/system/bin/init";
-	/* This applies to versions between Android 6 ~ 9  */
-	static const char old_system_init[] = "/init";
-	static bool init_second_stage_executed = false;
-
-	if (!filename_ptr)
-		return 0;
-
-	filename = *filename_ptr;
-	if (IS_ERR(filename)) {
-		return 0;
-	}
-
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (current->pid != 1 && is_init(get_current_cred())) {
-		if (unlikely(strcmp(filename->name, KSUD_PATH) == 0)) {
-			pr_info("escape to root for init executing ksud: %d\n",
-				current->pid);
-			escape_to_root_for_init();
-		}
-	}
-#endif
-
-	if (unlikely(!memcmp(filename->name, system_bin_init,
-			     sizeof(system_bin_init) - 1) &&
-		     argv)) {
-		char buf[16];
-		if (!init_second_stage_executed &&
-		    check_argv(*argv, 1, "second_stage", buf, sizeof(buf))) {
-			pr_info("/system/bin/init second_stage executed\n");
-			handle_second_stage();
-			init_second_stage_executed = true;
-		}
-	} else if (unlikely(!memcmp(filename->name, old_system_init,
-				    sizeof(old_system_init) - 1) &&
-			    argv)) {
-		char buf[16];
-		if (!init_second_stage_executed &&
-		    check_argv(*argv, 1, "--second-stage", buf, sizeof(buf))) {
-			/* This applies to versions between Android 6 ~ 7 */
-			pr_info("/init second_stage executed\n");
-			handle_second_stage();
-			init_second_stage_executed = true;
-		} else if (count(*argv, MAX_ARG_STRINGS) == 1 &&
-			   !init_second_stage_executed && envp) {
-			/* This applies to versions between Android 8 ~ 9  */
-			int envc = count(*envp, MAX_ARG_STRINGS);
-			if (envc > 0) {
-				int n;
-				for (n = 1; n <= envc; n++) {
-					const char __user *p =
-						get_user_arg_ptr(*envp, n);
-					if (!p || IS_ERR(p)) {
-						continue;
-					}
-					char env[256];
-					// Reading environment variable strings from user space
-					if (ksu_strncpy_from_user_nofault(
-						    env, p, sizeof(env)) < 0)
-						continue;
-					// Parsing environment variable names and values
-					char *env_name = env;
-					char *env_value = strchr(env, '=');
-					if (env_value == NULL)
-						continue;
-					// Replace equal sign with string terminator
-					*env_value = '\0';
-					env_value++;
-					// Check if the environment variable name and value are matching
-					if (!strcmp(env_name,
-						    "INIT_SECOND_STAGE") &&
-					    (!strcmp(env_value, "1") ||
-					     !strcmp(env_value, "true"))) {
-						pr_info("/init second_stage executed\n");
-						handle_second_stage();
-						init_second_stage_executed =
-							true;
-					}
-				}
-			}
-		}
-	}
-
-	if (unlikely(first_zygote &&
-		     !memcmp(filename->name, app_process,
-			     sizeof(app_process) - 1) &&
-		     argv)) {
-		char buf[16];
-		if (check_argv(*argv, 1, "-Xzygote", buf, sizeof(buf))) {
-			pr_info("exec zygote, /data prepared, second_stage: %d\n",
-				init_second_stage_executed);
-			rcu_read_lock();
-			struct task_struct *init_task =
-				rcu_dereference(current->real_parent);
-			if (init_task)
-				task_work_add(init_task, &on_post_fs_data_cb,
-					      TWA_RESUME);
-			rcu_read_unlock();
-			first_zygote = false;
-			stop_execve_hook();
-		}
-	}
-
-	return 0;
-}
-
-static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *);
-static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *);
-static struct file_operations fops_proxy;
-static ssize_t ksu_rc_pos = 0;
-const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1;
-
-// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da
-// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da
-// The system will read init.rc file until EOF, whenever read() returns 0,
-// so we begin append ksu rc when we meet EOF.
-
-static ssize_t read_proxy(struct file *file, char __user *buf, size_t count,
-			  loff_t *pos)
-{
-	ssize_t ret = 0;
-	size_t append_count;
-	if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len)
-		goto append_ksu_rc;
-
-	ret = orig_read(file, buf, count, pos);
-	if (ret != 0 || ksu_rc_pos >= ksu_rc_len) {
-		return ret;
-	} else {
-		pr_info("read_proxy: orig read finished, start append rc\n");
-	}
-append_ksu_rc:
-	append_count = ksu_rc_len - ksu_rc_pos;
-	if (append_count > count - ret)
-		append_count = count - ret;
-	// copy_to_user returns the number of not copied
-	if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) {
-		pr_info("read_proxy: append error, totally appended %zd\n",
-			ksu_rc_pos);
-	} else {
-		pr_info("read_proxy: append %zu\n", append_count);
-
-		ksu_rc_pos += append_count;
-		if (ksu_rc_pos == ksu_rc_len) {
-			pr_info("read_proxy: append done\n");
-		}
-		ret += append_count;
-	}
-
-	return ret;
-}
-
-static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to)
-{
-	ssize_t ret = 0;
-	size_t append_count;
-	if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len)
-		goto append_ksu_rc;
-
-	ret = orig_read_iter(iocb, to);
-	if (ret != 0 || ksu_rc_pos >= ksu_rc_len) {
-		return ret;
-	} else {
-		pr_info("read_iter_proxy: orig read finished, start append rc\n");
-	}
-append_ksu_rc:
-	// copy_to_iter returns the number of copied bytes
-	append_count = copy_to_iter(KERNEL_SU_RC + ksu_rc_pos,
-				    ksu_rc_len - ksu_rc_pos, to);
-	if (!append_count) {
-		pr_info("read_iter_proxy: append error, totally appended %zd\n",
-			ksu_rc_pos);
-	} else {
-		pr_info("read_iter_proxy: append %zu\n", append_count);
-
-		ksu_rc_pos += append_count;
-		if (ksu_rc_pos == ksu_rc_len) {
-			pr_info("read_iter_proxy: append done\n");
-		}
-		ret += append_count;
-	}
-	return ret;
-}
-
-static bool check_init_path(char *dpath)
-{
-	const char *valid_paths[] = { "/system/etc/init/hw/init.rc",
-				      "/init.rc" };
-	bool path_match = false;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(valid_paths); i++) {
-		if (strcmp(dpath, valid_paths[i]) == 0) {
-			path_match = true;
-			break;
-		}
-	}
-
-	if (!path_match) {
-		pr_err("vfs_read: couldn't determine init.rc path for %s\n",
-		       dpath);
-		return false;
-	}
-
-	pr_info("vfs_read: got init.rc path: %s\n", dpath);
-	return true;
-}
-
-int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr,
-			size_t *count_ptr, loff_t **pos)
-{
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (!ksu_vfs_read_hook) {
-		return 0;
-	}
-#endif
-
-	struct file *file;
-	size_t count;
-
-	if (strcmp(current->comm, "init")) {
-		// we are only interest in `init` process
-		return 0;
-	}
-
-	file = *file_ptr;
-	if (IS_ERR(file)) {
-		return 0;
-	}
-
-	if (!d_is_reg(file->f_path.dentry)) {
-		return 0;
-	}
-
-	const char *short_name = file->f_path.dentry->d_name.name;
-	if (strcmp(short_name, "init.rc")) {
-		// we are only interest `init.rc` file name file
-		return 0;
-	}
-	char path[256];
-	char *dpath = d_path(&file->f_path, path, sizeof(path));
-
-	if (IS_ERR(dpath)) {
-		return 0;
-	}
-
-	if (!check_init_path(dpath)) {
-		return 0;
-	}
-
-	// we only process the first read
-	static bool rc_hooked = false;
-	if (rc_hooked) {
-		// we don't need this kprobe, unregister it!
-		stop_vfs_read_hook();
-		return 0;
-	}
-	rc_hooked = true;
-
-	// now we can sure that the init process is reading
-	// `/system/etc/init/hw/init.rc` or `/init.rc`
-	count = *count_ptr;
-
-	pr_info("vfs_read: %s, comm: %s, count: %zu, rc_count: %zu\n", dpath,
-		current->comm, count, ksu_rc_len);
-
-	// Now we need to proxy the read and modify the result!
-	// But, we can not modify the file_operations directly, because it's in read-only memory.
-	// We just replace the whole file_operations with a proxy one.
-	memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations));
-	orig_read = file->f_op->read;
-	if (orig_read) {
-		fops_proxy.read = read_proxy;
-	}
-	orig_read_iter = file->f_op->read_iter;
-	if (orig_read_iter) {
-		fops_proxy.read_iter = read_iter_proxy;
-	}
-	// replace the file_operations
-	file->f_op = &fops_proxy;
-
-	return 0;
-}
-
-int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr,
-			size_t *count_ptr)
-{
-	struct file *file = fget(fd);
-	if (!file) {
-		return 0;
-	}
-	int result = ksu_handle_vfs_read(&file, buf_ptr, count_ptr, NULL);
-	fput(file);
-	return result;
-}
-
-static unsigned int volumedown_pressed_count = 0;
-
-static bool is_volumedown_enough(unsigned int count)
-{
-	return count >= 3;
-}
-
-int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code,
-				  int *value)
-{
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (!ksu_input_hook) {
-		return 0;
-	}
-#endif
-
-	if (*type == EV_KEY && *code == KEY_VOLUMEDOWN && *value) {
-		// key pressed, count it
-		volumedown_pressed_count++;
-		pr_info("input_handle_event: vol_down pressed count: %u\n",
-			volumedown_pressed_count);
-		if (is_volumedown_enough(volumedown_pressed_count)) {
-			pr_info("input_handle_event: vol_down pressed MAX! safe mode is active!\n");
-			stop_input_hook();
-		}
-	}
-
-	return 0;
-}
-
-bool ksu_is_safe_mode(void)
-{
-	return is_volumedown_enough(volumedown_pressed_count);
-}
-
-static void stop_vfs_read_hook(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_ksud_stop(VFS_READ_HOOK_KP);
-#else
-	ksu_vfs_read_hook = false;
-	pr_info("stop vfs_read_hook\n");
-#endif
-}
-
-static void stop_execve_hook(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_ksud_stop(EXECVE_HOOK_KP);
-#else
-	ksu_execveat_hook = false;
-	pr_info("stop execve_hook\n");
-#endif
-}
-
-static void stop_input_hook(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_ksud_stop(INPUT_EVENT_HOOK_KP);
-#else
-	// No need to stop when its already stopped.
-	if (!ksu_input_hook) {
-		return;
-	}
-	ksu_input_hook = false;
-	pr_info("stop input_hook\n");
-#endif
-}
-
-// ksud: module support
-void ksu_ksud_init(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_ksud_init();
-#endif
-}
-
-void ksu_ksud_exit(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_ksud_exit();
-#endif
-}
diff --git a/drivers/kernelsu/ksud.h b/drivers/kernelsu/ksud.h
deleted file mode 100644
index 68c545714c24..000000000000
--- a/drivers/kernelsu/ksud.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __KSU_H_KSUD
-#define __KSU_H_KSUD
-
-#include <linux/types.h>
-
-#define KSUD_PATH "/data/adb/ksud"
-
-void ksu_ksud_init(void);
-void ksu_ksud_exit(void);
-
-void on_post_fs_data(void);
-void on_module_mounted(void);
-void on_boot_completed(void);
-
-bool ksu_is_safe_mode(void);
-
-int nuke_ext4_sysfs(const char *mnt);
-
-extern u32 ksu_file_sid;
-extern bool ksu_module_mounted;
-extern bool ksu_boot_completed;
-
-struct user_arg_ptr {
-#ifdef CONFIG_COMPAT
-	bool is_compat;
-#endif
-	union {
-		const char __user *const __user *native;
-#ifdef CONFIG_COMPAT
-		const compat_uptr_t __user *compat;
-#endif
-	} ptr;
-};
-
-int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr,
-			     struct user_arg_ptr *argv,
-			     struct user_arg_ptr *envp, int *flags);
-
-#endif
diff --git a/drivers/kernelsu/ksuinit.c b/drivers/kernelsu/ksuinit.c
deleted file mode 100644
index 75cfced0268d..000000000000
--- a/drivers/kernelsu/ksuinit.c
+++ /dev/null
@@ -1,140 +0,0 @@
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/printk.h>
-#include <linux/kobject.h>
-#include <linux/module.h>
-#include <generated/utsrelease.h>
-#include <generated/compile.h>
-#include <linux/version.h> /* LINUX_VERSION_CODE, KERNEL_VERSION macros */
-
-#include "allowlist.h"
-#include "arch.h"
-#include "feature.h"
-#include "klog.h" // IWYU pragma: keep
-#include "ksu.h"
-#include "throne_tracker.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "syscall_handler.h"
-#endif
-#ifdef CONFIG_KSU_MANUAL_HOOK
-#include "setuid_hook.h"
-#include "sucompat.h"
-#endif
-#include "ksud.h"
-#include "supercalls.h"
-#include "ksu.h"
-#include "file_wrapper.h"
-
-struct cred *ksu_cred;
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0) &&                           \
-     defined(CONFIG_KSU_MANUAL_HOOK))
-extern void __init ksu_lsm_hook_init(void);
-#endif
-
-int __init kernelsu_init(void)
-{
-#ifdef CONFIG_KSU_DEBUG
-	pr_alert(
-		"*************************************************************");
-	pr_alert(
-		"**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **");
-	pr_alert(
-		"**                                                         **");
-	pr_alert(
-		"**         You are running KernelSU in DEBUG mode          **");
-	pr_alert(
-		"**                                                         **");
-	pr_alert(
-		"**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **");
-	pr_alert(
-		"*************************************************************");
-#endif
-
-	ksu_cred = prepare_creds();
-	if (!ksu_cred) {
-		pr_err("prepare cred failed!\n");
-	}
-
-	ksu_feature_init();
-
-	ksu_supercalls_init();
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	ksu_syscall_hook_manager_init();
-#endif
-#ifdef CONFIG_KSU_MANUAL_HOOK
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0)
-	ksu_lsm_hook_init();
-#endif
-	ksu_setuid_hook_init();
-	ksu_sucompat_init();
-#endif
-
-	ksu_allowlist_init();
-
-	ksu_throne_tracker_init();
-
-	ksu_ksud_init();
-
-	ksu_file_wrapper_init();
-
-#ifdef MODULE
-#ifndef CONFIG_KSU_DEBUG
-	kobject_del(&THIS_MODULE->mkobj.kobj);
-#endif
-#endif
-	return 0;
-}
-
-#if defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
-extern void ksu_observer_exit(void);
-#endif
-
-void kernelsu_exit(void)
-{
-	ksu_allowlist_exit();
-
-	ksu_throne_tracker_exit();
-
-#if defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
-	ksu_observer_exit();
-#endif
-
-	ksu_ksud_exit();
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	ksu_syscall_hook_manager_exit();
-#endif
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	ksu_sucompat_exit();
-	ksu_setuid_hook_exit();
-#endif
-
-	ksu_supercalls_exit();
-
-	ksu_feature_exit();
-
-	if (ksu_cred) {
-		put_cred(ksu_cred);
-	}
-}
-
-module_init(kernelsu_init);
-module_exit(kernelsu_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("weishu");
-MODULE_DESCRIPTION("Android KernelSU");
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0)
-MODULE_IMPORT_NS("VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver");
-#else
-MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver);
-#endif
-#endif
diff --git a/drivers/kernelsu/linux_hashtable.h b/drivers/kernelsu/linux_hashtable.h
new file mode 100644
index 000000000000..3d4516102bee
--- /dev/null
+++ b/drivers/kernelsu/linux_hashtable.h
@@ -0,0 +1,243 @@
+/*
+ * Statically sized hash table implementation
+ * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
+ */
+
+#ifndef _LINUX_HASHTABLE_H
+#define _LINUX_HASHTABLE_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/rculist.h>
+
+#define DEFINE_HASHTABLE(name, bits)						\
+	struct hlist_head name[1 << (bits)] =					\
+			{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
+
+#define DECLARE_HASHTABLE(name, bits)                                   	\
+	struct hlist_head name[1 << (bits)]
+
+#define HASH_SIZE(name) (ARRAY_SIZE(name))
+#define HASH_BITS(name) ilog2(HASH_SIZE(name))
+
+/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
+#define hash_min(val, bits)							\
+	(sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))
+
+static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
+{
+	unsigned int i;
+
+	for (i = 0; i < sz; i++)
+		INIT_HLIST_HEAD(&ht[i]);
+}
+
+/**
+ * hash_init - initialize a hash table
+ * @hashtable: hashtable to be initialized
+ *
+ * Calculates the size of the hashtable from the given parameter, otherwise
+ * same as hash_init_size.
+ *
+ * This has to be a macro since HASH_BITS() will not work on pointers since
+ * it calculates the size during preprocessing.
+ */
+#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))
+
+/**
+ * hash_add - add an object to a hashtable
+ * @hashtable: hashtable to add to
+ * @node: the &struct hlist_node of the object to be added
+ * @key: the key of the object to be added
+ */
+#define hash_add(hashtable, node, key)						\
+	hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])
+
+/**
+ * hash_add_rcu - add an object to a rcu enabled hashtable
+ * @hashtable: hashtable to add to
+ * @node: the &struct hlist_node of the object to be added
+ * @key: the key of the object to be added
+ */
+#define hash_add_rcu(hashtable, node, key)					\
+	hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])
+
+/**
+ * hash_hashed - check whether an object is in any hashtable
+ * @node: the &struct hlist_node of the object to be checked
+ */
+static inline bool hash_hashed(struct hlist_node *node)
+{
+	return !hlist_unhashed(node);
+}
+
+static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
+{
+	unsigned int i;
+
+	for (i = 0; i < sz; i++)
+		if (!hlist_empty(&ht[i]))
+			return false;
+
+	return true;
+}
+
+/**
+ * hash_empty - check whether a hashtable is empty
+ * @hashtable: hashtable to check
+ *
+ * This has to be a macro since HASH_BITS() will not work on pointers since
+ * it calculates the size during preprocessing.
+ */
+#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))
+
+/**
+ * hash_del - remove an object from a hashtable
+ * @node: &struct hlist_node of the object to remove
+ */
+static inline void hash_del(struct hlist_node *node)
+{
+	hlist_del_init(node);
+}
+
+/**
+ * hash_del_rcu - remove an object from a rcu enabled hashtable
+ * @node: &struct hlist_node of the object to remove
+ */
+static inline void hash_del_rcu(struct hlist_node *node)
+{
+	hlist_del_init_rcu(node);
+}
+
+#undef hlist_entry_safe
+#undef hlist_for_each_entry_rcu
+#undef hlist_for_each_entry
+#undef hlist_for_each_entry_safe
+
+#define hlist_entry_safe(ptr, type, member) \
+	(ptr) ? hlist_entry(ptr, type, member) : NULL
+
+/**
+ * hlist_for_each_entry_rcu - iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define hlist_for_each_entry_rcu(pos, head, member)			\
+	for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\
+			typeof(*(pos)), member);			\
+		pos;							\
+		pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
+			&(pos)->member)), typeof(*(pos)), member))
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(pos, head, member)				\
+	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(pos, n, head, member) 		\
+	for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
+	     pos && ({ n = pos->member.next; 1; });			\
+	     pos = hlist_entry_safe(n, typeof(*pos), member))
+
+#undef hash_for_each
+#undef hash_for_each_rcu
+#undef hash_for_each_safe
+#undef hash_for_each_possible
+#undef hash_for_each_possible_rcu
+
+/**
+ * hash_for_each - iterate over a hashtable
+ * @name: hashtable to iterate
+ * @bkt: integer to use as bucket loop cursor
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ */
+#define hash_for_each(name, bkt, obj, member)				\
+	for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
+			(bkt)++)\
+		hlist_for_each_entry(obj, &name[bkt], member)
+
+/**
+ * hash_for_each_rcu - iterate over a rcu enabled hashtable
+ * @name: hashtable to iterate
+ * @bkt: integer to use as bucket loop cursor
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ */
+#define hash_for_each_rcu(name, bkt, obj, member)			\
+	for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
+			(bkt)++)\
+		hlist_for_each_entry_rcu(obj, &name[bkt], member)
+
+/**
+ * hash_for_each_safe - iterate over a hashtable safe against removal of
+ * hash entry
+ * @name: hashtable to iterate
+ * @bkt: integer to use as bucket loop cursor
+ * @tmp: a &struct used for temporary storage
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ */
+#define hash_for_each_safe(name, bkt, tmp, obj, member)			\
+	for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
+			(bkt)++)\
+		hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)
+
+/**
+ * hash_for_each_possible - iterate over all possible objects hashing to the
+ * same bucket
+ * @name: hashtable to iterate
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ * @key: the key of the objects to iterate over
+ */
+#define hash_for_each_possible(name, obj, member, key)			\
+	hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)
+
+/**
+ * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
+ * same bucket in an rcu enabled hashtable
+ * in a rcu enabled hashtable
+ * @name: hashtable to iterate
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ * @key: the key of the objects to iterate over
+ */
+#define hash_for_each_possible_rcu(name, obj, member, key)		\
+	hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
+		member)
+
+/**
+ * hash_for_each_possible_safe - iterate over all possible objects hashing to the
+ * same bucket safe against removals
+ * @name: hashtable to iterate
+ * @obj: the type * to use as a loop cursor for each entry
+ * @tmp: a &struct used for temporary storage
+ * @member: the name of the hlist_node within the struct
+ * @key: the key of the objects to iterate over
+ */
+#define hash_for_each_possible_safe(name, obj, tmp, member, key)	\
+	hlist_for_each_entry_safe(obj, tmp,\
+		&name[hash_min(key, HASH_BITS(name))], member)
+
+
+#endif
diff --git a/drivers/kernelsu/lsm_hook.c b/drivers/kernelsu/lsm_hook.c
deleted file mode 100644
index e1c0a76ec5ba..000000000000
--- a/drivers/kernelsu/lsm_hook.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <linux/lsm_hooks.h>
-#include <linux/uidgid.h>
-#include <linux/version.h>
-#include <linux/dcache.h>
-#include <linux/err.h>
-#include <linux/uidgid.h>
-#include <linux/string.h>
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||                           \
-	defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND)
-static int ksu_key_permission(key_ref_t key_ref, const struct cred *cred,
-			      unsigned perm)
-{
-	if (init_session_keyring != NULL) {
-		return 0;
-	}
-	if (strcmp(current->comm, "init")) {
-		// we are only interested in `init` process
-		return 0;
-	}
-	init_session_keyring = cred->session_keyring;
-	pr_info("kernel_compat: got init_session_keyring\n");
-	return 0;
-}
-#endif
-
-static int ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
-			    struct inode *new_inode, struct dentry *new_dentry)
-{
-	// skip kernel threads
-	if (!current->mm) {
-		return 0;
-	}
-
-	// skip non system uid
-	if (current_uid().val != 1000) {
-		return 0;
-	}
-
-	if (!old_dentry || !new_dentry) {
-		return 0;
-	}
-
-	// /data/system/packages.list.tmp -> /data/system/packages.list
-	if (strcmp(new_dentry->d_iname, "packages.list")) {
-		return 0;
-	}
-
-	char path[128];
-	char *buf = dentry_path_raw(new_dentry, path, sizeof(path));
-	if (IS_ERR(buf)) {
-		pr_err("dentry_path_raw failed.\n");
-		return 0;
-	}
-
-	if (!strstr(buf, "/system/packages.list")) {
-		return 0;
-	}
-
-	pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname,
-		new_dentry->d_iname, buf);
-
-	/*
-	 * RKSU note:
-	 * track_throne(true) only occurs on on_boot_completed event.
-	 * When using this LSM, we must handle it here, else it returns
-	 * ENOENT (-2).
-	 */
-	static bool did = false;
-	if (ksu_boot_completed && !did) {
-		did = true;
-		track_throne(true);
-		return 0;
-	}
-
-	track_throne(false);
-
-	return 0;
-}
-
-static int ksu_task_fix_setuid(struct cred *new, const struct cred *old,
-			       int flags)
-{
-	if (!new || !old)
-		return 0;
-
-	return ksu_handle_setuid_common(new->uid.val, old->uid.val, new->euid.val);
-}
-
-static struct security_hook_list ksu_hooks[] = {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||                           \
-	defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND)
-	LSM_HOOK_INIT(key_permission, ksu_key_permission),
-#endif
-	LSM_HOOK_INIT(inode_rename, ksu_inode_rename),
-	LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid)
-};
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)
-static const struct lsm_id ksu_lsmid = {
-	.name = "ksu",
-	.id = 912,
-};
-#endif
-
-void __init ksu_lsm_hook_init(void)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)
-	security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), &ksu_lsmid);
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-	security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu");
-#else
-	// https://elixir.bootlin.com/linux/v4.10.17/source/include/linux/lsm_hooks.h#L1892
-	security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks));
-#endif
-	pr_info("LSM hooks initialized.\n");
-}
diff --git a/drivers/kernelsu/manager.h b/drivers/kernelsu/manager.h
deleted file mode 100644
index a22ac52ec1f2..000000000000
--- a/drivers/kernelsu/manager.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef __KSU_H_KSU_MANAGER
-#define __KSU_H_KSU_MANAGER
-
-#include <linux/cred.h>
-#include <linux/types.h>
-#include "allowlist.h"
-
-#define KSU_INVALID_APPID -1
-
-extern uid_t ksu_manager_appid; // DO NOT DIRECT USE
-
-static inline bool ksu_is_manager_appid_valid(void)
-{
-	return ksu_manager_appid != KSU_INVALID_APPID;
-}
-
-static inline bool is_manager(void)
-{
-	return unlikely(ksu_manager_appid ==
-			current_uid().val % PER_USER_RANGE);
-}
-
-static inline uid_t ksu_get_manager_appid(void)
-{
-	return ksu_manager_appid;
-}
-
-static inline void ksu_set_manager_appid(uid_t appid)
-{
-	ksu_manager_appid = appid;
-}
-
-static inline void ksu_invalidate_manager_uid(void)
-{
-	ksu_manager_appid = KSU_INVALID_APPID;
-}
-
-#endif
diff --git a/drivers/kernelsu/apk_sign.c b/drivers/kernelsu/manager/apk_sign.c
similarity index 73%
rename from drivers/kernelsu/apk_sign.c
rename to drivers/kernelsu/manager/apk_sign.c
index 4c6c63d0d886..b5965842b5e2 100644
--- a/drivers/kernelsu/apk_sign.c
+++ b/drivers/kernelsu/manager/apk_sign.c
@@ -1,34 +1,8 @@
-#include <linux/err.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/version.h>
-#ifdef CONFIG_KSU_DEBUG
-#include <linux/moduleparam.h>
-#endif
-#include <crypto/hash.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-#include <crypto/sha2.h>
-#else
-#include <crypto/sha.h>
-#endif
-
-#include "apk_sign.h"
-#include "app_profile.h"
-#include "klog.h" // IWYU pragma: keep
-#include "kernel_compat.h"
-#include "manager_sign.h"
-
 struct sdesc {
 	struct shash_desc shash;
 	char ctx[];
 };
 
-static apk_sign_key_t apk_sign_keys[] = {
-	{ EXPECTED_SIZE_RSUNTK, EXPECTED_HASH_RSUNTK }, // RKSU
-};
-
 static struct sdesc *init_sdesc(struct crypto_shash *alg)
 {
 	struct sdesc *sdesc;
@@ -76,42 +50,39 @@ static int ksu_sha256(const unsigned char *data, unsigned int datalen,
 	return ret;
 }
 
-static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset)
+static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset,
+			unsigned expected_size, const char *expected_sha256)
 {
-	int i;
-	apk_sign_key_t sign_key;
-
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer-sequence length
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer length
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // signed data length
+	kernel_read(fp, size4, 0x4, pos); // signer-sequence length
+	kernel_read(fp, size4, 0x4, pos); // signer length
+	kernel_read(fp, size4, 0x4, pos); // signed data length
 
 	*offset += 0x4 * 3;
 
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // digests-sequence length
+	kernel_read(fp, size4, 0x4, pos); // digests-sequence length
 
 	*pos += *size4;
 	*offset += 0x4 + *size4;
 
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificates length
-	ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificate length
+	kernel_read(fp, size4, 0x4, pos); // certificates length
+	kernel_read(fp, size4, 0x4, pos); // certificate length
 	*offset += 0x4 * 2;
 
-	for (i = 0; i < ARRAY_SIZE(apk_sign_keys); i++) {
-		sign_key = apk_sign_keys[i];
-
-		if (*size4 != sign_key.size)
-			continue;
+	if (*size4 == expected_size) {
 		*offset += *size4;
 
 #define CERT_MAX_LENGTH 1024
-		char cert[CERT_MAX_LENGTH];
+		char *cert __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(CERT_MAX_LENGTH, GFP_KERNEL);
+		if (!cert)
+			return false;
+
 		if (*size4 > CERT_MAX_LENGTH) {
 			pr_info("cert length overlimit\n");
 			return false;
 		}
-		ksu_kernel_read_compat(fp, cert, *size4, pos);
+		kernel_read(fp, cert, *size4, pos);
 		unsigned char digest[SHA256_DIGEST_SIZE];
-		if (ksu_sha256(cert, *size4, digest) < 0) {
+		if (ksu_sha256(cert, *size4, digest) < 0 ) {
 			pr_info("sha256 error\n");
 			return false;
 		}
@@ -121,8 +92,8 @@ static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset)
 
 		bin2hex(hash_str, digest, SHA256_DIGEST_SIZE);
 		pr_info("sha256: %s, expected: %s\n", hash_str,
-			sign_key.sha256);
-		if (strcmp(sign_key.sha256, hash_str) == 0) {
+			expected_sha256);
+		if (strcmp(expected_sha256, hash_str) == 0) {
 			return true;
 		}
 	}
@@ -151,7 +122,7 @@ static bool has_v1_signature_file(struct file *fp)
 
 	loff_t pos = 0;
 
-	while (ksu_kernel_read_compat(fp, &header,
+	while (kernel_read(fp, &header,
 				      sizeof(struct zip_entry_header), &pos) ==
 	       sizeof(struct zip_entry_header)) {
 		if (header.signature != 0x04034b50) {
@@ -161,7 +132,7 @@ static bool has_v1_signature_file(struct file *fp)
 		// Read the entry file name
 		if (header.file_name_length == sizeof(MANIFEST) - 1) {
 			char fileName[sizeof(MANIFEST)];
-			ksu_kernel_read_compat(fp, fileName,
+			kernel_read(fp, fileName,
 					       header.file_name_length, &pos);
 			fileName[header.file_name_length] = '\0';
 
@@ -182,7 +153,9 @@ static bool has_v1_signature_file(struct file *fp)
 	return false;
 }
 
-static __always_inline bool check_v2_signature(char *path)
+static __always_inline bool check_v2_signature(char *path,
+					       unsigned expected_size,
+					       const char *expected_sha256)
 {
 	unsigned char buffer[0x11] = { 0 };
 	u32 size4;
@@ -196,9 +169,27 @@ static __always_inline bool check_v2_signature(char *path)
 	bool v3_1_signing_exist = false;
 
 	int i;
-	struct file *fp = ksu_filp_open_compat(path, O_RDONLY, 0);
+
+	struct path kpath;
+	if (kern_path(path, 0, &kpath))
+		return false;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) 
+	if (inode_is_locked(kpath.dentry->d_inode))
+#else
+	if (mutex_is_locked(&kpath.dentry->d_inode->i_mutex))
+#endif
+	{
+		pr_info("%s: inode is locked for %s\n", __func__, path);
+		path_put(&kpath);
+		return false;
+	}
+
+	path_put(&kpath);
+
+	struct file *fp = filp_open(path, O_RDONLY, 0);
 	if (IS_ERR(fp)) {
-		pr_err("open %s error.\n", path);
+		// pr_err("open %s error.\n", path);
 		return false;
 	}
 
@@ -208,11 +199,11 @@ static __always_inline bool check_v2_signature(char *path)
 	// https://en.wikipedia.org/wiki/Zip_(file_format)#End_of_central_directory_record_(EOCD)
 	for (i = 0;; ++i) {
 		unsigned short n;
-		pos = generic_file_llseek(fp, -i - 2, SEEK_END);
-		ksu_kernel_read_compat(fp, &n, 2, &pos);
+		pos = vfs_llseek(fp, -i - 2, SEEK_END);
+		kernel_read(fp, &n, 2, &pos);
 		if (n == i) {
 			pos -= 22;
-			ksu_kernel_read_compat(fp, &size4, 4, &pos);
+			kernel_read(fp, &size4, 4, &pos);
 			if ((size4 ^ 0xcafebabeu) == 0xccfbf1eeu) {
 				break;
 			}
@@ -225,17 +216,17 @@ static __always_inline bool check_v2_signature(char *path)
 
 	pos += 12;
 	// offset
-	ksu_kernel_read_compat(fp, &size4, 0x4, &pos);
+	kernel_read(fp, &size4, 0x4, &pos);
 	pos = size4 - 0x18;
 
-	ksu_kernel_read_compat(fp, &size8, 0x8, &pos);
-	ksu_kernel_read_compat(fp, buffer, 0x10, &pos);
-	if (strcmp((char *)buffer, "APK Sig Block 42")) {
+	kernel_read(fp, &size8, 0x8, &pos);
+	kernel_read(fp, buffer, 0x10, &pos);
+	if (memcmp(buffer, "APK Sig Block 42", 16)) {
 		goto clean;
 	}
 
 	pos = size4 - (size8 + 0x8);
-	ksu_kernel_read_compat(fp, &size_of_block, 0x8, &pos);
+	kernel_read(fp, &size_of_block, 0x8, &pos);
 	if (size_of_block != size8) {
 		goto clean;
 	}
@@ -244,17 +235,17 @@ static __always_inline bool check_v2_signature(char *path)
 	while (loop_count++ < 10) {
 		uint32_t id;
 		uint32_t offset;
-		ksu_kernel_read_compat(fp, &size8, 0x8,
-				       &pos); // sequence length
+		kernel_read(fp, &size8, 0x8, &pos); // sequence length
 		if (size8 == size_of_block) {
 			break;
 		}
-		ksu_kernel_read_compat(fp, &id, 0x4, &pos); // id
+		kernel_read(fp, &id, 0x4, &pos); // id
 		offset = 4;
 		if (id == 0x7109871au) {
 			v2_signing_blocks++;
 			v2_signing_valid =
-				check_block(fp, &size4, &pos, &offset);
+				check_block(fp, &size4, &pos, &offset,
+					    expected_size, expected_sha256);
 		} else if (id == 0xf05368c0u) {
 			// http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#73
 			v3_signing_exist = true;
@@ -302,8 +293,6 @@ static __always_inline bool check_v2_signature(char *path)
 
 int ksu_debug_manager_appid = -1;
 
-#include "manager.h"
-
 static int set_expected_size(const char *val, const struct kernel_param *kp)
 {
 	int rv = param_set_uint(val, kp);
@@ -318,7 +307,7 @@ static struct kernel_param_ops expected_size_ops = {
 };
 
 module_param_cb(ksu_debug_manager_appid, &expected_size_ops,
-		&ksu_debug_manager_appid, S_IRUSR | S_IWUSR);
+	&ksu_debug_manager_appid, S_IRUSR | S_IWUSR);
 
 #endif
 
@@ -363,5 +352,7 @@ int get_pkg_from_apk_path(char *pkg, const char *path)
 
 bool is_manager_apk(char *path)
 {
-	return check_v2_signature(path);
+	return (check_v2_signature(path, 0x363, "4359c171f32543394cbc23ef908c4bb94cad7c8087002ba164c8230948c21549") // dummy.keystore
+	|| check_v2_signature(path, 0x033b, "c371061b19d8c7d7d6133c6a9bafe198fa944e50c1b31c9d8daa8d7f1fc2d2d6")  // kernelsu official
+	);
 }
diff --git a/drivers/kernelsu/apk_sign.h b/drivers/kernelsu/manager/apk_sign.h
similarity index 85%
rename from drivers/kernelsu/apk_sign.h
rename to drivers/kernelsu/manager/apk_sign.h
index b4d4ce3756c4..65b3a1e51cdd 100644
--- a/drivers/kernelsu/apk_sign.h
+++ b/drivers/kernelsu/manager/apk_sign.h
@@ -1,10 +1,7 @@
 #ifndef __KSU_H_APK_V2_SIGN
 #define __KSU_H_APK_V2_SIGN
 
-#include <linux/types.h>
-
 bool is_manager_apk(char *path);
-
 int get_pkg_from_apk_path(char *pkg, const char *path);
 
 #endif
diff --git a/drivers/kernelsu/manager/manager_identity.h b/drivers/kernelsu/manager/manager_identity.h
new file mode 100644
index 000000000000..0891a6a6f571
--- /dev/null
+++ b/drivers/kernelsu/manager/manager_identity.h
@@ -0,0 +1,41 @@
+#ifndef __KSU_H_MANAGER_IDENTITY
+#define __KSU_H_MANAGER_IDENTITY
+
+// #include "allowlist.h"
+
+#define KSU_INVALID_APPID -1
+#define KSU_PER_USER_RANGE 100000
+
+extern uid_t ksu_manager_appid; // DO NOT DIRECT USE
+
+static inline bool ksu_is_manager_appid_valid()
+{
+	return ksu_manager_appid != KSU_INVALID_APPID;
+}
+
+static inline bool is_manager()
+{
+	return unlikely(ksu_manager_appid == current_uid().val % KSU_PER_USER_RANGE);
+}
+
+static inline bool is_uid_manager(uid_t uid)
+{
+	return unlikely(ksu_manager_appid == uid % KSU_PER_USER_RANGE);
+}
+
+static inline uid_t ksu_get_manager_appid()
+{
+	return ksu_manager_appid;
+}
+
+static inline void ksu_set_manager_appid(uid_t appid)
+{
+	ksu_manager_appid = appid;
+}
+
+static inline void ksu_invalidate_manager_uid()
+{
+	ksu_manager_appid = KSU_INVALID_APPID;
+}
+
+#endif
diff --git a/drivers/kernelsu/manager/pkg_observer.c b/drivers/kernelsu/manager/pkg_observer.c
new file mode 100644
index 000000000000..3a913a6b5ed0
--- /dev/null
+++ b/drivers/kernelsu/manager/pkg_observer.c
@@ -0,0 +1,96 @@
+/**
+ * ! this is on inode_rename, NOT fsnotify
+ * we have access to LSM and overhead is way lower.
+ * we watch one file, check ifs on the same parent inode.
+ * a few int compare and a ptr compare. thats it.
+ * as for throne tracker, we just async it by hand
+ * by offloading it to a kthread.
+ */
+
+static uintptr_t system_dir_inode_ptr = NULL;
+
+__attribute__((cold))
+static noinline void ksu_grab_data_system_inode()
+{
+	struct path path;
+	int ret = kern_path("/data/system", LOOKUP_FOLLOW, &path);
+	if (ret) {
+		pr_info("renameat: /data/system not ready? ret: (%d)\n", ret);
+		return;
+	}
+
+	system_dir_inode_ptr = (uintptr_t)d_inode(path.dentry);
+	pr_info("renameat: cached /data/system d_inode: 0x%lx\n", system_dir_inode_ptr);
+	path_put(&path);
+}
+
+__attribute__((cold))
+static noinline void ksu_rename_observer_slow(struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	system_dir_inode_ptr = NULL; // reset cached inode
+
+	char path[128] = { 0 };
+	char *buf = dentry_path_raw(new_dentry, path, sizeof(path) - 1);
+	if (IS_ERR(buf)) {
+		pr_err("dentry_path_raw failed.\n");
+		return;
+	}
+
+	if (!strstr(buf, "/system/packages.list"))
+		return;
+
+	pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, new_dentry->d_iname, buf);
+	track_throne(false);
+	return;
+}
+
+static inline void ksu_rename_observer(struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	// skip kernel threads
+	if (!current->mm)
+		return;
+
+	if (!old_dentry || !new_dentry)
+		return;
+
+	// skip non system uid
+	if (likely(current_uid().val != 1000))
+		return;
+
+	// HASH_LEN_DECLARE see dcache.h
+	if (likely(new_dentry->d_name.len != sizeof("packages.list") - 1  ))
+		return;
+
+	// /data/system/packages.list.tmp -> /data/system/packages.list
+	if (likely(!!__builtin_memcmp(new_dentry->d_iname, "packages.list", sizeof("packages.list") - 1 )))
+		return;
+
+	// cache dir inode, we try to go for fast path, lockless
+	if (unlikely(!system_dir_inode_ptr))
+		ksu_grab_data_system_inode();
+
+	if (unlikely(!system_dir_inode_ptr))
+		goto slow_path;
+
+	if (unlikely(!new_dentry->d_parent || !new_dentry->d_parent->d_inode))
+		goto slow_path;
+
+	/*
+	 * fallback to slow path, but this should NOT change unless someone overlays /data/system
+	 * but then again maybe https://github.com/tiann/KernelSU/pull/2633#discussion_r2141740346
+	 * but /data is casefolded, overlaying is really really unlikely
+	 * we self heal this thing, so on enxt run, it will try to grab d inode again
+	 * alternatively we can use packages.list inode change as trigger too, however,
+	 * we need to save last state. more writes.
+	 */
+	if (unlikely((uintptr_t)new_dentry->d_parent->d_inode != system_dir_inode_ptr))
+		goto slow_path;
+
+	pr_info("renameat: %s -> %s, /data/system d_inode: 0x%lx \n", old_dentry->d_iname, new_dentry->d_iname, system_dir_inode_ptr);
+	track_throne(false);
+	return;
+
+slow_path:
+	ksu_rename_observer_slow(old_dentry, new_dentry);
+	return;
+}
diff --git a/drivers/kernelsu/throne_tracker.c b/drivers/kernelsu/manager/throne_tracker.c
similarity index 50%
rename from drivers/kernelsu/throne_tracker.c
rename to drivers/kernelsu/manager/throne_tracker.c
index a129fa9f4935..f61bdf3a36b1 100644
--- a/drivers/kernelsu/throne_tracker.c
+++ b/drivers/kernelsu/manager/throne_tracker.c
@@ -1,27 +1,6 @@
-#include <linux/err.h>
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/version.h>
-
-#include "allowlist.h"
-#include "apk_sign.h"
-#include "klog.h" // IWYU pragma: keep
-#include "manager.h"
-#include "kernel_compat.h"
-#include "throne_tracker.h"
-
 uid_t ksu_manager_appid = KSU_INVALID_APPID;
 
-#if defined(CONFIG_KSU_MANUAL_HOOK)
-#define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list.tmp"
-#elif defined(CONFIG_KSU_SYSCALL_HOOK) ||                                        \
-	(LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                      \
-	 defined(CONFIG_KSU_MANUAL_HOOK))
 #define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list"
-#endif
 
 struct uid_data {
 	struct list_head list;
@@ -29,7 +8,7 @@ struct uid_data {
 	char package[KSU_MAX_PACKAGE_NAME];
 };
 
-static void crown_manager(const char *apk, struct list_head *uid_data)
+static __always_inline void crown_manager(const char *apk, struct list_head *uid_data)
 {
 	char pkg[KSU_MAX_PACKAGE_NAME];
 	if (get_pkg_from_apk_path(pkg, apk) < 0) {
@@ -65,8 +44,6 @@ struct apk_path_hash {
 	struct list_head list;
 };
 
-static struct list_head apk_path_hash_list;
-
 struct my_dir_context {
 	struct dir_context ctx;
 	struct list_head *data_path_list;
@@ -76,9 +53,7 @@ struct my_dir_context {
 	int *stop;
 };
 // https://docs.kernel.org/filesystems/porting.html
-// filldir_t (readdir callbacks) calling conventions have changed.
-// Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions).
-// Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted.
+// filldir_t (readdir callbacks) calling conventions have changed. Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted.
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
 #define FILLDIR_RETURN_TYPE bool
 #define FILLDIR_ACTOR_CONTINUE true
@@ -88,25 +63,28 @@ struct my_dir_context {
 #define FILLDIR_ACTOR_CONTINUE 0
 #define FILLDIR_ACTOR_STOP -EINVAL
 #endif
-extern bool is_manager_apk(char *path);
 
-static inline void print_iter(bool is_manager, char *path)
-{
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("Found new base.apk at path: %s, is_manager: %d\n", path,
-		is_manager);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)
+#define MY_ACTOR_CTX_ARG struct dir_context *ctx
 #else
-	if (is_manager)
-		pr_info("Found KernelSU base.apk at %s\n", path);
+#define MY_ACTOR_CTX_ARG void *ctx_void
 #endif
-}
 
-FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name,
+extern bool is_manager_apk(char *path);
+FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name,
 			     int namelen, loff_t off, u64 ino,
 			     unsigned int d_type)
 {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
+	// then pull it out of the void
+	struct dir_context *ctx = (struct dir_context *)ctx_void;
+#endif
 	struct my_dir_context *my_ctx =
 		container_of(ctx, struct my_dir_context, ctx);
+
+	// we put the apk path we collected here
+	char *candidate_path = (char *)my_ctx->private_data;
+
 	char dirpath[DATA_PATH_LEN];
 
 	if (!my_ctx) {
@@ -136,8 +114,7 @@ FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name,
 
 	if (d_type == DT_DIR && my_ctx->depth > 0 &&
 	    (my_ctx->stop && !*my_ctx->stop)) {
-		struct data_path *data =
-			kzalloc(sizeof(struct data_path), GFP_ATOMIC);
+		struct data_path *data = kzalloc(sizeof(struct data_path), GFP_KERNEL);
 
 		if (!data) {
 			pr_err("Failed to allocate memory for %s\n", dirpath);
@@ -147,121 +124,110 @@ FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name,
 		strscpy(data->dirpath, dirpath, DATA_PATH_LEN);
 		data->depth = my_ctx->depth - 1;
 		list_add_tail(&data->list, my_ctx->data_path_list);
-	} else {
-		if ((namelen == 8) &&
-		    (strncmp(name, "base.apk", namelen) == 0)) {
-			struct apk_path_hash *pos;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
-			unsigned int hash =
-				full_name_hash(dirpath, strlen(dirpath));
-#else
-			unsigned int hash =
-				full_name_hash(NULL, dirpath, strlen(dirpath));
-#endif
-			list_for_each_entry (pos, &apk_path_hash_list, list) {
-				if (hash == pos->hash) {
-					pos->exists = true;
-					return FILLDIR_ACTOR_CONTINUE;
-				}
-			}
+		
+		return FILLDIR_ACTOR_CONTINUE;
+	}
 
-			bool is_manager = is_manager_apk(dirpath);
-			print_iter(is_manager, dirpath);
-			if (is_manager) {
-				crown_manager(dirpath, my_ctx->private_data);
-				*my_ctx->stop = 1;
-			}
-		}
+	// now put this on candidate_path
+	if (d_type == DT_REG && namelen == 8 && !memcmp(name, "base.apk", 8)) {
+		snprintf(candidate_path, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, namelen, name);
 	}
 
 	return FILLDIR_ACTOR_CONTINUE;
 }
 
-static void search_manager(const char *path, int depth,
-			   struct list_head *uid_data)
+// compat: https://elixir.bootlin.com/linux/v3.9/source/include/linux/fs.h#L771
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
+#define ksu_get_magic(x) ((x)->f_inode->i_sb->s_magic)
+#else
+#define ksu_get_magic(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic)
+#endif
+
+static noinline void search_manager(const char *path, int depth, struct list_head *uid_data)
 {
 	int i, stop = 0;
 	struct list_head data_path_list;
 	INIT_LIST_HEAD(&data_path_list);
-	INIT_LIST_HEAD(&apk_path_hash_list);
 	unsigned long data_app_magic = 0;
 
-	// Initialize APK cache list
-	struct apk_path_hash *pos, *n;
-	list_for_each_entry (pos, &apk_path_hash_list, list) {
-		pos->exists = false;
-	}
-
 	// First depth
-	struct data_path data;
-	strscpy(data.dirpath, path, DATA_PATH_LEN);
-	data.depth = depth;
-	list_add_tail(&data.list, &data_path_list);
+	struct data_path *data __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return;
+
+	strscpy(data->dirpath, path, DATA_PATH_LEN);
+	data->depth = depth;
+	list_add_tail(&data->list, &data_path_list);
+
+	// we put the apk path we collected here
+	char candidate_path[DATA_PATH_LEN];
 
 	for (i = depth; i >= 0; i--) {
 		struct data_path *pos, *n;
 
-		list_for_each_entry_safe (pos, n, &data_path_list, list) {
+		list_for_each_entry_safe(pos, n, &data_path_list, list) {
 			struct my_dir_context ctx = { .ctx.actor = my_actor,
-						      .data_path_list =
-							      &data_path_list,
-						      .parent_dir =
-							      pos->dirpath,
-						      .private_data = uid_data,
+						      .data_path_list = &data_path_list,
+						      .parent_dir = pos->dirpath,
+						      .private_data = candidate_path,
 						      .depth = pos->depth,
 						      .stop = &stop };
-			struct file *file;
-
-			if (!stop) {
-				file = ksu_filp_open_compat(
-					pos->dirpath, O_RDONLY | O_NOFOLLOW, 0);
-				if (IS_ERR(file)) {
-					pr_err("Failed to open directory: %s, err: %ld\n",
-					       pos->dirpath, PTR_ERR(file));
-					goto skip_iterate;
-				}
 
-				// grab magic on first folder, which is /data/app
-				if (!data_app_magic) {
-					if (file->f_inode->i_sb->s_magic) {
-						data_app_magic =
-							file->f_inode->i_sb
-								->s_magic;
-						pr_info("%s: dir: %s got magic! 0x%lx\n",
-							__func__, pos->dirpath,
-							data_app_magic);
-					} else {
-						filp_close(file, NULL);
-						goto skip_iterate;
-					}
-				}
+			// make sure to clean buffer on every iteration
+			memset(candidate_path, 0, DATA_PATH_LEN);
+
+			if (stop)
+				goto skip_iterate;
 
-				if (file->f_inode->i_sb->s_magic !=
-				    data_app_magic) {
-					pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n",
-						__func__, pos->dirpath,
-						file->f_inode->i_sb->s_magic,
-						data_app_magic);
+			struct file *file = filp_open(pos->dirpath, O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0);
+			if (IS_ERR(file)) {
+				pr_err("Failed to open directory: %s, err: %ld\n", pos->dirpath, PTR_ERR(file));
+				goto skip_iterate;
+			}
+
+			// grab magic on first folder, which is /data/app
+			if (!data_app_magic) {
+				if (ksu_get_magic(file)) {
+					data_app_magic = ksu_get_magic(file);
+					pr_info("%s: dir: %s got magic! 0x%lx\n", __func__, pos->dirpath, data_app_magic);
+				} else {
 					filp_close(file, NULL);
 					goto skip_iterate;
 				}
-
-				iterate_dir(file, &ctx.ctx);
+			}
+				
+			if (ksu_get_magic(file) != data_app_magic) {
+				pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", __func__, pos->dirpath, ksu_get_magic(file), data_app_magic);
 				filp_close(file, NULL);
+				goto skip_iterate;
 			}
-		skip_iterate:
+
+			iterate_dir(file, &ctx.ctx);
+			filp_close(file, NULL);
+
+			// ^ oh so thats the issue!
+			// we were calling is_manager_apk inside iterate_dir
+			// now we defer file opens after iterate_dir
+			// this way we dont open apks while inside that
+			if (!strstarts(candidate_path, "/data/ap") )
+				goto skip_iterate;
+
+			bool is_manager = is_manager_apk(candidate_path);
+			pr_info("Found new base.apk at path: %s, is_manager: %d\n", candidate_path, is_manager);
+
+			if (likely(!is_manager))
+				goto skip_iterate;
+
+			crown_manager(candidate_path, uid_data);
+			stop = 1;
+
+skip_iterate:
 			list_del(&pos->list);
-			if (pos != &data)
+			if (pos != data)
 				kfree(pos);
 		}
 	}
 
-	// clear apk_path_hash_list unconditionally
-	pr_info("Search manager: cleanup!\n");
-	list_for_each_entry_safe (pos, n, &apk_path_hash_list, list) {
-		list_del(&pos->list);
-		kfree(pos);
-	}
 }
 
 static bool is_uid_exist(uid_t uid, char *package, void *data)
@@ -280,13 +246,11 @@ static bool is_uid_exist(uid_t uid, char *package, void *data)
 	return exist;
 }
 
-void track_throne(bool prune_only)
+static void throne_tracker_fn(bool prune_only)
 {
-	struct file *fp =
-		ksu_filp_open_compat(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0);
+	struct file *fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0);
 	if (IS_ERR(fp)) {
-		pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n",
-		       __func__, PTR_ERR(fp));
+		pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", __func__, PTR_ERR(fp));
 		return;
 	}
 
@@ -298,18 +262,19 @@ void track_throne(bool prune_only)
 	loff_t line_start = 0;
 	char buf[KSU_MAX_PACKAGE_NAME];
 	for (;;) {
-		ssize_t count =
-			ksu_kernel_read_compat(fp, &chr, sizeof(chr), &pos);
+		ssize_t count = kernel_read(fp, &chr, sizeof(chr), &pos);
 		if (count != sizeof(chr))
 			break;
 		if (chr != '\n')
 			continue;
 
-		count = ksu_kernel_read_compat(fp, buf, sizeof(buf),
-					       &line_start);
+		count = kernel_read(fp, buf, sizeof(buf) - 1, &line_start);
+		if (count <= 0) {
+			break;
+		}
+		buf[count] = '\0';
 
-		struct uid_data *data =
-			kzalloc(sizeof(struct uid_data), GFP_ATOMIC);
+		struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_KERNEL);
 		if (!data) {
 			filp_close(fp, 0);
 			goto out;
@@ -339,13 +304,12 @@ void track_throne(bool prune_only)
 	}
 	filp_close(fp, 0);
 
-	if (prune_only) {
-		pr_info("throne_tracker: prune allowlist only!\n");
-		goto prune;
-	}
-
 	// now update uid list
-	struct uid_data *np, *n;
+	struct uid_data *np;
+	struct uid_data *n;
+
+	if (prune_only)
+		goto prune;
 
 	// first, check if manager_uid exist!
 	bool manager_exist = false;
@@ -364,7 +328,7 @@ void track_throne(bool prune_only)
 		}
 		pr_info("Searching manager...\n");
 		search_manager("/data/app", 2, &uid_list);
-		pr_info("Search manager finished.\n");
+		pr_info("Search manager finished\n");
 	}
 
 prune:
@@ -378,12 +342,73 @@ void track_throne(bool prune_only)
 	}
 }
 
-void ksu_throne_tracker_init(void)
+static DEFINE_MUTEX(throne_tracker_mutex);
+
+static int throne_tracker_thread(void *data)
+{
+	// now de-void it here
+	bool prune_only = (bool)data;
+
+	pr_info("throne_tracker: pid: %d started\n", current->pid);
+
+	mutex_lock(&throne_tracker_mutex);
+
+test_tmp:
+	if (!is_file_existing("/data/system/packages.list.tmp"))
+		goto test_list;
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG))
+		pr_info("throne_tracker: rename not finished! retry!\n");
+
+	msleep(20); // yield
+	goto test_tmp;
+
+test_list:
+	if (is_file_stable(SYSTEM_PACKAGES_LIST_PATH))
+		goto start_tt;
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG))
+		pr_info("throne_tracker: rename not finished! retry!\n");
+
+	msleep(20); // yield
+	goto test_list;	
+
+start_tt:
+	// lessen that window where user opens manager right away, yet its not crowned
+	set_user_nice(current, -10);
+
+	escape_to_root_forced();
+	throne_tracker_fn(prune_only);
+
+	mutex_unlock(&throne_tracker_mutex);
+
+	pr_info("throne_tracker: pid: %d exit!\n", current->pid);
+	return 0;
+}
+
+void track_throne(bool prune_only)
+{
+#ifndef CONFIG_KSU_THRONE_TRACKER_ALWAYS_THREADED
+	static bool throne_tracker_first_run __read_mostly = true;
+	if (unlikely(throne_tracker_first_run)) {
+		mutex_lock(&throne_tracker_mutex);
+		throne_tracker_fn(prune_only);
+		mutex_unlock(&throne_tracker_mutex);
+		throne_tracker_first_run = false;
+		return;
+	}
+#endif
+
+	// HACK: force cast prune_only to be a void *
+	kthread_run(throne_tracker_thread, (void *)prune_only, "ksu_throne");
+}
+
+void ksu_throne_tracker_init()
 {
 	// nothing to do
 }
 
-void ksu_throne_tracker_exit(void)
+void ksu_throne_tracker_exit()
 {
 	// nothing to do
 }
diff --git a/drivers/kernelsu/manager/throne_tracker.h b/drivers/kernelsu/manager/throne_tracker.h
new file mode 100644
index 000000000000..48beebcf8fd9
--- /dev/null
+++ b/drivers/kernelsu/manager/throne_tracker.h
@@ -0,0 +1,60 @@
+#ifndef __KSU_H_UID_OBSERVER
+#define __KSU_H_UID_OBSERVER
+
+void ksu_throne_tracker_init();
+
+void ksu_throne_tracker_exit();
+
+void track_throne(bool prune_only);
+
+/*
+ * small helper to check if file exists
+ * true - file exists
+ * false - file does NOT exist
+ *
+ */
+static inline bool is_file_existing(const char *path) 
+{
+	struct path kpath;
+
+	if (!!kern_path(path, 0, &kpath))
+		return false;
+	
+	path_put(&kpath);
+	return true;
+}
+
+/*
+ * small helper to check if file is stable
+ * note: if we can hold d_lock ourselves, file is stable
+ * true - file is stable
+ * false - file is deleted / being deleted/renamed
+ *
+ */
+static bool is_file_stable(const char *path) 
+{
+	struct path kpath;
+
+	// kern_path returns 0 on success
+	if (kern_path(path, 0, &kpath))
+		return false;
+
+	// just being defensive
+	if (!kpath.dentry) {
+		path_put(&kpath);
+		return false;
+	}
+
+	if (!spin_trylock(&kpath.dentry->d_lock)) {
+		pr_info("%s: lock held for %s, bail out!\n", __func__, path);
+		path_put(&kpath);
+		return false;
+	}
+	// we hold it ourselves here!
+
+	spin_unlock(&kpath.dentry->d_lock);
+	path_put(&kpath);
+	return true;
+}
+
+#endif
diff --git a/drivers/kernelsu/manager_sign.h b/drivers/kernelsu/manager_sign.h
deleted file mode 100644
index 2766b261e311..000000000000
--- a/drivers/kernelsu/manager_sign.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __KSU_H_MANAGER_SIGN
-#define __KSU_H_MANAGER_SIGN
-
-#include <linux/types.h>
-
-// rsuntk/KernelSU
-#define EXPECTED_SIZE_RSUNTK 0x396
-#define EXPECTED_HASH_RSUNTK                                                   \
-	"f415f4ed9435427e1fdf7f1fccd4dbc07b3d6b8751e4dbcec6f19671f427870b"
-
-typedef struct {
-	u32 size;
-	const char *sha256;
-} apk_sign_key_t;
-
-#endif /* MANAGER_SIGN_H */
diff --git a/drivers/kernelsu/pkg_observer.c b/drivers/kernelsu/pkg_observer.c
deleted file mode 100644
index 049c58e38caf..000000000000
--- a/drivers/kernelsu/pkg_observer.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/fsnotify_backend.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/version.h>
-
-#define MASK_SYSTEM (FS_CREATE | FS_MOVE | FS_EVENT_ON_CHILD)
-
-struct watch_dir {
-	const char *path;
-	u32 mask;
-	struct path kpath;
-	struct inode *inode;
-	struct fsnotify_mark *mark;
-};
-
-static struct fsnotify_group *g;
-
-static int ksu_handle_inode_event(struct fsnotify_mark *mark, u32 mask,
-				  struct inode *inode, struct inode *dir,
-				  const struct qstr *file_name, u32 cookie)
-{
-	if (!file_name)
-		return 0;
-	if (mask & FS_ISDIR)
-		return 0;
-	if (file_name->len == 13 &&
-	    !memcmp(file_name->name, "packages.list", 13)) {
-		pr_info("packages.list detected: %d\n", mask);
-		track_throne(false);
-	}
-	return 0;
-}
-
-static const struct fsnotify_ops ksu_ops = {
-	.handle_inode_event = ksu_handle_inode_event,
-};
-
-static int add_mark_on_inode(struct inode *inode, u32 mask,
-			     struct fsnotify_mark **out)
-{
-	struct fsnotify_mark *m;
-
-	m = kzalloc(sizeof(*m), GFP_KERNEL);
-	if (!m)
-		return -ENOMEM;
-
-	fsnotify_init_mark(m, g);
-	m->mask = mask;
-
-	if (fsnotify_add_inode_mark(m, inode, 0)) {
-		fsnotify_put_mark(m);
-		return -EINVAL;
-	}
-	*out = m;
-	return 0;
-}
-
-static int watch_one_dir(struct watch_dir *wd)
-{
-	int ret = kern_path(wd->path, LOOKUP_FOLLOW, &wd->kpath);
-	if (ret) {
-		pr_info("path not ready: %s (%d)\n", wd->path, ret);
-		return ret;
-	}
-	wd->inode = d_inode(wd->kpath.dentry);
-	ihold(wd->inode);
-
-	ret = add_mark_on_inode(wd->inode, wd->mask, &wd->mark);
-	if (ret) {
-		pr_err("Add mark failed for %s (%d)\n", wd->path, ret);
-		path_put(&wd->kpath);
-		iput(wd->inode);
-		wd->inode = NULL;
-		return ret;
-	}
-	pr_info("watching %s\n", wd->path);
-	return 0;
-}
-
-static void unwatch_one_dir(struct watch_dir *wd)
-{
-	if (wd->mark) {
-		fsnotify_destroy_mark(wd->mark, g);
-		fsnotify_put_mark(wd->mark);
-		wd->mark = NULL;
-	}
-	if (wd->inode) {
-		iput(wd->inode);
-		wd->inode = NULL;
-	}
-	if (wd->kpath.dentry) {
-		path_put(&wd->kpath);
-		memset(&wd->kpath, 0, sizeof(wd->kpath));
-	}
-}
-
-static struct watch_dir g_watch = { .path = "/data/system",
-				    .mask = MASK_SYSTEM };
-
-int ksu_observer_init(void)
-{
-	int ret = 0;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 0, 0)
-	g = fsnotify_alloc_group(&ksu_ops, 0);
-#else
-	g = fsnotify_alloc_group(&ksu_ops);
-#endif
-	if (IS_ERR(g))
-		return PTR_ERR(g);
-
-	ret = watch_one_dir(&g_watch);
-	pr_info("observer init done\n");
-	return 0;
-}
-
-void ksu_observer_exit(void)
-{
-	unwatch_one_dir(&g_watch);
-	fsnotify_put_group(g);
-	pr_info("observer exit done\n");
-}
diff --git a/drivers/kernelsu/policy/allowlist.c b/drivers/kernelsu/policy/allowlist.c
new file mode 100644
index 000000000000..f793935f955b
--- /dev/null
+++ b/drivers/kernelsu/policy/allowlist.c
@@ -0,0 +1,543 @@
+#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32
+#define FILE_FORMAT_VERSION 3 // u32
+
+#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID
+#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0"
+
+static DEFINE_MUTEX(allowlist_mutex);
+
+// default profiles, these may be used frequently, so we cache it
+static struct root_profile default_root_profile;
+static struct non_root_profile default_non_root_profile;
+
+static void __init init_default_profiles()
+{
+	kernel_cap_t full_cap = CAP_FULL_SET;
+
+	default_root_profile.uid = 0;
+	default_root_profile.gid = 0;
+	default_root_profile.groups_count = 1;
+	default_root_profile.groups[0] = 0;
+	memcpy(&default_root_profile.capabilities.effective, &full_cap,
+		   sizeof(default_root_profile.capabilities.effective));
+	default_root_profile.namespaces = KSU_NS_INHERITED;
+	strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN);
+
+	// This means that we will umount modules by default!
+	default_non_root_profile.umount_modules = true;
+}
+
+struct perm_data {
+	struct hlist_node list;
+	struct rcu_head rcu;
+	struct kref ref;
+	struct app_profile profile;
+};
+
+// protected by rcu
+#define ALLOW_LIST_BITS 8
+static DEFINE_HASHTABLE(allow_list, ALLOW_LIST_BITS);
+static u16 allow_list_count = 0;
+
+#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist"
+
+void ksu_persistent_allow_list(void);
+
+void ksu_show_allow_list(void)
+{
+	int i;
+	struct perm_data *p = NULL;
+	pr_info("ksu_show_allow_list\n");
+	rcu_read_lock();
+	hash_for_each_rcu (allow_list, i, p, list) {
+		pr_info("uid :%d, allow: %d\n", p->profile.curr_uid, p->profile.allow_su);
+	}
+	rcu_read_unlock();
+}
+
+struct app_profile *ksu_get_app_profile(uid_t uid)
+{
+	struct perm_data *p = NULL;
+	bool found;
+
+retry:
+	found = false;
+	hash_for_each_possible_rcu (allow_list, p, list, uid) {
+		if (uid == p->profile.curr_uid) {
+			// found it, override it with ours
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return NULL;
+
+	if (!kref_get_unless_zero(&p->ref)) {
+		goto retry;
+	}
+
+	return &p->profile;
+}
+
+static inline bool forbid_system_uid(uid_t uid)
+{
+#define SHELL_UID 2000
+#define SYSTEM_UID 1000
+	return uid < SHELL_UID && uid != SYSTEM_UID;
+}
+
+static bool profile_valid(struct app_profile *profile)
+{
+	if (!profile) {
+		return false;
+	}
+
+	bool need_migrate_su_domain = false;
+
+	if (unlikely(profile->version == 2)) {
+		profile->version = KSU_APP_PROFILE_VER;
+		need_migrate_su_domain = true;
+	}
+
+	if (strnlen(profile->key, sizeof(profile->key)) >= sizeof(profile->key)) {
+		pr_err("invalid app_profile key\n");
+		return false;
+	}
+
+	if (profile->version < KSU_APP_PROFILE_VER) {
+		pr_info("Unsupported profile version: %d\n", profile->version);
+		return false;
+	}
+
+	if (profile->allow_su) {
+		if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) {
+			pr_err("invalid groups_count in app_profile: %s\n", profile->key);
+			return false;
+		}
+
+		char *domain = profile->rp_config.profile.selinux_domain;
+		static const size_t domain_len = sizeof(profile->rp_config.profile.selinux_domain);
+		if (unlikely(need_migrate_su_domain)) {
+			if (strncmp(domain, "u:r:su:s0", domain_len) == 0) {
+				strscpy_pad(domain, KSU_DEFAULT_SELINUX_DOMAIN, domain_len);
+				pr_info("migrated profile domain: %s\n", profile->key);
+			}
+		}
+		size_t len = strnlen(domain, domain_len);
+
+		if (len == 0 || len >= domain_len) {
+			pr_err("invalid selinux_domain in app_profile: %s\n", profile->key);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static void release_perm_data(struct kref *ref)
+{
+	struct perm_data *p = container_of(ref, struct perm_data, ref);
+	kfree_rcu(p, rcu);
+}
+
+static void put_perm_data(struct perm_data *data)
+{
+	kref_put(&data->ref, release_perm_data);
+}
+
+int ksu_set_app_profile(struct app_profile *profile)
+{
+	struct perm_data *p, *np;
+	int result = 0;
+
+	if (!profile_valid(profile)) {
+		pr_err("Failed to set app profile: invalid profile!\n");
+		return -EINVAL;
+	}
+
+	// only allow default non root profile
+	if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID && strcmp(profile->key, "$") != 0)) {
+		return -EINVAL;
+	}
+
+	mutex_lock(&allowlist_mutex);
+
+	hash_for_each_possible (allow_list, p, list, profile->curr_uid) {
+		if (profile->curr_uid == p->profile.curr_uid) {
+			if (strcmp(profile->key, p->profile.key) != 0) {
+				pr_warn("ksu_set_app_profile: key changed: uid=%d orig=%s new=%s\n", profile->curr_uid, p->profile.key,
+						profile->key);
+			}
+			// found it, just override it all!
+			np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL);
+			if (!np) {
+				result = -ENOMEM;
+				goto out_unlock;
+			}
+			kref_init(&np->ref);
+			memcpy(&np->profile, profile, sizeof(*profile));
+			hlist_replace_rcu(&p->list, &np->list);
+			put_perm_data(p);
+			goto out;
+		}
+	}
+
+	if (unlikely(allow_list_count == U16_MAX)) {
+		pr_err("too many app profile\n");
+		result = -E2BIG;
+		goto out_unlock;
+	}
+
+	// not found, alloc a new node!
+	np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL);
+	if (!np) {
+		pr_err("ksu_set_app_profile alloc failed\n");
+		result = -ENOMEM;
+		goto out_unlock;
+	}
+
+	kref_init(&np->ref);
+	memcpy(&np->profile, profile, sizeof(*profile));
+	if (profile->allow_su) {
+		pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", profile->key, profile->curr_uid,
+				profile->rp_config.profile.gid, profile->rp_config.profile.selinux_domain);
+	} else {
+		pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", profile->key, profile->curr_uid,
+				profile->nrp_config.profile.umount_modules);
+	}
+
+	hash_add_rcu(allow_list, &np->list, np->profile.curr_uid);
+	++allow_list_count;
+
+out:
+	result = 0;
+
+	if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID)) {
+		// set default non root profile
+		default_non_root_profile.umount_modules = profile->nrp_config.profile.umount_modules;
+	}
+
+out_unlock:
+	mutex_unlock(&allowlist_mutex);
+	return result;
+}
+
+bool __ksu_is_allow_uid(uid_t uid)
+{
+	struct perm_data *p;
+
+	if (forbid_system_uid(uid)) {
+		// do not bother going through the list if it's system
+		return false;
+	}
+
+	if (unlikely(is_uid_manager(uid))) {
+		// manager is always allowed!
+		return true;
+	}
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID))
+		return true;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu (allow_list, p, list, uid) {
+		if (uid == p->profile.curr_uid && p->profile.allow_su) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
+bool __ksu_is_allow_uid_for_current(uid_t uid)
+{
+	if (unlikely(uid == 0)) {
+		// already root, but only allow our domain.
+		return is_ksu_domain();
+	}
+	return __ksu_is_allow_uid(uid);
+}
+
+bool ksu_uid_should_umount(uid_t uid)
+{
+	struct app_profile *profile;
+	bool res;
+	if (likely(ksu_is_manager_appid_valid()) && unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) {
+		// we should not umount on manager!
+		return false;
+	}
+	if (unlikely(uid == WEBVIEW_ZYGOTE_UID)) {
+		// we should not umount for webview zygote
+		return false;
+	}
+
+	rcu_read_lock();
+	profile = ksu_get_app_profile(uid);
+	if (!profile) {
+		// no app profile found, it must be non root app
+		res = default_non_root_profile.umount_modules;
+	} else if (profile->allow_su) {
+		// if found and it is granted to su, we shouldn't umount for it
+		res = false;
+	} else {
+		// found an app profile
+		if (profile->nrp_config.use_default) {
+			res = default_non_root_profile.umount_modules;
+		} else {
+			res = profile->nrp_config.profile.umount_modules;
+		}
+	}
+	rcu_read_unlock();
+
+	if (profile)
+		ksu_put_app_profile(profile);
+	return res;
+}
+
+void ksu_put_app_profile(struct app_profile *profile)
+{
+	struct perm_data *p = container_of(profile, struct perm_data, profile);
+	put_perm_data(p);
+}
+
+struct root_profile *ksu_get_root_profile(uid_t uid)
+{
+	struct perm_data *p = NULL;
+	struct root_profile *res;
+
+	rcu_read_lock();
+	if (is_uid_manager(uid)) {
+		goto use_default;
+	}
+
+	if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID))
+		goto use_default;
+
+retry:
+	res = NULL;
+	hash_for_each_possible_rcu (allow_list, p, list, uid) {
+		if (uid == p->profile.curr_uid && p->profile.allow_su) {
+			if (!p->profile.rp_config.use_default) {
+				if (!kref_get_unless_zero(&p->ref)) {
+					goto retry;
+				}
+				res = &p->profile.rp_config.profile;
+			}
+			break;
+		}
+	}
+
+	if (unlikely(!res)) {
+	use_default:
+		res = &default_root_profile;
+	}
+
+	rcu_read_unlock();
+	return res;
+}
+
+void ksu_put_root_profile(struct root_profile *profile)
+{
+	if (likely(profile == &default_root_profile))
+		return;
+	struct perm_data *p = container_of(profile, struct perm_data, profile.rp_config.profile);
+	put_perm_data(p);
+}
+
+bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow)
+{
+	struct perm_data *p = NULL;
+	u16 i = 0, j = 0;
+	int iter;
+	rcu_read_lock();
+	hash_for_each_rcu (allow_list, iter, p, list) {
+		// pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow);
+		if (p->profile.allow_su == allow && !is_uid_manager(p->profile.curr_uid)) {
+			if (j < length) {
+				array[j++] = p->profile.curr_uid;
+			}
+			++i;
+		}
+	}
+	rcu_read_unlock();
+	if (out_length) {
+		*out_length = j;
+	}
+	if (out_total) {
+		*out_total = i;
+	}
+
+	return true;
+}
+
+static void do_persistent_allow_list()
+{
+	u32 magic = FILE_MAGIC;
+	u32 version = FILE_FORMAT_VERSION;
+	struct perm_data *p = NULL;
+	loff_t off = 0;
+	int i;
+
+	struct file *fp = filp_open(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+	if (IS_ERR(fp)) {
+		pr_err("save_allow_list create file failed: %ld\n", PTR_ERR(fp));
+		goto out;
+	}
+
+	// store magic and version
+	if (kernel_write(fp, &magic, sizeof(magic), &off) != sizeof(magic)) {
+		pr_err("save_allow_list write magic failed.\n");
+		goto close_file;
+	}
+
+	if (kernel_write(fp, &version, sizeof(version), &off) != sizeof(version)) {
+		pr_err("save_allow_list write version failed.\n");
+		goto close_file;
+	}
+
+	hash_for_each (allow_list, i, p, list) {
+		pr_info("save allow list, name: %s uid :%d, allow: %d\n", p->profile.key, p->profile.curr_uid,
+				p->profile.allow_su);
+
+		kernel_write(fp, &p->profile, sizeof(p->profile), &off);
+	}
+
+close_file:
+	filp_close(fp, 0);
+out:
+	return;
+}
+
+// this is a bit heavier than task work / workqueue but this allows
+// us to have our own context. we give it a full escaped-to-root one.
+static int persistent_allow_list_pre(void *data)
+{
+	pr_info("do_persistent_allow_list: pid: %d started\n", current->pid);
+
+	/**
+	 * repurpose the mutex they were holding on ksu_persistent_allow_list_fn
+	 * since all this does eventually is to call kernel_write
+	 * we hit two birds in one stone. exclusive io + exclusive kthread
+	 * there wont be a single instance lock, but for what we need, its finee
+	 * we just let other threads stall.
+	 * 'mutex-trylock-fail-then-return' is detrimental here
+	 */
+	mutex_lock(&allowlist_mutex);
+
+	escape_to_root_forced(); // give permissions for everything
+	do_persistent_allow_list();
+
+	mutex_unlock(&allowlist_mutex);
+
+	pr_info("do_persistent_allow_list: pid: %d exit\n", current->pid);
+	return 0;
+}
+
+void ksu_persistent_allow_list()
+{
+	kthread_run(persistent_allow_list_pre, NULL, "allowlist");
+}
+
+void ksu_load_allow_list()
+{
+	loff_t off = 0;
+	ssize_t ret = 0;
+	struct file *fp = NULL;
+	u32 magic;
+	u32 version;
+
+	// load allowlist now!
+	fp = filp_open(KERNEL_SU_ALLOWLIST, O_RDONLY, 0);
+	if (IS_ERR(fp)) {
+		pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp));
+		return;
+	}
+
+	// verify magic
+	if (kernel_read(fp, &magic, sizeof(magic), &off) != sizeof(magic) || magic != FILE_MAGIC) {
+		pr_err("allowlist file invalid: %d!\n", magic);
+		goto exit;
+	}
+
+	if (kernel_read(fp, &version, sizeof(version), &off) != sizeof(version)) {
+		pr_err("allowlist read version: %d failed\n", version);
+		goto exit;
+	}
+
+	pr_info("allowlist version: %d\n", version);
+
+	while (true) {
+		struct app_profile profile;
+
+		ret = kernel_read(fp, &profile, sizeof(profile), &off);
+
+		if (ret <= 0) {
+			pr_info("load_allow_list read err: %zd\n", ret);
+			break;
+		}
+
+		pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", profile.key, profile.curr_uid, profile.allow_su);
+		ksu_set_app_profile(&profile);
+	}
+
+exit:
+	ksu_show_allow_list();
+	filp_close(fp, 0);
+}
+
+void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data)
+{
+	struct perm_data *np = NULL;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!ksu_boot_completed) {
+		pr_info("boot not completed, skip prune\n");
+		return;
+	}
+
+	bool modified = false;
+	mutex_lock(&allowlist_mutex);
+	hash_for_each_safe (allow_list, i, tmp, np, list) {
+		uid_t uid = np->profile.curr_uid;
+		char *package = np->profile.key;
+		// we use this uid for special cases, don't prune it!
+		bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID;
+		if (!is_preserved_uid && !is_uid_valid(uid, package, data)) {
+			modified = true;
+			pr_info("prune uid: %d, package: %s\n", uid, package);
+			hlist_del_rcu(&np->list);
+			put_perm_data(np);
+			--allow_list_count;
+		}
+	}
+	mutex_unlock(&allowlist_mutex);
+
+	if (modified) {
+		smp_mb();
+		ksu_persistent_allow_list();
+	}
+}
+
+void __init ksu_allowlist_init(void)
+{
+	init_default_profiles();
+}
+
+void __exit ksu_allowlist_exit(void)
+{
+	struct perm_data *np = NULL;
+	struct hlist_node *tmp;
+	int i;
+
+	// free allowlist
+	mutex_lock(&allowlist_mutex);
+	hash_for_each_safe (allow_list, i, tmp, np, list) {
+		hlist_del(&np->list);
+		put_perm_data(np);
+	}
+	mutex_unlock(&allowlist_mutex);
+}
diff --git a/drivers/kernelsu/allowlist.h b/drivers/kernelsu/policy/allowlist.h
similarity index 63%
rename from drivers/kernelsu/allowlist.h
rename to drivers/kernelsu/policy/allowlist.h
index d52795afe866..59809cc7ccd3 100644
--- a/drivers/kernelsu/allowlist.h
+++ b/drivers/kernelsu/policy/allowlist.h
@@ -1,11 +1,10 @@
 #ifndef __KSU_H_ALLOWLIST
 #define __KSU_H_ALLOWLIST
 
-#include <linux/types.h>
-#include <linux/uidgid.h>
 #include "app_profile.h"
 
 #define PER_USER_RANGE 100000
+#define WEBVIEW_ZYGOTE_UID 1053
 #define FIRST_APPLICATION_UID 10000
 #define LAST_APPLICATION_UID 19999
 #define FIRST_ISOLATED_UID 99000
@@ -25,19 +24,23 @@ bool __ksu_is_allow_uid(uid_t uid);
 
 // Check if the uid is in allow list, or current is ksu domain root
 bool __ksu_is_allow_uid_for_current(uid_t uid);
-#define ksu_is_allow_uid_for_current(uid)                                      \
-	unlikely(__ksu_is_allow_uid_for_current(uid))
+#define ksu_is_allow_uid_for_current(uid) unlikely(__ksu_is_allow_uid_for_current(uid))
 
-bool ksu_get_allow_list(int *array, int *length, bool allow);
+bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow);
 
-void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *),
-			 void *data);
+void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), void *data);
+void ksu_persistent_allow_list();
 
-bool ksu_get_app_profile(struct app_profile *);
-bool ksu_set_app_profile(struct app_profile *, bool persist);
+// should be called with rcu read lock
+struct app_profile *ksu_get_app_profile(uid_t uid);
+// only used to put the app_profile returned by ksu_get_app_profile
+void ksu_put_app_profile(struct app_profile *);
+int ksu_set_app_profile(struct app_profile *);
 
 bool ksu_uid_should_umount(uid_t uid);
 struct root_profile *ksu_get_root_profile(uid_t uid);
+// only used to put the root_profile returned by ksu_get_root_profile
+void ksu_put_root_profile(struct root_profile *);
 
 static inline bool is_appuid(uid_t uid)
 {
diff --git a/drivers/kernelsu/policy/app_profile.c b/drivers/kernelsu/policy/app_profile.c
new file mode 100644
index 000000000000..7abdbcdf8f40
--- /dev/null
+++ b/drivers/kernelsu/policy/app_profile.c
@@ -0,0 +1,209 @@
+#if LINUX_VERSION_CODE >= KERNEL_VERSION (6, 7, 0)
+static struct group_info root_groups = { .usage = REFCOUNT_INIT(2) };
+#else
+static struct group_info root_groups = { .usage = ATOMIC_INIT(2) };
+#endif
+
+static void setup_groups(struct root_profile *profile, struct cred *cred)
+{
+	if (profile->groups_count > KSU_MAX_GROUPS) {
+		pr_warn("Failed to setgroups, too large group: %d!\n",
+			profile->uid);
+		return;
+	}
+
+	if (profile->groups_count == 1 && profile->groups[0] == 0) {
+		// setgroup to root and return early.
+		if (cred->group_info)
+			put_group_info(cred->group_info);
+		cred->group_info = get_group_info(&root_groups);
+		return;
+	}
+
+	u32 ngroups = profile->groups_count;
+	struct group_info *group_info = groups_alloc(ngroups);
+	if (!group_info) {
+		pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid);
+		return;
+	}
+
+	int i;
+	for (i = 0; i < ngroups; i++) {
+		gid_t gid = profile->groups[i];
+		kgid_t kgid = make_kgid(current_user_ns(), gid);
+		if (!gid_valid(kgid)) {
+			pr_warn("Failed to setgroups, invalid gid: %d\n", gid);
+			put_group_info(group_info);
+			return;
+		}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
+		group_info->gid[i] = kgid;
+#else
+		GROUP_AT(group_info, i) = kgid;
+#endif
+	}
+
+	groups_sort(group_info);
+	set_groups(cred, group_info);
+	put_group_info(group_info);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+static void disable_seccomp(void)
+{
+	struct task_struct *fake;
+
+	fake = kmalloc(sizeof(*fake), GFP_KERNEL);
+	if (!fake) {
+		pr_warn("failed to alloc fake task_struct\n");
+		return;
+	}
+
+	// Refer to kernel/seccomp.c: seccomp_set_mode_strict
+	// When disabling Seccomp, ensure that current->sighand->siglock is held during the operation.
+	spin_lock_irq(&current->sighand->siglock);
+	// disable seccomp
+#if defined(CONFIG_GENERIC_ENTRY) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+	clear_syscall_work(SECCOMP);
+#else
+	clear_thread_flag(TIF_SECCOMP);
+#endif
+
+	memcpy(fake, current, sizeof(*fake));
+
+	current->seccomp.mode = 0;
+	current->seccomp.filter = NULL;
+	atomic_set(&current->seccomp.filter_count, 0);
+	spin_unlock_irq(&current->sighand->siglock);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0)
+	// https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577
+	fake->flags |= PF_EXITING;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+	// https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558
+	fake->sighand = NULL;
+#endif
+
+	seccomp_filter_release(fake);
+	kfree(fake);
+}
+#else /* ! LINUX_VERSION_CODE < 5.9 */
+/*
+ * for < 5.9 lets have free_task do it for us (put_seccomp_filter)
+ * we risk a double free / double decrement which isn't safe on old kernels
+ * I'm not even sure if this thing is needed on newer kernels
+ *
+ */
+static void disable_seccomp(void)
+{
+	spin_lock_irq(&current->sighand->siglock);
+
+	clear_thread_flag(TIF_SECCOMP);
+	current->seccomp.mode = 0;
+	current->seccomp.filter = NULL;
+
+	spin_unlock_irq(&current->sighand->siglock);
+}
+#endif // 5.9
+
+static int escape_to_root(bool is_forced)
+{
+	int ret = 0;
+	struct cred *cred;
+	struct root_profile *profile = NULL;
+	struct user_struct *new_user;
+
+	cred = prepare_creds();
+	if (!cred) {
+		pr_warn("prepare_creds failed!\n");
+		return -ENOMEM;
+	}
+
+	if (!is_forced && ksu_get_uid_t(cred->euid) == 0) {
+		pr_warn("Already root, don't escape!\n");
+		goto out_abort_creds;
+	}
+
+	profile = ksu_get_root_profile(ksu_get_uid_t(cred->uid));
+
+	ksu_get_uid_t(cred->uid) = profile->uid;
+	ksu_get_uid_t(cred->suid) = profile->uid;
+	ksu_get_uid_t(cred->euid) = profile->uid;
+	ksu_get_uid_t(cred->fsuid) = profile->uid;
+
+	ksu_get_uid_t(cred->gid) = profile->gid;
+	ksu_get_uid_t(cred->fsgid) = profile->gid;
+	ksu_get_uid_t(cred->sgid) = profile->gid;
+	ksu_get_uid_t(cred->egid) = profile->gid;
+	cred->securebits = 0;
+
+	BUILD_BUG_ON(sizeof(profile->capabilities.effective) != sizeof(kernel_cap_t));
+
+	/*
+	 * Mirror the kernel set*uid path: update cred->user first, then
+	 * cred->ucounts, before commit_creds(). commit_creds() moves
+	 * RLIMIT_NPROC accounting based on cred->user; if uid changes while
+	 * user/ucounts stay stale, the old charge can remain pinned to the
+	 * previous UID.
+	 * See kernel/sys.c:set_user() and kernel/cred.c:set_cred_ucounts() /
+	 * commit_creds():
+	 * https://github.com/torvalds/linux/blob/v5.14/kernel/sys.c
+	 * https://github.com/torvalds/linux/blob/v5.14/kernel/cred.c
+	 */
+	new_user = alloc_uid(cred->uid);
+	if (!new_user) {
+		ret = -ENOMEM;
+		goto out_abort_creds;
+	}
+
+	free_uid(cred->user);
+	cred->user = new_user;
+
+	// v5.14+ added cred->ucounts, so we must refresh it after changing uid/user:
+	// https://github.com/torvalds/linux/commit/905ae01c4ae2ae3df05bb141801b1db4b7d83c61#diff-ff6060da281bd9ef3f24e17b77a9b0b5b2ed2d7208bb69b29107bee69732bd31
+	// on older kernels, per-UID process accounting lives in user_struct.
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+	if (set_cred_ucounts(cred)) {
+		goto out_abort_creds;
+	}
+#endif
+
+	// setup capabilities
+	// we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process
+	// we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec!
+	u64 cap_for_ksud = profile->capabilities.effective | CAP_DAC_READ_SEARCH;
+	memcpy(&cred->cap_effective, &cap_for_ksud, sizeof(cred->cap_effective));
+	memcpy(&cred->cap_permitted, &profile->capabilities.effective, sizeof(cred->cap_permitted));
+	memcpy(&cred->cap_bset, &profile->capabilities.effective, sizeof(cred->cap_bset));
+
+	setup_groups(profile, cred);
+	setup_selinux(profile->selinux_domain, cred);
+
+	commit_creds(cred);
+
+	if (test_thread_flag(TIF_SECCOMP))
+		disable_seccomp();
+	
+	setup_mount_ns(profile->namespaces);
+	ksu_put_root_profile(profile);
+	return 0;
+
+out_abort_creds:
+	if (profile)
+		ksu_put_root_profile(profile);
+	abort_creds(cred);
+	return ret;
+}
+
+int escape_with_root_profile(void)
+{
+	return escape_to_root(false);
+}
+
+void escape_to_root_forced(void)
+{
+	// I'm not really sure which permissions are needed
+	// its just escape to root but bypasses cred check
+	// which we likely already have on contexts where this will be used.
+	escape_to_root(true);
+}
diff --git a/drivers/kernelsu/policy/app_profile.h b/drivers/kernelsu/policy/app_profile.h
new file mode 100644
index 000000000000..747f550236d7
--- /dev/null
+++ b/drivers/kernelsu/policy/app_profile.h
@@ -0,0 +1,9 @@
+#ifndef __KSU_H_APP_PROFILE
+#define __KSU_H_APP_PROFILE
+
+// Escalate current process to root with the appropriate profile
+int escape_with_root_profile(void);
+
+void escape_to_root_forced(void);
+
+#endif
diff --git a/drivers/kernelsu/feature.c b/drivers/kernelsu/policy/feature.c
similarity index 96%
rename from drivers/kernelsu/feature.c
rename to drivers/kernelsu/policy/feature.c
index a1017aafbb8e..cf9ee4d5e0eb 100644
--- a/drivers/kernelsu/feature.c
+++ b/drivers/kernelsu/policy/feature.c
@@ -1,8 +1,3 @@
-#include "feature.h"
-#include "klog.h" // IWYU pragma: keep
-
-#include <linux/mutex.h>
-
 static const struct ksu_feature_handler *feature_handlers[KSU_FEATURE_MAX];
 
 static DEFINE_MUTEX(feature_mutex);
@@ -149,7 +144,7 @@ int ksu_set_feature(u32 feature_id, u64 value)
 	return ret;
 }
 
-void ksu_feature_init(void)
+void __init ksu_feature_init(void)
 {
 	int i;
 
@@ -160,7 +155,7 @@ void ksu_feature_init(void)
 	pr_info("feature: feature management initialized\n");
 }
 
-void ksu_feature_exit(void)
+void __exit ksu_feature_exit(void)
 {
 	int i;
 
diff --git a/drivers/kernelsu/feature.h b/drivers/kernelsu/policy/feature.h
similarity index 82%
rename from drivers/kernelsu/feature.h
rename to drivers/kernelsu/policy/feature.h
index a5de137a5cfb..1eb12392e617 100644
--- a/drivers/kernelsu/feature.h
+++ b/drivers/kernelsu/policy/feature.h
@@ -1,15 +1,6 @@
 #ifndef __KSU_H_FEATURE
 #define __KSU_H_FEATURE
 
-#include <linux/types.h>
-
-enum ksu_feature_id {
-	KSU_FEATURE_SU_COMPAT = 0,
-	KSU_FEATURE_KERNEL_UMOUNT = 1,
-
-	KSU_FEATURE_MAX
-};
-
 typedef int (*ksu_feature_get_t)(u64 *value);
 typedef int (*ksu_feature_set_t)(u64 value);
 
diff --git a/drivers/kernelsu/runtime/ksud.c b/drivers/kernelsu/runtime/ksud.c
new file mode 100644
index 000000000000..44b3c25d2618
--- /dev/null
+++ b/drivers/kernelsu/runtime/ksud.c
@@ -0,0 +1,563 @@
+static const char KERNEL_SU_RC[] =
+	"\n"
+
+	"on post-fs-data\n"
+	"    start logd\n"
+	// We should wait for the post-fs-data finish
+	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " post-fs-data\n"
+	"\n"
+
+	"on nonencrypted\n"
+	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n"
+	"\n"
+
+	"on property:vold.decrypt=trigger_restart_framework\n"
+	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n"
+	"\n"
+
+	"on property:sys.boot_completed=1\n"
+	"    exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " boot-completed\n"
+	"\n"
+
+	"\n";
+
+static void stop_vfs_read_hook();
+static void stop_input_hook();
+
+static bool ksu_module_mounted __read_mostly = false;
+static bool ksu_boot_completed __read_mostly = false;
+static bool ksu_vfs_read_hook __read_mostly = true;
+static bool ksu_input_hook __read_mostly = true;
+
+#ifdef KSU_CAN_USE_JUMP_LABEL
+DEFINE_STATIC_KEY_TRUE(ksud_vfs_read_key);
+static inline void ksu_disable_vfs_read_branch()
+{
+	pr_info("vfs_read_hook: remove vfs_read branches\n");
+	static_branch_disable(&ksud_vfs_read_key);
+	smp_mb();
+}
+#else
+static inline void ksu_disable_vfs_read_branch() { } // no-op
+#endif
+
+void on_post_fs_data(void)
+{
+	static bool done = false;
+	if (done) {
+		pr_info("on_post_fs_data already done\n");
+		return;
+	}
+	done = true;
+	pr_info("on_post_fs_data!\n");
+
+	ksu_load_allow_list();
+	// sanity check, this may influence the performance
+	stop_input_hook();
+}
+
+extern void ext4_unregister_sysfs(struct super_block *sb);
+int nuke_ext4_sysfs(const char *mnt)
+{
+	struct path path;
+	int err = kern_path(mnt, 0, &path);
+	if (err) {
+		pr_err("nuke path err: %d\n", err);
+		return err;
+	}
+
+	struct super_block *sb = path.dentry->d_inode->i_sb;
+	const char *name = sb->s_type->name;
+	if (strcmp(name, "ext4") != 0) {
+		pr_info("nuke but module aren't mounted\n");
+		path_put(&path);
+		return -EINVAL;
+	}
+
+	ext4_unregister_sysfs(sb);
+	path_put(&path);
+	return 0;
+}
+
+void on_module_mounted(void)
+{
+	pr_info("on_module_mounted!\n");
+	ksu_module_mounted = true;
+}
+
+void on_boot_completed(void)
+{
+	ksud_escape_exit();
+
+	ksu_boot_completed = true;
+	pr_info("on_boot_completed!\n");
+	track_throne(true);
+}
+
+static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *);
+static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *);
+static struct file_operations fops_proxy;
+static ssize_t ksu_rc_pos = 0;
+const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1;
+
+// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da
+// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da
+// The system will read init.rc file until EOF, whenever read() returns 0,
+// so we begin append ksu rc when we meet EOF.
+
+static ssize_t read_proxy(struct file *file, char __user *buf, size_t count, loff_t *pos)
+{
+	ssize_t ret = 0;
+	size_t append_count;
+	if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len)
+		goto append_ksu_rc;
+
+	ret = orig_read(file, buf, count, pos);
+	if (ret != 0 || ksu_rc_pos >= ksu_rc_len) {
+		return ret;
+	} else {
+		pr_info("read_proxy: orig read finished, start append rc\n");
+	}
+append_ksu_rc:
+	append_count = ksu_rc_len - ksu_rc_pos;
+	if (append_count > count - ret)
+		append_count = count - ret;
+	// copy_to_user returns the number of not copied
+	if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) {
+		pr_info("read_proxy: append error, totally appended %ld\n", ksu_rc_pos);
+	} else {
+		pr_info("read_proxy: append %ld\n", append_count);
+
+		ksu_rc_pos += append_count;
+		if (ksu_rc_pos == ksu_rc_len) {
+			pr_info("read_proxy: append done\n");
+		}
+		ret += append_count;
+	}
+
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER)
+static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to)
+{
+	ssize_t ret = 0;
+	size_t append_count;
+	if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len)
+		goto append_ksu_rc;
+
+	ret = orig_read_iter(iocb, to);
+	if (ret != 0 || ksu_rc_pos >= ksu_rc_len) {
+		return ret;
+	} else {
+		pr_info("read_iter_proxy: orig read finished, start append rc\n");
+	}
+append_ksu_rc:
+	// copy_to_iter returns the number of copied bytes
+	append_count = copy_to_iter((void *)KERNEL_SU_RC + ksu_rc_pos, ksu_rc_len - ksu_rc_pos, to);
+	if (!append_count) {
+		pr_info("read_iter_proxy: append error, totally appended %ld\n", ksu_rc_pos);
+	} else {
+		pr_info("read_iter_proxy: append %ld\n", append_count);
+
+		ksu_rc_pos += append_count;
+		if (ksu_rc_pos == ksu_rc_len) {
+			pr_info("read_iter_proxy: append done\n");
+		}
+		ret += append_count;
+	}
+	return ret;
+}
+#endif
+
+static bool is_init_rc(struct file *fp)
+{
+	if (strcmp(current->comm, "init")) {
+		// we are only interest in `init` process
+		return false;
+	}
+
+	if (!d_is_reg(fp->f_path.dentry)) {
+		return false;
+	}
+
+	const char *short_name = fp->f_path.dentry->d_name.name;
+	if (strcmp(short_name, "init.rc")) {
+		// we are only interest `init.rc` file name file
+		return false;
+	}
+	char path[256] = {0};
+	char *dpath = d_path(&fp->f_path, path, sizeof(path));
+
+	if (IS_ERR(dpath)) {
+		return false;
+	}
+
+	if (!!strcmp(dpath, "/init.rc") && !!strcmp(dpath, "/system/etc/init/hw/init.rc")) {
+		return false;
+	}
+
+	pr_info("%s: %s \n", __func__, dpath);
+
+	return true;
+}
+
+__attribute__((cold))
+static noinline void ksu_install_rc_hook(struct file *file)
+{
+	if (!is_init(current_cred()))
+		return;
+
+	if (!is_init_rc(file)) {
+		return;
+	}
+
+	// we only process the first read
+	static bool rc_hooked = false;
+	if (rc_hooked) {
+		// we don't need this kprobe, unregister it!
+		stop_vfs_read_hook();
+		return;
+	}
+	rc_hooked = true;
+
+	// since we already have domains, selinux is initialized, we can apply rules and shit
+	// https://github.com/LineageOS/android_system_core_old/blob/ecbcdafc3/init/init.cpp#L669
+	pr_info("%s: init.rc second stage, fp: 0x%lx \n", __func__, (uintptr_t)file);
+	apply_kernelsu_rules();
+	cache_sid();
+	setup_ksu_cred();
+	ksu_grab_init_session_keyring();
+
+	// now we can sure that the init process is reading
+	// `/system/etc/init/init.rc`
+
+	pr_info("read init.rc, comm: %s, rc_count: %zu\n", current->comm, ksu_rc_len);
+
+	// Now we need to proxy the read and modify the result!
+	// But, we can not modify the file_operations directly, because it's in read-only memory.
+	// We just replace the whole file_operations with a proxy one.
+	memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations));
+	orig_read = file->f_op->read;
+	if (orig_read) {
+		fops_proxy.read = read_proxy;
+	}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER)
+	orig_read_iter = file->f_op->read_iter;
+	if (orig_read_iter) {
+		fops_proxy.read_iter = read_iter_proxy;
+	}
+#endif
+	// replace the file_operations
+	file->f_op = &fops_proxy;
+
+	return;
+}
+
+// for sys_read kp / syscall table
+__attribute__((cold))
+static noinline void ksu_handle_sys_read_fd(unsigned int fd)
+{
+	if (likely(!ksu_vfs_read_hook))
+		return;
+
+	if (!is_init(current_cred()))
+		return;
+
+	struct file *file = fget(fd);
+	if (!file) {
+		return;
+	}
+	ksu_install_rc_hook(file);
+	fput(file);
+}
+
+#define STAT_NATIVE 0
+#define STAT_STAT64 1
+
+__attribute__((cold))
+static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf_ptr, 
+			const int type, const char *syscall_name)
+{
+	if (!is_init(current_cred()))
+		return;
+
+	struct file *file = fget(fd_int);
+	if (!file)
+		return;
+
+	if (!is_init_rc(file)) {
+		fput(file);
+		return;
+	}
+	fput(file);
+
+	pr_info("%s: stat init.rc \n", syscall_name);
+
+	uintptr_t statbuf_ptr_local = (uintptr_t)*(void **)statbuf_ptr;
+	void __user *statbuf = (void __user *)statbuf_ptr_local;
+	if (!statbuf)
+		return;
+
+	void __user *st_size_ptr;
+	long size, new_size;
+	size_t len;
+
+	st_size_ptr = statbuf + offsetof(struct stat, st_size);
+	len = sizeof(long);
+
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+	if (type) {
+		st_size_ptr = statbuf + offsetof(struct stat64, st_size);
+		len = sizeof(long long);
+	}
+#endif
+
+	// we do this for kretprobe's reusability
+	// this is pretty short, so nbd
+	bool got_flipped = false;
+	if (!preemptible()) {
+		preempt_enable();
+		got_flipped = true;
+	}
+
+	if (ksu_copy_from_user_retry(&size, st_size_ptr, len)) {
+		pr_info("%s: read statbuf 0x%lx failed \n", syscall_name, (unsigned long)st_size_ptr);
+		goto out;
+	}
+
+	new_size = size + ksu_rc_len;
+	pr_info("%s: adding ksu_rc_len: %ld -> %ld \n", syscall_name, size, new_size);
+		
+	if (!copy_to_user(st_size_ptr, &new_size, len))
+		pr_info("%s: added ksu_rc_len \n", syscall_name);
+	else
+		pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", syscall_name, (unsigned long)st_size_ptr);
+	
+out:
+	if (got_flipped)
+		preempt_disable();
+
+	return;
+}
+
+void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr)
+{
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	if (static_branch_likely(&ksud_vfs_read_key))
+		ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat");
+#else
+	if (unlikely(ksu_vfs_read_hook))
+		ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat");
+#endif
+}
+
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr)
+{
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	if (static_branch_likely(&ksud_vfs_read_key))
+		ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!!
+#else
+	if (unlikely(ksu_vfs_read_hook))
+		ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!!
+#endif
+}
+#endif
+
+static bool safe_mode_flag = false;
+#define VOLUME_PRESS_THRESHOLD_COUNT 3
+
+bool ksu_is_safe_mode()
+{
+	// don't need to check again, userspace may call multiple times
+	static bool already_checked = false;
+	if (already_checked)
+		return true;
+
+	// stop hook first!
+	stop_input_hook();
+
+	if (!safe_mode_flag)
+		return false;
+		
+	pr_info("volume keys pressed max times, safe mode detected!\n");
+	already_checked = true;
+	return true;
+}
+
+static void vol_detector_event(struct input_handle *handle, unsigned int type, unsigned int code, int value)
+{
+	static int vol_up_cnt = 0;
+	static int vol_down_cnt = 0;
+
+	if (!value)
+		return;
+	
+	if (type != EV_KEY)
+		return;
+	
+	if (code == KEY_VOLUMEDOWN) {
+		vol_down_cnt++;
+		pr_info("KEY_VOLUMEDOWN press detected!\n");
+	}
+
+	if (code == KEY_VOLUMEUP) {
+		vol_up_cnt++;
+		pr_info("KEY_VOLUMEUP press detected!\n");
+	}
+
+	pr_info("volume_pressed_count: vol_up: %d vol_down: %d\n", vol_up_cnt, vol_down_cnt);
+
+	/*
+	 * on upstream we call stop_input_hook() here but this is causing issues
+	 * #1. unregistering an input handler inside the input handler is a bad meme
+	 * #2. when I tried to defer unreg to a kthread, it also causes issues on some users? nfi.
+	 * since unregging is done anyway on ksu_is_safe_mode() or on_post_fs_data() we just dont bother.
+	 *
+	 */
+	if (vol_up_cnt >= VOLUME_PRESS_THRESHOLD_COUNT || vol_down_cnt >= VOLUME_PRESS_THRESHOLD_COUNT) {
+		pr_info("volume keys pressed max times, safe mode detected!\n");
+		safe_mode_flag = true;
+	}
+}
+
+static int vol_detector_connect(struct input_handler *handler, struct input_dev *dev,
+					  const struct input_device_id *id)
+{
+	struct input_handle *handle;
+	int error;
+
+	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->dev = dev;
+	handle->handler = handler;
+	handle->name = "ksu_handle_input";
+
+	error = input_register_handle(handle);
+	if (error)
+		goto err_free_handle;
+
+	error = input_open_device(handle);
+	if (error)
+		goto err_unregister_handle;
+
+	return 0;
+
+err_unregister_handle:
+	input_unregister_handle(handle);
+err_free_handle:
+	kfree(handle);
+	return error;
+}
+
+static const struct input_device_id vol_detector_ids[] = { 
+	// we add key volume up so that
+	// 1. if you have broken volume down you get shit
+	// 2. we can make sure to trigger only ksu safemode, not android's safemode.
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_VOLUMEUP)] = BIT_MASK(KEY_VOLUMEUP) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_VOLUMEDOWN)] = BIT_MASK(KEY_VOLUMEDOWN) },
+	},
+	{ }
+};
+
+static void vol_detector_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+	kfree(handle);
+}
+
+MODULE_DEVICE_TABLE(input, vol_detector_ids);
+
+static struct input_handler vol_detector_handler = {
+        .event =	vol_detector_event,
+        .connect =	vol_detector_connect,
+        .disconnect =	vol_detector_disconnect,
+        .name =		"ksu",
+        .id_table =	vol_detector_ids,
+};
+
+static int vol_detector_init()
+{
+	pr_info("vol_detector: init\n");
+	return input_register_handler(&vol_detector_handler);
+}
+
+static int vol_detector_exit()
+{
+	pr_info("vol_detector: exit\n");
+	input_unregister_handler(&vol_detector_handler);
+	return 0;
+}
+
+// we do this so that if theres no ksud to call on_post_fs_data/ksu_is_safe_mode/on_boot_completed
+// there will be no input handler / extra execve branch that stays around
+// 60s is more than enough time from second_stage to decrypt/post_fs_data
+// if theres no ksud that does that, we trigger the closing of hooks ourselves
+static int ksu_hook_watchdog(void *data)
+{
+	unsigned int i = 0;
+
+	set_user_nice(current, 19); // low prio
+	pr_info("%s: kthread init!\n", __func__);
+
+start:
+	if (!*(volatile bool *)&ksu_input_hook)
+		goto bail;
+
+	msleep(5000);
+
+	i++;
+
+	if (i < 12)
+		goto start;
+
+	// if this path gets triggerred, it means theres no ksud
+	pr_info("%s: ksud probably absent, closing hooks!\n", __func__);
+
+	// close down input hook
+	stop_input_hook();
+
+	// close down ksud escape
+	ksud_escape_exit();
+	ksu_boot_completed = true;
+
+bail:
+	pr_info("%s: kthread exit!\n", __func__);
+	return 0;
+}
+
+static void stop_vfs_read_hook()
+{
+	ksu_vfs_read_hook = false;
+	pr_info("stop vfs_read_hook\n");
+	ksu_disable_vfs_read_branch();
+
+	kthread_run(ksu_hook_watchdog, NULL, "watchdog");
+}
+
+static void stop_input_hook()
+{
+	if (!ksu_input_hook) { return; }
+	ksu_input_hook = false;
+	pr_info("stop input_hook\n");
+	
+	vol_detector_exit();
+}
+
+void __init ksu_ksud_init()
+{
+	ksud_escape_init();
+	vol_detector_init();
+}
+
diff --git a/drivers/kernelsu/runtime/ksud.h b/drivers/kernelsu/runtime/ksud.h
new file mode 100644
index 000000000000..4461843407c3
--- /dev/null
+++ b/drivers/kernelsu/runtime/ksud.h
@@ -0,0 +1,26 @@
+#ifndef __KSU_H_KSUD
+#define __KSU_H_KSUD
+
+#define KSUD_PATH "/data/adb/ksud"
+
+void ksu_ksud_init();
+void ksu_ksud_exit();
+
+void on_post_fs_data(void);
+void on_module_mounted(void);
+void on_boot_completed(void);
+
+bool ksu_is_safe_mode(void);
+
+int nuke_ext4_sysfs(const char* mnt);
+
+static noinline void ksu_install_rc_hook(struct file *file);
+
+extern u32 ksu_file_sid;
+
+static bool ksu_module_mounted __read_mostly;
+static bool ksu_boot_completed __read_mostly;
+static bool ksu_vfs_read_hook __read_mostly;
+static bool ksu_input_hook __read_mostly;
+
+#endif
diff --git a/drivers/kernelsu/runtime/ksud_escape.c b/drivers/kernelsu/runtime/ksud_escape.c
new file mode 100644
index 000000000000..974d5859eece
--- /dev/null
+++ b/drivers/kernelsu/runtime/ksud_escape.c
@@ -0,0 +1,213 @@
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+#if defined(CONFIG_KRETPROBES)
+#include <linux/kprobes.h>
+static u32 cached_su_sid __read_mostly;
+static u32 cached_init_sid __read_mostly;
+
+// int security_bounded_transition(u32 old_sid, u32 new_sid)
+static int bounded_transition_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	// grab sids on entry
+	u32 *sid = (u32 *)ri->data;
+	sid[0] = PT_REGS_PARM1(regs);  // old_sid
+	sid[1] = PT_REGS_PARM2(regs);  // new_sid
+
+	return 0;
+}
+
+static int bounded_transition_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	u32 *sid = (u32 *)ri->data;
+	u32 old_sid = sid[0];
+	u32 new_sid = sid[1];
+
+	if (!cached_su_sid)
+		return 0;
+
+	// so if old sid is 'init' and trying to transition to a new sid of 'ksu'
+	// force the function to return 0 
+	if (old_sid == cached_init_sid && new_sid == cached_su_sid) {
+		pr_info("security_bounded_transition: allowing init (%d) -> ksu (%d) \n", old_sid, new_sid);
+		PT_REGS_RC(regs) = 0;  // make the original func return 0
+	}
+
+	return 0;
+}
+
+static struct kretprobe bounded_transition_rp = {
+	.kp.symbol_name = "security_bounded_transition",
+	.handler = bounded_transition_ret_handler,
+	.entry_handler = bounded_transition_entry_handler,
+	.data_size = sizeof(u32) * 2, // need to keep 2x u32's, one per sid
+	.maxactive = 20,
+};
+
+static int kp_ksud_transition_unregister(void *data)
+{
+	msleep(1000);
+
+	unregister_kretprobe(&bounded_transition_rp);
+	pr_info("kp_ksud: unregister rp: security_bounded_transition\n");
+	return 0;
+}
+
+static void kp_ksud_transition_routine_start()
+{
+	static bool already_ran = false;
+	if (already_ran)
+		return;
+
+	int ret = register_kretprobe(&bounded_transition_rp);
+	pr_info("kp_ksud: register rp: security_bounded_transition ret: %d\n", ret);
+
+	already_ran = true;
+}
+#else
+__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename)
+{
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	if (ksu_boot_completed) {
+		pr_info("sys_execve: boot completed, remove escape branch\n");
+		static_branch_disable(&ksud_escape_key);
+		smp_mb();
+		return;
+	}
+#endif
+
+	// see if its init
+	if (!is_init(current_cred()))
+		return;
+
+	const char ksud_path[] = KSUD_PATH;
+	char path[sizeof(ksud_path)];
+
+	// filename is void * char __user *
+	const char __user **filename_user = (const char __user **)filename;
+
+	// see if its trying to execute ksud
+	if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path)))
+		return;
+
+	if (likely(!!memcmp(ksud_path, path, sizeof(path))))
+		return;
+
+	pr_info("sys_execve: escape init executing %s with pid: %d\n", path, current->pid);
+	escape_to_root_forced(); // give this context all permissions
+	return;
+}
+
+__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename)
+{
+#ifdef KSU_CAN_USE_JUMP_LABEL
+	if (ksu_boot_completed) {
+		pr_info("kernel_execve: boot completed, remove escape branch\n");
+		static_branch_disable(&ksud_escape_key);
+		smp_mb();
+		return;
+	}
+#endif
+	// filename is void **
+	void **filename_ptr = (void **)filename;
+
+	// see if its init
+	if (!is_init(current_cred()))
+		return;
+
+	if (!*filename_ptr)
+		return;
+
+	if (likely(!!memcmp(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH))))
+		return;
+
+	pr_info("kernel_execve: escape init executing %s with pid: %d\n", *(const char **)filename_ptr, current->pid);
+	escape_to_root_forced(); // give this context all permissions
+	return;
+}
+#endif // KRETPROBES
+#endif // < 4.14 && >= 4.2
+
+// UL bprm_set_creds handling
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)
+static uintptr_t selinux_ops_addr;
+static int (*orig_bprm_set_creds)(struct linux_binprm *bprm) = NULL;
+
+static int ksu_unregister_bprm_set_creds(void *data)
+{
+	struct security_operations *ops = (struct security_operations *)selinux_ops_addr;
+	if (orig_bprm_set_creds) {
+		pr_info("%s: restoring: bprm_set_creds 0x%lx -> 0x%lx\n", __func__, (long)ops->bprm_set_creds, (long)orig_bprm_set_creds);
+		ops->bprm_set_creds = orig_bprm_set_creds;
+	}
+	
+	return 0;
+}
+
+static int hook_bprm_set_creds(struct linux_binprm *bprm)
+{
+	if (ksu_boot_completed)
+		goto unreg_bprm_set_creds;
+
+	if (!is_init(current_cred()))
+		goto bprm_set_creds;
+
+	if (!bprm->filename)
+		goto bprm_set_creds;
+
+	if (!!strcmp(bprm->filename, "/data/adb/ksud"))
+		goto bprm_set_creds;
+
+	struct task_security_struct *old_tsec = current_security();
+	struct task_security_struct *new_tsec = bprm->cred->security;
+
+	if (!(old_tsec->exec_sid))
+		goto bprm_set_creds;
+
+	// we copy what selinux was doing
+	// ref: https://elixir.bootlin.com/linux/v3.0.101/source/security/selinux/hooks.c#L1971
+
+	/* Default to the current task SID. */
+	new_tsec->sid = old_tsec->sid;
+	new_tsec->osid = old_tsec->sid;
+
+	/* Reset fs, key, and sock SIDs on execve. */
+	new_tsec->create_sid = 0;
+	new_tsec->keycreate_sid = 0;
+	new_tsec->sockcreate_sid = 0;
+
+	new_tsec->sid = old_tsec->exec_sid;
+	/* Reset exec SID on execve. */
+	new_tsec->exec_sid = 0;
+
+	pr_info("bprm_set_creds: allow init executing %s with pid: %d\n", bprm->filename, current->pid);
+	return 0;
+
+unreg_bprm_set_creds:
+	stop_machine(ksu_unregister_bprm_set_creds, NULL, NULL);
+
+bprm_set_creds:
+	return orig_bprm_set_creds(bprm);
+
+
+}
+#endif
+
+static void ksud_escape_init()
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES)
+	kp_ksud_transition_routine_start();
+#endif
+}
+
+static void ksud_escape_exit()
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES)
+	static bool already_ran = false;
+	if (already_ran)
+		return;
+
+	already_ran = true;
+
+	kthread_run(kp_ksud_transition_unregister, NULL, "rp_unhook");
+#endif
+
+}
diff --git a/drivers/kernelsu/runtime/ksud_escape.h b/drivers/kernelsu/runtime/ksud_escape.h
new file mode 100644
index 000000000000..13ba5b9a5145
--- /dev/null
+++ b/drivers/kernelsu/runtime/ksud_escape.h
@@ -0,0 +1,41 @@
+#ifndef __KSU_H_KSUD_ESCAPE
+#define __KSU_H_KSUD_ESCAPE
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && !defined(CONFIG_KRETPROBES)
+__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename);
+__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename);
+
+#ifdef KSU_CAN_USE_JUMP_LABEL
+DEFINE_STATIC_KEY_TRUE(ksud_escape_key);
+static inline void sys_execve_escape_ksud(void *filename)
+{
+	if (static_branch_likely(&ksud_escape_key))
+		sys_execve_escape_ksud_internal(filename);
+}
+static inline void kernel_execve_escape_ksud(void *filename)
+{
+	if (static_branch_likely(&ksud_escape_key))
+		kernel_execve_escape_ksud_internal(filename);
+}
+#else
+static inline void sys_execve_escape_ksud(void *filename)
+{
+	if (unlikely(!ksu_boot_completed))
+		sys_execve_escape_ksud_internal(filename);
+}
+static inline void kernel_execve_escape_ksud(void *filename)
+{
+	if (unlikely(!ksu_boot_completed))
+		kernel_execve_escape_ksud_internal(filename);
+}
+#endif
+
+#else
+static inline void sys_execve_escape_ksud(void *filename) { } // no-op
+static inline void kernel_execve_escape_ksud(void *filename) { } // no-op
+#endif // < 4.14 && >= 4.2 && !KRETPROBES
+
+static void ksud_escape_init();
+static void ksud_escape_exit();
+
+#endif // __KSU_H_KSUD_ESCAPE
diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c
index a2b9a7dde728..c51990b6b060 100644
--- a/drivers/kernelsu/selinux/rules.c
+++ b/drivers/kernelsu/selinux/rules.c
@@ -1,51 +1,62 @@
-#include <linux/uaccess.h>
-#include <linux/types.h>
-#include <linux/version.h>
-
-#include "../klog.h" // IWYU pragma: keep
-#include "selinux.h"
-#include "sepolicy.h"
-#include "ss/services.h"
-#include "linux/lsm_audit.h" // IWYU pragma: keep
-#include "xfrm.h"
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
 #define SELINUX_POLICY_INSTEAD_SELINUX_SS
 #endif
 
 #define ALL NULL
 
-static struct policydb *get_policydb(void)
-{
-	struct policydb *db;
-// selinux_state does not exists before 4.19
-#ifdef KSU_COMPAT_USE_SELINUX_STATE
-#ifdef SELINUX_POLICY_INSTEAD_SELINUX_SS
-	struct selinux_policy *policy = selinux_state.policy;
-	db = &policy->policydb;
+#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))
+extern int avc_ss_reset(u32 seqno);
 #else
-	struct selinux_ss *ss = selinux_state.ss;
-	db = &ss->policydb;
+extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno);
 #endif
+// reset avc cache table, otherwise the new rules will not take effect if already denied
+static void reset_avc_cache()
+{
+#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))
+	avc_ss_reset(0);
+	selnl_notify_policyload(0);
+	selinux_status_update_policyload(0);
 #else
-	db = &policydb;
+	struct selinux_avc *avc = selinux_state.avc;
+	avc_ss_reset(avc, 0);
+	selnl_notify_policyload(0);
+	selinux_status_update_policyload(&selinux_state, 0);
 #endif
-	return db;
+	selinux_xfrm_notify_policyload();
 }
 
-static DEFINE_MUTEX(ksu_rules);
-void apply_kernelsu_rules(void)
-{
-	struct policydb *db;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0)
 
-	if (!getenforce()) {
-		pr_info("SELinux permissive or disabled, apply rules!\n");
-	}
+#if defined(KSU_COMPAT_USE_SELINUX_STATE)
+static struct policydb *get_policydb(void) { return &selinux_state.ss->policydb; }
+#else
+static struct policydb *get_policydb(void) { return &policydb; }
+#endif
 
-	mutex_lock(&ksu_rules);
+// rwlock
+#if defined(KSU_COMPAT_USE_SELINUX_STATE)
+static inline rwlock_t *ksu_get_policy_rwlock() { return &selinux_state.ss->policy_rwlock; }
+#elif defined(KSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK)
+static inline rwlock_t *ksu_get_policy_rwlock() { extern rwlock_t policy_rwlock; return &policy_rwlock; }
+#elif defined(CONFIG_KALLSYMS)
+static noinline rwlock_t *ksu_get_policy_rwlock() { return (rwlock_t *)kallsyms_lookup_name("policy_rwlock"); }
+#else
+static inline rwlock_t *ksu_get_policy_rwlock() { return NULL; }
+#endif
 
-	db = get_policydb();
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) || defined(KSU_COMPAT_HAS_BACKPORTED_CPUS_PTR)
+static inline const cpumask_t *ksu_get_current_cpumask_t() { return current->cpus_ptr; }
+#else
+static inline cpumask_t *ksu_get_current_cpumask_t() { return &current->cpus_allowed; }
+#endif
+
+#endif // < 5.10
+
+static int apply_kernelsu_rules_fn(void *ptr)
+{
+	struct policydb *db = (struct policydb *)ptr;
 
+	ksu_type(db, KERNEL_SU_DOMAIN, "domain");
 	ksu_permissive(db, KERNEL_SU_DOMAIN);
 	ksu_typeattribute(db, KERNEL_SU_DOMAIN, "mlstrustedsubject");
 	ksu_typeattribute(db, KERNEL_SU_DOMAIN, "netdomain");
@@ -54,7 +65,7 @@ void apply_kernelsu_rules(void)
 	// Create unconstrained file type
 	ksu_type(db, KERNEL_SU_FILE, "file_type");
 	ksu_typeattribute(db, KERNEL_SU_FILE, "mlstrustedobject");
-	ksu_allow(db, ALL, KERNEL_SU_FILE, ALL, ALL);
+	ksu_allow(db, "domain", KERNEL_SU_FILE, ALL, ALL);
 
 	// allow all!
 	ksu_allow(db, KERNEL_SU_DOMAIN, ALL, ALL, ALL);
@@ -69,10 +80,10 @@ void apply_kernelsu_rules(void)
 
 	// our ksud triggered by init
 	ksu_allow(db, "init", KERNEL_SU_DOMAIN, ALL, ALL);
-#ifdef CONFIG_KSU_MANUAL_HOOK
+
+	// restored from https://github.com/tiann/KernelSU/pull/3031 
 	ksu_allow(db, "init", "adb_data_file", "file", ALL);
 	ksu_allow(db, "init", "adb_data_file", "dir", ALL); // #1289
-#endif
 
 	// copied from Magisk rules
 	// suRights
@@ -81,7 +92,7 @@ void apply_kernelsu_rules(void)
 	ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "open");
 	ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "read");
 	ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "process", "getattr");
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "process", "sigchld");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "process", "sigchld");
 
 	// allowLog
 	ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "dir", "search");
@@ -89,407 +100,606 @@ void apply_kernelsu_rules(void)
 	ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "open");
 	ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "getattr");
 
-	// dumpsys
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fd", "use");
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "write");
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "read");
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "open");
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "getattr");
+	// dumpsys, send fd
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fd", "use");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "write");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "read");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "open");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "getattr");
 
 	// bootctl
 	ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "dir", "search");
 	ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "read");
 	ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "open");
-	ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process",
-		  "getattr");
+	ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", "getattr");
 
 	// Allow all binder transactions
-	ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "binder", ALL);
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "binder", ALL);
 
 	// Allow system server kill su process
 	ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "getpgid");
 	ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "sigkill");
 
-	mutex_unlock(&ksu_rules);
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "read");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "write");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "connectto");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getopt");
+	ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getattr");
+
+	return 0;
 }
 
-#define MAX_SEPOL_LEN 128
+void apply_kernelsu_rules()
+{
+	struct policydb *db;
+
+	if (!getenforce()) {
+		pr_info("SELinux permissive or disabled, apply rules!\n");
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+	struct selinux_policy *pol, *old_pol = selinux_state.policy;
+	mutex_lock(&selinux_state.policy_mutex);
+	pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex)));
+	if (IS_ERR(pol)) {
+		pr_err("failed to dup selinux_policy: %ld\n", PTR_ERR(pol));
+		goto out_unlock;
+	}
+	db = &pol->policydb;
+
+	apply_kernelsu_rules_fn((void *)db);
+
+	rcu_assign_pointer(selinux_state.policy, pol);
+	synchronize_rcu();
+	ksu_destroy_sepolicy(old_pol);
+
+	reset_avc_cache();
+out_unlock:
+	mutex_unlock(&selinux_state.policy_mutex);
+#else
+
+	db = get_policydb();
 
-#define CMD_NORMAL_PERM 1
-#define CMD_XPERM 2
-#define CMD_TYPE_STATE 3
-#define CMD_TYPE 4
-#define CMD_TYPE_ATTR 5
-#define CMD_ATTR 6
-#define CMD_TYPE_TRANSITION 7
-#define CMD_TYPE_CHANGE 8
-#define CMD_GENFSCON 9
+	rwlock_t *lock = ksu_get_policy_rwlock();
+	if (!lock)
+		goto do_stop_machine;
+
+	/*
+	 * HACK: write_lock() is held with preempt enabled. DO NOT let the
+	 * task be migrated to any other CPU than the current CPU. And since
+	 * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get
+	 * current CPU and bypass preemption checks.
+	 */
+	cpumask_t old_mask;
+	cpumask_copy(&old_mask, ksu_get_current_cpumask_t());
+	set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id()));
+
+	pr_info("%s: type: policy_rwlock \n", __func__);
+	write_lock(lock);
+	preempt_enable();
+
+	apply_kernelsu_rules_fn((void *)db);
+
+	preempt_disable();
+	write_unlock(lock);
+	set_cpus_allowed_ptr(current, &old_mask);
+	goto out_flush;
+
+do_stop_machine:
+	pr_info("%s: type: stop_machine()\n", __func__);
+	stop_machine(apply_kernelsu_rules_fn, (void *)db, NULL);
+
+out_flush:
+	smp_mb();
+	reset_avc_cache();
+#endif
+}
+
+#define KSU_SEPOLICY_MAX_BATCH_SIZE (8U * 1024U * 1024U)
+#define KSU_SEPOLICY_MAX_ARGS 5
 
 struct sepol_data {
 	u32 cmd;
 	u32 subcmd;
-	u64 sepol1;
-	u64 sepol2;
-	u64 sepol3;
-	u64 sepol4;
-	u64 sepol5;
-	u64 sepol6;
-	u64 sepol7;
 };
 
-static int get_object(char *buf, char __user *user_object, size_t buf_sz,
-		      char **object)
+struct sepol_batch_cursor {
+	const u8 *cur;
+	const u8 *end;
+};
+
+static size_t sepol_remaining(const struct sepol_batch_cursor *cursor)
 {
-	if (!user_object) {
-		*object = ALL;
-		return 0;
-	}
+	return (size_t)(cursor->end - cursor->cur);
+}
 
-	if (strncpy_from_user(buf, user_object, buf_sz) < 0) {
+static int sepol_read_cmd_header(struct sepol_batch_cursor *cursor, struct sepol_data *header)
+{
+	if (sepol_remaining(cursor) < sizeof(*header)) {
 		return -EINVAL;
 	}
 
-	*object = buf;
+	memcpy(header, cursor->cur, sizeof(*header));
+	cursor->cur += sizeof(*header);
 
 	return 0;
 }
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) ||                           \
-	!defined(KSU_COMPAT_USE_SELINUX_STATE)
-extern int avc_ss_reset(u32 seqno);
-#else
-extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno);
-#endif
-// reset avc cache table, otherwise the new rules will not take effect if already denied
-static void reset_avc_cache(void)
+static int sepol_read_string(struct sepol_batch_cursor *cursor, const char **out)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) ||                           \
-	!defined(KSU_COMPAT_USE_SELINUX_STATE)
-	avc_ss_reset(0);
-	selnl_notify_policyload(0);
-	selinux_status_update_policyload(0);
-#else
-	struct selinux_avc *avc = selinux_state.avc;
-	avc_ss_reset(avc, 0);
-	selnl_notify_policyload(0);
-	selinux_status_update_policyload(&selinux_state, 0);
-#endif
-	selinux_xfrm_notify_policyload();
-}
-
-int handle_sepolicy(unsigned long arg3, void __user *arg4)
-{
-	struct policydb *db;
+	u32 len;
+	const char *str;
 
-	if (!arg4) {
+	if (sepol_remaining(cursor) < sizeof(len)) {
 		return -EINVAL;
 	}
 
-	if (!getenforce()) {
-		pr_info("SELinux permissive or disabled when handle policy!\n");
+	memcpy(&len, cursor->cur, sizeof(len));
+	cursor->cur += sizeof(len);
+
+	if (len >= sepol_remaining(cursor)) {
+		return -EINVAL;
 	}
 
-	struct sepol_data data;
-	if (copy_from_user(&data, arg4, sizeof(struct sepol_data))) {
-		pr_err("sepol: copy sepol_data failed.\n");
+	str = (const char *)cursor->cur;
+	if (memchr(str, '\0', len) != NULL || str[len] != '\0') {
 		return -EINVAL;
 	}
 
-	u32 cmd = data.cmd;
-	u32 subcmd = data.subcmd;
+	cursor->cur += len + 1;
+	if (len == 0) {
+		*out = ALL;
+		return 0;
+	}
+
+	*out = str;
+	return 0;
+}
 
-	mutex_lock(&ksu_rules);
+static int sepol_require_not_all(const char *value, const char *name)
+{
+	if (value != ALL) {
+		return 0;
+	}
 
-	db = get_policydb();
+	pr_err("sepol: %s cannot be ALL.\n", name);
+	return -EINVAL;
+}
 
-	int ret = -EINVAL;
+static int sepol_expected_argc(u32 cmd)
+{
 	switch (cmd) {
-	case CMD_NORMAL_PERM: {
-		char src_buf[MAX_SEPOL_LEN];
-		char tgt_buf[MAX_SEPOL_LEN];
-		char cls_buf[MAX_SEPOL_LEN];
-		char perm_buf[MAX_SEPOL_LEN];
+	case KSU_SEPOLICY_CMD_NORMAL_PERM:
+		return 4;
+	case KSU_SEPOLICY_CMD_XPERM:
+		return 5;
+	case KSU_SEPOLICY_CMD_TYPE_STATE:
+		return 1;
+	case KSU_SEPOLICY_CMD_TYPE:
+	case KSU_SEPOLICY_CMD_TYPE_ATTR:
+		return 2;
+	case KSU_SEPOLICY_CMD_ATTR:
+		return 1;
+	case KSU_SEPOLICY_CMD_TYPE_TRANSITION:
+		return 5;
+	case KSU_SEPOLICY_CMD_TYPE_CHANGE:
+		return 4;
+	case KSU_SEPOLICY_CMD_GENFSCON:
+		return 3;
+	default:
+		return -EINVAL;
+	}
+}
 
-		char *s, *t, *c, *p;
-		if (get_object(src_buf, (void __user *)data.sepol1,
-			       sizeof(src_buf), &s) < 0) {
-			pr_err("sepol: copy src failed.\n");
-			goto exit;
+static int apply_one_sepolicy_cmd(struct policydb *db, const struct sepol_data *header, const char **args)
+{
+	bool success = false;
+	int ret;
+
+	switch (header->cmd) {
+	case KSU_SEPOLICY_CMD_NORMAL_PERM:
+		if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW) {
+			success = ksu_allow(db, args[0], args[1], args[2], args[3]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY) {
+			success = ksu_deny(db, args[0], args[1], args[2], args[3]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW) {
+			success = ksu_auditallow(db, args[0], args[1], args[2], args[3]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT) {
+			success = ksu_dontaudit(db, args[0], args[1], args[2], args[3]);
+		} else {
+			pr_err("sepol: unknown subcmd: %d\n", header->subcmd);
 		}
+		return success ? 0 : -EINVAL;
 
-		if (get_object(tgt_buf, (void __user *)data.sepol2,
-			       sizeof(tgt_buf), &t) < 0) {
-			pr_err("sepol: copy tgt failed.\n");
-			goto exit;
+	case KSU_SEPOLICY_CMD_XPERM:
+		ret = sepol_require_not_all(args[3], "operation");
+		if (ret < 0) {
+			return ret;
 		}
-
-		if (get_object(cls_buf, (void __user *)data.sepol3,
-			       sizeof(cls_buf), &c) < 0) {
-			pr_err("sepol: copy cls failed.\n");
-			goto exit;
+		ret = sepol_require_not_all(args[4], "perm_set");
+		if (ret < 0) {
+			return ret;
 		}
 
-		if (get_object(perm_buf, (void __user *)data.sepol4,
-			       sizeof(perm_buf), &p) < 0) {
-			pr_err("sepol: copy perm failed.\n");
-			goto exit;
+		if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_ALLOW) {
+			success = ksu_allowxperm(db, args[0], args[1], args[2], args[4]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW) {
+			success = ksu_auditallowxperm(db, args[0], args[1], args[2], args[4]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT) {
+			success = ksu_dontauditxperm(db, args[0], args[1], args[2], args[4]);
+		} else {
+			pr_err("sepol: unknown subcmd: %d\n", header->subcmd);
 		}
+		return success ? 0 : -EINVAL;
 
-		bool success = false;
+	case KSU_SEPOLICY_CMD_TYPE_STATE:
+		ret = sepol_require_not_all(args[0], "type");
+		if (ret < 0) {
+			return ret;
+		}
 
-		if (subcmd == 1) {
-			success = ksu_allow(db, s, t, c, p);
-		} else if (subcmd == 2) {
-			success = ksu_deny(db, s, t, c, p);
-		} else if (subcmd == 3) {
-			success = ksu_auditallow(db, s, t, c, p);
-		} else if (subcmd == 4) {
-			success = ksu_dontaudit(db, s, t, c, p);
+		if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE) {
+			success = ksu_permissive(db, args[0]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE) {
+			success = ksu_enforce(db, args[0]);
 		} else {
-			pr_err("sepol: unknown subcmd: %d\n", subcmd);
+			pr_err("sepol: unknown subcmd: %d\n", header->subcmd);
 		}
-		ret = success ? 0 : -EINVAL;
-		break;
-	}
-	case CMD_XPERM: {
-		char src_buf[MAX_SEPOL_LEN];
-		char tgt_buf[MAX_SEPOL_LEN];
-		char cls_buf[MAX_SEPOL_LEN];
-
-		char __maybe_unused
-			operation[MAX_SEPOL_LEN]; // it is always ioctl now!
-		char perm_set[MAX_SEPOL_LEN];
-
-		char *s, *t, *c;
-		if (get_object(src_buf, (void __user *)data.sepol1,
-			       sizeof(src_buf), &s) < 0) {
-			pr_err("sepol: copy src failed.\n");
-			goto exit;
-		}
-		if (get_object(tgt_buf, (void __user *)data.sepol2,
-			       sizeof(tgt_buf), &t) < 0) {
-			pr_err("sepol: copy tgt failed.\n");
-			goto exit;
-		}
-		if (get_object(cls_buf, (void __user *)data.sepol3,
-			       sizeof(cls_buf), &c) < 0) {
-			pr_err("sepol: copy cls failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(operation, (void __user *)data.sepol4,
-				      sizeof(operation)) < 0) {
-			pr_err("sepol: copy operation failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(perm_set, (void __user *)data.sepol5,
-				      sizeof(perm_set)) < 0) {
-			pr_err("sepol: copy perm_set failed.\n");
-			goto exit;
-		}
-
-		bool success = false;
-		if (subcmd == 1) {
-			success = ksu_allowxperm(db, s, t, c, perm_set);
-		} else if (subcmd == 2) {
-			success = ksu_auditallowxperm(db, s, t, c, perm_set);
-		} else if (subcmd == 3) {
-			success = ksu_dontauditxperm(db, s, t, c, perm_set);
+		return success ? 0 : -EINVAL;
+
+	case KSU_SEPOLICY_CMD_TYPE:
+	case KSU_SEPOLICY_CMD_TYPE_ATTR:
+		ret = sepol_require_not_all(args[0], "type");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[1], "attribute");
+		if (ret < 0) {
+			return ret;
+		}
+
+		if (header->cmd == KSU_SEPOLICY_CMD_TYPE) {
+			success = ksu_type(db, args[0], args[1]);
 		} else {
-			pr_err("sepol: unknown subcmd: %d\n", subcmd);
+			success = ksu_typeattribute(db, args[0], args[1]);
 		}
-		ret = success ? 0 : -EINVAL;
-		break;
-	}
-	case CMD_TYPE_STATE: {
-		char src[MAX_SEPOL_LEN];
+		if (!success) {
+			pr_err("sepol: %d failed.\n", header->cmd);
+			return -EINVAL;
+		}
+		return 0;
 
-		if (strncpy_from_user(src, (void __user *)data.sepol1,
-				      sizeof(src)) < 0) {
-			pr_err("sepol: copy src failed.\n");
-			goto exit;
+	case KSU_SEPOLICY_CMD_ATTR:
+		ret = sepol_require_not_all(args[0], "attribute");
+		if (ret < 0) {
+			return ret;
 		}
 
-		bool success = false;
-		if (subcmd == 1) {
-			success = ksu_permissive(db, src);
-		} else if (subcmd == 2) {
-			success = ksu_enforce(db, src);
-		} else {
-			pr_err("sepol: unknown subcmd: %d\n", subcmd);
+		if (!ksu_attribute(db, args[0])) {
+			pr_err("sepol: %d failed.\n", header->cmd);
+			return -EINVAL;
+		}
+		return 0;
+
+	case KSU_SEPOLICY_CMD_TYPE_TRANSITION: {
+		const char *object = ALL;
+
+		ret = sepol_require_not_all(args[0], "src");
+		if (ret < 0) {
+			return ret;
 		}
-		if (success)
-			ret = 0;
-		break;
+		ret = sepol_require_not_all(args[1], "tgt");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[2], "cls");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[3], "default_type");
+		if (ret < 0) {
+			return ret;
+		}
+
+		object = args[4];
+
+		success = ksu_type_transition(db, args[0], args[1], args[2], args[3], object);
+		return success ? 0 : -EINVAL;
 	}
-	case CMD_TYPE:
-	case CMD_TYPE_ATTR: {
-		char type[MAX_SEPOL_LEN];
-		char attr[MAX_SEPOL_LEN];
 
-		if (strncpy_from_user(type, (void __user *)data.sepol1,
-				      sizeof(type)) < 0) {
-			pr_err("sepol: copy type failed.\n");
-			goto exit;
+	case KSU_SEPOLICY_CMD_TYPE_CHANGE:
+		ret = sepol_require_not_all(args[0], "src");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[1], "tgt");
+		if (ret < 0) {
+			return ret;
 		}
-		if (strncpy_from_user(attr, (void __user *)data.sepol2,
-				      sizeof(attr)) < 0) {
-			pr_err("sepol: copy attr failed.\n");
-			goto exit;
+		ret = sepol_require_not_all(args[2], "cls");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[3], "default_type");
+		if (ret < 0) {
+			return ret;
 		}
 
-		bool success = false;
-		if (cmd == CMD_TYPE) {
-			success = ksu_type(db, type, attr);
+		if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE) {
+			success = ksu_type_change(db, args[0], args[1], args[2], args[3]);
+		} else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER) {
+			success = ksu_type_member(db, args[0], args[1], args[2], args[3]);
 		} else {
-			success = ksu_typeattribute(db, type, attr);
-		}
-		if (!success) {
-			pr_err("sepol: %d failed.\n", cmd);
-			goto exit;
+			pr_err("sepol: unknown subcmd: %d\n", header->subcmd);
 		}
-		ret = 0;
-		break;
-	}
-	case CMD_ATTR: {
-		char attr[MAX_SEPOL_LEN];
+		return success ? 0 : -EINVAL;
 
-		if (strncpy_from_user(attr, (void __user *)data.sepol1,
-				      sizeof(attr)) < 0) {
-			pr_err("sepol: copy attr failed.\n");
-			goto exit;
+	case KSU_SEPOLICY_CMD_GENFSCON:
+		ret = sepol_require_not_all(args[0], "name");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[1], "path");
+		if (ret < 0) {
+			return ret;
+		}
+		ret = sepol_require_not_all(args[2], "context");
+		if (ret < 0) {
+			return ret;
 		}
-		if (!ksu_attribute(db, attr)) {
-			pr_err("sepol: %d failed.\n", cmd);
-			goto exit;
+
+		if (!ksu_genfscon(db, args[0], args[1], args[2])) {
+			pr_err("sepol: %d failed.\n", header->cmd);
+			return -EINVAL;
 		}
-		ret = 0;
-		break;
+		return 0;
+
+	default:
+		pr_err("sepol: unknown cmd: %d\n", header->cmd);
+		return -EINVAL;
 	}
-	case CMD_TYPE_TRANSITION: {
-		char src[MAX_SEPOL_LEN];
-		char tgt[MAX_SEPOL_LEN];
-		char cls[MAX_SEPOL_LEN];
-		char default_type[MAX_SEPOL_LEN];
-		char object[MAX_SEPOL_LEN];
-
-		if (strncpy_from_user(src, (void __user *)data.sepol1,
-				      sizeof(src)) < 0) {
-			pr_err("sepol: copy src failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(tgt, (void __user *)data.sepol2,
-				      sizeof(tgt)) < 0) {
-			pr_err("sepol: copy tgt failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(cls, (void __user *)data.sepol3,
-				      sizeof(cls)) < 0) {
-			pr_err("sepol: copy cls failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(default_type, (void __user *)data.sepol4,
-				      sizeof(default_type)) < 0) {
-			pr_err("sepol: copy default_type failed.\n");
-			goto exit;
-		}
-		char *real_object;
-		if ((void __user *)data.sepol5 == NULL) {
-			real_object = NULL;
-		} else {
-			if (strncpy_from_user(object,
-					      (void __user *)data.sepol5,
-					      sizeof(object)) < 0) {
-				pr_err("sepol: copy object failed.\n");
-				goto exit;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+int handle_sepolicy(void __user *user_data, u64 data_len)
+{
+	struct selinux_policy *pol, *old_pol;
+	struct policydb *db;
+	struct sepol_batch_cursor cursor;
+	u8 *payload;
+	int ret;
+	int success_cmd_count;
+	u32 cmd_index;
+
+	if (!user_data || !data_len) {
+		return -EINVAL;
+	}
+
+	if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE) {
+		return -E2BIG;
+	}
+
+	payload = kvmalloc((size_t)data_len, GFP_KERNEL);
+	if (!payload) {
+		return -ENOMEM;
+	}
+
+	if (copy_from_user(payload, user_data, (size_t)data_len)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	if (!getenforce()) {
+		pr_info("SELinux permissive or disabled when handle policy!\n");
+	}
+
+	mutex_lock(&selinux_state.policy_mutex);
+
+	old_pol = selinux_state.policy;
+	pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex)));
+	if (IS_ERR(pol)) {
+		ret = PTR_ERR(pol);
+		pr_err("ksu_dup_sepolicy err: %d\n", ret);
+		goto out_unlock;
+	}
+	db = &pol->policydb;
+
+	cursor.cur = payload;
+	cursor.end = payload + (size_t)data_len;
+
+	ret = 0;
+	success_cmd_count = 0;
+	cmd_index = 0;
+	while (cursor.cur < cursor.end) {
+		struct sepol_data header;
+		const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 };
+		int expected_argc;
+		u32 arg_index;
+
+		ret = sepol_read_cmd_header(&cursor, &header);
+		if (ret < 0) {
+			pr_err("sepol: failed to read cmd header #%u.\n", cmd_index);
+			goto out_drop_new_policy;
+		}
+
+		expected_argc = sepol_expected_argc(header.cmd);
+		if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) {
+			ret = -EINVAL;
+			pr_err("sepol: invalid cmd header #%u.\n", cmd_index);
+			goto out_drop_new_policy;
+		}
+
+		for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) {
+			ret = sepol_read_string(&cursor, &args[arg_index]);
+			if (ret < 0) {
+				pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index);
+				goto out_drop_new_policy;
 			}
-			real_object = object;
 		}
 
-		bool success = ksu_type_transition(db, src, tgt, cls,
-						   default_type, real_object);
-		if (success)
-			ret = 0;
-		break;
-	}
-	case CMD_TYPE_CHANGE: {
-		char src[MAX_SEPOL_LEN];
-		char tgt[MAX_SEPOL_LEN];
-		char cls[MAX_SEPOL_LEN];
-		char default_type[MAX_SEPOL_LEN];
-
-		if (strncpy_from_user(src, (void __user *)data.sepol1,
-				      sizeof(src)) < 0) {
-			pr_err("sepol: copy src failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(tgt, (void __user *)data.sepol2,
-				      sizeof(tgt)) < 0) {
-			pr_err("sepol: copy tgt failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(cls, (void __user *)data.sepol3,
-				      sizeof(cls)) < 0) {
-			pr_err("sepol: copy cls failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(default_type, (void __user *)data.sepol4,
-				      sizeof(default_type)) < 0) {
-			pr_err("sepol: copy default_type failed.\n");
-			goto exit;
-		}
-		bool success = false;
-		if (subcmd == 1) {
-			success = ksu_type_change(db, src, tgt, cls,
-						  default_type);
-		} else if (subcmd == 2) {
-			success = ksu_type_member(db, src, tgt, cls,
-						  default_type);
+		ret = apply_one_sepolicy_cmd(db, &header, args);
+		if (ret < 0) {
+			pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd);
 		} else {
-			pr_err("sepol: unknown subcmd: %d\n", subcmd);
+			success_cmd_count++;
+			int argc = sepol_expected_argc(header.cmd);
+			int i;
+			for (i = 0; i < argc; i++)
+				ksu_add_shit_to_list(args[i]);
 		}
-		if (success)
-			ret = 0;
-		break;
+		cmd_index++;
 	}
-	case CMD_GENFSCON: {
-		char name[MAX_SEPOL_LEN];
-		char path[MAX_SEPOL_LEN];
-		char context[MAX_SEPOL_LEN];
-		if (strncpy_from_user(name, (void __user *)data.sepol1,
-				      sizeof(name)) < 0) {
-			pr_err("sepol: copy name failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(path, (void __user *)data.sepol2,
-				      sizeof(path)) < 0) {
-			pr_err("sepol: copy path failed.\n");
-			goto exit;
-		}
-		if (strncpy_from_user(context, (void __user *)data.sepol3,
-				      sizeof(context)) < 0) {
-			pr_err("sepol: copy context failed.\n");
-			goto exit;
-		}
-
-		if (!ksu_genfscon(db, name, path, context)) {
-			pr_err("sepol: %d failed.\n", cmd);
-			goto exit;
-		}
-		ret = 0;
-		break;
+
+	rcu_assign_pointer(selinux_state.policy, pol);
+	synchronize_rcu();
+	ksu_destroy_sepolicy(old_pol);
+
+	reset_avc_cache();
+	ret = success_cmd_count;
+	goto out_unlock;
+
+out_drop_new_policy:
+	ksu_destroy_sepolicy(pol);
+out_unlock:
+	mutex_unlock(&selinux_state.policy_mutex);
+out_free:
+	kvfree(payload);
+
+	return ret;
+}
+#else
+
+struct handle_sepolicy_args {
+	void *ctx_success_cmd_count;
+	void *ctx_payload;
+	u64 ctx_data_len;
+};
+
+static int handle_sepolicy_fn(void *data)
+{
+	struct sepol_batch_cursor cursor;
+	int ret = 0;
+	u32 cmd_index = 0;
+	int success_cmd_count = 0;
+
+	struct policydb *db = get_policydb();
+	struct handle_sepolicy_args *ctx = (struct handle_sepolicy_args *)data;
+	u8 *payload = (u8 *)ctx->ctx_payload;
+	u64 data_len = ctx->ctx_data_len;
+
+	cursor.cur = payload;
+	cursor.end = payload + (size_t)data_len;
+
+	while (cursor.cur < cursor.end) {
+		struct sepol_data header;
+		const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 };
+		int expected_argc;
+		u32 arg_index;
+
+		ret = sepol_read_cmd_header(&cursor, &header);
+		if (ret < 0) {
+			pr_err("sepol: failed to read cmd header #%u.\n", cmd_index);
+			goto out;
+		}
+
+		expected_argc = sepol_expected_argc(header.cmd);
+		if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) {
+			ret = -EINVAL;
+			pr_err("sepol: invalid cmd header #%u.\n", cmd_index);
+			goto out;
+		}
+
+		for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) {
+			ret = sepol_read_string(&cursor, &args[arg_index]);
+			if (ret < 0) {
+				pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index);
+				goto out;
+			}
+		}
+
+		ret = apply_one_sepolicy_cmd(db, &header, args);
+		if (ret < 0)
+			pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd);
+		else {
+			pr_info("sepol: cmd #%u success, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd);
+			success_cmd_count++;
+			int argc = sepol_expected_argc(header.cmd);
+			int i;
+			for (i = 0; i < argc; i++)
+				ksu_add_shit_to_list(args[i]);
+
+		}
+
+		cmd_index++;
 	}
-	default: {
-		pr_err("sepol: unknown cmd: %d\n", cmd);
-		break;
+
+out:
+	*(int *)(ctx->ctx_success_cmd_count) = success_cmd_count;
+	return ret;
+}
+
+int handle_sepolicy(void __user *user_data, u64 data_len)
+{
+	u8 *payload;
+	int ret = 0;
+	int success_cmd_count = 0;
+
+	if (!user_data || !data_len)
+    		return -EINVAL;
+
+	if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE)
+		return -E2BIG;
+
+	payload = kvmalloc((size_t)data_len, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
+
+	if (copy_from_user(payload, user_data, (size_t)data_len)) {
+		ret = -EFAULT;
+		goto out_free;
 	}
+
+	if (!getenforce()) {
+		pr_info("SELinux permissive or disabled when handle policy!\n");
 	}
 
-exit:
-	mutex_unlock(&ksu_rules);
+	struct handle_sepolicy_args ctx = { 0 };
+	ctx.ctx_success_cmd_count = (void *)&success_cmd_count;
+	ctx.ctx_payload = (void *)payload;
+	ctx.ctx_data_len = (u64)data_len;
+
+	rwlock_t *lock = ksu_get_policy_rwlock();
+	if (!lock)
+		goto do_stop_machine;
+
+	cpumask_t old_mask;
+	cpumask_copy(&old_mask, ksu_get_current_cpumask_t());
+	set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id()));
+
+	write_lock(lock);
+	preempt_enable();
 
-	// only allow and xallow needs to reset avc cache, but we cannot do that because
-	// we are in atomic context. so we just reset it every time.
+	ret = handle_sepolicy_fn((void *)&ctx);
+
+	preempt_disable();
+	write_unlock(lock);
+	set_cpus_allowed_ptr(current, &old_mask);
+	goto out_done;
+
+do_stop_machine:
+	ret = stop_machine(handle_sepolicy_fn, (void *)&ctx, NULL);
+
+out_done:
+	if (ret)
+		goto out_free;
+
+	smp_mb();
 	reset_avc_cache();
+	ret = success_cmd_count;
+
+out_free:
+	kvfree(payload);
 
 	return ret;
 }
+#endif
diff --git a/drivers/kernelsu/selinux/selinux.c b/drivers/kernelsu/selinux/selinux.c
index 010732dffd9b..d7c6a71d20c6 100644
--- a/drivers/kernelsu/selinux/selinux.c
+++ b/drivers/kernelsu/selinux/selinux.c
@@ -1,11 +1,3 @@
-#include "linux/cred.h"
-#include "linux/sched.h"
-#include "linux/security.h"
-#include "linux/version.h"
-#include "selinux_defs.h"
-#include "../klog.h" // IWYU pragma: keep
-#include "../ksu.h"
-
 /*
  * Cached SID values for frequently checked contexts.
  * These are resolved once at init and used for fast u32 comparison
@@ -23,60 +15,40 @@ static u32 cached_zygote_sid __read_mostly = 0;
 static u32 cached_init_sid __read_mostly = 0;
 u32 ksu_file_sid __read_mostly = 0;
 
-static int transive_to_domain(const char *domain, struct cred *cred)
+static int transive_to_domain(const char *domain, struct cred *cred, bool clear_exec_sid)
 {
-	taskcred_sec_t *tsec;
 	u32 sid;
 	int error;
-
-	tsec = (taskcred_sec_t *)selinux_cred(cred);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0)
+	struct task_security_struct *tsec;
+#else
+	struct cred_security_struct *tsec;
+#endif
+	tsec = selinux_cred(cred);
 	if (!tsec) {
 		pr_err("tsec == NULL!\n");
 		return -1;
 	}
 	error = security_secctx_to_secid(domain, strlen(domain), &sid);
 	if (error) {
-		pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n",
-			domain, sid, error);
+		pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n", domain,
+				sid, error);
 	}
 	if (!error) {
 		tsec->sid = sid;
 		tsec->create_sid = 0;
 		tsec->keycreate_sid = 0;
 		tsec->sockcreate_sid = 0;
+		if (clear_exec_sid) {
+			tsec->exec_sid = 0;
+		}
 	}
 	return error;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(4, 19, 0)
-bool __maybe_unused
-is_ksu_transition(const struct task_security_struct *old_tsec,
-		  const struct task_security_struct *new_tsec)
+void setup_selinux(const char *domain, struct cred *cred)
 {
-	static u32 ksu_sid;
-	char *secdata;
-	int err;
-	u32 seclen;
-	bool allowed = false;
-
-	if (!ksu_sid) {
-		err = security_secctx_to_secid(
-			KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &ksu_sid);
-		pr_err("failed to get ksu_sid: %d\n", err);
-	}
-
-	if (security_secid_to_secctx(old_tsec->sid, &secdata, &seclen))
-		return false;
-
-	allowed = (!strcmp("u:r:init:s0", secdata) && new_tsec->sid == ksu_sid);
-	security_release_secctx(secdata, seclen);
-	return allowed;
-}
-#endif
-
-void setup_selinux(const char *domain)
-{
-	if (transive_to_domain(domain, (struct cred *)__task_cred(current))) {
+	if (transive_to_domain(domain, cred, false)) {
 		pr_err("transive domain failed.\n");
 		return;
 	}
@@ -84,24 +56,65 @@ void setup_selinux(const char *domain)
 
 void setup_ksu_cred(void)
 {
-	if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred)) {
+	if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred, false)) {
 		pr_err("setup ksu cred failed.\n");
 	}
 }
 
 void setenforce(bool enforce)
 {
-	do_setenforce(enforce);
+#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
+#ifdef KSU_COMPAT_USE_SELINUX_STATE
+	selinux_state.enforcing = enforce;
+#else
+	selinux_enforcing = enforce;
+#endif
+#endif
 }
 
 bool getenforce(void)
 {
-	if (is_selinux_disabled()) {
+#ifdef CONFIG_SECURITY_SELINUX_DISABLE
+#ifdef KSU_COMPAT_USE_SELINUX_STATE
+	if (selinux_state.disabled) {
 		return false;
 	}
+#else
+	if (selinux_disabled) {
+		return false;
+	}
+#endif // KSU_COMPAT_USE_SELINUX_STATE
+#endif // CONFIG_SECURITY_SELINUX_DISABLE
+
+#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
+#ifdef KSU_COMPAT_USE_SELINUX_STATE
+	return selinux_state.enforcing;
+#else
+	return selinux_enforcing;
+#endif
+#else
+	return true;
+#endif
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0)
+struct lsm_context {
+	char *context;
+	u32 len;
+};
 
-	return is_selinux_enforcing();
+static int __security_secid_to_secctx(u32 secid, struct lsm_context *cp)
+{
+	return security_secid_to_secctx(secid, &cp->context, &cp->len);
+}
+static void __security_release_secctx(struct lsm_context *cp)
+{
+	security_release_secctx(cp->context, cp->len);
 }
+#else
+#define __security_secid_to_secctx security_secid_to_secctx
+#define __security_release_secctx security_release_secctx
+#endif
 
 /*
  * Initialize cached SID values for frequently checked SELinux contexts.
@@ -112,8 +125,7 @@ void cache_sid(void)
 {
 	int err;
 
-	err = security_secctx_to_secid(
-		KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid);
+	err = security_secctx_to_secid(KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid);
 	if (err) {
 		pr_warn("Failed to cache kernel su domain SID: %d\n", err);
 		cached_su_sid = 0;
@@ -121,8 +133,7 @@ void cache_sid(void)
 		pr_info("Cached su SID: %u\n", cached_su_sid);
 	}
 
-	err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT),
-				       &cached_zygote_sid);
+	err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT), &cached_zygote_sid);
 	if (err) {
 		pr_warn("Failed to cache zygote SID: %d\n", err);
 		cached_zygote_sid = 0;
@@ -130,8 +141,7 @@ void cache_sid(void)
 		pr_info("Cached zygote SID: %u\n", cached_zygote_sid);
 	}
 
-	err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT),
-				       &cached_init_sid);
+	err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT), &cached_init_sid);
 	if (err) {
 		pr_warn("Failed to cache init SID: %d\n", err);
 		cached_init_sid = 0;
@@ -139,8 +149,7 @@ void cache_sid(void)
 		pr_info("Cached init SID: %u\n", cached_init_sid);
 	}
 
-	err = security_secctx_to_secid(KSU_FILE_CONTEXT,
-				       strlen(KSU_FILE_CONTEXT), &ksu_file_sid);
+	err = security_secctx_to_secid(KSU_FILE_CONTEXT, strlen(KSU_FILE_CONTEXT), &ksu_file_sid);
 	if (err) {
 		pr_warn("Failed to cache ksu_file SID: %d\n", err);
 		ksu_file_sid = 0;
@@ -153,15 +162,16 @@ void cache_sid(void)
  * Fast path: compare task's SID directly against cached value.
  * Falls back to string comparison if cache is not initialized.
  */
-static bool is_sid_match(const struct cred *cred, u32 cached_sid,
-			 const char *fallback_context)
+static bool is_sid_match(const struct cred *cred, u32 cached_sid, const char *fallback_context)
 {
-	const taskcred_sec_t *tsec;
 	if (!cred) {
 		return false;
 	}
-
-	tsec = (const taskcred_sec_t *)selinux_cred(cred);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0)
+	const struct task_security_struct *tsec = selinux_cred(cred);
+#else
+	const struct cred_security_struct *tsec = selinux_cred(cred);
+#endif
 	if (!tsec) {
 		return false;
 	}
@@ -172,10 +182,9 @@ static bool is_sid_match(const struct cred *cred, u32 cached_sid,
 	}
 
 	// Slow path fallback: string comparison (only before cache is initialized)
-	struct lsm_context ctx = { 0 };
+	struct lsm_context ctx;
 	bool result;
-	int err = __security_secid_to_secctx(tsec->sid, &ctx);
-	if (err) {
+	if (__security_secid_to_secctx(tsec->sid, &ctx)) {
 		return false;
 	}
 	result = strncmp(fallback_context, ctx.context, ctx.len) == 0;
@@ -202,3 +211,19 @@ bool is_init(const struct cred *cred)
 {
 	return is_sid_match(cred, cached_init_sid, INIT_CONTEXT);
 }
+
+void escape_to_root_for_adb_root(void)
+{
+	struct cred *cred = prepare_creds();
+	if (!cred) {
+		pr_err("Failed to prepare adbd's creds!\n");
+		return;
+	}
+
+	if (transive_to_domain(KERNEL_SU_CONTEXT, cred, true)) {
+		pr_err("transive domain failed.\n");
+		abort_creds(cred);
+		return;
+	}
+	commit_creds(cred);
+}
diff --git a/drivers/kernelsu/selinux/selinux.h b/drivers/kernelsu/selinux/selinux.h
index cf8c414ee0ea..cbeac553d20a 100644
--- a/drivers/kernelsu/selinux/selinux.h
+++ b/drivers/kernelsu/selinux/selinux.h
@@ -1,12 +1,11 @@
 #ifndef __KSU_H_SELINUX
 #define __KSU_H_SELINUX
 
-#include "linux/types.h"
-#include "linux/version.h"
-#include "linux/cred.h"
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)) || defined(KSU_COMPAT_HAS_SELINUX_STATE)
+#define KSU_COMPAT_USE_SELINUX_STATE
+#endif
 
-// TODO: rename to "ksu"
-#define KERNEL_SU_DOMAIN "su"
+#define KERNEL_SU_DOMAIN "ksu"
 #define KERNEL_SU_FILE "ksu_file"
 
 #define KERNEL_SU_CONTEXT "u:r:" KERNEL_SU_DOMAIN ":s0"
@@ -14,30 +13,28 @@
 #define ZYGOTE_CONTEXT "u:r:zygote:s0"
 #define INIT_CONTEXT "u:r:init:s0"
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
-#define KSU_COMPAT_USE_SELINUX_STATE
-#endif
-
-void setup_selinux(const char *);
+void setup_selinux(const char *, struct cred *);
 
 void setenforce(bool);
 
-bool getenforce(void);
+bool getenforce();
 
 void cache_sid(void);
 
-bool is_task_ksu_domain(const struct cred *cred);
+bool is_task_ksu_domain(const struct cred* cred);
+
+bool is_ksu_domain();
 
-bool is_ksu_domain(void);
+bool is_zygote(const struct cred* cred);
 
-bool is_zygote(const struct cred *cred);
+bool is_init(const struct cred* cred);
 
-bool is_init(const struct cred *cred);
+void apply_kernelsu_rules();
 
-void apply_kernelsu_rules(void);
+int handle_sepolicy(void __user *user_data, u64 data_len);
 
-int handle_sepolicy(unsigned long arg3, void __user *arg4);
+void setup_ksu_cred();
 
-void setup_ksu_cred(void);
+void escape_to_root_for_adb_root();
 
 #endif
diff --git a/drivers/kernelsu/selinux/selinux_defs.h b/drivers/kernelsu/selinux/selinux_defs.h
deleted file mode 100644
index b8e47e7d77f1..000000000000
--- a/drivers/kernelsu/selinux/selinux_defs.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef __KSU_H_SELINUX_DEFS
-#define __KSU_H_SELINUX_DEFS
-
-#include "selinux.h"
-#include "objsec.h"
-#ifndef KSU_COMPAT_USE_SELINUX_STATE
-#include "avc.h"
-#endif
-
-static inline bool is_selinux_disabled(void)
-{
-#ifdef CONFIG_SECURITY_SELINUX_DISABLE
-#ifdef KSU_COMPAT_USE_SELINUX_STATE
-	return selinux_state.disabled;
-#else
-	return selinux_disabled;
-#endif
-#else
-	return false;
-#endif
-}
-
-static inline bool is_selinux_enforcing(void)
-{
-#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
-#ifdef KSU_COMPAT_USE_SELINUX_STATE
-	return selinux_state.enforcing;
-#else
-	return selinux_enforcing;
-#endif
-#else
-	return true;
-#endif
-}
-
-static inline void do_setenforce(bool val)
-{
-#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
-#ifdef KSU_COMPAT_USE_SELINUX_STATE
-	selinux_state.enforcing = val;
-#else
-	selinux_enforcing = val;
-#endif
-#else
-	/* do nothing */
-#endif
-}
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0)
-typedef struct task_security_struct taskcred_sec_t;
-#else
-typedef struct cred_security_struct taskcred_sec_t;
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0)
-static inline taskcred_sec_t *selinux_cred(const struct cred *cred)
-{
-	return (taskcred_sec_t *)cred->security;
-}
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0)
-struct lsm_context {
-	char *context;
-	u32 len;
-};
-
-static inline int __security_secid_to_secctx(u32 secid, struct lsm_context *cp)
-{
-	return security_secid_to_secctx(secid, &cp->context, &cp->len);
-}
-static inline void __security_release_secctx(struct lsm_context *cp)
-{
-	security_release_secctx(cp->context, cp->len);
-}
-#else
-#define __security_secid_to_secctx security_secid_to_secctx
-#define __security_release_secctx security_release_secctx
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0)
-/*
- * get the subjective security ID of the current task
- */
-static inline u32 current_sid(void)
-{
-	const taskcred_sec_t *sec = current_security();
-
-	return sec->sid;
-}
-#endif
-
-#endif
diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c
index 1d3ec397030f..a97c7430efcf 100644
--- a/drivers/kernelsu/selinux/sepolicy.c
+++ b/drivers/kernelsu/selinux/sepolicy.c
@@ -1,58 +1,55 @@
-#include <linux/gfp.h>
-#include <linux/printk.h>
-#include <linux/slab.h>
-#include <linux/version.h>
-
-#include "sepolicy.h"
-#include "../klog.h" // IWYU pragma: keep
-#include "ss/symtab.h"
-#include "../kernel_compat.h" // Add check Huawei Device
-
 #define KSU_SUPPORT_ADD_TYPE
 
+/*
+ * Adapt to Huawei HISI kernel without affecting other kernels ,
+ * Huawei Hisi Kernel EBITMAP Enable or Disable Flag ,
+ * From ss/ebitmap.h
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) &&                           \
+		LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ||               \
+	LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) &&                      \
+		LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)
+#ifdef HISI_SELINUX_EBITMAP_RO
+#define CONFIG_IS_HW_HISI
+#endif
+#endif
+
 //////////////////////////////////////////////////////
 // Declaration
 //////////////////////////////////////////////////////
 
-static struct avtab_node *get_avtab_node(struct policydb *db,
-					 struct avtab_key *key,
-					 struct avtab_extended_perms *xperms);
+static struct avtab_node *get_avtab_node(struct policydb *db, struct avtab_key *key,
+                                         struct avtab_extended_perms *xperms);
 
-static bool add_rule(struct policydb *db, const char *s, const char *t,
-		     const char *c, const char *p, int effect, bool invert);
+static bool is_redundant_avtab_node(struct avtab_node *node);
 
-static void add_rule_raw(struct policydb *db, struct type_datum *src,
-			 struct type_datum *tgt, struct class_datum *cls,
-			 struct perm_datum *perm, int effect, bool invert);
+static bool remove_avtab_node(struct policydb *db, struct avtab_node *node);
 
-static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src,
-			       struct type_datum *tgt, struct class_datum *cls,
-			       uint16_t low, uint16_t high, int effect,
-			       bool invert);
-static bool add_xperm_rule(struct policydb *db, const char *s, const char *t,
-			   const char *c, const char *range, int effect,
-			   bool invert);
+static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect,
+			bool invert);
 
-static bool add_type_rule(struct policydb *db, const char *s, const char *t,
-			  const char *c, const char *d, int effect);
+static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls,
+				struct perm_datum *perm, int effect, bool invert);
 
-static bool add_filename_trans(struct policydb *db, const char *s,
-			       const char *t, const char *c, const char *d,
-			       const char *o);
+static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt,
+				struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert);
+static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range,
+				int effect, bool invert);
+
+static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect);
 
-static bool add_genfscon(struct policydb *db, const char *fs_name,
-			 const char *path, const char *context);
+static bool add_filename_trans(struct policydb *db, const char *s, const char *t, const char *c, const char *d,
+				const char *o);
+
+static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context);
 
 static bool add_type(struct policydb *db, const char *type_name, bool attr);
 
-static bool set_type_state(struct policydb *db, const char *type_name,
-			   bool permissive);
+static bool set_type_state(struct policydb *db, const char *type_name, bool permissive);
 
-static void add_typeattribute_raw(struct policydb *db, struct type_datum *type,
-				  struct type_datum *attr);
+static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, struct type_datum *attr);
 
-static bool add_typeattribute(struct policydb *db, const char *type,
-			      const char *attr);
+static bool add_typeattribute(struct policydb *db, const char *type, const char *attr);
 
 //////////////////////////////////////////////////////
 // Implementation
@@ -70,11 +67,9 @@ static bool add_typeattribute(struct policydb *db, const char *type,
 // htable is a struct instead of pointer above 5.8.0:
 // https://elixir.bootlin.com/linux/v5.8-rc1/source/security/selinux/ss/symtab.h
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
-#define ksu_hashtab_for_each(htab, cur)                                        \
-	ksu_hash_for_each(htab.htable, htab.size, cur)
+#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab.htable, htab.size, cur)
 #else
-#define ksu_hashtab_for_each(htab, cur)                                        \
-	ksu_hash_for_each(htab->htable, htab->size, cur)
+#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab->htable, htab->size, cur)
 #endif
 
 // symtab_search is introduced on 5.9.0:
@@ -84,8 +79,7 @@ static bool add_typeattribute(struct policydb *db, const char *type,
 #define symtab_insert(s, name, datum) hashtab_insert((s)->table, name, datum)
 #endif
 
-#define avtab_for_each(avtab, cur)                                             \
-	ksu_hash_for_each(avtab.htable, avtab.nslot, cur);
+#define avtab_for_each(avtab, cur) ksu_hash_for_each(avtab.htable, avtab.nslot, cur);
 
 static struct avtab_node *get_avtab_node(struct policydb *db,
 					 struct avtab_key *key,
@@ -126,6 +120,8 @@ static struct avtab_node *get_avtab_node(struct policydb *db,
 		}
 		/* this is used to get the node - insertion is actually unique */
 		node = avtab_insert_nonunique(&db->te_avtab, key, &avdatum);
+		if (!node)
+			return NULL;
 
 		int grow_size = sizeof(struct avtab_key);
 		grow_size += sizeof(struct avtab_datum);
@@ -141,8 +137,93 @@ static struct avtab_node *get_avtab_node(struct policydb *db,
 	return node;
 }
 
-static bool add_rule(struct policydb *db, const char *s, const char *t,
-		     const char *c, const char *p, int effect, bool invert)
+static bool is_redundant_avtab_node(struct avtab_node *node)
+{
+	if (node->key.specified & AVTAB_XPERMS)
+		return node->datum.u.xperms == NULL;
+	if (!(node->key.specified & AVTAB_AV))
+		return false;
+	if (node->key.specified & AVTAB_AUDITDENY)
+		return node->datum.u.data == ~0U;
+	return node->datum.u.data == 0U;
+}
+
+// 4.1, https://github.com/torvalds/linux/commit/ba39db6e0519aa8362dbda6523ceb69349a18dc3
+// 5.1, https://github.com/torvalds/linux/commit/acdf52d97f824019888422842757013b37441dd1
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) || LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY)
+static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i)
+{
+	// htable is **
+	// struct avtab_node **htable;
+	return ab->htable[i];
+}
+static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node)
+{
+	ab->htable[i] = node;
+}
+#else
+static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i)
+{
+	// htable is **
+	// this can ret NULL!
+	struct avtab_node **p = flex_array_get(ab->htable, i);
+	if (!p)
+		return NULL;
+
+	return *p;
+}
+static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node)
+{
+	flex_array_put_ptr(ab->htable, i, node, GFP_KERNEL | __GFP_ZERO);
+}
+#endif
+
+static bool remove_avtab_node(struct policydb *db, struct avtab_node *node)
+{
+	int i;
+	int ret;
+	int shrink_size = sizeof(struct avtab_key) + sizeof(struct avtab_datum);
+	struct avtab removed = {};
+	struct avtab_node *n;
+	struct avtab_node *prev;
+
+	ret = avtab_alloc(&removed, 1);
+	if (ret < 0)
+		return false;
+
+	for (i = 0; i < db->te_avtab.nslot; i++) {
+		prev = NULL;
+		for (n = avtab_get_slot(&db->te_avtab, i); n; prev = n, n = n->next) {
+			if (n != node)
+				continue;
+
+			if (prev)
+				prev->next = n->next;
+			else
+				avtab_set_slot(&db->te_avtab, i, n->next);
+
+			if (db->te_avtab.nel > 0)
+				db->te_avtab.nel--;
+
+			if ((n->key.specified & AVTAB_XPERMS) && n->datum.u.xperms) {
+				shrink_size += sizeof(u8) + sizeof(u8) + sizeof(u32) * ARRAY_SIZE(n->datum.u.xperms->perms.p);
+			}
+			n->next = NULL;
+			avtab_set_slot(&removed, 0, n);
+			removed.nel = 1;
+			avtab_destroy(&removed);
+			if (db->len >= shrink_size)
+				db->len -= shrink_size;
+			return true;
+		}
+	}
+
+	avtab_destroy(&removed);
+	return false;
+}
+
+static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect,
+		bool invert)
 {
 	struct type_datum *src = NULL, *tgt = NULL;
 	struct class_datum *cls = NULL;
@@ -188,31 +269,27 @@ static bool add_rule(struct policydb *db, const char *s, const char *t,
 			return false;
 		}
 	}
-	add_rule_raw(db, src, tgt, cls, perm, effect, invert);
-	return true;
+	return add_rule_raw(db, src, tgt, cls, perm, effect, invert);
 }
 
-static void add_rule_raw(struct policydb *db, struct type_datum *src,
-			 struct type_datum *tgt, struct class_datum *cls,
-			 struct perm_datum *perm, int effect, bool invert)
+static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls,
+						 struct perm_datum *perm, int effect, bool invert)
 {
+	bool success = true;
+
 	if (src == NULL) {
 		struct hashtab_node *node;
 		if (strip_av(effect, invert)) {
 			ksu_hashtab_for_each(db->p_types.table, node)
 			{
-				add_rule_raw(db,
-					     (struct type_datum *)node->datum,
-					     tgt, cls, perm, effect, invert);
+				success &= add_rule_raw(db, (struct type_datum *)node->datum, tgt, cls, perm, effect, invert);
 			};
 		} else {
 			ksu_hashtab_for_each(db->p_types.table, node)
 			{
-				struct type_datum *type =
-					(struct type_datum *)(node->datum);
+				struct type_datum *type = (struct type_datum *)(node->datum);
 				if (type->attribute) {
-					add_rule_raw(db, type, tgt, cls, perm,
-						     effect, invert);
+					success &= add_rule_raw(db, type, tgt, cls, perm, effect, invert);
 				}
 			};
 		}
@@ -221,18 +298,14 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src,
 		if (strip_av(effect, invert)) {
 			ksu_hashtab_for_each(db->p_types.table, node)
 			{
-				add_rule_raw(db, src,
-					     (struct type_datum *)node->datum,
-					     cls, perm, effect, invert);
+				success &= add_rule_raw(db, src, (struct type_datum *)node->datum, cls, perm, effect, invert);
 			};
 		} else {
 			ksu_hashtab_for_each(db->p_types.table, node)
 			{
-				struct type_datum *type =
-					(struct type_datum *)(node->datum);
+				struct type_datum *type = (struct type_datum *)(node->datum);
 				if (type->attribute) {
-					add_rule_raw(db, src, type, cls, perm,
-						     effect, invert);
+					success &= add_rule_raw(db, src, type, cls, perm, effect, invert);
 				}
 			};
 		}
@@ -240,22 +313,30 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src,
 		struct hashtab_node *node;
 		ksu_hashtab_for_each(db->p_classes.table, node)
 		{
-			add_rule_raw(db, src, tgt,
-				     (struct class_datum *)node->datum, perm,
-				     effect, invert);
+			success &= add_rule_raw(db, src, tgt, (struct class_datum *)node->datum, perm, effect, invert);
 		}
 	} else {
 		struct avtab_key key;
+		struct avtab_node *node;
+
 		key.source_type = src->value;
 		key.target_type = tgt->value;
 		key.target_class = cls->value;
 		key.specified = effect;
 
-		struct avtab_node *node = get_avtab_node(db, &key, NULL);
+		if (invert && effect != AVTAB_AUDITDENY) {
+			node = avtab_search_node(&db->te_avtab, &key);
+			if (!node)
+				return true;
+		} else {
+			node = get_avtab_node(db, &key, NULL);
+			if (!node)
+				return false;
+		}
+
 		if (invert) {
 			if (perm)
-				node->datum.u.data &=
-					~(1U << (perm->value - 1));
+				node->datum.u.data &= ~(1U << (perm->value - 1));
 			else
 				node->datum.u.data = 0U;
 		} else {
@@ -264,7 +345,11 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src,
 			else
 				node->datum.u.data = ~0U;
 		}
+		if (is_redundant_avtab_node(node))
+			return remove_avtab_node(db, node);
 	}
+
+	return success;
 }
 
 #define ioctl_driver(x) (x >> 8 & 0xFF)
@@ -274,40 +359,32 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src,
 #define xperm_set(x, p) (p[x >> 5] |= (1 << (x & 0x1f)))
 #define xperm_clear(x, p) (p[x >> 5] &= ~(1 << (x & 0x1f)))
 
-static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src,
-			       struct type_datum *tgt, struct class_datum *cls,
-			       uint16_t low, uint16_t high, int effect,
-			       bool invert)
+static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt,
+				struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert)
 {
 	if (src == NULL) {
 		struct hashtab_node *node;
 		ksu_hashtab_for_each(db->p_types.table, node)
 		{
-			struct type_datum *type =
-				(struct type_datum *)(node->datum);
+			struct type_datum *type = (struct type_datum *)(node->datum);
 			if (type->attribute) {
-				add_xperm_rule_raw(db, type, tgt, cls, low,
-						   high, effect, invert);
+				add_xperm_rule_raw(db, type, tgt, cls, low, high, effect, invert);
 			}
 		};
 	} else if (tgt == NULL) {
 		struct hashtab_node *node;
 		ksu_hashtab_for_each(db->p_types.table, node)
 		{
-			struct type_datum *type =
-				(struct type_datum *)(node->datum);
+			struct type_datum *type = (struct type_datum *)(node->datum);
 			if (type->attribute) {
-				add_xperm_rule_raw(db, src, type, cls, low,
-						   high, effect, invert);
+				add_xperm_rule_raw(db, src, type, cls, low, high, effect, invert);
 			}
 		};
 	} else if (cls == NULL) {
 		struct hashtab_node *node;
 		ksu_hashtab_for_each(db->p_classes.table, node)
 		{
-			add_xperm_rule_raw(db, src, tgt,
-					   (struct class_datum *)(node->datum),
-					   low, high, effect, invert);
+			add_xperm_rule_raw(db, src, tgt, (struct class_datum *)(node->datum), low, high, effect, invert);
 		};
 	} else {
 		struct avtab_key key;
@@ -330,8 +407,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src,
 		}
 		int i;
 		if (xperms.specified == AVTAB_XPERMS_IOCTLDRIVER) {
-			for (i = ioctl_driver(low); i <= ioctl_driver(high);
-			     ++i) {
+			for (i = ioctl_driver(low); i <= ioctl_driver(high); ++i) {
 				if (invert)
 					xperm_clear(i, xperms.perms.p);
 				else
@@ -354,9 +430,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src,
 		datum = &node->datum;
 
 		if (datum->u.xperms == NULL) {
-			datum->u.xperms =
-				(struct avtab_extended_perms *)(kzalloc(
-					sizeof(xperms), GFP_ATOMIC));
+			datum->u.xperms = (struct avtab_extended_perms *)(kzalloc(sizeof(xperms), GFP_ATOMIC));
 			if (!datum->u.xperms) {
 				pr_err("alloc xperms failed\n");
 				return;
@@ -366,9 +440,8 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src,
 	}
 }
 
-static bool add_xperm_rule(struct policydb *db, const char *s, const char *t,
-			   const char *c, const char *range, int effect,
-			   bool invert)
+static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range,
+			int effect, bool invert)
 {
 	struct type_datum *src = NULL, *tgt = NULL;
 	struct class_datum *cls = NULL;
@@ -415,8 +488,7 @@ static bool add_xperm_rule(struct policydb *db, const char *s, const char *t,
 	return true;
 }
 
-static bool add_type_rule(struct policydb *db, const char *s, const char *t,
-			  const char *c, const char *d, int effect)
+static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect)
 {
 	struct type_datum *src, *tgt, *def;
 	struct class_datum *cls;
@@ -449,6 +521,8 @@ static bool add_type_rule(struct policydb *db, const char *s, const char *t,
 	key.specified = effect;
 
 	struct avtab_node *node = get_avtab_node(db, &key, NULL);
+	if (!node)
+		return false;
 	node->datum.u.data = def->value;
 
 	return true;
@@ -533,11 +607,9 @@ static bool add_filename_trans(struct policydb *db, const char *s,
 	struct filename_trans_datum *last = NULL;
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
-	struct filename_trans_datum *trans =
-		policydb_filenametr_search(db, &key);
+	struct filename_trans_datum *trans = policydb_filenametr_search(db, &key);
 #else
-	struct filename_trans_datum *trans =
-		hashtab_search(&db->filename_trans, &key);
+	struct filename_trans_datum *trans = hashtab_search(&db->filename_trans, &key);
 #endif
 	while (trans) {
 		if (ebitmap_get_bit(&trans->stypes, src->value - 1)) {
@@ -552,17 +624,13 @@ static bool add_filename_trans(struct policydb *db, const char *s,
 	}
 
 	if (trans == NULL) {
-		trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans),
-							       GFP_ATOMIC);
-		struct filename_trans_key *new_key =
-			(struct filename_trans_key *)kzalloc(sizeof(*new_key),
-							     GFP_ATOMIC);
+		trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), GFP_KERNEL);
+		struct filename_trans_key *new_key = (struct filename_trans_key *)kzalloc(sizeof(*new_key), GFP_KERNEL);
 		*new_key = key;
-		new_key->name = kstrdup(key.name, GFP_ATOMIC);
+		new_key->name = kstrdup(key.name, GFP_KERNEL);
 		trans->next = last;
 		trans->otype = def->value;
-		hashtab_insert(&db->filename_trans, new_key, trans,
-			       filenametr_key_params);
+		hashtab_insert(&db->filename_trans, new_key, trans, filenametr_key_params);
 	}
 
 	db->compat_filename_trans_count++;
@@ -578,42 +646,52 @@ static bool add_filename_trans(struct policydb *db, const char *s,
 		hashtab_search(db->filename_trans, &key);
 
 	if (trans == NULL) {
-		trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans),
-							       GFP_ATOMIC);
+		trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), 1, GFP_KERNEL);
 		if (!trans) {
 			pr_err("add_filename_trans: Failed to alloc datum\n");
 			return false;
 		}
 		struct filename_trans *new_key =
-			(struct filename_trans *)kzalloc(sizeof(*new_key),
-							 GFP_ATOMIC);
+			(struct filename_trans *)kmalloc(sizeof(*new_key), GFP_KERNEL);
 		if (!new_key) {
 			pr_err("add_filename_trans: Failed to alloc new_key\n");
 			return false;
 		}
 		*new_key = key;
-		new_key->name = kstrdup(key.name, GFP_ATOMIC);
+		new_key->name = kstrdup(key.name, GFP_KERNEL);
 		trans->otype = def->value;
 		hashtab_insert(db->filename_trans, new_key, trans);
 	}
 
-	return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) ==
-	       0;
+	return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == 0;
 #endif
 }
 
-static bool add_genfscon(struct policydb *db, const char *fs_name,
-			 const char *path, const char *context)
+static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context)
 {
 	return false;
 }
 
 // https://github.com/torvalds/linux/commit/590b9d576caec6b4c46bba49ed36223a399c3fc5#diff-cc9aa90e094e6e0f47bd7300db4f33cf4366b98b55d8753744f31eb69c691016R844-R845
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
-#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_ATOMIC)
-#else
-#define ksu_kvrealloc(p, new_size, old_size)                                   \
-	ksu_compat_kvrealloc(p, old_size, new_size, GFP_ATOMIC)
+#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_KERNEL)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY)
+// https://cs.android.com/android/_/android/kernel/common/+/f5f3e54f811679761c33526e695bd296190faade
+// Some 5.10 kernel don't have this backport, so copy one.
+static void *ksu_kvrealloc_compat(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+{
+	void *newp;
+
+	if (oldsize >= newsize)
+		return (void *)p;
+	newp = kvmalloc(newsize, flags);
+	if (!newp)
+		return NULL;
+	__builtin_memcpy(newp, p, oldsize); // bypass fortify_source, kasan
+	kvfree(p);
+	return newp;
+}
+#define ksu_kvrealloc(p, new_size, old_size) ksu_kvrealloc_compat(p, old_size, new_size, GFP_KERNEL)
 #endif
 
 static bool add_type(struct policydb *db, const char *type_name, bool attr)
@@ -627,7 +705,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 
 	u32 value = ++db->p_types.nprim;
 	type = (struct type_datum *)kzalloc(sizeof(struct type_datum),
-					    GFP_ATOMIC);
+					    GFP_KERNEL);
 	if (!type) {
 		pr_err("add_type: alloc type_datum failed.\n");
 		return false;
@@ -637,7 +715,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 	type->value = value;
 	type->attribute = attr;
 
-	char *key = kstrdup(type_name, GFP_ATOMIC);
+	char *key = kstrdup(type_name, GFP_KERNEL);
 	if (!key) {
 		pr_err("add_type: alloc key failed.\n");
 		return false;
@@ -648,11 +726,11 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 		return false;
 	}
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT)
 	struct ebitmap *new_type_attr_map_array =
 		ksu_kvrealloc(db->type_attr_map_array,
-			      value * sizeof(struct ebitmap),
-			      (value - 1) * sizeof(struct ebitmap));
+			    value * sizeof(struct ebitmap),
+			    (value - 1) * sizeof(struct ebitmap));
 
 	if (!new_type_attr_map_array) {
 		pr_err("add_type: alloc type_attr_map_array failed\n");
@@ -661,8 +739,8 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 
 	struct type_datum **new_type_val_to_struct =
 		ksu_kvrealloc(db->type_val_to_struct,
-			      sizeof(*db->type_val_to_struct) * value,
-			      sizeof(*db->type_val_to_struct) * (value - 1));
+			    sizeof(*db->type_val_to_struct) * value,
+			    sizeof(*db->type_val_to_struct) * (value - 1));
 
 	if (!new_type_val_to_struct) {
 		pr_err("add_type: alloc type_val_to_struct failed\n");
@@ -671,8 +749,8 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 
 	char **new_val_to_name_types =
 		ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES],
-			      sizeof(char *) * value,
-			      sizeof(char *) * (value - 1));
+			    sizeof(char *) * value,
+			    sizeof(char *) * (value - 1));
 	if (!new_val_to_name_types) {
 		pr_err("add_type: alloc val_to_name failed\n");
 		return false;
@@ -695,6 +773,55 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 	}
 
 	return true;
+
+#elif defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY)
+	struct ebitmap *new_type_attr_map_array =
+		ksu_kvrealloc(db->type_attr_map_array,
+			    value * sizeof(struct ebitmap),
+			    (value - 1) * sizeof(struct ebitmap));
+
+	if (!new_type_attr_map_array) {
+		pr_err("add_type: alloc type_attr_map_array failed\n");
+		return false;
+	}
+
+	struct type_datum **new_type_val_to_struct =
+		ksu_kvrealloc(db->type_val_to_struct_array,
+			    sizeof(*db->type_val_to_struct_array) * value,
+			    sizeof(*db->type_val_to_struct_array) * (value - 1));
+
+	if (!new_type_val_to_struct) {
+		pr_err("add_type: alloc type_val_to_struct failed\n");
+		return false;
+	}
+
+	char **new_val_to_name_types =
+		ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES],
+			    sizeof(char *) * value,
+			    sizeof(char *) * (value - 1));
+	if (!new_val_to_name_types) {
+		pr_err("add_type: alloc val_to_name failed\n");
+		return false;
+	}
+
+	db->type_attr_map_array = new_type_attr_map_array;
+	ebitmap_init(&db->type_attr_map_array[value - 1]);
+	ebitmap_set_bit(&db->type_attr_map_array[value - 1], value - 1, 1);
+
+	db->type_val_to_struct_array = new_type_val_to_struct;
+	db->type_val_to_struct_array[value - 1] = type;
+
+	db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types;
+	db->sym_val_to_name[SYM_TYPES][value - 1] = key;
+
+	int i;
+	for (i = 0; i < db->p_roles.nprim; ++i) {
+		ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1,
+				1);
+	}
+
+	return true;
+
 #elif defined(CONFIG_IS_HW_HISI)
 	/*
    * Huawei use type_attr_map and type_val_to_struct.
@@ -702,12 +829,12 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
    */
 	size_t new_size = sizeof(struct ebitmap) * db->p_types.nprim;
 	struct ebitmap *new_type_attr_map =
-		(krealloc(db->type_attr_map, new_size, GFP_ATOMIC));
+		(krealloc(db->type_attr_map, new_size, GFP_KERNEL));
 
 	struct type_datum **new_type_val_to_struct =
 		krealloc(db->type_val_to_struct,
 			 sizeof(*db->type_val_to_struct) * db->p_types.nprim,
-			 GFP_ATOMIC);
+			 GFP_KERNEL);
 
 	if (!new_type_attr_map) {
 		pr_err("add_type: alloc type_attr_map failed\n");
@@ -749,15 +876,15 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 	// flex_array is not extensible, we need to create a new bigger one instead
 	struct flex_array *new_type_attr_map_array =
 		flex_array_alloc(sizeof(struct ebitmap), db->p_types.nprim,
-				 GFP_ATOMIC | __GFP_ZERO);
+				 GFP_KERNEL | __GFP_ZERO);
 
 	struct flex_array *new_type_val_to_struct =
 		flex_array_alloc(sizeof(struct type_datum *), db->p_types.nprim,
-				 GFP_ATOMIC | __GFP_ZERO);
+				 GFP_KERNEL | __GFP_ZERO);
 
 	struct flex_array *new_val_to_name_types =
 		flex_array_alloc(sizeof(char *), db->symtab[SYM_TYPES].nprim,
-				 GFP_ATOMIC | __GFP_ZERO);
+				 GFP_KERNEL | __GFP_ZERO);
 
 	if (!new_type_attr_map_array) {
 		pr_err("add_type: alloc type_attr_map_array failed\n");
@@ -776,20 +903,20 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 
 	// preallocate so we don't have to worry about the put ever failing
 	if (flex_array_prealloc(new_type_attr_map_array, 0, db->p_types.nprim,
-				GFP_ATOMIC | __GFP_ZERO)) {
+				GFP_KERNEL | __GFP_ZERO)) {
 		pr_err("add_type: prealloc type_attr_map_array failed\n");
 		return false;
 	}
 
 	if (flex_array_prealloc(new_type_val_to_struct, 0, db->p_types.nprim,
-				GFP_ATOMIC | __GFP_ZERO)) {
+				GFP_KERNEL | __GFP_ZERO)) {
 		pr_err("add_type: prealloc type_val_to_struct_array failed\n");
 		return false;
 	}
 
 	if (flex_array_prealloc(new_val_to_name_types, 0,
 				db->symtab[SYM_TYPES].nprim,
-				GFP_ATOMIC | __GFP_ZERO)) {
+				GFP_KERNEL | __GFP_ZERO)) {
 		pr_err("add_type: prealloc val_to_name_types failed\n");
 		return false;
 	}
@@ -801,14 +928,14 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 		old_elem = flex_array_get(db->type_attr_map_array, j);
 		if (old_elem)
 			flex_array_put(new_type_attr_map_array, j, old_elem,
-				       GFP_ATOMIC | __GFP_ZERO);
+				       GFP_KERNEL | __GFP_ZERO);
 	}
 
 	for (j = 0; j < db->type_val_to_struct_array->total_nr_elements; j++) {
 		old_elem = flex_array_get_ptr(db->type_val_to_struct_array, j);
 		if (old_elem)
 			flex_array_put_ptr(new_type_val_to_struct, j, old_elem,
-					   GFP_ATOMIC | __GFP_ZERO);
+					   GFP_KERNEL | __GFP_ZERO);
 	}
 
 	for (j = 0; j < db->symtab[SYM_TYPES].nprim; j++) {
@@ -816,7 +943,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 			flex_array_get_ptr(db->sym_val_to_name[SYM_TYPES], j);
 		if (old_elem)
 			flex_array_put_ptr(new_val_to_name_types, j, old_elem,
-					   GFP_ATOMIC | __GFP_ZERO);
+					   GFP_KERNEL | __GFP_ZERO);
 	}
 
 	// store the pointer of old flex arrays first, when assigning new ones we
@@ -839,7 +966,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 		flex_array_free(old_fa);
 	}
 	flex_array_put_ptr(db->type_val_to_struct_array, value - 1, type,
-			   GFP_ATOMIC | __GFP_ZERO);
+			   GFP_KERNEL | __GFP_ZERO);
 
 	old_fa = db->sym_val_to_name[SYM_TYPES];
 	db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types;
@@ -847,7 +974,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr)
 		flex_array_free(old_fa);
 	}
 	flex_array_put_ptr(db->sym_val_to_name[SYM_TYPES], value - 1, key,
-			   GFP_ATOMIC | __GFP_ZERO);
+			   GFP_KERNEL | __GFP_ZERO);
 
 	int i;
 	for (i = 0; i < db->p_roles.nprim; ++i) {
@@ -894,7 +1021,7 @@ static bool set_type_state(struct policydb *db, const char *type_name,
 static void add_typeattribute_raw(struct policydb *db, struct type_datum *type,
 				  struct type_datum *attr)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY)
 	struct ebitmap *sattr = &db->type_attr_map_array[type->value - 1];
 #elif defined(CONFIG_IS_HW_HISI)
 	/*
@@ -1060,3 +1187,92 @@ bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path,
 {
 	return add_genfscon(db, fs_name, path, ctx);
 }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#include "ss/avtab.h"
+#include "ss/constraint.h"
+#include "ss/ebitmap.h"
+#include "ss/hashtab.h"
+#include "ss/policydb.h"
+#include "ss/services.h"
+
+void ksu_destroy_sepolicy(struct selinux_policy *pol)
+{
+	policydb_destroy(&pol->policydb);
+	kfree(pol);
+}
+
+struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol)
+{
+	int ret;
+	size_t len;
+	struct selinux_policy *new_pol;
+	void *data;
+	struct policy_file fp;
+
+	len = old_pol->policydb.len;
+	data = vmalloc(len);
+	if (!data) {
+		pr_err("alloc policy len %ld\n", len);
+		ret = -ENOMEM;
+		goto out_free_data;
+	}
+
+	fp.data = data;
+	fp.len = len;
+
+	ret = policydb_write(&old_pol->policydb, &fp);
+	if (ret) {
+		pr_err("sepolicy: policydb_write: %d\n", ret);
+		goto out_free_data;
+	}
+
+	// https://android-review.googlesource.com/c/kernel/common/+/3009995/11/security/selinux/ss/policydb.c
+	// fixup config
+	// 4*2+8+4
+	static const size_t kConfigOff = 20;
+	if (len >= kConfigOff + sizeof(u32)) {
+		u32 *config_ptr = (u32 *)((unsigned long)data + kConfigOff);
+		pr_info("old config: %u\n", *config_ptr);
+		if (old_pol->policydb.android_netlink_route) {
+			pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE\n");
+			*config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE;
+		}
+		if (old_pol->policydb.android_netlink_getneigh) {
+			pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH\n");
+			*config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH;
+		}
+		pr_info("new config: %u\n", *config_ptr);
+	}
+
+	new_pol = kmemdup(old_pol, sizeof(*old_pol), GFP_KERNEL);
+	if (!new_pol) {
+		ret = -ENOMEM;
+		pr_err("sepolicy: dup old pol\n");
+		goto out_free_data;
+	}
+	memset(&new_pol->policydb, 0, sizeof(new_pol->policydb));
+
+	// rewind fp
+	fp.data = data;
+	fp.len = len;
+
+	ret = policydb_read(&new_pol->policydb, &fp);
+	if (ret) {
+		pr_err("sepolicy: policydb_read: %d\n", ret);
+		goto out_free_policydb;
+	}
+	new_pol->policydb.len = old_pol->policydb.len;
+	kvfree(data);
+
+	return new_pol;
+
+out_free_policydb:
+	kfree(new_pol);
+
+out_free_data:
+	kvfree(data);
+
+	return ERR_PTR(ret);
+}
+#endif
diff --git a/drivers/kernelsu/selinux/sepolicy.h b/drivers/kernelsu/selinux/sepolicy.h
index 675d1499e46d..8ae79e3dc3b3 100644
--- a/drivers/kernelsu/selinux/sepolicy.h
+++ b/drivers/kernelsu/selinux/sepolicy.h
@@ -1,10 +1,14 @@
 #ifndef __KSU_H_SEPOLICY
 #define __KSU_H_SEPOLICY
 
-#include <linux/types.h>
-
 #include "ss/policydb.h"
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol);
+
+void ksu_destroy_sepolicy(struct selinux_policy *orig);
+#endif
+
 // Operation on types
 bool ksu_type(struct policydb *db, const char *name, const char *attr);
 bool ksu_attribute(struct policydb *db, const char *name);
diff --git a/drivers/kernelsu/setuid_hook.c b/drivers/kernelsu/setuid_hook.c
deleted file mode 100644
index c15123101c45..000000000000
--- a/drivers/kernelsu/setuid_hook.c
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/version.h>
-#include <linux/slab.h>
-#include <linux/task_work.h>
-#include <linux/thread_info.h>
-#include <linux/seccomp.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include <linux/sched/signal.h>
-#endif
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/uidgid.h>
-
-#include "allowlist.h"
-#include "setuid_hook.h"
-#include "klog.h" // IWYU pragma: keep
-#include "manager.h"
-#include "selinux/selinux.h"
-#include "supercalls.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "syscall_handler.h"
-#endif
-#include "kernel_umount.h"
-#include "kernel_compat.h"
-
-static void ksu_install_manager_fd_tw_func(struct callback_head *cb)
-{
-	ksu_install_fd();
-	kfree(cb);
-}
-
-static void do_install_manager_fd(void)
-{
-	struct callback_head *cb = kzalloc(sizeof(*cb), GFP_ATOMIC);
-	if (!cb)
-		return;
-
-	cb->func = ksu_install_manager_fd_tw_func;
-	if (task_work_add(current, cb, TWA_RESUME)) {
-		kfree(cb);
-		pr_warn("install manager fd add task_work failed\n");
-	}
-}
-
-// force_sig kcompat, TODO: move it out of core_hook.c
-// https://elixir.bootlin.com/linux/v5.3-rc1/source/kernel/signal.c#L1613
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
-#define send_sig(sig) force_sig(sig)
-#else
-#define send_sig(sig) force_sig(sig, current)
-#endif
-
-extern void disable_seccomp(void);
-int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid)
-{
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("handle_setuid from %d to %d\n", old_uid, new_uid);
-#endif
-
-	if (likely(ksu_is_manager_appid_valid()) &&
-	    unlikely(ksu_get_manager_appid() == new_uid % PER_USER_RANGE)) {
-		disable_seccomp();
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-		ksu_set_task_tracepoint_flag(current);
-#endif
-		pr_info("install fd for manager (uid=%d)\n", new_uid);
-		do_install_manager_fd();
-		return 0;
-	}
-
-	if (ksu_is_allow_uid_for_current(new_uid)) {
-		disable_seccomp();
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-		ksu_set_task_tracepoint_flag(current);
-	} else {
-		ksu_clear_task_tracepoint_flag_if_needed(current);
-#endif
-	}
-
-	// Handle kernel umount
-	ksu_handle_umount(old_uid, new_uid);
-
-	return 0;
-}
-
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) &&                          \
-     defined(CONFIG_KSU_MANUAL_HOOK))
-int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid)
-{
-	if (!is_zygote(current_cred())) {
-#ifdef CONFIG_KSU_DEBUG
-		pr_info("setresuid: disallow non zygote sid!\n");
-#endif
-		return 0;
-	}
-	return ksu_handle_setuid_common(ruid, current_uid().val, euid);
-}
-#endif
-
-void ksu_setuid_hook_init(void)
-{
-	ksu_kernel_umount_init();
-}
-
-void ksu_setuid_hook_exit(void)
-{
-	pr_info("ksu setuid exit\n");
-	ksu_kernel_umount_exit();
-}
diff --git a/drivers/kernelsu/setuid_hook.h b/drivers/kernelsu/setuid_hook.h
deleted file mode 100644
index 7c4eda71c1c0..000000000000
--- a/drivers/kernelsu/setuid_hook.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __KSU_H_KSU_SETUID_HOOK
-#define __KSU_H_KSU_SETUID_HOOK
-
-#include <linux/init.h>
-#include <linux/types.h>
-
-void ksu_setuid_hook_init(void);
-void ksu_setuid_hook_exit(void);
-
-int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid);
-
-#endif
diff --git a/drivers/kernelsu/shim.c b/drivers/kernelsu/shim.c
deleted file mode 100644
index 75d5542a87aa..000000000000
--- a/drivers/kernelsu/shim.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <linux/version.h>
-#include <linux/compat.h>
-#include <linux/fs.h>
-
-// unity build idea from backslashxx, not full, we only use it for shim ksu hooks
-
-#include "allowlist.h"
-#include "arch.h"
-#include "kp_hook.h"
-#include "ksu.h"
-#include "klog.h" // IWYU pragma: keep
-#include "ksud.h"
-#include "kernel_compat.h"
-#include "kp_util.h"
-#include "supercalls.h"
-#include "sucompat.h"
-#include "setuid_hook.h"
-#include "syscall_handler.h"
-#include "selinux/selinux.h"
-#include "throne_tracker.h"
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "pkg_observer.c"
-#include "kp_hook.c"
-#include "kp_util.c"
-#include "syscall_handler.c"
-#endif
-
-#if (defined(CONFIG_KSU_MANUAL_HOOK) &&                                        \
-     LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0))
-#include "lsm_hook.c"
-#elif (defined(CONFIG_KSU_MANUAL_HOOK) &&                                      \
-       LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0))
-// + ksu_handle_setresuid hook for 6.8+
-#include "pkg_observer.c"
-#endif
diff --git a/drivers/kernelsu/sucompat.c b/drivers/kernelsu/sucompat.c
deleted file mode 100644
index 2bb1a9fba702..000000000000
--- a/drivers/kernelsu/sucompat.c
+++ /dev/null
@@ -1,217 +0,0 @@
-#include <linux/uaccess.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/version.h>
-#include <linux/ptrace.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-#include <linux/compiler.h>
-#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#include <linux/sched/task_stack.h>
-#else
-#include <linux/sched.h>
-#endif
-#include <asm/current.h>
-
-#include "allowlist.h"
-#include "feature.h"
-#include "klog.h" // IWYU pragma: keep
-#include "ksud.h"
-#include "kernel_compat.h"
-#include "sucompat.h"
-#include "app_profile.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "kp_util.h"
-#endif
-
-#define SU_PATH "/system/bin/su"
-#define SH_PATH "/system/bin/sh"
-
-bool ksu_su_compat_enabled __read_mostly = true;
-
-static const char su_path[] = SU_PATH;
-static const char ksud_path[] = KSUD_PATH;
-static const char sh_path[] = SH_PATH;
-
-static int su_compat_feature_get(u64 *value)
-{
-	*value = ksu_su_compat_enabled ? 1 : 0;
-	return 0;
-}
-
-static int su_compat_feature_set(u64 value)
-{
-	bool enable = value != 0;
-	ksu_su_compat_enabled = enable;
-	pr_info("su_compat: set to %d\n", enable);
-	return 0;
-}
-
-static const struct ksu_feature_handler su_compat_handler = {
-	.feature_id = KSU_FEATURE_SU_COMPAT,
-	.name = "su_compat",
-	.get_handler = su_compat_feature_get,
-	.set_handler = su_compat_feature_set,
-};
-
-static void __user *userspace_stack_buffer(const void *d, size_t len)
-{
-	// To avoid having to mmap a page in userspace, just write below the stack
-	// pointer.
-	char __user *p = (void __user *)current_user_stack_pointer() - len;
-
-	return copy_to_user(p, d, len) ? NULL : p;
-}
-
-static char __user *sh_user_path(void)
-{
-	return userspace_stack_buffer(sh_path, sizeof(sh_path));
-}
-
-static char __user *ksud_user_path(void)
-{
-	return userspace_stack_buffer(ksud_path, sizeof(ksud_path));
-}
-
-static inline bool is_su_allowed(void)
-{
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (!ksu_su_compat_enabled)
-		return false;
-#endif
-#ifdef CONFIG_SECCOMP
-	if (likely(!!current->seccomp.mode))
-		return false;
-#endif
-	if (!ksu_is_allow_uid_for_current(current_uid().val))
-		return false;
-
-	return true;
-}
-
-static int ksu_sucompat_user_common(const char __user **filename_user,
-				    const char *syscall_name,
-				    const bool escalate)
-{
-	char path[sizeof(su_path) + 1];
-
-	if (unlikely(!filename_user))
-		return 0;
-	if (!is_su_allowed())
-		return 0;
-
-	memset(path, 0, sizeof(path));
-	ksu_strncpy_from_user_nofault(path, *filename_user, sizeof(path));
-
-	if (memcmp(path, su_path, sizeof(su_path)))
-		return 0;
-
-	if (escalate) {
-		pr_info("%s su found\n", syscall_name);
-		*filename_user = ksud_user_path();
-		escape_with_root_profile(); // escalate !!
-	} else {
-		pr_info("%s su->sh!\n", syscall_name);
-		*filename_user = sh_user_path();
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-static int do_execve_sucompat_for_kp(const char __user **filename_user)
-{
-	char path[sizeof(su_path) + 1];
-
-	if (unlikely(!filename_user))
-		return 0;
-	if (!is_su_allowed())
-		return 0;
-	if (!ksu_retry_filename_access(filename_user, path, sizeof(path), true))
-		return 0;
-	if (likely(memcmp(path, su_path, sizeof(su_path))))
-		return 0;
-
-	pr_info("sys_execve su found\n");
-	*filename_user = ksud_user_path();
-
-	escape_with_root_profile();
-
-	return 0;
-}
-#define handle_execve_sucompat(filename_ptr)                                   \
-	(do_execve_sucompat_for_kp(filename_ptr))
-#else
-#define handle_execve_sucompat(filename_ptr)                                   \
-	(ksu_sucompat_user_common(filename_ptr, "sys_execve", true))
-#endif
-
-int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode,
-			 int *__unused_flags)
-{
-	return ksu_sucompat_user_common(filename_user, "faccessat", false);
-}
-
-int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags)
-{
-	return ksu_sucompat_user_common(filename_user, "newfstatat", false);
-}
-
-int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user,
-			       void *__never_use_argv, void *__never_use_envp,
-			       int *__never_use_flags)
-{
-	return handle_execve_sucompat(filename_user);
-}
-
-int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr,
-				 void *__never_use_argv, void *__never_use_envp,
-				 int *__never_use_flags)
-{
-	struct filename *filename;
-
-	if (unlikely(!filename_ptr))
-		return 0;
-	if (!is_su_allowed())
-		return 0;
-
-	filename = *filename_ptr;
-	if (IS_ERR(filename))
-		return 0;
-	if (likely(memcmp(filename->name, su_path, sizeof(su_path))))
-		return 0;
-
-	pr_info("do_execveat_common su found\n");
-	memcpy((void *)filename->name, ksud_path, sizeof(ksud_path));
-
-	escape_with_root_profile();
-
-	return 0;
-}
-
-int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv,
-			void *envp, int *flags)
-{
-	return ksu_handle_execveat_ksud(fd, filename_ptr, argv, envp, flags);
-}
-
-// dead code: devpts handling
-int __maybe_unused ksu_handle_devpts(struct inode *inode)
-{
-	return 0;
-}
-
-// sucompat: permitted process can execute 'su' to gain root access.
-void ksu_sucompat_init(void)
-{
-	if (ksu_register_feature_handler(&su_compat_handler)) {
-		pr_err("Failed to register su_compat feature handler\n");
-	}
-}
-
-void ksu_sucompat_exit(void)
-{
-	ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT);
-}
diff --git a/drivers/kernelsu/sucompat.h b/drivers/kernelsu/sucompat.h
deleted file mode 100644
index de4bcfe037fa..000000000000
--- a/drivers/kernelsu/sucompat.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef __KSU_H_SUCOMPAT
-#define __KSU_H_SUCOMPAT
-#include <linux/types.h>
-
-extern bool ksu_su_compat_enabled;
-
-void ksu_sucompat_init(void);
-void ksu_sucompat_exit(void);
-
-// Handler functions exported for hook_manager
-int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode,
-			 int *__unused_flags);
-int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags);
-int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user,
-			       void *__never_use_argv, void *__never_use_envp,
-			       int *__never_use_flags);
-#endif
diff --git a/drivers/kernelsu/sulog/event.c b/drivers/kernelsu/sulog/event.c
new file mode 100644
index 000000000000..95f532979df5
--- /dev/null
+++ b/drivers/kernelsu/sulog/event.c
@@ -0,0 +1,267 @@
+#define KSU_SULOG_MAX_QUEUED 256U
+#define KSU_SULOG_MAX_PAYLOAD_LEN 2048U
+#define KSU_SULOG_MAX_ARG_STRINGS 0x7FFFFFFF
+#define KSU_SULOG_MAX_ARG_CHUNK 256U
+#define KSU_SULOG_MAX_FILENAME_LEN 256U
+
+static struct ksu_event_queue sulog_queue;
+
+struct ksu_sulog_pending_event {
+	__u16 event_type;
+	void *payload;
+	__u32 payload_len;
+};
+
+struct ksu_sulog_identity {
+	__u32 uid;
+	__u32 euid;
+};
+
+static void ksu_sulog_fill_task_info(struct ksu_sulog_event *event, __u16 event_type, int retval)
+{
+	event->version = KSU_SULOG_EVENT_VERSION;
+	event->event_type = event_type;
+	event->retval = retval;
+	event->pid = task_pid_nr(current);
+	event->tgid = task_tgid_nr(current);
+	event->ppid = task_ppid_nr(current);
+	event->uid = current_uid().val;
+	event->euid = current_euid().val;
+	get_task_comm(event->comm, current);
+}
+
+static void ksu_sulog_set_identity(struct ksu_sulog_event *event, const struct ksu_sulog_identity *identity)
+{
+	if (!identity)
+		return;
+
+	event->uid = identity->uid;
+	event->euid = identity->euid;
+}
+
+static struct ksu_sulog_pending_event *ksu_sulog_capture(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp)
+{
+	struct ksu_sulog_pending_event *pending = NULL;
+	struct ksu_sulog_event *event;
+	void *payload = NULL;
+	__u32 payload_len;
+	__u32 filename_len;
+	__u32 argv_len;
+	__u32 remaining;
+	char *filename_buf;
+	bool should_skip_copy = false;
+
+	if (!ksu_sulog_is_enabled())
+		return NULL;
+	
+	if (event_type == KSU_SULOG_EVENT_IOCTL_GRANT_ROOT || event_type == KSU_SULOG_EVENT_SUCOMPAT) {
+		filename_len = 0;
+		argv_len = 0;
+		should_skip_copy = true;
+		goto alloc;
+	}
+
+	if (!bprm_argv)
+		return NULL;
+
+	if (!bprm_argv_len)
+		return NULL;
+
+	if (bprm_argv_len <= 0)
+		return NULL;
+
+alloc:
+	pending = kzalloc(sizeof(*pending), gfp);
+	if (!pending)
+		goto out_drop;
+
+	payload = kzalloc(KSU_SULOG_MAX_PAYLOAD_LEN, gfp);
+	if (!payload)
+		goto out_free_pending;
+
+	event = payload;
+	ksu_sulog_fill_task_info(event, event_type, 0);
+
+	if (should_skip_copy)
+		goto skip_copy;
+
+	remaining = KSU_SULOG_MAX_PAYLOAD_LEN - sizeof(*event);
+	filename_buf = (char *)payload + sizeof(*event);
+
+	size_t actual_copy_len = bprm_argv_len;
+	
+	if (bprm_argv_len > remaining - 1)
+		actual_copy_len = remaining - 1 ;
+
+	memcpy(filename_buf, bprm_argv, actual_copy_len);
+	filename_buf[actual_copy_len] = '\0';
+
+	filename_len = strlen(filename_buf) + 1 ; // argv0 + null terminator
+
+	if (actual_copy_len > filename_len)
+		argv_len = actual_copy_len - (filename_len);
+	else
+		argv_len = 0;
+
+skip_copy:
+	event->filename_len = filename_len;
+	event->argv_len = argv_len;
+	
+	payload_len = (__u32)sizeof(*event) + filename_len + argv_len;
+
+	// unlikely
+	if (payload_len > KSU_SULOG_MAX_PAYLOAD_LEN || (__u32)sizeof(*event) > payload_len)
+		goto out_free_payload;
+
+	pending->event_type = event_type;
+	pending->payload = payload;
+	pending->payload_len = payload_len;
+	return pending;
+
+out_free_payload:
+	kfree(payload);
+out_free_pending:
+	kfree(pending);
+out_drop:
+	ksu_event_queue_drop(&sulog_queue);
+	return NULL;
+}
+
+static struct ksu_sulog_pending_event *ksu_sulog_capture_grant_root(const struct ksu_sulog_identity *identity, gfp_t gfp)
+{
+	struct ksu_sulog_pending_event *pending;
+	struct ksu_sulog_event *event;
+
+	pending = ksu_sulog_capture(KSU_SULOG_EVENT_IOCTL_GRANT_ROOT, NULL, NULL, gfp);
+	if (!pending)
+		return NULL;
+
+	event = pending->payload;
+	ksu_sulog_set_identity(event, identity);
+	return pending;
+}
+
+int ksu_sulog_events_init(void)
+{
+	ksu_event_queue_init(&sulog_queue, KSU_SULOG_MAX_QUEUED, KSU_SULOG_MAX_PAYLOAD_LEN);
+	return 0;
+}
+
+void ksu_sulog_events_exit(void)
+{
+	ksu_event_queue_destroy(&sulog_queue);
+}
+
+static void ksu_sulog_free_pending(struct ksu_sulog_pending_event *pending)
+{
+	if (!pending)
+		return;
+	kfree(pending->payload);
+	kfree(pending);
+}
+
+void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp)
+{
+	struct ksu_sulog_event *event;
+
+	if (!pending)
+		return;
+
+	event = pending->payload;
+	event->retval = retval;
+	ksu_event_queue_push(&sulog_queue, pending->event_type, 0, pending->payload, pending->payload_len, gfp);
+	ksu_sulog_free_pending(pending);
+}
+
+static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp)
+{
+	if (!ksu_sulog_is_enabled())
+		return 0;
+
+	struct ksu_sulog_pending_event *pending;
+	struct ksu_sulog_identity identity = {
+		.uid = uid,
+		.euid = euid,
+	};
+
+	pending = ksu_sulog_capture_grant_root(&identity, gfp);
+	if (!pending)
+		return 0;
+
+	ksu_sulog_emit_pending(pending, retval, gfp);
+	return 0;
+}
+
+static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp)
+{
+	if (!ksu_sulog_is_enabled())
+		return 0;
+
+	struct ksu_sulog_pending_event *pending;
+
+	pending = ksu_sulog_capture(event_type, bprm_argv, bprm_argv_len, gfp);
+	if (!pending)
+		return 0;
+
+	ksu_sulog_emit_pending(pending, 0, gfp);
+	return 0;
+}
+
+static void ksu_sulog_emit_bprm(const char *filename)
+{
+	if (!ksu_sulog_is_enabled())
+		return;
+
+	// maybe tag the process instead?
+	if (!is_ksu_domain())
+		return;
+
+	if (!current->mm)
+		return;
+
+	unsigned long arg_start = current->mm->arg_start;
+	unsigned long arg_end = current->mm->arg_end;
+	size_t arg_len = arg_end - arg_start;
+
+	if (arg_len <= 0)
+		return;
+
+#define ARGV_MAX_BPRM 128
+	char args[ARGV_MAX_BPRM] = {0};
+
+	size_t argv_copy_len = (arg_len > ARGV_MAX_BPRM) ? ARGV_MAX_BPRM : arg_len;
+
+	// we cant use strncpy on here, else it will truncate once it sees \0
+	if (ksu_copy_from_user_retry(args, (void __user *)arg_start, argv_copy_len))
+		return;
+
+	args[argv_copy_len - 1] = '\0';
+
+	// we grab strlen of argv0 as that needs to be kept as \0, basically to skip it
+	size_t argv0_len = strnlen(args, argv_copy_len);
+	char *buf = args + argv0_len + 1;
+
+flatten:
+	if (buf >= args + argv_copy_len - 1)
+		goto flatten_done;
+
+	int len = strlen(buf);
+	if (!len)
+		goto flatten_done;
+	
+	*(buf + len) = ' ';
+	buf = buf + len + 1;
+
+	if (buf - args < argv_copy_len - argv0_len - 1)
+		goto flatten;
+
+flatten_done:
+	//	this should look like
+	//      /system/bin/sh\0-c sh -c id
+	ksu_sulog_emit(KSU_SULOG_EVENT_ROOT_EXECVE, args, argv_copy_len, GFP_KERNEL);
+}
+
+struct ksu_event_queue *ksu_sulog_get_queue(void)
+{
+	return &sulog_queue;
+}
diff --git a/drivers/kernelsu/sulog/event.h b/drivers/kernelsu/sulog/event.h
new file mode 100644
index 000000000000..92563ded6d10
--- /dev/null
+++ b/drivers/kernelsu/sulog/event.h
@@ -0,0 +1,18 @@
+#ifndef __KSU_H_SULOG_EVENT
+#define __KSU_H_SULOG_EVENT
+
+struct ksu_event_queue;
+struct ksu_sulog_pending_event;
+
+int ksu_sulog_events_init(void);
+void ksu_sulog_events_exit(void);
+
+void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp);
+
+static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp);
+static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp);
+static void ksu_sulog_emit_bprm(const char *filename);
+
+struct ksu_event_queue *ksu_sulog_get_queue(void);
+
+#endif
diff --git a/drivers/kernelsu/sulog/fd.c b/drivers/kernelsu/sulog/fd.c
new file mode 100644
index 000000000000..70da685e73ea
--- /dev/null
+++ b/drivers/kernelsu/sulog/fd.c
@@ -0,0 +1,83 @@
+static DEFINE_MUTEX(ksu_sulog_fd_lock);
+static bool ksu_sulog_fd_active;
+
+static ssize_t ksu_sulog_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	return ksu_event_queue_read(ksu_sulog_get_queue(), buf, count, file->f_flags);
+}
+
+static unsigned __bitwise ksu_sulog_poll(struct file *file, poll_table *wait)
+{
+	return ksu_event_queue_poll(ksu_sulog_get_queue(), file, wait);
+}
+
+static int ksu_sulog_release(struct inode *inode, struct file *file)
+{
+	mutex_lock(&ksu_sulog_fd_lock);
+	ksu_sulog_fd_active = false;
+	mutex_unlock(&ksu_sulog_fd_lock);
+
+	pr_info("sulog: fd released\n");
+	return 0;
+}
+
+static const struct file_operations ksu_sulog_fops = {
+	.owner = THIS_MODULE,
+	.read = ksu_sulog_read,
+	.poll = ksu_sulog_poll,
+	.release = ksu_sulog_release,
+	.llseek = noop_llseek,
+};
+
+int ksu_install_sulog_fd(void)
+{
+	struct file *filp;
+	int fd;
+
+	mutex_lock(&ksu_sulog_fd_lock);
+
+	if (ksu_sulog_fd_active) {
+		fd = -EBUSY;
+		goto out_unlock;
+	}
+
+	if (READ_ONCE(ksu_sulog_get_queue()->closed)) {
+		fd = -EPIPE;
+		goto out_unlock;
+	}
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		goto out_unlock;
+
+	filp = anon_inode_getfile("[ksu_sulog]", &ksu_sulog_fops, NULL, O_RDONLY | O_CLOEXEC);
+	if (IS_ERR(filp)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(filp);
+		goto out_unlock;
+	}
+
+	ksu_sulog_fd_active = true;
+	fd_install(fd, filp);
+	pr_info("sulog: fd installed %d for pid %d\n", fd, current->pid);
+
+out_unlock:
+	mutex_unlock(&ksu_sulog_fd_lock);
+	return fd;
+}
+
+void ksu_sulog_fd_init(void)
+{
+	mutex_lock(&ksu_sulog_fd_lock);
+	ksu_sulog_fd_active = false;
+	mutex_unlock(&ksu_sulog_fd_lock);
+}
+
+void ksu_sulog_fd_exit(void)
+{
+	mutex_lock(&ksu_sulog_fd_lock);
+	ksu_sulog_fd_active = false;
+	mutex_unlock(&ksu_sulog_fd_lock);
+
+	ksu_event_queue_close(ksu_sulog_get_queue());
+}
diff --git a/drivers/kernelsu/sulog/fd.h b/drivers/kernelsu/sulog/fd.h
new file mode 100644
index 000000000000..6a117fedc0a9
--- /dev/null
+++ b/drivers/kernelsu/sulog/fd.h
@@ -0,0 +1,8 @@
+#ifndef __KSU_H_SULOG_FD
+#define __KSU_H_SULOG_FD
+
+int ksu_install_sulog_fd(void);
+void ksu_sulog_fd_init(void);
+void ksu_sulog_fd_exit(void);
+
+#endif
diff --git a/drivers/kernelsu/supercall/dispatch.c b/drivers/kernelsu/supercall/dispatch.c
new file mode 100644
index 000000000000..2ea7d8b4cbff
--- /dev/null
+++ b/drivers/kernelsu/supercall/dispatch.c
@@ -0,0 +1,729 @@
+static int do_grant_root(void __user *arg)
+{
+	int ret;
+	__u32 audit_uid = current_uid().val;
+	__u32 audit_euid = current_euid().val;
+
+	// we already check uid above on allowed_for_su()
+
+	write_sulog('i'); // log ioctl escalation
+
+	pr_info("allow root for: %d\n", audit_uid);
+	ret = escape_with_root_profile();
+
+#ifdef CONFIG_KSU_FEATURE_SULOG
+	ksu_sulog_emit_grant_root(ret, audit_uid, audit_euid, GFP_KERNEL);
+#endif
+	return ret;
+}
+
+static uint32_t ksuver_override = 0;
+static uint32_t ksuflags_override = 0;
+
+static int do_get_info(void __user *arg)
+{
+	struct ksu_get_info_cmd cmd = {.version = KERNEL_SU_VERSION, .flags = 0};
+
+	// NOTE: we do not have LKM support so we don't bother with its flags or late-load
+	if (is_manager()) {
+		cmd.flags |= KSU_GET_INFO_FLAG_MANAGER;
+	}
+	cmd.features = KSU_FEATURE_MAX;
+
+	if (ksuver_override)
+		cmd.version = ksuver_override;
+
+	if (ksuflags_override)
+		cmd.flags = ksuflags_override;
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("get_version: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_report_event(void __user *arg)
+{
+	struct ksu_report_event_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		return -EFAULT;
+	}
+
+	switch (cmd.event) {
+	case EVENT_POST_FS_DATA: {
+		static bool post_fs_data_lock = false;
+		if (!post_fs_data_lock) {
+			post_fs_data_lock = true;
+			pr_info("post-fs-data triggered\n");
+			on_post_fs_data();
+		}
+		break;
+	}
+	case EVENT_BOOT_COMPLETED: {
+		static bool boot_complete_lock = false;
+		if (!boot_complete_lock) {
+			boot_complete_lock = true;
+			pr_info("boot_complete triggered\n");
+			on_boot_completed();
+		}
+		break;
+	}
+	case EVENT_MODULE_MOUNTED: {
+		ksu_module_mounted = true;
+		pr_info("module mounted!\n");
+		on_module_mounted();
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int do_set_sepolicy(void __user *arg)
+{
+	struct ksu_set_sepolicy_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		return -EFAULT;
+	}
+
+	return handle_sepolicy((void __user *)cmd.data, cmd.data_len);
+}
+
+static int do_check_safemode(void __user *arg)
+{
+	struct ksu_check_safemode_cmd cmd;
+
+	cmd.in_safe_mode = ksu_is_safe_mode();
+
+	if (cmd.in_safe_mode) {
+		pr_warn("safemode enabled!\n");
+	}
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("check_safemode: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_new_get_allow_list_common(void __user *arg, bool allow)
+{
+	struct ksu_new_get_allow_list_cmd cmd;
+	int *arr = NULL;
+	int err = 0;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		return -EFAULT;
+	}
+
+	if (cmd.count) {
+		arr = kmalloc(sizeof(int) * cmd.count, GFP_KERNEL);
+		if (!arr) {
+			return -ENOMEM;
+		}
+	}
+
+	bool success = ksu_get_allow_list(arr, cmd.count, &cmd.count, &cmd.total_count, allow);
+
+	if (!success) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("new_get_allow_list: copy_to_user count failed\n");
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (cmd.count && copy_to_user(&((struct ksu_new_get_allow_list_cmd *)arg)->uids, arr, sizeof(int) * cmd.count)) {
+		pr_err("new_get_allow_list: copy_to_user uids failed\n");
+		err = -EFAULT;
+	}
+
+out:
+	if (arr) {
+		kfree(arr);
+	}
+	return err;
+}
+
+static int do_new_get_deny_list(void __user *arg)
+{
+	return do_new_get_allow_list_common(arg, false);
+}
+
+static int do_new_get_allow_list(void __user *arg)
+{
+	return do_new_get_allow_list_common(arg, true);
+}
+
+static int do_get_allow_list_common(void __user *arg, bool allow)
+{
+	int *arr = NULL;
+	int err = 0;
+	u16 count;
+	u32 out_count;
+	static const u16 kSize = 128;
+
+	arr = kmalloc(sizeof(int) * kSize, GFP_KERNEL);
+	if (!arr) {
+		return -ENOMEM;
+	}
+
+	bool success = ksu_get_allow_list(arr, kSize, &count, NULL, allow);
+
+	if (!success) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	out_count = count;
+
+	if (copy_to_user(arg + offsetof(struct ksu_get_allow_list_cmd, count),
+					 &out_count, sizeof(u32))) {
+		pr_err("get_allow_list: copy_to_user count failed\n");
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(arg, arr, sizeof(u32) * count)) {
+		pr_err("get_allow_list: copy_to_user uids failed\n");
+		err = -EFAULT;
+	}
+
+out:
+	if (arr) {
+		kfree(arr);
+	}
+	return err;
+}
+
+static int do_get_deny_list(void __user *arg)
+{
+	return do_get_allow_list_common(arg, false);
+}
+
+static int do_get_allow_list(void __user *arg)
+{
+	return do_get_allow_list_common(arg, true);
+}
+
+static int do_uid_granted_root(void __user *arg)
+{
+	struct ksu_uid_granted_root_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		return -EFAULT;
+	}
+
+	cmd.granted = ksu_is_allow_uid_for_current(cmd.uid);
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("uid_granted_root: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_uid_should_umount(void __user *arg)
+{
+	struct ksu_uid_should_umount_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		return -EFAULT;
+	}
+
+	cmd.should_umount = ksu_uid_should_umount(cmd.uid);
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("uid_should_umount: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_get_manager_appid(void __user *arg)
+{
+	struct ksu_get_manager_appid_cmd cmd;
+
+	cmd.appid = ksu_get_manager_appid();
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("get_manager_appid: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_get_app_profile(void __user *arg)
+{
+	uid_t uid;
+	struct app_profile *profile;
+	int ret = 0;
+
+	if (copy_from_user(&uid, (char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile.curr_uid), sizeof(uid_t))) {
+		pr_err("get_app_profile: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	rcu_read_lock();
+	profile = ksu_get_app_profile(uid);
+	rcu_read_unlock();
+	if (!profile) {
+		ret = -ENOENT;
+	} else {
+		if (copy_to_user((char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile), profile, sizeof(struct app_profile))) {
+			pr_err("get_app_profile: copy_to_user failed\n");
+			ret = -EFAULT;
+		}
+		ksu_put_app_profile(profile);
+	}
+	return ret;
+}
+
+static int do_set_app_profile(void __user *arg)
+{
+	struct ksu_set_app_profile_cmd cmd;
+	int ret;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("set_app_profile: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	ret = ksu_set_app_profile(&cmd.profile);
+	if (!ret) {
+		ksu_persistent_allow_list();
+	}
+
+	return ret;
+}
+
+static int do_get_feature(void __user *arg)
+{
+	struct ksu_get_feature_cmd cmd;
+	bool supported;
+	int ret;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("get_feature: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported);
+	cmd.supported = supported ? 1 : 0;
+
+	if (ret && supported) {
+		pr_err("get_feature: failed for feature %u: %d\n", cmd.feature_id, ret);
+		return ret;
+	}
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("get_feature: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_set_feature(void __user *arg)
+{
+	struct ksu_set_feature_cmd cmd;
+	int ret;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("set_feature: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	ret = ksu_set_feature(cmd.feature_id, cmd.value);
+	if (ret) {
+		pr_err("set_feature: failed for feature %u: %d\n", cmd.feature_id, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int do_get_wrapper_fd(void __user *arg) {
+	if (!ksu_file_sid) {
+		return -EINVAL;
+	}
+
+	struct ksu_get_wrapper_fd_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("get_wrapper_fd: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	return ksu_install_file_wrapper(cmd.fd);
+}
+
+// Get task mark status
+// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found
+/* BRICKPORT: on this one we return 1 if seccomp is disabled and 0 if enabled */
+static int ksu_get_task_mark(pid_t pid)
+{
+	struct task_struct *task;
+	int ret = -ESRCH;
+
+	rcu_read_lock();
+	task = find_task_by_vpid(pid);
+	if (!task) {
+		rcu_read_unlock();
+		return ret;	
+	}
+
+	ret = !task->seccomp.mode;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int do_manage_mark(void __user *arg)
+{
+	struct ksu_manage_mark_cmd cmd;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("manage_mark: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	switch (cmd.operation) {
+		case KSU_MARK_GET: {
+			// on this one, we return seccomp status of a pid instead
+			// at the very least we have partial featureset
+			ret = ksu_get_task_mark(cmd.pid);
+			if (ret < 0) {
+			    pr_err("manage_mark: get failed for pid %d: %d\n", cmd.pid, ret);
+			    return ret;
+			}
+			cmd.result = (u32)ret;
+			break;
+		}
+#if 0 // TODO: revisit this sometime
+		case KSU_MARK_MARK: { break; }
+		case KSU_MARK_UNMARK: { break; }
+		case KSU_MARK_REFRESH: { break; }
+#endif
+		default: {
+			pr_err("manage_mark: invalid operation %u\n", cmd.operation);
+			return -EINVAL;
+		}
+	}
+
+	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
+		pr_err("manage_mark: copy_to_user failed\n");
+		return -EFAULT;
+	}
+
+
+	return 0;
+}
+
+static int do_nuke_ext4_sysfs(void __user *arg)
+{
+	struct ksu_nuke_ext4_sysfs_cmd cmd;
+	char mnt[256];
+	long ret;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.arg)
+		return -EINVAL;
+
+	memset(mnt, 0, sizeof(mnt));
+
+	ret = strncpy_from_user(mnt, (void __user *)cmd.arg, sizeof(mnt));
+	if (ret < 0) {
+		pr_err("nuke ext4 copy mnt failed: %ld\\n", ret);
+		return -EFAULT;   // 或者 return ret;
+	}
+
+	if (ret == sizeof(mnt)) {
+		pr_err("nuke ext4 mnt path too long\\n");
+		return -ENAMETOOLONG;
+	}
+
+	pr_info("do_nuke_ext4_sysfs: %s\n", mnt);
+
+	return nuke_ext4_sysfs(mnt);
+}
+
+struct list_head mount_list = LIST_HEAD_INIT(mount_list);
+DECLARE_RWSEM(mount_list_lock);
+
+static int add_try_umount(void __user *arg)
+{
+	struct mount_entry *new_entry, *entry, *tmp;
+	struct ksu_add_try_umount_cmd cmd;
+	char buf[256] = {0};
+
+	if (copy_from_user(&cmd, arg, sizeof cmd))
+		return -EFAULT;
+
+	switch (cmd.mode) {
+		case KSU_UMOUNT_WIPE: {
+			struct mount_entry *entry, *tmp;
+			down_write(&mount_list_lock);
+			list_for_each_entry_safe(entry, tmp, &mount_list, list) {
+				pr_info("wipe_umount_list: removing entry: %s\n", entry->umountable);
+				list_del(&entry->list);
+				kfree(entry->umountable);
+				kfree(entry);
+			}
+			up_write(&mount_list_lock);
+
+			return 0;
+		}
+
+		case KSU_UMOUNT_ADD: {
+			long len = strncpy_from_user(buf, (const char __user *)cmd.arg, 256);
+			if (len <= 0)
+				return -EFAULT;	
+			
+			buf[sizeof(buf) - 1] = '\0';
+
+			new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
+			if (!new_entry)
+				return -ENOMEM;
+
+			new_entry->umountable = kstrdup(buf, GFP_KERNEL);
+			if (!new_entry->umountable) {
+				kfree(new_entry);
+				return -ENOMEM;
+			}
+
+			down_write(&mount_list_lock);
+
+			// disallow dupes
+			// if this gets too many, we can consider moving this whole task to a kthread
+			list_for_each_entry(entry, &mount_list, list) {
+				if (!strcmp(entry->umountable, buf)) {
+					pr_info("cmd_add_try_umount: %s is already here!\n", buf);
+					up_write(&mount_list_lock);
+					kfree(new_entry->umountable);
+					kfree(new_entry);
+					return -EEXIST;
+				}
+			}
+
+			// now check flags and add
+			// this also serves as a null check
+			if (cmd.flags)
+				new_entry->flags = cmd.flags;
+			else
+				new_entry->flags = 0;
+
+			// debug
+			list_add(&new_entry->list, &mount_list);
+			up_write(&mount_list_lock);
+			pr_info("cmd_add_try_umount: %s added!\n", buf);
+
+			return 0;
+		}
+
+		// this is just strcmp'd wipe anyway
+		case KSU_UMOUNT_DEL: {
+			long len = strncpy_from_user(buf, (const char __user *)cmd.arg, sizeof(buf) - 1);
+			if (len <= 0)
+				return -EFAULT;
+			
+			buf[sizeof(buf) - 1] = '\0';
+
+			down_write(&mount_list_lock);
+			list_for_each_entry_safe(entry, tmp, &mount_list, list) {
+				if (!strcmp(entry->umountable, buf)) {
+					pr_info("cmd_add_try_umount: entry removed: %s\n", entry->umountable);
+					list_del(&entry->list);
+					kfree(entry->umountable);
+					kfree(entry);
+				}
+			}
+			up_write(&mount_list_lock);
+			
+			return 0;
+		}
+
+		// this way userspace can deduce the memory it has to prepare.
+		case KSU_UMOUNT_GETSIZE: {
+			// check for pointer first
+			if (!cmd.arg)
+				return -EFAULT;
+		
+			size_t total_size = 0; // size of list in bytes
+
+			down_read(&mount_list_lock);
+			list_for_each_entry(entry, &mount_list, list) {
+				total_size = total_size + strlen(entry->umountable) + 1; // + 1 for \0
+			}
+			up_read(&mount_list_lock);
+
+			pr_info("cmd_add_try_umount: total_size: %zu\n", total_size);
+			
+			if (copy_to_user((size_t __user *)cmd.arg, &total_size, sizeof(total_size)))
+				return -EFAULT;
+
+			return 0;
+		}
+		
+		// WARNING! this is straight up pointerwalking.
+		// this way we dont need to redefine the ioctl defs.
+		// this also avoids us needing to kmalloc
+		// userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA.
+		case KSU_UMOUNT_GETLIST: {
+			if (!cmd.arg)
+				return -EFAULT;
+			
+			char *user_buf = (char *)cmd.arg;
+
+			down_read(&mount_list_lock);
+			list_for_each_entry(entry, &mount_list, list) {
+				pr_info("cmd_add_try_umount: entry: %s\n", entry->umountable);
+			
+				if (copy_to_user((char __user *)user_buf, entry->umountable, strlen(entry->umountable) + 1 )) {
+					up_read(&mount_list_lock);
+					return -EFAULT;
+				}
+				
+				// walk it! +1 for null terminator
+				user_buf = user_buf + strlen(entry->umountable) + 1;
+			}
+			up_read(&mount_list_lock);
+
+			return 0;
+		}
+
+		default: {
+			pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode);
+			return -EINVAL;
+		}
+
+	} // switch(cmd.mode)
+	
+	return 0;
+}
+
+static int do_set_init_pgrp(void __user *arg)
+{
+	int err;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0)
+	struct pid *pids[PIDTYPE_MAX] = { 0 };
+#endif
+	write_lock_irq(&tasklist_lock);
+	struct task_struct *p = current->group_leader;
+	struct pid *init_group = task_pgrp(&init_task);
+
+	err = -EPERM;
+	if (task_session(p) != task_session(&init_task))
+		goto out;
+
+	err = 0;
+	if (task_pgrp(p) != init_group) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0)
+		change_pid(pids, p, PIDTYPE_PGID, init_group);
+#else
+		change_pid(p, PIDTYPE_PGID, init_group);
+#endif
+	}
+out:
+	write_unlock_irq(&tasklist_lock);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0)
+	free_pids(pids);
+#endif
+	return err;
+}
+
+static int do_get_sulog_fd(void __user *arg)
+{
+	struct ksu_get_sulog_fd_cmd cmd;
+
+	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
+		pr_err("get_sulog_fd: copy_from_user failed\n");
+		return -EFAULT;
+	}
+
+	if (cmd.flags) {
+		pr_err("get_sulog_fd: unsupported flags 0x%x\n", cmd.flags);
+		return -EINVAL;
+	}
+
+	return ksu_install_sulog_fd();
+}
+
+// IOCTL handlers mapping table
+static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = {
+	{ .cmd = KSU_IOCTL_GRANT_ROOT, .name = "GRANT_ROOT", .handler = do_grant_root, .perm_check = allowed_for_su },
+	{ .cmd = KSU_IOCTL_GET_INFO, .name = "GET_INFO", .handler = do_get_info, .perm_check = always_allow },
+	{ .cmd = KSU_IOCTL_REPORT_EVENT, .name = "REPORT_EVENT", .handler = do_report_event, .perm_check = only_root },
+	{ .cmd = KSU_IOCTL_SET_SEPOLICY, .name = "SET_SEPOLICY", .handler = do_set_sepolicy, .perm_check = only_root },
+	{ .cmd = KSU_IOCTL_CHECK_SAFEMODE, .name = "CHECK_SAFEMODE", .handler = do_check_safemode, .perm_check = always_allow },
+	{ .cmd = KSU_IOCTL_GET_ALLOW_LIST, .name = "GET_ALLOW_LIST", .handler = do_get_allow_list, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_GET_DENY_LIST, .name = "GET_DENY_LIST", .handler = do_get_deny_list, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_NEW_GET_ALLOW_LIST, .name = "NEW_GET_ALLOW_LIST", .handler = do_new_get_allow_list, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_NEW_GET_DENY_LIST, .name = "NEW_GET_DENY_LIST", .handler = do_new_get_deny_list, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_UID_GRANTED_ROOT, .name = "UID_GRANTED_ROOT", .handler = do_uid_granted_root, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_UID_SHOULD_UMOUNT, .name = "UID_SHOULD_UMOUNT", .handler = do_uid_should_umount, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_GET_MANAGER_APPID, .name = "GET_MANAGER_APPID", .handler = do_get_manager_appid, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_GET_APP_PROFILE, .name = "GET_APP_PROFILE", .handler = do_get_app_profile, .perm_check = only_manager },
+	{ .cmd = KSU_IOCTL_SET_APP_PROFILE, .name = "SET_APP_PROFILE", .handler = do_set_app_profile, .perm_check = only_manager },
+	{ .cmd = KSU_IOCTL_GET_FEATURE, .name = "GET_FEATURE", .handler = do_get_feature, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_SET_FEATURE, .name = "SET_FEATURE", .handler = do_set_feature, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_GET_WRAPPER_FD, .name = "GET_WRAPPER_FD", .handler = do_get_wrapper_fd, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_MANAGE_MARK, .name = "MANAGE_MARK", .handler = do_manage_mark, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_NUKE_EXT4_SYSFS, .name = "NUKE_EXT4_SYSFS", .handler = do_nuke_ext4_sysfs, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_ADD_TRY_UMOUNT, .name = "ADD_TRY_UMOUNT", .handler = add_try_umount, .perm_check = manager_or_root },
+	{ .cmd = KSU_IOCTL_SET_INIT_PGRP, .name = "SET_INIT_PGRP", .handler = do_set_init_pgrp, .perm_check = only_root },
+	{ .cmd = KSU_IOCTL_GET_SULOG_FD, .name = "GET_SULOG_FD", .handler = do_get_sulog_fd, .perm_check = only_root },
+	{ .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } // Sentinel
+};
+
+long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp)
+{
+	int i;
+
+#ifdef CONFIG_KSU_DEBUG
+	pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val);
+#endif
+
+	for (i = 0; ksu_ioctl_handlers[i].handler; i++) {
+		if (cmd == ksu_ioctl_handlers[i].cmd) {
+			// Check permission first
+			if (ksu_ioctl_handlers[i].perm_check && !ksu_ioctl_handlers[i].perm_check()) {
+				pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", cmd, current_uid().val);
+				return -EPERM;
+			}
+			// Execute handler
+			return ksu_ioctl_handlers[i].handler(argp);
+		}
+	}
+
+	pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd);
+	return -ENOTTY;
+}
+
+void __init ksu_supercall_dump_commands(void)
+{
+	int i;
+
+	pr_info("KernelSU IOCTL Commands:\n");
+	for (i = 0; ksu_ioctl_handlers[i].handler; i++) {
+		pr_info("  %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, ksu_ioctl_handlers[i].cmd);
+	}
+}
+
+void ksu_supercall_cleanup_state(void) {}
diff --git a/drivers/kernelsu/supercall/internal.h b/drivers/kernelsu/supercall/internal.h
new file mode 100644
index 000000000000..5287f2e5affe
--- /dev/null
+++ b/drivers/kernelsu/supercall/internal.h
@@ -0,0 +1,14 @@
+#ifndef __KSU_H_SUPERCALL_INTERNAL
+#define __KSU_H_SUPERCALL_INTERNAL
+
+bool only_manager(void);
+bool only_root(void);
+bool manager_or_root(void);
+bool always_allow(void);
+bool allowed_for_su(void);
+
+long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp);
+void ksu_supercall_dump_commands(void);
+void ksu_supercall_cleanup_state(void);
+
+#endif // __KSU_H_SUPERCALL_INTERNAL
diff --git a/drivers/kernelsu/supercall/perm.c b/drivers/kernelsu/supercall/perm.c
new file mode 100644
index 000000000000..89b674885072
--- /dev/null
+++ b/drivers/kernelsu/supercall/perm.c
@@ -0,0 +1,25 @@
+bool only_manager(void)
+{
+	return is_manager();
+}
+
+bool only_root(void)
+{
+	return current_uid().val == 0;
+}
+
+bool manager_or_root(void)
+{
+	return current_uid().val == 0 || is_manager();
+}
+
+bool always_allow(void)
+{
+	return true;
+}
+
+bool allowed_for_su(void)
+{
+	return is_manager() || ksu_is_allow_uid_for_current(current_uid().val);
+
+}
diff --git a/drivers/kernelsu/supercall/supercall.c b/drivers/kernelsu/supercall/supercall.c
new file mode 100644
index 000000000000..9bfd347d3d2c
--- /dev/null
+++ b/drivers/kernelsu/supercall/supercall.c
@@ -0,0 +1,212 @@
+static int anon_ksu_release(struct inode *inode, struct file *filp)
+{
+	pr_info("ksu fd released\n");
+	return 0;
+}
+
+static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	return ksu_supercall_handle_ioctl(cmd, (void __user *)arg);
+}
+
+// File operations structure
+static const struct file_operations anon_ksu_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = anon_ksu_ioctl,
+	.compat_ioctl = anon_ksu_ioctl,
+	.release = anon_ksu_release,
+};
+
+// Install KSU fd to current process
+int ksu_install_fd(void)
+{
+	struct file *filp;
+	int fd;
+
+	// Get unused fd
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		pr_err("ksu_install_fd: failed to get unused fd\n");
+		return fd;
+	}
+
+	// Create anonymous inode file
+	filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, O_RDWR | O_CLOEXEC);
+	if (IS_ERR(filp)) {
+		pr_err("ksu_install_fd: failed to create anon inode file\n");
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
+	}
+
+	// Install fd
+	fd_install(fd, filp);
+
+	pr_info("ksu fd installed: %d for pid %d\n", fd, current->pid);
+
+	return fd;
+}
+
+static inline int ksu_handle_fd_request(void __user *arg4)
+{
+	int fd = ksu_install_fd();
+	pr_info("[%d] install ksu fd: %d\n", current->pid, fd);
+
+	if (copy_to_user(arg4, &fd, sizeof(fd))) {
+		pr_err("install ksu fd reply err\n");
+		close_fd(fd);
+	}
+
+	return 0;
+}
+
+// downstream: make sure to pass arg as reference, this can allow us to extend things.
+int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg)
+{
+	if (magic1 != KSU_INSTALL_MAGIC1)
+		return 0;
+
+	// when ternary on fmt?
+	// cold syscall, we can splurge xD
+	if (magic2 == KSU_INSTALL_MAGIC2)
+		pr_info("sys_reboot: magic: 0x%x id: 0x%x pid: %d comm: %s \n", magic1, magic2, current->pid, current->comm);
+	else
+		pr_info("sys_reboot: magic: 0x%x id: %d pid: %d pid: %s \n", magic1, magic2, current->pid, current->comm);
+
+	// arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs);
+	// downstream: dereference arg as arg4 so we can be inline to upstream
+	void __user *arg4 = (void __user *)*arg;
+
+	// Check if this is a request to install KSU fd
+	if (magic2 == KSU_INSTALL_MAGIC2) {
+		return ksu_handle_fd_request(arg4);
+	}
+
+	// only root is allowed for these commands
+	if (current_uid().val != 0)
+		return 0;
+	
+	// extensions
+	u64 reply = (u64)*arg;
+
+	if (magic2 == CHANGE_MANAGER_UID) {
+		pr_info("sys_reboot: ksu_set_manager_appid to: %d\n", cmd);
+		ksu_set_manager_appid(cmd);
+
+		if (cmd == ksu_get_manager_appid()) {
+			if (copy_to_user((void __user *)*arg, &reply, sizeof(reply)))
+				pr_info("sys_reboot: reply fail\n");
+		}
+
+		return 0;
+	}
+
+	if (magic2 == GET_SULOG_DUMP_V2) {
+
+		int ret = send_sulog_dump(*arg);
+		if (ret)
+			return 0;
+
+		if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) ))
+			return 0;
+	}
+
+	if (magic2 == CHANGE_KSUVER) {
+		pr_info("sys_reboot: ksu_change_ksuver to: %d\n", cmd);
+		ksuver_override = cmd;
+
+		if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) ))
+			return 0;
+	}
+
+	// WARNING!!! triple ptr zone! ***
+	// https://wiki.c2.com/?ThreeStarProgrammer
+	if (magic2 == CHANGE_SPOOF_UNAME) {
+
+		char release_buf[65];
+		char version_buf[65];
+		static char original_release_buf[65] = {0};
+		static char original_version_buf[65] = {0};
+
+		// basically void * void __user * void __user *arg
+		void ***ppptr = (void ***)(uintptr_t)arg;
+
+		// user pointer storage
+		// init this as zero so this works on 32-on-64 compat (LE)
+		uint64_t u_pptr = 0;
+		uint64_t u_ptr = 0;
+
+		pr_info("sys_reboot: ppptr: 0x%lx \n", (uintptr_t)ppptr);
+
+		// arg here is ***, dereference to pull out **
+		if (copy_from_user(&u_pptr, (void __user *)*ppptr, sizeof(u_pptr)))
+			return 0;
+
+		pr_info("sys_reboot: u_pptr: 0x%lx \n", (uintptr_t)u_pptr);
+
+		// now we got the __user **
+		// we cannot dereference this as this is __user
+		// we just do another copy_from_user to get it
+		if (copy_from_user(&u_ptr, (void __user *)u_pptr, sizeof(u_ptr)))
+			return 0;
+
+		pr_info("sys_reboot: u_ptr: 0x%lx \n", (uintptr_t)u_ptr);
+
+		// for release
+		if (strncpy_from_user(release_buf, (char __user *)u_ptr, sizeof(release_buf)) < 0)
+			return 0;
+		release_buf[sizeof(release_buf) - 1] = '\0'; 
+
+		// for version
+		if (strncpy_from_user(version_buf, (char __user *)(u_ptr + strlen(release_buf) + 1), sizeof(version_buf)) < 0)
+			return 0;
+		version_buf[sizeof(version_buf) - 1] = '\0'; 
+
+		if (original_release_buf[0] == '\0') {
+			struct new_utsname *u_curr = utsname();
+			// we save current version as the original before modifying
+			strncpy(original_release_buf, u_curr->release, sizeof(original_release_buf));
+			strncpy(original_version_buf, u_curr->version, sizeof(original_version_buf));
+			pr_info("sys_reboot: original uname saved: %s %s\n", original_release_buf, original_version_buf);
+		}
+
+		// so user can reset
+		if (!strcmp(release_buf, "default")) {
+			memcpy(release_buf, original_release_buf, sizeof(release_buf));
+		}
+		if (!strcmp(version_buf, "default")) {
+			memcpy(version_buf, original_version_buf, sizeof(version_buf));
+		}
+
+		pr_info("sys_reboot: spoofing kernel to: %s - %s\n", release_buf, version_buf);
+
+		struct new_utsname *u = utsname();
+
+		down_write(&uts_sem);
+		strncpy(u->release, release_buf, sizeof(u->release));
+		strncpy(u->version, version_buf, sizeof(u->version));
+		up_write(&uts_sem);
+
+		// we write our confirmation on **
+		if (copy_to_user((void __user *)*arg, &reply, sizeof(reply)))
+			return 0;
+	}
+
+	if (magic2 == CHANGE_KSUFLAGS) {
+		pr_info("sys_reboot: ksu_change_ksuflags to: %d\n", cmd);
+		ksuflags_override = cmd;
+
+		if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) ))
+			return 0;
+	}
+
+	return 0;
+}
+
+void __init ksu_supercalls_init(void)
+{
+	ksu_supercall_dump_commands();
+	
+	tiny_sulog_init_heap(); // grab heap memory for sulog
+}
+
+void __exit ksu_supercalls_exit(void) { }
diff --git a/drivers/kernelsu/supercall/supercall.h b/drivers/kernelsu/supercall/supercall.h
new file mode 100644
index 000000000000..1c9e5a0a27ed
--- /dev/null
+++ b/drivers/kernelsu/supercall/supercall.h
@@ -0,0 +1,32 @@
+#ifndef __KSU_H_SUPERCALL
+#define __KSU_H_SUPERCALL
+
+// IOCTL handler types
+typedef int (*ksu_ioctl_handler_t)(void __user *arg);
+typedef bool (*ksu_perm_check_t)(void);
+
+// IOCTL command mapping
+struct ksu_ioctl_cmd_map {
+	unsigned int cmd;
+	const char *name;
+	ksu_ioctl_handler_t handler;
+	ksu_perm_check_t perm_check; // Permission check function
+};
+
+// Install KSU fd to current process
+int ksu_install_fd(void);
+
+void ksu_supercalls_init(void);
+void ksu_supercalls_exit(void);
+
+// extensions
+#define CHANGE_MANAGER_UID 10006
+#define KSU_UMOUNT_GETSIZE 107   // get list size // shit is u8 we cant fit 10k+ on it
+#define KSU_UMOUNT_GETLIST 108   // get list
+#define GET_SULOG_DUMP 10009     // get sulog dump, max, last 100 escalations
+#define GET_SULOG_DUMP_V2 10010     // get sulog dump, timestamped, last 250 escalations
+#define CHANGE_KSUVER 10011     // change ksu version
+#define CHANGE_SPOOF_UNAME 10012 // spoof uname
+#define CHANGE_KSUFLAGS 10013     // change ksuflags, do the bit calc on your own, 0 + 1 + 2 + 4 + 8 blah
+
+#endif // __KSU_H_SUPERCALLS
diff --git a/drivers/kernelsu/supercalls.c b/drivers/kernelsu/supercalls.c
deleted file mode 100644
index 12c7e284cfd1..000000000000
--- a/drivers/kernelsu/supercalls.c
+++ /dev/null
@@ -1,847 +0,0 @@
-#include <linux/anon_inodes.h>
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/version.h>
-#include <linux/kprobes.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include <linux/sched/task.h>
-#else
-#include <linux/sched.h>
-#endif
-
-#include "supercalls.h"
-#include "arch.h"
-#include "allowlist.h"
-#include "feature.h"
-#include "klog.h" // IWYU pragma: keep
-#include "ksu.h"
-#include "ksud.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "kp_hook.h"
-#include "syscall_handler.h"
-#endif
-#include "kernel_compat.h"
-#include "kernel_umount.h"
-#include "manager.h"
-#include "selinux/selinux.h"
-#include "file_wrapper.h"
-
-// Permission check functions
-bool only_manager(void)
-{
-	return is_manager();
-}
-
-bool only_root(void)
-{
-	return current_uid().val == 0;
-}
-
-bool manager_or_root(void)
-{
-	return current_uid().val == 0 || is_manager();
-}
-
-bool always_allow(void)
-{
-	return true; // No permission check
-}
-
-bool allowed_for_su(void)
-{
-	return is_manager() || ksu_is_allow_uid_for_current(current_uid().val);
-}
-
-static int do_grant_root(void __user *arg)
-{
-	// we already check uid above on allowed_for_su()
-
-	pr_info("allow root for: %d\n", current_uid().val);
-	escape_with_root_profile();
-
-	return 0;
-}
-
-static int do_get_info(void __user *arg)
-{
-	struct ksu_get_info_cmd cmd = { .version = KERNEL_SU_VERSION,
-					.flags = 0 };
-
-#ifdef MODULE
-	cmd.flags |= 0x1;
-#endif
-
-	if (is_manager()) {
-		cmd.flags |= 0x2;
-	}
-	cmd.features = KSU_FEATURE_MAX;
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_version: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_report_event(void __user *arg)
-{
-	struct ksu_report_event_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	switch (cmd.event) {
-	case EVENT_POST_FS_DATA: {
-		static bool post_fs_data_lock = false;
-		if (!post_fs_data_lock) {
-			post_fs_data_lock = true;
-			pr_info("post-fs-data triggered\n");
-			on_post_fs_data();
-		}
-		break;
-	}
-	case EVENT_BOOT_COMPLETED: {
-		static bool boot_complete_lock = false;
-		if (!boot_complete_lock) {
-			boot_complete_lock = true;
-			pr_info("boot_complete triggered\n");
-			on_boot_completed();
-		}
-		break;
-	}
-	case EVENT_MODULE_MOUNTED: {
-		pr_info("module mounted!\n");
-		on_module_mounted();
-		break;
-	}
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static int do_set_sepolicy(void __user *arg)
-{
-	struct ksu_set_sepolicy_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	return handle_sepolicy(cmd.cmd, (void __user *)cmd.arg);
-}
-
-static int do_check_safemode(void __user *arg)
-{
-	struct ksu_check_safemode_cmd cmd;
-
-	cmd.in_safe_mode = ksu_is_safe_mode();
-
-	if (cmd.in_safe_mode) {
-		pr_warn("safemode enabled!\n");
-	}
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("check_safemode: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_get_allow_list(void __user *arg)
-{
-	struct ksu_get_allow_list_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	bool success =
-		ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, true);
-
-	if (!success) {
-		return -EFAULT;
-	}
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_allow_list: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_get_deny_list(void __user *arg)
-{
-	struct ksu_get_allow_list_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	bool success =
-		ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, false);
-
-	if (!success) {
-		return -EFAULT;
-	}
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_deny_list: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_uid_granted_root(void __user *arg)
-{
-	struct ksu_uid_granted_root_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	cmd.granted = ksu_is_allow_uid_for_current(cmd.uid);
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("uid_granted_root: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_uid_should_umount(void __user *arg)
-{
-	struct ksu_uid_should_umount_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	cmd.should_umount = ksu_uid_should_umount(cmd.uid);
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("uid_should_umount: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_get_manager_appid(void __user *arg)
-{
-	struct ksu_get_manager_appid_cmd cmd;
-
-	cmd.appid = ksu_get_manager_appid();
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_manager_appid: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_get_app_profile(void __user *arg)
-{
-	struct ksu_get_app_profile_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("get_app_profile: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	if (!ksu_get_app_profile(&cmd.profile)) {
-		return -ENOENT;
-	}
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_app_profile: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_set_app_profile(void __user *arg)
-{
-	struct ksu_set_app_profile_cmd cmd;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("set_app_profile: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	if (!ksu_set_app_profile(&cmd.profile, true)) {
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_get_feature(void __user *arg)
-{
-	struct ksu_get_feature_cmd cmd;
-	bool supported;
-	int ret;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("get_feature: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported);
-	cmd.supported = supported ? 1 : 0;
-
-	if (ret && supported) {
-		pr_err("get_feature: failed for feature %u: %d\n",
-		       cmd.feature_id, ret);
-		return ret;
-	}
-
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("get_feature: copy_to_user failed\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int do_set_feature(void __user *arg)
-{
-	struct ksu_set_feature_cmd cmd;
-	int ret;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("set_feature: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	ret = ksu_set_feature(cmd.feature_id, cmd.value);
-	if (ret) {
-		pr_err("set_feature: failed for feature %u: %d\n",
-		       cmd.feature_id, ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int do_get_wrapper_fd(void __user *arg)
-{
-	if (!ksu_file_sid) {
-		return -EINVAL;
-	}
-
-	struct ksu_get_wrapper_fd_cmd cmd;
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("get_wrapper_fd: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	return ksu_install_file_wrapper(cmd.fd);
-}
-
-static int do_manage_mark(void __user *arg)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	struct ksu_manage_mark_cmd cmd;
-	int ret = 0;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		pr_err("manage_mark: copy_from_user failed\n");
-		return -EFAULT;
-	}
-
-	switch (cmd.operation) {
-	case KSU_MARK_GET: {
-		// Get task mark status
-		ret = ksu_get_task_mark(cmd.pid);
-		if (ret < 0) {
-			pr_err("manage_mark: get failed for pid %d: %d\n",
-			       cmd.pid, ret);
-			return ret;
-		}
-		cmd.result = (u32)ret;
-		break;
-	}
-	case KSU_MARK_MARK: {
-		if (cmd.pid == 0) {
-			ksu_mark_all_process();
-		} else {
-			ret = ksu_set_task_mark(cmd.pid, true);
-			if (ret < 0) {
-				pr_err("manage_mark: set_mark failed for pid %d: %d\n",
-				       cmd.pid, ret);
-				return ret;
-			}
-		}
-		break;
-	}
-	case KSU_MARK_UNMARK: {
-		if (cmd.pid == 0) {
-			ksu_unmark_all_process();
-		} else {
-			ret = ksu_set_task_mark(cmd.pid, false);
-			if (ret < 0) {
-				pr_err("manage_mark: set_unmark failed for pid %d: %d\n",
-				       cmd.pid, ret);
-				return ret;
-			}
-		}
-		break;
-	}
-	case KSU_MARK_REFRESH: {
-		ksu_mark_running_process();
-		pr_info("manage_mark: refreshed running processes\n");
-		break;
-	}
-	default: {
-		pr_err("manage_mark: invalid operation %u\n", cmd.operation);
-		return -EINVAL;
-	}
-	}
-	if (copy_to_user(arg, &cmd, sizeof(cmd))) {
-		pr_err("manage_mark: copy_to_user failed\n");
-		return -EFAULT;
-	}
-	return 0;
-#else
-	// We don't care, just return -ENOTSUPP
-	pr_warn("manage_mark: this supercalls is not implemented for manual hook.\n");
-	return -ENOTSUPP;
-#endif
-}
-
-struct list_head mount_list = LIST_HEAD_INIT(mount_list);
-DECLARE_RWSEM(mount_list_lock);
-
-static int add_try_umount(void __user *arg)
-{
-	struct mount_entry *new_entry, *entry, *tmp;
-	struct ksu_add_try_umount_cmd cmd;
-	char buf[256] = { 0 };
-
-	// When userspace disable kernel_umount, don't do anything.
-	if (!ksu_kernel_umount_enabled) {
-		pr_warn("add_try_umount supercall is not available when kernel_umount is disabled!\n");
-		return -ENOTSUPP;
-	}
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd))) {
-		return -EFAULT;
-	}
-
-	switch (cmd.mode) {
-	case KSU_UMOUNT_WIPE: {
-		struct mount_entry *entry, *tmp;
-		down_write(&mount_list_lock);
-		list_for_each_entry_safe (entry, tmp, &mount_list, list) {
-			pr_info("wipe_umount_list: removing entry: %s\n",
-				entry->umountable);
-			list_del(&entry->list);
-			kfree(entry->umountable);
-			kfree(entry);
-		}
-		up_write(&mount_list_lock);
-
-		return 0;
-	}
-
-	case KSU_UMOUNT_ADD: {
-		long len = strncpy_from_user(buf, (const char __user *)cmd.arg,
-					     256);
-		if (len <= 0)
-			return -EFAULT;
-
-		buf[sizeof(buf) - 1] = '\0';
-
-		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-		if (!new_entry)
-			return -ENOMEM;
-
-		new_entry->umountable = kstrdup(buf, GFP_KERNEL);
-		if (!new_entry->umountable) {
-			kfree(new_entry);
-			return -1;
-		}
-
-		down_write(&mount_list_lock);
-
-		// disallow dupes
-		// if this gets too many, we can consider moving this whole task to a kthread
-		list_for_each_entry (entry, &mount_list, list) {
-			if (!strcmp(entry->umountable, buf)) {
-				pr_info("cmd_add_try_umount: %s is already here!\n",
-					buf);
-				up_write(&mount_list_lock);
-				kfree(new_entry->umountable);
-				kfree(new_entry);
-				return -1;
-			}
-		}
-
-		// now check flags and add
-		// this also serves as a null check
-		if (cmd.flags)
-			new_entry->flags = cmd.flags;
-		else
-			new_entry->flags = 0;
-
-		// debug
-		list_add(&new_entry->list, &mount_list);
-		up_write(&mount_list_lock);
-		pr_info("cmd_add_try_umount: %s added!\n", buf);
-
-		return 0;
-	}
-
-	// this is just strcmp'd wipe anyway
-	case KSU_UMOUNT_DEL: {
-		long len = strncpy_from_user(buf, (const char __user *)cmd.arg,
-					     sizeof(buf) - 1);
-		if (len <= 0)
-			return -EFAULT;
-
-		buf[sizeof(buf) - 1] = '\0';
-
-		down_write(&mount_list_lock);
-		list_for_each_entry_safe (entry, tmp, &mount_list, list) {
-			if (!strcmp(entry->umountable, buf)) {
-				pr_info("cmd_add_try_umount: entry removed: %s\n",
-					entry->umountable);
-				list_del(&entry->list);
-				kfree(entry->umountable);
-				kfree(entry);
-			}
-		}
-		up_write(&mount_list_lock);
-
-		return 0;
-	}
-
-	// this way userspace can deduce the memory it has to prepare.
-	case KSU_UMOUNT_GETSIZE: {
-		// check for pointer first
-		if (!cmd.arg)
-			return -EFAULT;
-
-		size_t total_size = 0; // size of list in bytes
-
-		down_read(&mount_list_lock);
-		list_for_each_entry (entry, &mount_list, list) {
-			// + 1 for \0
-			total_size = total_size + strlen(entry->umountable) + 1;
-		}
-		up_read(&mount_list_lock);
-
-		pr_info("cmd_add_try_umount: total_size: %zu\n", total_size);
-
-		if (copy_to_user((size_t __user *)cmd.arg, &total_size,
-				 sizeof(total_size)))
-			return -EFAULT;
-
-		return 0;
-	}
-
-	// WARNING! this is straight up pointerwalking.
-	// this way we dont need to redefine the ioctl defs.
-	// this also avoids us needing to kmalloc
-	// userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA.
-	case KSU_UMOUNT_GETLIST: {
-		if (!cmd.arg)
-			return -EFAULT;
-
-		char *user_buf = (char *)cmd.arg;
-
-		down_read(&mount_list_lock);
-		list_for_each_entry (entry, &mount_list, list) {
-			pr_info("cmd_add_try_umount: entry: %s\n",
-				entry->umountable);
-
-			if (copy_to_user((char __user *)user_buf,
-					 entry->umountable,
-					 strlen(entry->umountable) + 1)) {
-				up_read(&mount_list_lock);
-				return -EFAULT;
-			}
-
-			// walk it! +1 for null terminator
-			user_buf = user_buf + strlen(entry->umountable) + 1;
-		}
-		up_read(&mount_list_lock);
-
-		return 0;
-	}
-
-	default: {
-		pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode);
-		return -EINVAL;
-	}
-
-	} // switch(cmd.mode)
-
-	return 0;
-}
-
-static int do_nuke_ext4_sysfs(void __user *arg)
-{
-	struct ksu_nuke_ext4_sysfs_cmd cmd;
-	char mnt[256];
-	long ret;
-
-	if (copy_from_user(&cmd, arg, sizeof(cmd)))
-		return -EFAULT;
-
-	if (!cmd.arg)
-		return -EINVAL;
-
-	memset(mnt, 0, sizeof(mnt));
-
-	ret = strncpy_from_user(mnt, cmd.arg, sizeof(mnt));
-	if (ret < 0) {
-		pr_err("nuke ext4 copy mnt failed: %ld\n", ret);
-		return -EFAULT; // 或者 return ret;
-	}
-
-	if (ret == sizeof(mnt)) {
-		pr_err("nuke ext4 mnt path too long\n");
-		return -ENAMETOOLONG;
-	}
-
-	pr_info("do_nuke_ext4_sysfs: %s\n", mnt);
-
-	return nuke_ext4_sysfs(mnt);
-}
-
-// IOCTL handlers mapping table
-static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = {
-	KSU_IOCTL(GRANT_ROOT, "GRANT_ROOT", do_grant_root, allowed_for_su),
-	KSU_IOCTL(GET_INFO, "GET_INFO", do_get_info, always_allow),
-	KSU_IOCTL(REPORT_EVENT, "REPORT_EVENT", do_report_event, only_root),
-	KSU_IOCTL(SET_SEPOLICY, "SET_SEPOLICY", do_set_sepolicy, only_root),
-	KSU_IOCTL(CHECK_SAFEMODE, "CHECK_SAFEMODE", do_check_safemode,
-		  always_allow),
-	KSU_IOCTL(GET_ALLOW_LIST, "GET_ALLOW_LIST", do_get_allow_list,
-		  manager_or_root),
-	KSU_IOCTL(GET_DENY_LIST, "GET_DENY_LIST", do_get_deny_list,
-		  manager_or_root),
-	KSU_IOCTL(UID_GRANTED_ROOT, "UID_GRANTED_ROOT", do_uid_granted_root,
-		  manager_or_root),
-	KSU_IOCTL(UID_SHOULD_UMOUNT, "UID_SHOULD_UMOUNT", do_uid_should_umount,
-		  manager_or_root),
-	KSU_IOCTL(GET_MANAGER_APPID, "GET_MANAGER_APPID", do_get_manager_appid,
-		  manager_or_root),
-	KSU_IOCTL(GET_APP_PROFILE, "GET_APP_PROFILE", do_get_app_profile,
-		  only_manager),
-	KSU_IOCTL(SET_APP_PROFILE, "SET_APP_PROFILE", do_set_app_profile,
-		  only_manager),
-	KSU_IOCTL(GET_FEATURE, "GET_FEATURE", do_get_feature, manager_or_root),
-	KSU_IOCTL(SET_FEATURE, "SET_FEATURE", do_set_feature, manager_or_root),
-	KSU_IOCTL(GET_WRAPPER_FD, "GET_WRAPPER_FD", do_get_wrapper_fd,
-		  manager_or_root),
-	KSU_IOCTL(MANAGE_MARK, "MANAGE_MARK", do_manage_mark, manager_or_root),
-	KSU_IOCTL(NUKE_EXT4_SYSFS, "NUKE_EXT4_SYSFS", do_nuke_ext4_sysfs,
-		  manager_or_root),
-	KSU_IOCTL(ADD_TRY_UMOUNT, "ADD_TRY_UMOUNT", add_try_umount,
-		  manager_or_root),
-
-	// Sentinel
-	{ .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL }
-};
-
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-struct ksu_install_fd_tw {
-	struct callback_head cb;
-	int __user *outp;
-};
-
-static void ksu_install_fd_tw_func(struct callback_head *cb)
-{
-	struct ksu_install_fd_tw *tw =
-		container_of(cb, struct ksu_install_fd_tw, cb);
-	int fd = ksu_install_fd();
-
-	if (copy_to_user(tw->outp, &fd, sizeof(fd))) {
-		pr_err("install ksu fd reply err\n");
-		do_close_fd(fd);
-	}
-
-	kfree(tw);
-}
-
-static int ksu_handle_fd_request(void __user *arg)
-{
-	struct ksu_install_fd_tw *tw;
-
-	tw = kzalloc(sizeof(*tw), GFP_ATOMIC);
-	if (!tw)
-		return -ENOMEM;
-
-	tw->outp = (int __user *)arg;
-	tw->cb.func = ksu_install_fd_tw_func;
-
-	if (task_work_add(current, &tw->cb, TWA_RESUME)) {
-		kfree(tw);
-		pr_warn("install fd add task_work failed\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-#else
-static int ksu_handle_fd_request(void __user *arg)
-{
-	int fd = ksu_install_fd();
-
-	if (copy_to_user(arg, &fd, sizeof(fd))) {
-		pr_err("install ksu fd reply err\n");
-		do_close_fd(fd);
-		return -EFAULT;
-	}
-
-	return 0;
-}
-#endif
-
-int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd,
-			  void __user **arg)
-{
-	if (magic1 != KSU_INSTALL_MAGIC1)
-		return -EINVAL;
-
-	// Rare case that unlikely to happen
-	if (unlikely(!arg))
-		return -EINVAL;
-
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("sys_reboot: magic: 0x%x (id: %d)\n", magic1, magic2);
-#endif
-
-	// Dereference **arg.. with IS_ERR check.
-	void __user *argp = (void __user *)*arg;
-	if (IS_ERR(argp)) {
-		pr_err("Failed to deref user arg, err: %lu\n", PTR_ERR(argp));
-		return -EINVAL;
-	}
-
-	// Check if this is a request to install KSU fd
-	if (magic2 == KSU_INSTALL_MAGIC2) {
-		return ksu_handle_fd_request(argp);
-	}
-
-	return 0;
-}
-
-void ksu_supercalls_init(void)
-{
-	int i;
-
-	pr_info("KernelSU IOCTL Commands:\n");
-	for (i = 0; ksu_ioctl_handlers[i].handler; i++) {
-		pr_info("  %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name,
-			ksu_ioctl_handlers[i].cmd);
-	}
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_supercalls_init();
-#endif
-}
-
-void ksu_supercalls_exit(void)
-{
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-	kp_handle_supercalls_exit();
-#endif
-}
-
-// IOCTL dispatcher
-static long anon_ksu_ioctl(struct file *filp, unsigned int cmd,
-			   unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int i;
-
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val);
-#endif
-
-	for (i = 0; ksu_ioctl_handlers[i].handler; i++) {
-		if (cmd == ksu_ioctl_handlers[i].cmd) {
-			// Check permission first
-			if (ksu_ioctl_handlers[i].perm_check &&
-			    !ksu_ioctl_handlers[i].perm_check()) {
-				pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n",
-					cmd, current_uid().val);
-				return -EPERM;
-			}
-			// Execute handler
-			return ksu_ioctl_handlers[i].handler(argp);
-		}
-	}
-
-	pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd);
-	return -ENOTTY;
-}
-
-// File release handler
-static int anon_ksu_release(struct inode *inode, struct file *filp)
-{
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("ksu fd released\n");
-#endif
-	return 0;
-}
-
-// File operations structure
-static const struct file_operations anon_ksu_fops = {
-	.owner = THIS_MODULE,
-	.unlocked_ioctl = anon_ksu_ioctl,
-	.compat_ioctl = anon_ksu_ioctl,
-	.release = anon_ksu_release,
-};
-
-// Install KSU fd to current process
-int ksu_install_fd(void)
-{
-	struct file *filp;
-	int fd;
-
-	// Get unused fd
-	fd = get_unused_fd_flags(O_CLOEXEC);
-	if (fd < 0) {
-		pr_err("ksu_install_fd: failed to get unused fd\n");
-		return fd;
-	}
-
-	// Create anonymous inode file
-	filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL,
-				  O_RDWR | O_CLOEXEC);
-	if (IS_ERR(filp)) {
-		pr_err("ksu_install_fd: failed to create anon inode file\n");
-		put_unused_fd(fd);
-		return PTR_ERR(filp);
-	}
-
-	// Install fd
-	fd_install(fd, filp);
-
-#ifdef CONFIG_KSU_DEBUG
-	pr_info("ksu fd[%d] installed for %s/%d\n", fd, current->comm,
-		current->pid);
-#endif
-
-	return fd;
-}
diff --git a/drivers/kernelsu/supercalls.h b/drivers/kernelsu/supercalls.h
deleted file mode 100644
index f6ba38c498d3..000000000000
--- a/drivers/kernelsu/supercalls.h
+++ /dev/null
@@ -1,152 +0,0 @@
-#ifndef __KSU_H_SUPERCALLS
-#define __KSU_H_SUPERCALLS
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include "app_profile.h"
-
-// Magic numbers for reboot hook to install fd
-#define KSU_INSTALL_MAGIC1 0xDEADBEEF
-#define KSU_INSTALL_MAGIC2 0xCAFEBABE
-
-// Command structures for ioctl
-
-struct ksu_become_daemon_cmd {
-	__u8 token[65]; // Input: daemon token (null-terminated)
-};
-
-struct ksu_get_info_cmd {
-	__u32 version; // Output: KERNEL_SU_VERSION
-	__u32 flags; // Output: flags (bit 0: MODULE mode)
-	__u32 features; // Output: max feature ID supported
-};
-
-struct ksu_report_event_cmd {
-	__u32 event; // Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc.
-};
-
-struct ksu_set_sepolicy_cmd {
-	__u64 cmd; // Input: sepolicy command
-	__aligned_u64 arg; // Input: sepolicy argument pointer
-};
-
-struct ksu_check_safemode_cmd {
-	__u8 in_safe_mode; // Output: true if in safe mode, false otherwise
-};
-
-struct ksu_get_allow_list_cmd {
-	__u32 uids[128]; // Output: array of allowed/denied UIDs
-	__u32 count; // Output: number of UIDs in array
-	__u8 allow; // Input: true for allow list, false for deny list
-};
-
-struct ksu_uid_granted_root_cmd {
-	__u32 uid; // Input: target UID to check
-	__u8 granted; // Output: true if granted, false otherwise
-};
-
-struct ksu_uid_should_umount_cmd {
-	__u32 uid; // Input: target UID to check
-	__u8 should_umount; // Output: true if should umount, false otherwise
-};
-
-struct ksu_get_manager_appid_cmd {
-	__u32 appid; // Output: manager app id
-};
-
-struct ksu_get_app_profile_cmd {
-	struct app_profile profile; // Input/Output: app profile structure
-};
-
-struct ksu_set_app_profile_cmd {
-	struct app_profile profile; // Input: app profile structure
-};
-
-struct ksu_get_feature_cmd {
-	__u32 feature_id; // Input: feature ID (enum ksu_feature_id)
-	__u64 value; // Output: feature value/state
-	__u8 supported; // Output: true if feature is supported, false otherwise
-};
-
-struct ksu_set_feature_cmd {
-	__u32 feature_id; // Input: feature ID (enum ksu_feature_id)
-	__u64 value; // Input: feature value/state to set
-};
-
-struct ksu_get_wrapper_fd_cmd {
-	__u32 fd; // Input: userspace fd
-	__u32 flags; // Input: flags of userspace fd
-};
-
-struct ksu_manage_mark_cmd {
-	__u32 operation; // Input: KSU_MARK_*
-	__s32 pid; // Input: target pid (0 for all processes)
-	__u32 result; // Output: for get operation - mark status or reg_count
-};
-
-struct ksu_nuke_ext4_sysfs_cmd {
-	__aligned_u64 arg; // Input: mnt pointer
-};
-
-#define KSU_MARK_GET 1
-#define KSU_MARK_MARK 2
-#define KSU_MARK_UNMARK 3
-#define KSU_MARK_REFRESH 4
-
-struct ksu_add_try_umount_cmd {
-	__aligned_u64 arg; // char ptr, this is the mountpoint
-	__u32 flags; // this is the flag we use for it
-	__u8 mode; // denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry
-};
-
-#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list
-#define KSU_UMOUNT_ADD 1 // add entry (path + flags)
-#define KSU_UMOUNT_DEL 2 // delete entry, strcmp
-#define KSU_UMOUNT_GETSIZE 3 // get list size
-#define KSU_UMOUNT_GETLIST 4 // get list
-
-// IOCTL command definitions
-#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0)
-#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0)
-#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0)
-#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ | _IOC_WRITE, 'K', 4, 0)
-#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0)
-#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 6, 0)
-#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 7, 0)
-#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ | _IOC_WRITE, 'K', 8, 0)
-#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ | _IOC_WRITE, 'K', 9, 0)
-#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0)
-#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ | _IOC_WRITE, 'K', 11, 0)
-#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0)
-#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ | _IOC_WRITE, 'K', 13, 0)
-#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0)
-#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0)
-#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ | _IOC_WRITE, 'K', 16, 0)
-#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0)
-#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0)
-
-// IOCTL handler types
-typedef int (*ksu_ioctl_handler_t)(void __user *arg);
-typedef bool (*ksu_perm_check_t)(void);
-
-// IOCTL command mapping
-struct ksu_ioctl_cmd_map {
-	unsigned int cmd;
-	const char *name;
-	ksu_ioctl_handler_t handler;
-	ksu_perm_check_t perm_check; // Permission check function
-};
-
-#define KSU_IOCTL(CMD, NAME, HANDLER, PERM)                                    \
-	{                                                                      \
-		.cmd = KSU_IOCTL_##CMD, .name = NAME, .handler = HANDLER,      \
-		.perm_check = PERM                                             \
-	}
-
-// Install KSU fd to current process
-int ksu_install_fd(void);
-
-void ksu_supercalls_init(void);
-void ksu_supercalls_exit(void);
-
-#endif // __KSU_H_SUPERCALLS
diff --git a/drivers/kernelsu/syscall_handler.c b/drivers/kernelsu/syscall_handler.c
deleted file mode 100644
index 499967165bce..000000000000
--- a/drivers/kernelsu/syscall_handler.c
+++ /dev/null
@@ -1,374 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/cred.h>
-#include <linux/printk.h>
-#include <linux/spinlock.h>
-#include <linux/kprobes.h>
-#include <linux/tracepoint.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <asm/syscall.h>
-
-#include <trace/events/syscalls.h>
-
-// Tracepoint registration count management
-// == 1: just us
-// >  1: someone else is also using syscall tracepoint e.g. ftrace
-static int tracepoint_reg_count = 0;
-static DEFINE_SPINLOCK(tracepoint_reg_lock);
-
-void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&tracepoint_reg_lock, flags);
-	if (tracepoint_reg_count <= 1) {
-		ksu_clear_task_tracepoint_flag(t);
-	}
-	spin_unlock_irqrestore(&tracepoint_reg_lock, flags);
-}
-
-// Process marking management
-static void handle_process_mark(bool mark)
-{
-	struct task_struct *p, *t;
-	read_lock(&tasklist_lock);
-	for_each_process_thread (p, t) {
-		if (mark)
-			ksu_set_task_tracepoint_flag(t);
-		else
-			ksu_clear_task_tracepoint_flag(t);
-	}
-	read_unlock(&tasklist_lock);
-}
-
-void ksu_mark_all_process(void)
-{
-	handle_process_mark(true);
-	pr_info("hook_manager: mark all user process done!\n");
-}
-
-void ksu_unmark_all_process(void)
-{
-	handle_process_mark(false);
-	pr_info("hook_manager: unmark all user process done!\n");
-}
-
-static void ksu_mark_running_process_locked(void)
-{
-	struct task_struct *p, *t;
-	read_lock(&tasklist_lock);
-	for_each_process_thread (p, t) {
-		if (!t->mm) { // only user processes
-			continue;
-		}
-		int uid = task_uid(t).val;
-		const struct cred *cred = get_task_cred(t);
-		bool ksu_root_process = uid == 0 && is_task_ksu_domain(cred);
-		bool is_zygote_process = is_zygote(cred);
-		bool is_shell = uid == 2000;
-		// before boot completed, we shall mark init for marking zygote
-		bool is_init = t->pid == 1;
-		if (ksu_root_process || is_zygote_process || is_shell ||
-		    is_init || ksu_is_allow_uid(uid)) {
-			ksu_set_task_tracepoint_flag(t);
-			pr_info("hook_manager: mark process: pid:%d, uid: %d, comm:%s\n",
-				t->pid, uid, t->comm);
-		} else {
-			ksu_clear_task_tracepoint_flag(t);
-			pr_info("hook_manager: unmark process: pid:%d, uid: %d, comm:%s\n",
-				t->pid, uid, t->comm);
-		}
-		put_cred(cred);
-	}
-	read_unlock(&tasklist_lock);
-}
-
-void ksu_mark_running_process(void)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&tracepoint_reg_lock, flags);
-	if (tracepoint_reg_count <= 1) {
-		ksu_mark_running_process_locked();
-	} else {
-		pr_info("hook_manager: not mark running process since syscall tracepoint is in use\n");
-	}
-	spin_unlock_irqrestore(&tracepoint_reg_lock, flags);
-}
-
-// Get task mark status
-// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found
-int ksu_get_task_mark(pid_t pid)
-{
-	struct task_struct *task;
-	int marked = -ESRCH;
-
-	rcu_read_lock();
-	task = find_task_by_vpid(pid);
-	if (task) {
-		get_task_struct(task);
-		rcu_read_unlock();
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-		marked = test_task_syscall_work(task, SYSCALL_TRACEPOINT) ? 1 : 0;
-#else
-		marked = test_tsk_thread_flag(task, TIF_SYSCALL_TRACEPOINT) ? 1 : 0;
-#endif
-		put_task_struct(task);
-	} else {
-		rcu_read_unlock();
-	}
-
-	return marked;
-}
-
-// Set task mark status
-// Returns: 0 on success, -ESRCH if task not found
-int ksu_set_task_mark(pid_t pid, bool mark)
-{
-	struct task_struct *task;
-	int ret = -ESRCH;
-
-	rcu_read_lock();
-	task = find_task_by_vpid(pid);
-	if (task) {
-		get_task_struct(task);
-		rcu_read_unlock();
-		if (mark) {
-			ksu_set_task_tracepoint_flag(task);
-			pr_info("hook_manager: marked task pid=%d comm=%s\n",
-				pid, task->comm);
-		} else {
-			ksu_clear_task_tracepoint_flag(task);
-			pr_info("hook_manager: unmarked task pid=%d comm=%s\n",
-				pid, task->comm);
-		}
-		put_task_struct(task);
-		ret = 0;
-	} else {
-		rcu_read_unlock();
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_KRETPROBES
-
-static struct kretprobe *init_kretprobe(const char *name,
-					kretprobe_handler_t handler)
-{
-	struct kretprobe *rp = kzalloc(sizeof(struct kretprobe), GFP_KERNEL);
-	if (!rp)
-		return NULL;
-	rp->kp.symbol_name = name;
-	rp->handler = handler;
-	rp->data_size = 0;
-	rp->maxactive = 0;
-
-	int ret = register_kretprobe(rp);
-	pr_info("hook_manager: register_%s kretprobe: %d\n", name, ret);
-	if (ret) {
-		kfree(rp);
-		return NULL;
-	}
-
-	return rp;
-}
-
-static void destroy_kretprobe(struct kretprobe **rp_ptr)
-{
-	struct kretprobe *rp = *rp_ptr;
-	if (!rp)
-		return;
-	unregister_kretprobe(rp);
-	synchronize_rcu();
-	kfree(rp);
-	*rp_ptr = NULL;
-}
-
-static int syscall_regfunc_handler(struct kretprobe_instance *ri,
-				   struct pt_regs *regs)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&tracepoint_reg_lock, flags);
-	if (tracepoint_reg_count < 1) {
-		// while install our tracepoint, mark our processes
-		ksu_mark_running_process_locked();
-	} else if (tracepoint_reg_count == 1) {
-		// while other tracepoint first added, mark all processes
-		ksu_mark_all_process();
-	}
-	tracepoint_reg_count++;
-	spin_unlock_irqrestore(&tracepoint_reg_lock, flags);
-	return 0;
-}
-
-static int syscall_unregfunc_handler(struct kretprobe_instance *ri,
-				     struct pt_regs *regs)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&tracepoint_reg_lock, flags);
-	tracepoint_reg_count--;
-	if (tracepoint_reg_count <= 0) {
-		// while no tracepoint left, unmark all processes
-		ksu_unmark_all_process();
-	} else if (tracepoint_reg_count == 1) {
-		// while just our tracepoint left, unmark disallowed processes
-		ksu_mark_running_process_locked();
-	}
-	spin_unlock_irqrestore(&tracepoint_reg_lock, flags);
-	return 0;
-}
-
-static struct kretprobe *syscall_regfunc_rp = NULL;
-static struct kretprobe *syscall_unregfunc_rp = NULL;
-#endif
-
-static inline bool check_syscall_fastpath(int nr)
-{
-	switch (nr) {
-	case __NR_newfstatat:
-	case __NR_faccessat:
-	case __NR_execve:
-	case __NR_setresuid:
-		return true;
-	default:
-		return false;
-	}
-}
-
-// Unmark init's child that are not zygote, adbd or ksud
-int ksu_handle_init_mark_tracker(const char __user **filename_user)
-{
-	char path[64];
-
-	if (unlikely(!filename_user))
-		return 0;
-	if (!ksu_retry_filename_access(filename_user, path, sizeof(path),
-				       false))
-		return 0;
-
-	if (unlikely(strcmp(path, KSUD_PATH) == 0)) {
-		pr_info("hook_manager: escape to root for init executing ksud: %d\n",
-			current->pid);
-		escape_to_root_for_init();
-	} else if (likely(strstr(path, "/app_process") == NULL &&
-			  strstr(path, "/adbd") == NULL)) {
-		pr_info("hook_manager: unmark %d exec %s\n", current->pid,
-			path);
-		ksu_clear_task_tracepoint_flag_if_needed(current);
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-static int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid)
-{
-	return ksu_handle_setuid_common(ruid, current_uid().val, euid);
-}
-
-// Generic sys_enter handler that dispatches to specific handlers
-static void ksu_sys_enter_handler(void *data, struct pt_regs *regs, long id)
-{
-	if (unlikely(check_syscall_fastpath(id))) {
-		if (ksu_su_compat_enabled) {
-			// Handle newfstatat
-			if (id == __NR_newfstatat) {
-				int *dfd = (int *)&PT_REGS_PARM1(regs);
-				const char __user **filename_user =
-					(const char __user **)&PT_REGS_PARM2(
-						regs);
-				int *flags =
-					(int *)&PT_REGS_SYSCALL_PARM4(regs);
-				ksu_handle_stat(dfd, filename_user, flags);
-				return;
-			}
-
-			// Handle faccessat
-			if (id == __NR_faccessat) {
-				int *dfd = (int *)&PT_REGS_PARM1(regs);
-				const char __user **filename_user =
-					(const char __user **)&PT_REGS_PARM2(
-						regs);
-				int *mode = (int *)&PT_REGS_PARM3(regs);
-				ksu_handle_faccessat(dfd, filename_user, mode,
-						     NULL);
-				return;
-			}
-
-			// Handle execve
-			if (id == __NR_execve) {
-				const char __user **filename_user =
-					(const char __user **)&PT_REGS_PARM1(
-						regs);
-				if (current->pid != 1 &&
-				    is_init(get_current_cred())) {
-					ksu_handle_init_mark_tracker(
-						filename_user);
-				} else {
-					ksu_handle_execve_sucompat(
-						NULL, filename_user, NULL, NULL,
-						NULL);
-				}
-				return;
-			}
-		}
-
-		// Handle setresuid
-		if (id == __NR_setresuid) {
-			uid_t ruid = (uid_t)PT_REGS_PARM1(regs);
-			uid_t euid = (uid_t)PT_REGS_PARM2(regs);
-			uid_t suid = (uid_t)PT_REGS_PARM3(regs);
-			ksu_handle_setresuid(ruid, euid, suid);
-			return;
-		}
-	}
-}
-#endif
-
-void ksu_syscall_hook_manager_init(void)
-{
-	int ret;
-	pr_info("hook_manager: ksu_hook_manager_init called\n");
-
-#ifdef CONFIG_KRETPROBES
-	// Register kretprobe for syscall_regfunc
-	syscall_regfunc_rp =
-		init_kretprobe("syscall_regfunc", syscall_regfunc_handler);
-	// Register kretprobe for syscall_unregfunc
-	syscall_unregfunc_rp =
-		init_kretprobe("syscall_unregfunc", syscall_unregfunc_handler);
-#endif
-
-#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-	ret = register_trace_sys_enter(ksu_sys_enter_handler, NULL);
-#ifndef CONFIG_KRETPROBES
-	ksu_mark_running_process_locked();
-#endif
-	if (ret) {
-		pr_err("hook_manager: failed to register sys_enter tracepoint: %d\n",
-		       ret);
-	} else {
-		pr_info("hook_manager: sys_enter tracepoint registered\n");
-	}
-#endif
-
-	ksu_setuid_hook_init();
-	ksu_sucompat_init();
-}
-
-void ksu_syscall_hook_manager_exit(void)
-{
-	pr_info("hook_manager: ksu_hook_manager_exit called\n");
-#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-	unregister_trace_sys_enter(ksu_sys_enter_handler, NULL);
-	tracepoint_synchronize_unregister();
-	pr_info("hook_manager: sys_enter tracepoint unregistered\n");
-#endif
-
-#ifdef CONFIG_KRETPROBES
-	destroy_kretprobe(&syscall_regfunc_rp);
-	destroy_kretprobe(&syscall_unregfunc_rp);
-#endif
-
-	ksu_sucompat_exit();
-	ksu_setuid_hook_exit();
-}
diff --git a/drivers/kernelsu/syscall_handler.h b/drivers/kernelsu/syscall_handler.h
deleted file mode 100644
index 463617fd97d9..000000000000
--- a/drivers/kernelsu/syscall_handler.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __KSU_H_HOOK_MANAGER
-#define __KSU_H_HOOK_MANAGER
-
-#include <linux/version.h>
-#include <linux/sched.h>
-#include <linux/thread_info.h>
-
-// Hook manager initialization and cleanup
-void ksu_syscall_hook_manager_init(void);
-void ksu_syscall_hook_manager_exit(void);
-
-// Process marking for tracepoint
-void ksu_mark_all_process(void);
-void ksu_unmark_all_process(void);
-void ksu_mark_running_process(void);
-
-// Per-task mark operations
-int ksu_get_task_mark(pid_t pid);
-int ksu_set_task_mark(pid_t pid, bool mark);
-
-static inline void ksu_set_task_tracepoint_flag(struct task_struct *t)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-	set_task_syscall_work(t, SYSCALL_TRACEPOINT);
-#else
-	set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-#endif
-}
-
-static inline void ksu_clear_task_tracepoint_flag(struct task_struct *t)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
-	clear_task_syscall_work(t, SYSCALL_TRACEPOINT);
-#else
-	clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-#endif
-}
-
-void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t);
-#endif
diff --git a/drivers/kernelsu/throne_tracker.h b/drivers/kernelsu/throne_tracker.h
deleted file mode 100644
index 8bb3b9a29b51..000000000000
--- a/drivers/kernelsu/throne_tracker.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __KSU_H_THRONE_TRACKER
-#define __KSU_H_THRONE_TRACKER
-
-void ksu_throne_tracker_init(void);
-
-void ksu_throne_tracker_exit(void);
-
-void track_throne(bool prune_only);
-
-#endif
diff --git a/drivers/kernelsu/tiny_sulog.c b/drivers/kernelsu/tiny_sulog.c
new file mode 100644
index 000000000000..1fc8a5b1e3dd
--- /dev/null
+++ b/drivers/kernelsu/tiny_sulog.c
@@ -0,0 +1,125 @@
+// half assed ringbuffer
+// 8 bytes
+struct sulog_entry {
+	uint32_t s_time; // uptime in seconds
+	uint32_t data; // uint8_t[0,1,2] = uid, basically uint24_t, uint8_t[3] = symbol
+} __attribute__((packed));
+
+#define SULOG_ENTRY_MAX 250
+#define SULOG_BUFSIZ SULOG_ENTRY_MAX * (sizeof (struct sulog_entry))
+
+static void *sulog_buf_ptr = NULL;
+static uint8_t sulog_index_next = 0;
+
+static DEFINE_SPINLOCK(sulog_lock);
+
+static void tiny_sulog_init_heap()
+{
+	sulog_buf_ptr = kzalloc(SULOG_BUFSIZ, GFP_KERNEL);
+	if (!sulog_buf_ptr)
+		return;
+	
+	pr_info("sulog_init: allocated %lu bytes on 0x%p \n", SULOG_BUFSIZ, sulog_buf_ptr);
+}
+
+/**
+ *
+ *  boottime_s_get, get kernel uptime in seconds
+ *
+ * - handles sub 4.10 compat
+ * - we do this forced pointer cast to cut down on compat, pre 4.10, ktime is a union
+ *
+ * - bs handling 64-bit division on 32-bit (do_div)
+ * - remainder = do_div(dividend, divisor); dividend will hold the quotient 
+ * - for 64-bit we can straight up just use divide
+ *
+ */
+static inline uint32_t boottime_s_get()
+{
+	ktime_t boottime_kt = ktime_get_boottime();
+
+#ifdef CONFIG_64BIT 
+	uint64_t boottime_s = *(uint64_t *)&boottime_kt / 1000000000;
+#else
+	uint64_t boottime_s = *(uint64_t *)&boottime_kt;
+	do_div(boottime_s, 1000000000);
+#endif
+
+	return (uint32_t)boottime_s;
+}
+
+static void write_sulog(uint8_t sym)
+{
+	if (!sulog_buf_ptr)
+		return;
+
+	unsigned int offset = sulog_index_next * sizeof(struct sulog_entry);
+	struct sulog_entry entry = {0};
+
+	// WARNING!!! this is LE only!
+	entry.s_time = boottime_s_get();
+	entry.data = (uint32_t)current_uid().val;
+	*((char *)&entry.data + 3) = sym;
+
+	// we can perform this write atomic on 64-bit
+	// however this still has to be locked for exclusion as theres a reader
+
+	spin_lock(&sulog_lock);
+
+#ifdef CONFIG_64BIT
+	*(volatile uint64_t *)(sulog_buf_ptr + offset) = *(uint64_t *)&entry;
+#else
+	__builtin_memcpy(sulog_buf_ptr + offset, &entry, sizeof(entry));
+#endif
+
+	// move ptr for next iteration
+	sulog_index_next = sulog_index_next + 1;
+
+	if (sulog_index_next >= SULOG_ENTRY_MAX)
+		sulog_index_next = 0;
+
+	spin_unlock(&sulog_lock);
+
+	return;
+}
+
+struct sulog_entry_rcv_ptr {
+	uint64_t index_ptr; // send index here
+	uint64_t buf_ptr; // send buf here
+	uint64_t uptime_ptr; // uptime
+};
+
+static int send_sulog_dump(void __user *uptr)
+{
+	if (!sulog_buf_ptr)
+		return 1;
+
+	struct sulog_entry_rcv_ptr sbuf = {0};
+
+	if (copy_from_user(&sbuf, uptr, sizeof(sbuf) ))
+		return 1;
+
+	if (!sbuf.index_ptr || !sbuf.buf_ptr || !sbuf.uptime_ptr )
+		return 1;
+
+	// send uptime
+
+	uint32_t uptime =  boottime_s_get();
+
+	if (copy_to_user((void __user *)(uintptr_t)sbuf.uptime_ptr, &uptime, sizeof(uptime) ))
+		return 1;
+
+	// send index
+	if (copy_to_user((void __user *)(uintptr_t)sbuf.index_ptr, &sulog_index_next, sizeof(sulog_index_next) ))
+		return 1;
+
+	// send buffer data
+	spin_lock(&sulog_lock);
+	if (copy_to_user((void __user *)(uintptr_t)sbuf.buf_ptr, sulog_buf_ptr, SULOG_BUFSIZ )) {
+		spin_unlock(&sulog_lock);
+		return 1;
+	}
+	spin_unlock(&sulog_lock);
+
+	return 0;
+}
diff --git a/drivers/rekernel/Kconfig b/drivers/rekernel/Kconfig
deleted file mode 100644
index dadf14779fde..000000000000
--- a/drivers/rekernel/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-menu "Re:Kernel"
-
-config REKERNEL
-	bool "Re:Kernel support"
-	default n
-	help
-	  Make tombstone users get a better experience.
-
-config REKERNEL_NETWORK
-	bool "Re:Kernel NetReceive unfreeze support"
-	depends on REKERNEL
-	default n
-	help
-	  Make tombstone users get a better experience.
-endmenu
diff --git a/drivers/rekernel/Makefile b/drivers/rekernel/Makefile
deleted file mode 100644
index bb613644a5f4..000000000000
--- a/drivers/rekernel/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_REKERNEL) += rekernel.o
diff --git a/drivers/rekernel/rekernel.c b/drivers/rekernel/rekernel.c
deleted file mode 100644
index d3783225c33c..000000000000
--- a/drivers/rekernel/rekernel.c
+++ /dev/null
@@ -1,333 +0,0 @@
-#include <linux/init.h>
-#include <linux/types.h>
-
-#include <net/sock.h>
-#include <net/ip.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#endif /* IS_ENABLED(CONFIG_IPV6) */
-#include <linux/tcp.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <linux/netfilter_ipv6.h>
-#endif /* IS_ENABLED(CONFIG_IPV6) */
-
-#include <linux/proc_fs.h>
-#include <uapi/linux/android/binder.h>
-#include "rekernel.h"
-
-#define MIN_USERAPP_UID				10000
-#define MAX_SYSTEM_UID				2000
-#define SYSTEM_APP_UID				1000
-#define INTERFACETOKEN_BUFF_SIZE	140
-#define PARCEL_OFFSET				16
-#define LINE_ERROR					1
-#define LINE_SUCCESS				0
-
-#define NETLINK_REKERNEL_MAX		26
-#define NETLINK_REKERNEL_MIN		22
-#define USER_PORT					100
-#define PACKET_SIZE					256
-
-static const char* binder_type[] = {
-	"reply",
-	"transaction",
-	"free_buffer_full",
-};
-static const char* rpc_type[] = {
-	"SYNC_BINDER_REPLY",
-	"SYNC_BINDER",
-	"FREE_BUFFER_FULL",
-};
-static struct sock* netlink_socket;
-extern struct net init_net;
-static unsigned long netlink_unit = 0;
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry* rekernel_dir, * rekernel_unit_entry;
-#endif /* CONFIG_PROC_FS */
-
-static int sendMessage(char* packet_buffer, uint16_t len) {
-	struct sk_buff* socket_buffer;
-	struct nlmsghdr* netlink_hdr;
-
-	socket_buffer = nlmsg_new(len, GFP_ATOMIC);
-	if (!socket_buffer) {
-		pr_err("netlink alloc failure.\n");
-		return -LINE_ERROR;
-	}
-
-	netlink_hdr = nlmsg_put(socket_buffer, 0, 0, netlink_unit, len, 0);
-	if (!netlink_hdr) {
-		pr_err("nlmsg_put failaure.\n");
-		nlmsg_free(socket_buffer);
-		return -LINE_ERROR;
-	}
-
-	memcpy(nlmsg_data(netlink_hdr), packet_buffer, len);
-	return netlink_unicast(netlink_socket, socket_buffer, USER_PORT, MSG_DONTWAIT);
-}
-static void netlink_rcv_msg(struct sk_buff* socket_buffer) {
-	struct nlmsghdr* nlhdr = NULL;
-	char* umsg = NULL;
-
-	if (socket_buffer->len >= nlmsg_total_size(0)) {
-		nlhdr = nlmsg_hdr(socket_buffer);
-		umsg = nlmsg_data(nlhdr);
-		if (umsg) {
-#ifdef CONFIG_PROC_FS
-			if (!memcmp(umsg, "#proc_remove", nlmsg_len(nlhdr))) {
-				if (rekernel_dir) {
-					proc_remove(rekernel_dir);
-				}
-			}
-#endif /* CONFIG_PROC_FS */
-		}
-	}
-}
-#ifdef CONFIG_REKERNEL_NETWORK
-static unsigned int rekernel_pkg_ipv4_ipv6_in(void* priv, struct sk_buff* socket_buffer,
-	const struct nf_hook_state* state) {
-	struct sock* sk;
-	unsigned int thoff = 0;
-	unsigned short frag_off = 0;
-	uid_t uid;
-	uint hook;
-	struct net_device* dev = NULL;
-	struct tcphdr *th;
-	int data_len = 0;
-
-	if (!socket_buffer || !socket_buffer->len || !state)
-		return NF_ACCEPT;
-
-	hook = state->hook;
-	if (NF_INET_LOCAL_IN == hook)
-		dev = state->in;
-
-	if (NULL == dev)
-		return NF_ACCEPT;
-
-	if (ip_hdr(socket_buffer)->version == 4) {
-		struct iphdr *iph4 = ip_hdr(socket_buffer);
-		if (iph4->protocol != IPPROTO_TCP)
-			return NF_ACCEPT;
-		if (!pskb_may_pull(socket_buffer, (iph4->ihl << 2) + sizeof(struct tcphdr)))
-			return NF_ACCEPT;
-		th = (struct tcphdr *)((unsigned char *)iph4 + (iph4->ihl << 2));
-		data_len = ntohs(iph4->tot_len) - (iph4->ihl << 2) - (th->doff << 2);
-#if IS_ENABLED(CONFIG_IPV6)
-	} else if (ip_hdr(socket_buffer)->version == 6) {
-		struct ipv6hdr *iph6 = ipv6_hdr(socket_buffer);
-		if (ipv6_find_hdr(socket_buffer, &thoff, -1, &frag_off, NULL) != IPPROTO_TCP)
-			return NF_ACCEPT;
-		if (!pskb_may_pull(socket_buffer, thoff + sizeof(struct tcphdr)))
-			return NF_ACCEPT;
-		th = (struct tcphdr *)(skb_network_header(socket_buffer) + thoff);
-		data_len = ntohs(iph6->payload_len) - (thoff - sizeof(struct ipv6hdr)) - (th->doff << 2);
-#endif
-	} else {
-		return NF_ACCEPT;
-	}
-
-	sk = skb_to_full_sk(socket_buffer);
-	if (sk == NULL || !sk_fullsock(sk))
-		return NF_ACCEPT;
-
-	uid = sock_i_uid(sk).val;
-	if (uid < MIN_USERAPP_UID)
-		return NF_ACCEPT;
-
-	if (data_len <= 0 && !th->syn && !th->fin && !th->rst)
-		return NF_ACCEPT;
-
-	rekernel_report(NETWORK, ip_hdr(socket_buffer)->version, data_len, NULL, uid, NULL, true, NULL);
-	return NF_ACCEPT;
-}
-/* Only monitor input network packages */
-static struct nf_hook_ops rekernel_nf_ops[] = {
-	{
-		.hook = rekernel_pkg_ipv4_ipv6_in,
-		.pf = NFPROTO_IPV4,
-		.hooknum = NF_INET_LOCAL_IN,
-		.priority = NF_IP_PRI_SELINUX_LAST + 1,
-	},
-#if IS_ENABLED(CONFIG_IPV6)
-	{
-		.hook = rekernel_pkg_ipv4_ipv6_in,
-		.pf = NFPROTO_IPV6,
-		.hooknum = NF_INET_LOCAL_IN,
-		.priority = NF_IP6_PRI_SELINUX_LAST + 1,
-	}
-#endif
-};
-
-int register_netfilter(void) {
-	int rc;
-	struct net* net = NULL;
-	for_each_net(net) {
-		rc = nf_register_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops));
-		if (rc) {
-			pr_err("register netfilter hooks failed, rc=%d\n", rc);
-			break;
-		}
-	}
-	if (rc) {
-		for_each_net(net) {
-			nf_unregister_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops));
-		}
-		return -1;
-	}
-
-	return LINE_SUCCESS;
-}
-#endif /* CONFIG_REKERNEL_NETWORK */
-struct netlink_kernel_cfg cfg = {
-	.input = netlink_rcv_msg, // set recv callback
-};
-#ifdef CONFIG_PROC_FS
-static int rekernel_unit_show(struct seq_file* m, void* v) {
-	seq_printf(m, "%d\n", netlink_unit);
-	return LINE_SUCCESS;
-}
-static int rekernel_unit_open(struct inode* inode, struct file* file) {
-	return single_open(file, rekernel_unit_show, NULL);
-}
-static const struct file_operations rekernel_unit_fops = {
-	.open = rekernel_unit_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release
-};
-#endif /* CONFIG_PROC_FS */
-// init
-static int start_rekernel(void) {
-	if (netlink_unit)
-		return 0;
-
-	pr_info("Thank you for choosing Re:Kernel!\n");
-#ifdef CONFIG_REKERNEL_NETWORK
-	pr_info("NetFilter is enabled!\n");
-#endif
-	pr_info("Re:Kernel v8.6 | DEVELOPER: Sakion Team | Timeline | USER PORT: %d\n", USER_PORT);
-	pr_info("Trying to create Re:Kernel Server......\n");
-
-	for (netlink_unit = NETLINK_REKERNEL_MIN; netlink_unit < NETLINK_REKERNEL_MAX; netlink_unit++) {
-		netlink_socket = netlink_kernel_create(&init_net, netlink_unit, &cfg);
-		if (netlink_socket != NULL)
-			break;
-	}
-	if (netlink_socket == NULL) {
-		netlink_unit = 0;
-		pr_err("Failed to create Re:Kernel server!\n");
-		return -LINE_ERROR;
-	}
-	pr_info("Created Re:Kernel server! NETLINK UNIT: %d\n", netlink_unit);
-
-#ifdef CONFIG_PROC_FS
-	rekernel_dir = proc_mkdir("rekernel", NULL);
-	if (!rekernel_dir) {
-		pr_err("create /proc/rekernel failed!\n");
-	} else {
-		char buff[32];
-		sprintf(buff, "%d", netlink_unit);
-		rekernel_unit_entry = proc_create(buff, 0644, rekernel_dir, &rekernel_unit_fops);
-		if (!rekernel_unit_entry) {
-			pr_err("create rekernel unit failed!\n");
-		}
-	}
-#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_REKERNEL_NETWORK
-	if (register_netfilter()) {
-		pr_err("%s: Failed to hook netfilter!\n", __func__);
-		return -LINE_ERROR;
-	}
-#endif /* CONFIG_REKERNEL_NETWORK */
-	return LINE_SUCCESS;
-}
-
-void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) {
-	char binder_kmsg[PACKET_SIZE];
-	char buf_data[INTERFACETOKEN_BUFF_SIZE];
-	size_t buf_data_size;
-	char buf[INTERFACETOKEN_BUFF_SIZE] = { 0 };
-	char* p;
-	int i = 0;
-	int j = 0;
-
-	if (start_rekernel())
-		return;
-
-#ifdef CONFIG_REKERNEL_NETWORK
-	if (reporttype == NETWORK) {
-		char binder_kmsg[PACKET_SIZE];
-		snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Network,target=%d,proto=ipv%d,data_len=%d;", dst_pid, type, src_pid);
-		sendMessage(binder_kmsg, strlen(binder_kmsg));
-		return;
-	}
-#endif /* CONFIG_REKERNEL_NETWORK */
-
-	if (!frozen_task_group(dst))
-		return;
-
-	if (task_uid(src).val == task_uid(dst).val)
-		return;
-
-	switch (reporttype) {
-	case BINDER:
-		if (oneway && type == TRANSACTION) {
-			if (tr->code < 29 || tr->code > 32)
-				return;
-			buf_data_size = tr->data_size > INTERFACETOKEN_BUFF_SIZE ? INTERFACETOKEN_BUFF_SIZE : tr->data_size;
-			if (copy_from_user(buf_data, (char*)tr->data.ptr.buffer, buf_data_size))
-				return;
-			j = PARCEL_OFFSET + 1;
-			p = (char*)(buf_data)+PARCEL_OFFSET;
-			while (i < INTERFACETOKEN_BUFF_SIZE && j < buf_data_size && *p != '\0') {
-				buf[i++] = *p;
-				j += 2;
-				p += 2;
-			}
-			if (i == INTERFACETOKEN_BUFF_SIZE) {
-				buf[i - 1] = '\0';
-			}
-			snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d,rpc_name=%s,code=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, buf, tr->code);
-		} else {
-			snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, rpc_type[type], -1);
-		}
-		break;
-	case SIGNAL:
-		snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Signal,signal=%d,killer_pid=%d,killer=%d,dst_pid=%d,dst=%d;", type, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val);
-		break;
-	default:
-		return;
-	}
-	sendMessage(binder_kmsg, strlen(binder_kmsg));
-}
-
-void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) {
-	if (unlikely(!dst))
-		return;
-	if (task_uid(dst).val > MAX_SYSTEM_UID || src_pid == dst_pid)
-		return;
-
-	// oneway=0
-	rekernel_report(BINDER, REPLY, src_pid, src, dst_pid, dst, oneway, tr);
-}
-
-void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) {
-	if (unlikely(!dst))
-		return;
-	if ((task_uid(dst).val <= MIN_USERAPP_UID) || src_pid == dst_pid)
-		return;
-
-	rekernel_report(BINDER, TRANSACTION, src_pid, src, dst_pid, dst, oneway, tr);
-}
-
-void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) {
-	if (unlikely(!dst))
-		return;
-
-	// oneway=1
-	rekernel_report(BINDER, OVERFLOW, src_pid, src, dst_pid, dst, oneway, tr);
-}
diff --git a/drivers/rekernel/rekernel.h b/drivers/rekernel/rekernel.h
deleted file mode 100644
index af7022a8535c..000000000000
--- a/drivers/rekernel/rekernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __REKERNEL_H
-#define __REKERNEL_H
-
-#include <linux/types.h>
-#include <linux/cgroup.h>
-#include <linux/freezer.h>
-#include <uapi/linux/android/binder.h>
-
-enum report_type {
-	BINDER,
-	SIGNAL,
-#ifdef CONFIG_REKERNEL_NETWORK
-	NETWORK,
-#endif /* CONFIG_REKERNEL_NETWORK */
-};
-enum binder_type {
-	REPLY,
-	TRANSACTION,
-	OVERFLOW,
-};
-
-static inline bool jobctl_frozen(struct task_struct* task) {
-	return ((task->jobctl & JOBCTL_TRAP_FREEZE) != 0);
-}
-static inline bool frozen_task_group(struct task_struct* task) {
-	return (jobctl_frozen(task) || cgroup_freezing(task));
-}
-
-extern void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr);
-extern void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr);
-extern void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr);
-extern void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr);
-
-#endif /* __REKERNEL_H */
diff --git a/fs/exec.c b/fs/exec.c
index c2530fed584d..351ce34f1226 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1676,13 +1676,6 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern bool ksu_execveat_hook __read_mostly;
-extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv,
-			void *envp, int *flags);
-extern int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr,
-				 void *argv, void *envp, int *flags);
-#endif
 static int do_execveat_common(int fd, struct filename *filename,
 			      struct user_arg_ptr argv,
 			      struct user_arg_ptr envp,
@@ -1694,13 +1687,6 @@ static int do_execveat_common(int fd, struct filename *filename,
 	struct files_struct *displaced;
 	int retval;
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (unlikely(ksu_execveat_hook))
-		ksu_handle_execveat(&fd, &filename, &argv, &envp, &flags);
-	else
-		ksu_handle_execveat_sucompat(&fd, &filename, &argv, &envp, &flags);
-#endif
-
 	if (IS_ERR(filename))
 		return PTR_ERR(filename);
 
@@ -1851,12 +1837,21 @@ static int do_execveat_common(int fd, struct filename *filename,
 	return retval;
 }
 
+#ifdef CONFIG_KSU
+__attribute__((hot))
+extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr,
+				void *argv, void *envp, int *flags);
+#endif
+
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
 {
 	struct user_arg_ptr argv = { .ptr.native = __argv };
 	struct user_arg_ptr envp = { .ptr.native = __envp };
+#ifdef CONFIG_KSU
+	ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0);
+#endif
 	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
 }
 
@@ -1884,6 +1879,9 @@ static int compat_do_execve(struct filename *filename,
 		.is_compat = true,
 		.ptr.compat = __envp,
 	};
+#ifdef CONFIG_KSU // 32-bit ksud and 32-on-64 support
+	ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0);
+#endif
 	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
 }
 
diff --git a/fs/file.c b/fs/file.c
index 73b85f676357..be0792c0a231 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -656,37 +656,6 @@ int __close_fd(struct files_struct *files, unsigned fd)
 	return -EBADF;
 }
 
-/*
- * variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file, and then
- * an fput().
- */
-int close_fd_get_file(unsigned int fd, struct file **res)
-{
-	struct files_struct *files = current->files;
-	struct file *file;
-	struct fdtable *fdt;
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (fd >= fdt->max_fds)
-		goto out_unlock;
-	file = fdt->fd[fd];
-	if (!file)
-		goto out_unlock;
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__put_unused_fd(files, fd);
-	spin_unlock(&files->file_lock);
-	get_file(file);
-	*res = file;
-	return 0;
-
-out_unlock:
-	spin_unlock(&files->file_lock);
-	*res = NULL;
-	return -ENOENT;
-}
-
 void do_close_on_exec(struct files_struct *files)
 {
 	unsigned i;
diff --git a/fs/internal.h b/fs/internal.h
index 380bae4c5ff7..3e58863de514 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -68,7 +68,6 @@ extern int finish_automount(struct vfsmount *, struct path *);
 extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
-int path_umount(struct path *path, int flags);
 
 extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b70288a713b3..27358c854203 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -516,7 +516,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_put;
 
 	rc = 0;
-	of->mmapped = true;
+	of->mmapped = 1;
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 1c2ea6ca0381..d5b149a45be1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -123,10 +123,8 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 		return dentry;
 
 	knparent = find_next_ancestor(kn, NULL);
-	if (WARN_ON(!knparent)) {
-		dput(dentry);
+	if (WARN_ON(!knparent))
 		return ERR_PTR(-EINVAL);
-	}
 
 	do {
 		struct dentry *dtmp;
@@ -135,11 +133,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 		if (kn == knparent)
 			return dentry;
 		kntmp = find_next_ancestor(kn, knparent);
-		if (WARN_ON(!kntmp)) {
-			dput(dentry);
+		if (WARN_ON(!kntmp))
 			return ERR_PTR(-EINVAL);
-		}
-		dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
+		dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
+					       strlen(kntmp->name));
 		dput(dentry);
 		if (IS_ERR(dtmp))
 			return dtmp;
diff --git a/fs/open.c b/fs/open.c
index 66fadbdfd17a..7dc516777071 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -355,15 +355,17 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 	return error;
 }
 
+#ifdef CONFIG_KSU
+__attribute__((hot)) 
+extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user,
+				int *mode, int *flags);
+#endif
+
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode,
-			 int *flags);
-#endif
 SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 {
 	const struct cred *old_cred;
@@ -373,7 +375,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	struct vfsmount *mnt;
 	int res;
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
-#ifdef CONFIG_KSU_MANUAL_HOOK
+
+#ifdef CONFIG_KSU
 	ksu_handle_faccessat(&dfd, &filename, &mode, NULL);
 #endif
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 4f892b7649d5..901231269242 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -456,19 +456,10 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 }
 EXPORT_SYMBOL(__vfs_read);
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern bool ksu_vfs_read_hook __read_mostly;
-extern int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr,
-			size_t *count_ptr, loff_t **pos);
-#endif
 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 {
 	ssize_t ret;
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (unlikely(ksu_vfs_read_hook))
-		ksu_handle_vfs_read(&file, &buf, &count, &pos);
-#endif
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
diff --git a/fs/stat.c b/fs/stat.c
index 0d099fff8b82..6c795dd237bc 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -87,9 +87,6 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 }
 EXPORT_SYMBOL(vfs_fstat);
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags);
-#endif
 int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 		int flag)
 {
@@ -97,9 +94,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	unsigned int lookup_flags = 0;
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	ksu_handle_stat(&dfd, &filename, &flag);
-#endif
 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
 		      AT_EMPTY_PATH)) != 0)
 		goto out;
@@ -293,6 +287,12 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename,
 	return cp_new_stat(&stat, statbuf);
 }
 
+#ifdef CONFIG_KSU
+__attribute__((hot)) 
+extern int ksu_handle_stat(int *dfd, const char __user **filename_user,
+				int *flags);
+#endif
+
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
 SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
@@ -300,6 +300,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 	struct kstat stat;
 	int error;
 
+#ifdef CONFIG_KSU
+	ksu_handle_stat(&dfd, &filename, &flag);
+#endif
 	error = vfs_fstatat(dfd, filename, &stat, flag);
 	if (error)
 		return error;
@@ -307,6 +310,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 }
 #endif
 
+#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD)
+extern void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr);
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
+extern void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr); // for 32-bit
+#endif
+#endif
+
 SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 {
 	struct kstat stat;
@@ -315,6 +325,9 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 	if (!error)
 		error = cp_new_stat(&stat, statbuf);
 
+#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD)
+	ksu_handle_newfstat_ret(&fd, &statbuf);
+#endif
 	return error;
 }
 
@@ -433,6 +446,9 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	if (!error)
 		error = cp_new_stat64(&stat, statbuf);
 
+#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) // for 32-bit
+	ksu_handle_fstat64_ret(&fd, &statbuf);
+#endif
 	return error;
 }
 
@@ -442,6 +458,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 	struct kstat stat;
 	int error;
 
+#ifdef CONFIG_KSU // 32-bit su
+	ksu_handle_stat(&dfd, &filename, &flag); 
+#endif
 	error = vfs_fstatat(dfd, filename, &stat, flag);
 	if (error)
 		return error;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1778f36ac1ce..9c41956dc9ca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -45,7 +45,7 @@ struct pr_ops;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		3
+#define BLKCG_MAX_POLS		2
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ab429b48f8bd..35a28e4fb2dd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -13,7 +13,6 @@
 #include <linux/wait.h>
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
@@ -63,38 +62,18 @@ enum {
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
-
-	/* Control group has to be frozen. */
-	CGRP_FREEZE,
-
-	/* Cgroup is frozen. */
-	CGRP_FROZEN,
 };
 
 /* cgroup_root->flags */
 enum {
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-
-	/*
-	 * Consider namespaces as delegation boundaries.  If this flag is
-	 * set, controller specific interface files in a namespace root
-	 * aren't writeable from inside the namespace.
-	 */
-	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
-
-	/*
-	 * Enable cpuset controller in v1 cgroup to use v2 behavior.
-	 */
-	CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
 };
 
 /* cftype->flags */
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
-	CFTYPE_NS_DELEGATABLE	= (1 << 2),	/* writeable beyond delegation boundaries */
-
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
@@ -131,6 +110,9 @@ struct cgroup_subsys_state {
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 
+	/* PI: the parent css */
+	struct cgroup_subsys_state *parent;
+
 	/* siblings list anchored at the parent's ->children */
 	struct list_head sibling;
 	struct list_head children;
@@ -160,12 +142,6 @@ struct cgroup_subsys_state {
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
-
-	/*
-	 * PI: the parent css.	Placed here for cache proximity to following
-	 * fields of the containing structure.
-	 */
-	struct cgroup_subsys_state *parent;
 };
 
 /*
@@ -176,29 +152,14 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
-	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
-	/* reference count */
-	refcount_t refcount;
+	/* Reference count */
+	atomic_t refcount;
 
 	/*
-	 * For a domain cgroup, the following points to self.  If threaded,
-	 * to the matching cset of the nearest domain ancestor.  The
-	 * dom_cset provides access to the domain cgroup and its csses to
-	 * which domain level resource consumptions should be charged.
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
 	 */
-	struct css_set *dom_cset;
-
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
-
-	/* internal task count, protected by css_set_lock */
-	int nr_tasks;
+	struct hlist_node hlist;
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
@@ -209,42 +170,28 @@ struct css_set {
 	 */
 	struct list_head tasks;
 	struct list_head mg_tasks;
-	struct list_head dying_tasks;
-
-	/* all css_task_iters currently walking this cset */
-	struct list_head task_iters;
 
 	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* all threaded csets whose ->dom_cset points to this cset */
-	struct list_head threaded_csets;
-	struct list_head threaded_csets_node;
+	struct list_head cgrp_links;
 
-	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
-	 */
-	struct hlist_node hlist;
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 
 	/*
-	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * css_set.  Protected by css_set_lock.
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
-	struct list_head cgrp_links;
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
 	/*
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
-	struct list_head mg_src_preload_node;
-	struct list_head mg_dst_preload_node;
+	struct list_head mg_preload_node;
 	struct list_head mg_node;
 
 	/*
@@ -258,6 +205,18 @@ struct css_set {
 	struct cgroup *mg_dst_cgrp;
 	struct css_set *mg_dst_cset;
 
+	/*
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
+	 */
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
 	/* dead and being drained, ignore for migration */
 	bool dead;
 
@@ -265,25 +224,6 @@ struct css_set {
 	struct rcu_head rcu_head;
 };
 
-struct cgroup_freezer_state {
-	/* Should the cgroup and its descendants be frozen. */
-	bool freeze;
-
-	/* Should the cgroup actually be frozen? */
-	int e_freeze;
-
-	/* Fields below are protected by css_set_lock */
-
-	/* Number of frozen descendant cgroups */
-	int nr_frozen_descendants;
-
-	/*
-	 * Number of tasks, which are counted as frozen:
-	 * frozen, SIGSTOPped, and PTRACEd.
-	 */
-	int nr_frozen_tasks;
-};
-
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -308,40 +248,13 @@ struct cgroup {
 	 */
 	int level;
 
-	/* Maximum allowed descent tree depth */
-	int max_depth;
-
-	/*
-	 * Keep track of total numbers of visible and dying descent cgroups.
-	 * Dying cgroups are cgroups which were deleted by a user,
-	 * but are still existing because someone else is holding a reference.
-	 * max_descendants is a maximum allowed number of descent cgroups.
-	 *
-	 * nr_descendants and nr_dying_descendants are protected
-	 * by cgroup_mutex and css_set_lock. It's fine to read them holding
-	 * any of cgroup_mutex and css_set_lock; for writing both locks
-	 * should be held.
-	 */
-	int nr_descendants;
-	int nr_dying_descendants;
-	int max_descendants;
-
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
-	 * one to nr_populated_csets.  The counter is zero iff this cgroup
-	 * doesn't have any tasks.
-	 *
-	 * All children which have non-zero nr_populated_csets and/or
-	 * nr_populated_children of their own contribute one to either
-	 * nr_populated_domain_children or nr_populated_threaded_children
-	 * depending on their type.  Each counter is zero iff all cgroups
-	 * of the type in the subtree proper don't have any tasks.
+	 * one to populated_cnt.  All children with non-zero popuplated_cnt
+	 * of their own contribute one.  The count is zero iff there's no
+	 * task in this cgroup or its subtree.
 	 */
-	int nr_populated_csets;
-	int nr_populated_domain_children;
-	int nr_populated_threaded_children;
-
-	int nr_threaded_children;	/* # of live threaded child cgroups */
+	int populated_cnt;
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
@@ -379,16 +292,6 @@ struct cgroup {
 	 */
 	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 
-	/*
-	 * If !threaded, self.  If threaded, it points to the nearest
-	 * domain ancestor.  Inside a threaded subtree, cgroups are exempt
-	 * from process granularity and no-internal-task constraint.
-	 * Domain level resource consumptions which aren't tied to a
-	 * specific task are charged to the dom_cgrp.
-	 */
-	struct cgroup *dom_cgrp;
-	struct cgroup *old_dom_cgrp;		/* used while enabling threaded */
-
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
@@ -408,9 +311,6 @@ struct cgroup {
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
 
-	/* Used to store internal freezer state */
-	struct cgroup_freezer_state freezer;
-
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
@@ -548,7 +448,7 @@ struct cftype {
 
 /*
  * Control Group subsystem type.
- * See Documentation/cgroup-v1/cgroups.txt for details
+ * See Documentation/cgroups/cgroups.txt for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
@@ -567,7 +467,7 @@ struct cgroup_subsys {
 	void (*cancel_fork)(struct task_struct *task);
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct task_struct *task);
-	void (*release)(struct task_struct *task);
+	void (*free)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
 	bool early_init:1;
@@ -585,18 +485,6 @@ struct cgroup_subsys {
 	 */
 	bool implicit_on_dfl:1;
 
-	/*
-	 * If %true, the controller, supports threaded mode on the default
-	 * hierarchy.  In a threaded subtree, both process granularity and
-	 * no-internal-process constraint are ignored and a threaded
-	 * controllers should be able to handle that.
-	 *
-	 * Note that as an implicit controller is automatically enabled on
-	 * all cgroups on the default hierarchy, it should also be
-	 * threaded.  implicit && !threaded is not supported.
-	 */
-	bool threaded:1;
-
 	/*
 	 * If %false, this subsystem is properly hierarchical -
 	 * configuration, resource accounting and restriction on a parent
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 55a878aebe21..4e93ff0e45ba 100755
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,11 +17,11 @@
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
+#include <linux/nsproxy.h>
 #include <linux/types.h>
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
-#include <linux/refcount.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -36,33 +36,18 @@
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
-/* walk only threadgroup leaders */
-#define CSS_TASK_ITER_PROCS		(1U << 0)
-/* walk all threaded css_sets in the domain */
-#define CSS_TASK_ITER_THREADED		(1U << 1)
-
-/* internal flags */
-#define CSS_TASK_ITER_SKIPPED		(1U << 16)
-
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
-	unsigned int			flags;
 
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
 
-	struct list_head		*tcset_pos;
-	struct list_head		*tcset_head;
-
 	struct list_head		*task_pos;
 	struct list_head		*tasks_head;
 	struct list_head		*mg_tasks_head;
-	struct list_head		*dying_tasks_head;
 
-	struct list_head		*cur_tasks_head;
 	struct css_set			*cur_cset;
-	struct css_set			*cur_dcset;
 	struct task_struct		*cur_task;
 	struct list_head		iters_node;	/* css_set->task_iters */
 };
@@ -122,7 +107,6 @@ extern int cgroup_can_fork(struct task_struct *p);
 extern void cgroup_cancel_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 void cgroup_exit(struct task_struct *p);
-void cgroup_release(struct task_struct *p);
 void cgroup_free(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -145,7 +129,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 					struct cgroup_subsys_state **dst_cssp);
 
-void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
+void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
@@ -282,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
  * @leader: the loop cursor
  * @dst_css: the destination css
- * @tset: taskset to iterate
+ * @tset: takset to iterate
  *
  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
  * may not contain any.
@@ -563,27 +547,6 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
 	return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
-/**
- * cgroup_ancestor - find ancestor of cgroup
- * @cgrp: cgroup to find ancestor of
- * @ancestor_level: level of ancestor to find starting from root
- *
- * Find ancestor of cgroup at specified level starting from root if it exists
- * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
- * @ancestor_level.
- *
- * This function is safe to call as long as @cgrp is accessible.
- */
-static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
-					     int ancestor_level)
-{
-	if (cgrp->level < ancestor_level)
-		return NULL;
-	while (cgrp && cgrp->level > ancestor_level)
-		cgrp = cgroup_parent(cgrp);
-	return cgrp;
-}
-
 /**
  * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
  * @task: the task to be tested
@@ -604,8 +567,7 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
-	return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
-		cgrp->nr_populated_threaded_children;
+	return cgrp->populated_cnt;
 }
 
 /* returns ino associated with a cgroup */
@@ -709,7 +671,6 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
 static inline void cgroup_cancel_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
-static inline void cgroup_release(struct task_struct *p) {}
 static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
@@ -788,7 +749,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 #endif	/* CONFIG_CGROUP_DATA */
 
 struct cgroup_namespace {
-	refcount_t		count;
+	atomic_t		count;
 	struct ns_common	ns;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
@@ -823,56 +784,13 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
 static inline void get_cgroup_ns(struct cgroup_namespace *ns)
 {
 	if (ns)
-		refcount_inc(&ns->count);
+		atomic_inc(&ns->count);
 }
 
 static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 {
-	if (ns && refcount_dec_and_test(&ns->count))
+	if (ns && atomic_dec_and_test(&ns->count))
 		free_cgroup_ns(ns);
 }
 
-#ifdef CONFIG_CGROUPS
-
-void cgroup_enter_frozen(void);
-void cgroup_leave_frozen(bool always_leave);
-void cgroup_update_frozen(struct cgroup *cgrp);
-void cgroup_freeze(struct cgroup *cgrp, bool freeze);
-void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
-				 struct cgroup *dst);
-void cgroup_freezer_frozen_exit(struct task_struct *task);
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	bool ret;
-
-	if (task->flags & PF_KTHREAD)
-		return false;
-
-	rcu_read_lock();
-	ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static inline bool cgroup_task_frozen(struct task_struct *task)
-{
-	return task->frozen;
-}
-
-#else /* !CONFIG_CGROUPS */
-
-static inline void cgroup_enter_frozen(void) { }
-static inline void cgroup_leave_frozen(bool always_leave) { }
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	return false;
-}
-static inline bool cgroup_task_frozen(struct task_struct *task)
-{
-	return false;
-}
-
-#endif /* !CONFIG_CGROUPS */
-
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
deleted file mode 100644
index e94290b29e99..000000000000
--- a/include/linux/cgroup_rdma.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
- */
-
-#ifndef _CGROUP_RDMA_H
-#define _CGROUP_RDMA_H
-
-#include <linux/cgroup.h>
-
-enum rdmacg_resource_type {
-	RDMACG_RESOURCE_HCA_HANDLE,
-	RDMACG_RESOURCE_HCA_OBJECT,
-	RDMACG_RESOURCE_MAX,
-};
-
-#ifdef CONFIG_CGROUP_RDMA
-
-struct rdma_cgroup {
-	struct cgroup_subsys_state	css;
-
-	/*
-	 * head to keep track of all resource pools
-	 * that belongs to this cgroup.
-	 */
-	struct list_head		rpools;
-};
-
-struct rdmacg_device {
-	struct list_head	dev_node;
-	struct list_head	rpools;
-	char			*name;
-};
-
-/*
- * APIs for RDMA/IB stack to publish when a device wants to
- * participate in resource accounting
- */
-int rdmacg_register_device(struct rdmacg_device *device);
-void rdmacg_unregister_device(struct rdmacg_device *device);
-
-/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
-int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
-		      struct rdmacg_device *device,
-		      enum rdmacg_resource_type index);
-void rdmacg_uncharge(struct rdma_cgroup *cg,
-		     struct rdmacg_device *device,
-		     enum rdmacg_resource_type index);
-#endif	/* CONFIG_CGROUP_RDMA */
-#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ff4cad3a2275..7f4a2a5a2a77 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,10 +60,6 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
-#if IS_ENABLED(CONFIG_CGROUP_RDMA)
-SUBSYS(rdma)
-#endif
-
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3cfe2d27811b..d807fa9b2051 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -56,7 +56,7 @@ static inline void cpuset_dec(void)
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_force_rebuild(void);
-extern void cpuset_update_active_cpus(void);
+extern void cpuset_update_active_cpus(bool cpu_online);
 extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
@@ -172,7 +172,7 @@ static inline void cpuset_init_smp(void) {}
 
 static inline void cpuset_force_rebuild(void) { }
 
-static inline void cpuset_update_active_cpus(void)
+static inline void cpuset_update_active_cpus(bool cpu_online)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 796dc4380de0..09debf2e047f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -252,18 +252,6 @@ static inline const struct cred *get_cred(const struct cred *cred)
 	return get_new_cred(nonconst_cred);
 }
 
-static inline const struct cred *get_cred_rcu(const struct cred *cred)
-{
-	struct cred *nonconst_cred = (struct cred *) cred;
-	if (!cred)
-		return NULL;
-	if (!atomic_inc_not_zero(&nonconst_cred->usage))
-		return NULL;
-	validate_creds(cred);
-	nonconst_cred->non_rcu = 0;
-	return cred;
-}
-
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index b01be50dbb24..442b54a14cbc 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -120,7 +120,6 @@ extern void __fd_install(struct files_struct *files,
 		      unsigned int fd, struct file *file);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
-extern int close_fd_get_file(unsigned int fd, struct file **res);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 4df9b50cb1c3..44e529353b6b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -188,7 +188,7 @@ struct kernfs_open_file {
 	char			*prealloc_buf;
 
 	size_t			atomic_write_len;
-	bool			mmapped:1;
+	bool			mmapped;
 	bool			released:1;
 	const struct vm_operations_struct *vm_ops;
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a548961be39..d2d7208b2274 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,28 +32,6 @@ struct user_struct;
 struct writeback_control;
 struct bdi_writeback;
 
-/**
- * mmgrab() - Pin a &struct mm_struct.
- * @mm: The &struct mm_struct to pin.
- *
- * Make sure that @mm will not get freed even after the owning task
- * exits. This doesn't guarantee that the associated address space
- * will still exist later on and mmget_not_zero() has to be used before
- * accessing it.
- *
- * This is a preferred way to pin @mm for a longer/unbounded amount
- * of time.
- *
- * Use mmdrop() to release the reference acquired by mmgrab().
- *
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
- * of &mm_struct.mm_count vs &mm_struct.mm_users.
- */
-static inline void mmgrab(struct mm_struct *mm)
-{
-	atomic_inc(&mm->mm_count);
-}
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8106413464f..32111634c69b 100755
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1829,8 +1829,6 @@ struct task_struct {
 #ifdef CONFIG_CGROUPS
 	/* disallow userland-initiated cgroup migration */
 	unsigned no_cgroup_migration:1;
-	/* task is frozen/stopped (used by the cgroup freezer) */
-	unsigned			frozen:1;
 #endif
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
@@ -2650,7 +2648,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
 #define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
-#define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
 
 #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
@@ -2659,7 +2656,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
 #define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
 #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
-#define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 975be862e083..9089a2ae913d 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,7 +1,5 @@
-#ifndef _LINUX_SCHED_DEADLINE_H
-#define _LINUX_SCHED_DEADLINE_H
-
-#include <linux/sched.h>
+#ifndef _SCHED_DEADLINE_H
+#define _SCHED_DEADLINE_H
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,4 +26,4 @@ static inline bool dl_time_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-#endif /* _LINUX_SCHED_DEADLINE_H */
+#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 2cc450f6ec54..d9cf5a5762d9 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -1,5 +1,5 @@
-#ifndef _LINUX_SCHED_PRIO_H
-#define _LINUX_SCHED_PRIO_H
+#ifndef _SCHED_PRIO_H
+#define _SCHED_PRIO_H
 
 #define MAX_NICE	19
 #define MIN_NICE	-20
@@ -57,4 +57,4 @@ static inline long rlimit_to_nice(long prio)
 	return (MAX_NICE - prio + 1);
 }
 
-#endif /* _LINUX_SCHED_PRIO_H */
+#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 3bd668414f61..a30b172df6e1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,9 +1,7 @@
-#ifndef _LINUX_SCHED_RT_H
-#define _LINUX_SCHED_RT_H
+#ifndef _SCHED_RT_H
+#define _SCHED_RT_H
 
-#include <linux/sched.h>
-
-struct task_struct;
+#include <linux/sched/prio.h>
 
 static inline int rt_prio(int prio)
 {
@@ -59,4 +57,4 @@ extern void normalize_rt_tasks(void);
  */
 #define RR_TIMESLICE		(100 * HZ / 1000)
 
-#endif /* _LINUX_SCHED_RT_H */
+#endif /* _SCHED_RT_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 07207044b5f4..9ff03d20b986 100755
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -1,9 +1,5 @@
-#ifndef _LINUX_SCHED_SYSCTL_H
-#define _LINUX_SCHED_SYSCTL_H
-
-#include <linux/types.h>
-
-struct ctl_table;
+#ifndef _SCHED_SYSCTL_H
+#define _SCHED_SYSCTL_H
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 extern int	     sysctl_hung_task_check_count;
@@ -152,4 +148,4 @@ extern int sched_little_cluster_coloc_fmin_khz_handler(struct ctl_table *table,
 extern char sched_lib_name[LIB_PATH_LENGTH];
 extern unsigned int sched_lib_mask_force;
 extern bool is_sched_lib_based_app(pid_t pid);
-#endif /* _LINUX_SCHED_SYSCTL_H */
+#endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index d75248d81499..e305b66a9fb9 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -139,20 +139,6 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
 
-#define DEFINE_SHOW_ATTRIBUTE(__name)					\
-static int __name ## _open(struct inode *inode, struct file *file)	\
-{									\
-	return single_open(file, __name ## _show, inode->i_private);	\
-}									\
-									\
-static const struct file_operations __name ## _fops = {			\
-	.owner		= THIS_MODULE,					\
-	.open		= __name ## _open,				\
-	.read		= seq_read,					\
-	.llseek		= seq_lseek,					\
-	.release	= single_release,				\
-}
-
 static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 {
 #ifdef CONFIG_USER_NS
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index dab24c19c82a..3558b58da3e4 100755
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  * Copyright (C) 2008 Google, Inc.
  *
@@ -67,7 +66,6 @@ enum flat_binder_object_flags {
 	 * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
 	 */
 	FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
-
 	/**
 	 * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
 	 *
@@ -89,6 +87,7 @@ enum flat_binder_object_flags {
 	 * scheduling policy from the caller (for synchronous transactions).
 	 */
 	FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
+#ifdef __KERNEL__
 
 	/**
 	 * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
@@ -97,6 +96,7 @@ enum flat_binder_object_flags {
 	 * context
 	 */
 	FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
+#endif /* __KERNEL__ */
 };
 
 #ifdef BINDER_IPC_32BIT
@@ -265,25 +265,6 @@ struct binder_node_info_for_ref {
 	__u32            reserved3;
 };
 
-struct binder_freeze_info {
-	__u32            pid;
-	__u32            enable;
-	__u32            timeout_ms;
-};
-
-struct binder_frozen_status_info {
-	__u32            pid;
-
-	/* process received sync transactions since last frozen
-	 * bit 0: received sync transaction after being frozen
-	 * bit 1: new pending sync transaction during freezing
-	 */
-	__u32            sync_recv;
-
-	/* process received async transactions since last frozen */
-	__u32            async_recv;
-};
-
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -294,9 +275,6 @@ struct binder_frozen_status_info {
 #define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 #define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
 #define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
-#define BINDER_FREEZE			_IOW('b', 14, struct binder_freeze_info)
-#define BINDER_GET_FROZEN_INFO		_IOWR('b', 15, struct binder_frozen_status_info)
-#define BINDER_ENABLE_ONEWAY_SPAM_DETECTION	_IOW('b', 16, __u32)
 
 /*
  * NOTE: Two special error codes you should check for when calling
@@ -319,7 +297,6 @@ enum transaction_flags {
 	TF_STATUS_CODE	= 0x08,	/* contents are a 32-bit status code */
 	TF_ACCEPT_FDS	= 0x10,	/* allow replies with file descriptors */
 	TF_CLEAR_BUF	= 0x20,	/* clear buffer on txn complete */
-	TF_UPDATE_TXN	= 0x40,	/* update the outdated pending async txn */
 };
 
 struct binder_transaction_data {
@@ -357,11 +334,13 @@ struct binder_transaction_data {
 	} data;
 };
 
+#ifdef __KERNEL__
 struct binder_transaction_data_secctx {
 	struct binder_transaction_data transaction_data;
 	binder_uintptr_t secctx;
 };
 
+#endif /* __KERNEL__ */
 struct binder_transaction_data_sg {
 	struct binder_transaction_data transaction_data;
 	binder_size_t buffers_size;
@@ -397,11 +376,13 @@ enum binder_driver_return_protocol {
 	BR_OK = _IO('r', 1),
 	/* No parameters! */
 
+#ifdef __KERNEL__
 	BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
 				      struct binder_transaction_data_secctx),
 	/*
 	 * binder_transaction_data_secctx: the received command.
 	 */
+#endif /* __KERNEL__ */
 	BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
 	BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
 	/*
@@ -476,22 +457,9 @@ enum binder_driver_return_protocol {
 
 	BR_FAILED_REPLY = _IO('r', 17),
 	/*
-	 * The last transaction (either a bcTRANSACTION or
+	 * The the last transaction (either a bcTRANSACTION or
 	 * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory).  No parameters.
 	 */
-
-	BR_FROZEN_REPLY = _IO('r', 18),
-	/*
-	 * The target of the last transaction (either a bcTRANSACTION or
-	 * a bcATTEMPT_ACQUIRE) is frozen.  No parameters.
-	 */
-
-	BR_ONEWAY_SPAM_SUSPECT = _IO('r', 19),
-	/*
-	 * Current process sent too many oneway calls to target, and the last
-	 * asynchronous transaction makes the allocated async buffer size exceed
-	 * detection threshold.  No parameters.
-	 */
 };
 
 enum binder_driver_command_protocol {
@@ -578,3 +546,4 @@ enum binder_driver_command_protocol {
 };
 
 #endif /* _UAPI_LINUX_BINDER_H */
+
diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h
deleted file mode 100644
index 87410477aea9..000000000000
--- a/include/uapi/linux/android/binderfs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2018 Canonical Ltd.
- *
- */
-
-#ifndef _UAPI_LINUX_BINDERFS_H
-#define _UAPI_LINUX_BINDERFS_H
-
-#include <linux/android/binder.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#define BINDERFS_MAX_NAME 255
-
-/**
- * struct binderfs_device - retrieve information about a new binder device
- * @name:   the name to use for the new binderfs binder device
- * @major:  major number allocated for binderfs binder devices
- * @minor:  minor number allocated for the new binderfs binder device
- *
- */
-struct binderfs_device {
-	char name[BINDERFS_MAX_NAME + 1];
-	__u32 major;
-	__u32 minor;
-};
-
-/**
- * Allocate a new binder device.
- */
-#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device)
-
-#endif /* _UAPI_LINUX_BINDERFS_H */
-
diff --git a/init/Kconfig b/init/Kconfig
index bd93c3f5015a..25fb46dd2b56 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -858,16 +858,6 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
-config CGROUP_RDMA
-	bool "RDMA controller"
-	help
-	  Provides enforcement of RDMA resources defined by IB stack.
-	  It is fairly easy for consumers to exhaust RDMA resources, which
-	  can result into resource unavailability to other consumers.
-	  RDMA controller is designed to stop this from happening.
-	  Attaching processes with active RDMA resources to the cgroup
-	  hierarchy is allowed even if can cross the hierarchy's limit.
-
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
@@ -946,14 +936,11 @@ config CGROUP_BPF
 	  inet sockets.
 
 config CGROUP_DEBUG
-	bool "Debug controller"
+	bool "Example controller"
 	default n
-	depends on DEBUG_KERNEL
 	help
 	  This option enables a simple controller that exports
-	  debugging information about the cgroups framework. This
-	  controller is for control cgroup debugging only. Its
-	  interfaces are not stable.
+	  debugging information about the cgroups framework.
 
 	  Say N.
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 9fec7d39f4b0..f3a91fa080bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -65,7 +65,10 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup/
+obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup.c
similarity index 70%
rename from kernel/cgroup/cgroup.c
rename to kernel/cgroup.c
index 37cdbeb85a92..5c6deb033c96 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup.c
@@ -28,13 +28,15 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include "cgroup-internal.h"
-
+#include <linux/cgroup.h>
 #include <linux/cred.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/magic.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -45,11 +47,16 @@
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
 #include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
-#include <linux/cpuset.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
@@ -61,6 +68,14 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
 
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 
@@ -74,12 +89,14 @@
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
+#ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DEFINE_SPINLOCK(css_set_lock);
-
-#ifdef CONFIG_PROVE_RCU
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -94,6 +111,12 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
@@ -109,9 +132,15 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-struct cgroup_subsys *cgroup_subsys[] = {
+static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
@@ -158,17 +187,18 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_visible;
 
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
 /* some controllers are implicitly enabled on the default hierarchy */
-static u16 cgrp_dfl_implicit_ss_mask;
-
-/* some controllers can be threaded on the default hierarchy */
-static u16 cgrp_dfl_threaded_ss_mask;
+static unsigned long cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
-LIST_HEAD(cgroup_roots);
+
+static LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -184,25 +214,29 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 static u64 css_serial_nr_next = 1;
 
 /*
- * These bitmasks identify subsystems with specific features to avoid
- * having to do iterative checks repeatedly.
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
-static u16 have_release_callback __read_mostly;
-static u16 have_canfork_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.count		= REFCOUNT_INIT(2),
+	.count		= { .counter = 2, },
 	.user_ns	= &init_user_ns,
 	.ns.ops		= &cgroupns_operations,
 	.ns.inum	= PROC_CGROUP_INIT_INO,
 	.root_cset	= &init_css_set,
 };
 
+/* Ditto for the can_fork callback. */
+static u16 have_canfork_callback __read_mostly;
+
 static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
 
 /* cgroup optional features */
 enum cgroup_opt_features {
@@ -220,10 +254,11 @@ static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
 
 static u16 cgroup_feature_disable_mask __read_mostly;
 
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_skip(struct css_task_iter *it,
-				struct task_struct *task);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss);
@@ -241,7 +276,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  * is fine for individual subsystems but unsuitable for cgroup core.  This
  * is slower static_key_enabled() based test indexed by @ssid.
  */
-bool cgroup_ssid_enabled(int ssid)
+static bool cgroup_ssid_enabled(int ssid)
 {
 	if (CGROUP_SUBSYS_COUNT == 0)
 		return false;
@@ -249,6 +284,11 @@ bool cgroup_ssid_enabled(int ssid)
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
+static bool cgroup_ssid_no_v1(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
@@ -302,7 +342,7 @@ bool cgroup_ssid_enabled(int ssid)
  *
  * - debug: disallowed on the default hierarchy.
  */
-bool cgroup_on_dfl(const struct cgroup *cgrp)
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
@@ -338,103 +378,14 @@ static void cgroup_idr_remove(struct idr *idr, int id)
 	spin_unlock_bh(&cgroup_idr_lock);
 }
 
-static bool cgroup_has_tasks(struct cgroup *cgrp)
-{
-	return cgrp->nr_populated_csets;
-}
-
-bool cgroup_is_threaded(struct cgroup *cgrp)
-{
-	return cgrp->dom_cgrp != cgrp;
-}
-
-/* can @cgrp host both domain and threaded children? */
-static bool cgroup_is_mixable(struct cgroup *cgrp)
-{
-	/*
-	 * Root isn't under domain level resource control exempting it from
-	 * the no-internal-process constraint, so it can serve as a thread
-	 * root and a parent of resource domains at the same time.
-	 */
-	return !cgroup_parent(cgrp);
-}
-
-/* can @cgrp become a thread root? should always be true for a thread root */
-static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
-{
-	/* mixables don't care */
-	if (cgroup_is_mixable(cgrp))
-		return true;
-
-	/* domain roots can't be nested under threaded */
-	if (cgroup_is_threaded(cgrp))
-		return false;
-
-	/* can only have either domain or threaded children */
-	if (cgrp->nr_populated_domain_children)
-		return false;
-
-	/* and no domain controllers can be enabled */
-	if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
-		return false;
-
-	return true;
-}
-
-/* is @cgrp root of a threaded subtree? */
-bool cgroup_is_thread_root(struct cgroup *cgrp)
-{
-	/* thread root should be a domain */
-	if (cgroup_is_threaded(cgrp))
-		return false;
-
-	/* a domain w/ threaded children is a thread root */
-	if (cgrp->nr_threaded_children)
-		return true;
-
-	/*
-	 * A domain which has tasks and explicit threaded controllers
-	 * enabled is a thread root.
-	 */
-	if (cgroup_has_tasks(cgrp) &&
-	    (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
-		return true;
-
-	return false;
-}
-
-/* a domain which isn't connected to the root w/o brekage can't be used */
-static bool cgroup_is_valid_domain(struct cgroup *cgrp)
-{
-	/* the cgroup itself can be a thread root */
-	if (cgroup_is_threaded(cgrp))
-		return false;
-
-	/* but the ancestors can't be unless mixable */
-	while ((cgrp = cgroup_parent(cgrp))) {
-		if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
-			return false;
-		if (cgroup_is_threaded(cgrp))
-			return false;
-	}
-
-	return true;
-}
-
 /* subsystems visibly enabled on a cgroup */
 static u16 cgroup_control(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
 	u16 root_ss_mask = cgrp->root->subsys_mask;
 
-	if (parent) {
-		u16 ss_mask = parent->subtree_control;
-
-		/* threaded cgroups can only have threaded controllers */
-		if (cgroup_is_threaded(cgrp))
-			ss_mask &= cgrp_dfl_threaded_ss_mask;
-		return ss_mask;
-	}
+	if (parent)
+		return parent->subtree_control;
 
 	if (cgroup_on_dfl(cgrp))
 		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -447,14 +398,8 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
 
-	if (parent) {
-		u16 ss_mask = parent->subtree_ss_mask;
-
-		/* threaded cgroups can only have threaded controllers */
-		if (cgroup_is_threaded(cgrp))
-			ss_mask &= cgrp_dfl_threaded_ss_mask;
-		return ss_mask;
-	}
+	if (parent)
+		return parent->subtree_ss_mask;
 
 	return cgrp->root->subsys_mask;
 }
@@ -544,37 +489,10 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 	return css;
 }
 
-/**
- * __cgroup_task_count - count the number of tasks in a cgroup. The caller
- * is responsible for taking the css_set_lock.
- * @cgrp: the cgroup in question
- */
-int __cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	lockdep_assert_held(&css_set_lock);
-
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += link->cset->nr_tasks;
-
-	return count;
-}
-
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- */
-int cgroup_task_count(const struct cgroup *cgrp)
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-	int count;
-
-	spin_lock_irq(&css_set_lock);
-	count = __cgroup_task_count(cgrp);
-	spin_unlock_irq(&css_set_lock);
-
-	return count;
+	return !(cgrp->self.flags & CSS_ONLINE);
 }
 
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
@@ -597,6 +515,11 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
+static int notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
@@ -626,6 +549,15 @@ EXPORT_SYMBOL_GPL(of_css);
 			;						\
 		else
 
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
 /**
  * do_each_subsys_mask - filter for_each_subsys with a bitmask
  * @ss: the iteration cursor
@@ -650,6 +582,10 @@ EXPORT_SYMBOL_GPL(of_css);
 	}								\
 } while (false)
 
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)				\
 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -676,6 +612,29 @@ EXPORT_SYMBOL_GPL(of_css);
 			;						\
 		else
 
+static void cgroup_release_agent(struct work_struct *work);
+static void check_for_release(struct cgroup *cgrp);
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
@@ -684,42 +643,20 @@ EXPORT_SYMBOL_GPL(of_css);
  * haven't been created.
  */
 struct css_set init_css_set = {
-	.refcount		= REFCOUNT_INIT(1),
-	.dom_cset		= &init_css_set,
+	.refcount		= ATOMIC_INIT(1),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
-	.dying_tasks		= LIST_HEAD_INIT(init_css_set.dying_tasks),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
-	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
-	.mg_src_preload_node	= LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
-	.mg_dst_preload_node	= LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
+	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-
-	/*
-	 * The following field is re-initialized when this cset gets linked
-	 * in cgroup_init().  However, let's initialize the field
-	 * statically too so that the default cgroup can be accessed safely
-	 * early during boot.
-	 */
-	.dfl_cgrp		= &cgrp_dfl_root.cgrp,
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
 
-static bool css_set_threaded(struct css_set *cset)
-{
-	return cset->dom_cset != cset;
-}
-
 /**
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
- *
- * css_set_populated() should be the same as !!cset->nr_tasks at steady
- * state. However, css_set_populated() can be called while a task is being
- * added to or removed from the linked list before the nr_tasks is
- * properly updated. Hence, we can't just look at ->nr_tasks here.
  */
 static bool css_set_populated(struct css_set *cset)
 {
@@ -729,48 +666,39 @@ static bool css_set_populated(struct css_set *cset)
 }
 
 /**
- * cgroup_update_populated - update the populated count of a cgroup
+ * cgroup_update_populated - updated populated count of a cgroup
  * @cgrp: the target cgroup
  * @populated: inc or dec populated count
  *
  * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
- * count is propagated towards root so that a given cgroup's
- * nr_populated_children is zero iff none of its descendants contain any
- * tasks.
- *
- * @cgrp's interface file "cgroup.populated" is zero if both
- * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
- * 1 otherwise.  When the sum changes from or to zero, userland is notified
- * that the content of the interface file has changed.  This can be used to
- * detect when @cgrp and its descendants become populated or empty.
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
  */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
-	struct cgroup *child = NULL;
-	int adj = populated ? 1 : -1;
-
 	lockdep_assert_held(&css_set_lock);
 
 	do {
-		bool was_populated = cgroup_is_populated(cgrp);
+		bool trigger;
 
-		if (!child) {
-			cgrp->nr_populated_csets += adj;
-		} else {
-			if (cgroup_is_threaded(child))
-				cgrp->nr_populated_threaded_children += adj;
-			else
-				cgrp->nr_populated_domain_children += adj;
-		}
+		if (populated)
+			trigger = !cgrp->populated_cnt++;
+		else
+			trigger = !--cgrp->populated_cnt;
 
-		if (was_populated == cgroup_is_populated(cgrp))
+		if (!trigger)
 			break;
 
-		cgroup1_check_for_release(cgrp);
+		check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
-		child = cgrp;
 		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
 }
@@ -781,7 +709,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  * @populated: whether @cset is populated or depopulated
  *
  * @cset is either getting the first task or losing the last.  Update the
- * populated counters of all associated cgroups accordingly.
+ * ->populated_cnt of all associated cgroups accordingly.
  */
 static void css_set_update_populated(struct css_set *cset, bool populated)
 {
@@ -793,21 +721,6 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
 		cgroup_update_populated(link->cgrp, populated);
 }
 
-/*
- * @task is leaving, advance task iterators which are pointing to it so
- * that they can resume at the next position.  Advancing an iterator might
- * remove it from the list, use safe walk.  See css_task_iter_skip() for
- * details.
- */
-static void css_set_skip_task_iters(struct css_set *cset,
-				    struct task_struct *task)
-{
-	struct css_task_iter *it, *pos;
-
-	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
-		css_task_iter_skip(it, task);
-}
-
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -819,7 +732,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated counter updates and
+ * This function automatically handles populated_cnt updates and
  * css_task_iter adjustments but the caller is responsible for managing
  * @from_cset and @to_cset's reference counts.
  */
@@ -833,9 +746,22 @@ static void css_set_move_task(struct task_struct *task,
 		css_set_update_populated(to_cset, true);
 
 	if (from_cset) {
+		struct css_task_iter *it, *pos;
+
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
-		css_set_skip_task_iters(from_cset, task);
+		/*
+		 * @task is leaving, advance task iterators which are
+		 * pointing to it so that they can resume at the next
+		 * position.  Advancing an iterator might remove it from
+		 * the list, use safe walk.  See css_task_iter_advance*()
+		 * for details.
+		 */
+		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+					 iters_node)
+			if (it->task_pos == &task->cg_list)
+				css_task_iter_advance(it);
+
 		list_del_init(&task->cg_list);
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
@@ -879,7 +805,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-void put_css_set_locked(struct css_set *cset)
+static void put_css_set_locked(struct css_set *cset)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	struct cgroup_subsys *ss;
@@ -887,11 +813,9 @@ void put_css_set_locked(struct css_set *cset)
 
 	lockdep_assert_held(&css_set_lock);
 
-	if (!refcount_dec_and_test(&cset->refcount))
+	if (!atomic_dec_and_test(&cset->refcount))
 		return;
 
-	WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
-
 	/* This css_set is dead. unlink it and release cgroup and css refs */
 	for_each_subsys(ss, ssid) {
 		list_del(&cset->e_cset_node[ssid]);
@@ -908,14 +832,34 @@ void put_css_set_locked(struct css_set *cset)
 		kfree(link);
 	}
 
-	if (css_set_threaded(cset)) {
-		list_del(&cset->threaded_csets_node);
-		put_css_set_locked(cset->dom_cset);
-	}
-
 	kfree_rcu(cset, rcu_head);
 }
 
+static void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -931,7 +875,6 @@ static bool compare_css_sets(struct css_set *cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
-	struct cgroup *new_dfl_cgrp;
 	struct list_head *l1, *l2;
 
 	/*
@@ -942,16 +885,6 @@ static bool compare_css_sets(struct css_set *cset,
 	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 		return false;
 
-
-	/* @cset's domain should match the default cgroup's */
-	if (cgroup_on_dfl(new_cgrp))
-		new_dfl_cgrp = new_cgrp;
-	else
-		new_dfl_cgrp = old_cset->dfl_cgrp;
-
-	if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
-		return false;
-
 	/*
 	 * Compare cgroup pointers in order to distinguish between
 	 * different cgroups in hierarchies.  As different cgroups may
@@ -1158,18 +1091,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		return NULL;
 	}
 
-	refcount_set(&cset->refcount, 1);
-	cset->dom_cset = cset;
+	atomic_set(&cset->refcount, 1);
+	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->dying_tasks);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_LIST_HEAD(&cset->task_iters);
-	INIT_LIST_HEAD(&cset->threaded_csets);
 	INIT_HLIST_NODE(&cset->hlist);
-	INIT_LIST_HEAD(&cset->cgrp_links);
-	INIT_LIST_HEAD(&cset->mg_src_preload_node);
-	INIT_LIST_HEAD(&cset->mg_dst_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -1203,32 +1132,10 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	spin_unlock_irq(&css_set_lock);
 
-	/*
-	 * If @cset should be threaded, look up the matching dom_cset and
-	 * link them up.  We first fully initialize @cset then look for the
-	 * dom_cset.  It's simpler this way and safe as @cset is guaranteed
-	 * to stay empty until we return.
-	 */
-	if (cgroup_is_threaded(cset->dfl_cgrp)) {
-		struct css_set *dcset;
-
-		dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
-		if (!dcset) {
-			put_css_set(cset);
-			return NULL;
-		}
-
-		spin_lock_irq(&css_set_lock);
-		cset->dom_cset = dcset;
-		list_add_tail(&cset->threaded_csets_node,
-			      &dcset->threaded_csets);
-		spin_unlock_irq(&css_set_lock);
-	}
-
 	return cset;
 }
 
-struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 
@@ -1256,7 +1163,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
-void cgroup_free_root(struct cgroup_root *root)
+static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		idr_destroy(&root->cgroup_idr);
@@ -1352,8 +1259,6 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
 	if (cset == &init_css_set) {
 		res = &root->cgrp;
-	} else if (root == &cgrp_dfl_root) {
-		res = cset->dfl_cgrp;
 	} else {
 		struct cgrp_cset_link *link;
 
@@ -1375,8 +1280,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_lock held.
  */
-struct cgroup *task_cgroup_from_root(struct task_struct *task,
-				     struct cgroup_root *root)
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+					    struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
@@ -1413,6 +1318,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
@@ -1425,7 +1331,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
 	else
-		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 
@@ -1506,7 +1412,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  * inaccessible any time.  If the caller intends to continue to access the
  * cgroup, it should pin it before invoking this function.
  */
-void cgroup_kn_unlock(struct kernfs_node *kn)
+static void cgroup_kn_unlock(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 
@@ -1538,7 +1444,8 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+					  bool drain_offline)
 {
 	struct cgroup *cgrp;
 
@@ -1601,17 +1508,8 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
 
 	css->flags &= ~CSS_VISIBLE;
 
-	if (!css->ss) {
-		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_base_files;
-		else
-			cfts = cgroup1_base_files;
-
+	list_for_each_entry(cfts, &css->ss->cfts, node)
 		cgroup_addrm_files(css, cgrp, cfts, false);
-	} else {
-		list_for_each_entry(cfts, &css->ss->cfts, node)
-			cgroup_addrm_files(css, cgrp, cfts, false);
-	}
 }
 
 /**
@@ -1631,20 +1529,18 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_base_files;
+			cfts = cgroup_dfl_base_files;
 		else
-			cfts = cgroup1_base_files;
+			cfts = cgroup_legacy_base_files;
 
-		ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
-		if (ret < 0)
-			return ret;
-	} else {
-		list_for_each_entry(cfts, &css->ss->cfts, node) {
-			ret = cgroup_addrm_files(css, cgrp, cfts, true);
-			if (ret < 0) {
-				failed_cfts = cfts;
-				goto err;
-			}
+		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+	}
+
+	list_for_each_entry(cfts, &css->ss->cfts, node) {
+		ret = cgroup_addrm_files(css, cgrp, cfts, true);
+		if (ret < 0) {
+			failed_cfts = cfts;
+			goto err;
 		}
 	}
 
@@ -1660,7 +1556,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 	return ret;
 }
 
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
 	struct cgroup *dcgrp = &dst_root->cgrp;
 	struct cgroup_subsys *ss;
@@ -1753,8 +1649,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
-int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-		     struct kernfs_root *kf_root)
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
 {
 	int len = 0;
 	char *buf = NULL;
@@ -1780,56 +1676,245 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+static int cgroup_show_options(struct seq_file *seq,
+			       struct kernfs_root *kf_root)
 {
-	char *token;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	if (root != &cgrp_dfl_root)
+		for_each_subsys(ss, ssid)
+			if (root->subsys_mask & (1 << ssid))
+				seq_show_option(seq, ss->legacy_name, NULL);
+	if (root->flags & CGRP_ROOT_NOPREFIX)
+		seq_puts(seq, ",noprefix");
+	if (root->flags & CGRP_ROOT_XATTR)
+		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
+	if (strlen(root->release_agent_path))
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+		seq_puts(seq, ",clone_children");
+	if (strlen(root->name))
+		seq_show_option(seq, "name", root->name);
+	return 0;
+}
 
-	*root_flags = 0;
+struct cgroup_sb_opts {
+	u16 subsys_mask;
+	unsigned int flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+};
 
-	if (!data || *data == '\0')
-		return 0;
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+	char *token, *o = data;
+	bool all_ss = false, one_ss = false;
+	u16 mask = U16_MAX;
+	struct cgroup_subsys *ss;
+	int nr_opts = 0;
+	int i;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
 
-	while ((token = strsep(&data, ",")) != NULL) {
-		if (!strcmp(token, "nsdelegate")) {
-			*root_flags |= CGRP_ROOT_NS_DELEGATE;
+	memset(opts, 0, sizeof(*opts));
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		nr_opts++;
+
+		if (!*token)
+			return -EINVAL;
+		if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
+			continue;
+		}
+		if (!strcmp(token, "all")) {
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (one_ss)
+				return -EINVAL;
+			all_ss = true;
+			continue;
+		}
+		if (!strcmp(token, "noprefix")) {
+			opts->flags |= CGRP_ROOT_NOPREFIX;
+			continue;
+		}
+		if (!strcmp(token, "clone_children")) {
+			opts->cpuset_clone_children = true;
+			continue;
+		}
+		if (!strcmp(token, "xattr")) {
+			opts->flags |= CGRP_ROOT_XATTR;
+			continue;
+		}
+		if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
 			continue;
 		}
+		if (!strncmp(token, "name=", 5)) {
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
 
-		pr_err("cgroup2: unknown option \"%s\"\n", token);
-		return -EINVAL;
-	}
+			continue;
+		}
 
-	return 0;
-}
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->legacy_name))
+				continue;
+			if (!cgroup_ssid_enabled(i))
+				continue;
+			if (cgroup_ssid_no_v1(i))
+				continue;
 
-static void apply_cgroup_root_flags(unsigned int root_flags)
-{
-	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
-		if (root_flags & CGRP_ROOT_NS_DELEGATE)
-			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
-		else
-			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (all_ss)
+				return -EINVAL;
+			opts->subsys_mask |= (1 << i);
+			one_ss = true;
+
+			break;
+		}
+		if (i == CGROUP_SUBSYS_COUNT)
+			return -ENOENT;
 	}
-}
 
-static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
-{
-	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
-		seq_puts(seq, ",nsdelegate");
+	/*
+	 * If the 'all' option was specified select all the subsystems,
+	 * otherwise if 'none', 'name=' and a subsystem name options were
+	 * not specified, let's default to 'all'
+	 */
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
+		for_each_subsys(ss, i)
+			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
+				opts->subsys_mask |= (1 << i);
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_mask && !opts->name)
+		return -EINVAL;
+
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+		return -EINVAL;
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_mask && opts->none)
+		return -EINVAL;
+
 	return 0;
 }
 
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-	unsigned int root_flags;
-	int ret;
-
-	ret = parse_cgroup_root_flags(data, &root_flags);
-	if (ret)
-		return ret;
+	int ret = 0;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup_sb_opts opts;
+	u16 added_mask, removed_mask;
 
-	apply_cgroup_root_flags(root_flags);
-	return 0;
+	if (root == &cgrp_dfl_root) {
+		pr_err("remount is not allowed\n");
+		return -EINVAL;
+	}
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* See what subsystems are wanted */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+			task_tgid_nr(current), current->comm);
+
+	/* See cgroup_mount release_agent handling */
+	if (opts.release_agent &&
+	    ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+	/* Don't allow flags or name to change at remount */
+	if ((opts.flags ^ root->flags) ||
+	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+		       opts.flags, opts.name ?: "", root->flags, root->name);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* remounting is not allowed for populated hierarchies */
+	if (!list_empty(&root->cgrp.self.children)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = rebind_subsystems(root, added_mask);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
+		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
+
+	trace_cgroup_remount(root);
+
+ out_unlock:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
 }
 
 /*
@@ -1882,7 +1967,6 @@ static void cgroup_enable_task_cg_lists(void)
 				css_set_update_populated(cset, true);
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
-			cset->nr_tasks++;
 		}
 		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
@@ -1903,18 +1987,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->self.cgroup = cgrp;
 	cgrp->self.flags |= CSS_ONLINE;
-	cgrp->dom_cgrp = cgrp;
-	cgrp->max_descendants = INT_MAX;
-	cgrp->max_depth = INT_MAX;
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
 	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
+	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
 }
 
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
+static void init_cgroup_root(struct cgroup_root *root,
+			     struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 
@@ -1926,18 +2008,17 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
-		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
+		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
-		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
+		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
-	struct kernfs_syscall_ops *kf_sops;
 	struct css_set *cset;
 	int i, ret;
 
@@ -1949,8 +2030,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	root_cgrp->id = ret;
 	root_cgrp->ancestor_ids[0] = ret;
 
-	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
-			      0, GFP_KERNEL);
+	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+			      GFP_KERNEL);
 	if (ret)
 		goto out;
 
@@ -1969,10 +2050,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	if (ret)
 		goto cancel_ref;
 
-	kf_sops = root == &cgrp_dfl_root ?
-		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
-
-	root->kf_root = kernfs_create_root(kf_sops,
+	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
@@ -2033,52 +2111,20 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	return ret;
 }
 
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
-			       struct cgroup_root *root, unsigned long magic,
-			       struct cgroup_namespace *ns)
-{
-	struct dentry *dentry;
-	bool new_sb = false;
-
-	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
-
-	/*
-	 * In non-init cgroup namespace, instead of root cgroup's dentry,
-	 * we return the dentry corresponding to the cgroupns->root_cgrp.
-	 */
-	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
-		struct dentry *nsdentry;
-		struct super_block *sb = dentry->d_sb;
-		struct cgroup *cgrp;
-
-		mutex_lock(&cgroup_mutex);
-		spin_lock_bh(&css_set_lock);
-
-		cgrp = cset_cgroup_from_root(ns->root_cset, root);
-
-		spin_unlock_bh(&css_set_lock);
-		mutex_unlock(&cgroup_mutex);
-
-		nsdentry = kernfs_node_dentry(cgrp->kn, sb);
-		dput(dentry);
-		if (IS_ERR(nsdentry))
-			deactivate_locked_super(sb);
-		dentry = nsdentry;
-	}
-
-	if (!new_sb)
-		cgroup_put(&root->cgrp);
-
-	return dentry;
-}
-
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
+	bool is_v2 = fs_type == &cgroup2_fs_type;
+	struct super_block *pinned_sb = NULL;
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup_subsys *ss;
+	struct cgroup_root *root;
+	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
+	int i;
+	bool new_sb;
 
 	get_cgroup_ns(ns);
 
@@ -2095,25 +2141,190 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 
-	if (fs_type == &cgroup2_fs_type) {
-		unsigned int root_flags;
-
-		ret = parse_cgroup_root_flags(data, &root_flags);
-		if (ret) {
+	if (is_v2) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
 			put_cgroup_ns(ns);
-			return ERR_PTR(ret);
+			return ERR_PTR(-EINVAL);
 		}
-
 		cgrp_dfl_visible = true;
-		cgroup_get(&cgrp_dfl_root.cgrp);
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		goto out_mount;
+	}
 
-		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
-					 CGROUP2_SUPER_MAGIC, ns);
-		if (!IS_ERR(dentry))
-			apply_cgroup_root_flags(root_flags);
-	} else {
-		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
-				       CGROUP_SUPER_MAGIC, ns);
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* First find the desired set of subsystems */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Destruction of cgroup root is asynchronous, so subsystems may
+	 * still be dying after the previous unmount.  Let's drain the
+	 * dying subsystems.  We just need to ensure that the ones
+	 * unmounted previously finish dying and don't care about new ones
+	 * starting.  Testing ref liveliness is good enough.
+	 */
+	for_each_subsys(ss, i) {
+		if (!(opts.subsys_mask & (1 << i)) ||
+		    ss->root == &cgrp_dfl_root)
+			continue;
+
+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+		cgroup_put(&ss->root->cgrp);
+	}
+
+	for_each_root(root) {
+		bool name_match = false;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
+
+		/*
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
+		 */
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		if (root->flags ^ opts.flags)
+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+		/*
+		 * We want to reuse @root whose lifetime is governed by its
+		 * ->cgrp.  Let's check whether @root is alive and keep it
+		 * that way.  As cgroup_kill_sb() can happen anytime, we
+		 * want to block it by pinning the sb so that @root doesn't
+		 * get killed before mount is complete.
+		 *
+		 * With the sb pinned, tryget_live can reliably indicate
+		 * whether @root can be reused.  If it's being killed,
+		 * drain it.  We can use wait_queue for the wait but this
+		 * path is super cold.  Let's just sleep a bit and retry.
+		 */
+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+		if (IS_ERR(pinned_sb) ||
+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			if (!IS_ERR_OR_NULL(pinned_sb))
+				deactivate_super(pinned_sb);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	/*
+	 * Release agent gets called with all capabilities,
+	 * require capabilities to set release agent.
+	 */
+	if (opts.release_agent &&
+	    ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	init_cgroup_root(root, &opts);
+
+	ret = cgroup_setup_root(root, opts.subsys_mask);
+	if (ret)
+		cgroup_free_root(root);
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
+	if (ret) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(ret);
+	}
+out_mount:
+	dentry = kernfs_mount(fs_type, flags, root->kf_root,
+			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+			      &new_sb);
+
+	/*
+	 * In non-init cgroup namespace, instead of root cgroup's
+	 * dentry, we return the dentry corresponding to the
+	 * cgroupns->root_cgrp.
+	 */
+	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+		struct dentry *nsdentry;
+		struct cgroup *cgrp;
+
+		mutex_lock(&cgroup_mutex);
+		spin_lock_irq(&css_set_lock);
+
+		cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+		spin_unlock_irq(&css_set_lock);
+		mutex_unlock(&cgroup_mutex);
+
+		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+		dput(dentry);
+		dentry = nsdentry;
+	}
+
+	if (IS_ERR(dentry) || !new_sb)
+		cgroup_put(&root->cgrp);
+
+	/*
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 */
+	if (pinned_sb) {
+		WARN_ON(new_sb);
+		deactivate_super(pinned_sb);
 	}
 
 	put_cgroup_ns(ns);
@@ -2126,20 +2337,22 @@ static void cgroup_kill_sb(struct super_block *sb)
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 
 	/*
-	 * If @root doesn't have any children, start killing it.
+	 * If @root doesn't have any mounts or children, start killing it.
 	 * This prevents new mounts by disabling percpu_ref_tryget_live().
 	 * cgroup_mount() may wait for @root's release.
 	 *
 	 * And don't kill the default root.
 	 */
-	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
-	    !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+	if (!list_empty(&root->cgrp.self.children) ||
+	    root == &cgrp_dfl_root)
+		cgroup_put(&root->cgrp);
+	else
 		percpu_ref_kill(&root->cgrp.self.refcnt);
-	cgroup_put(&root->cgrp);
+
 	kernfs_kill_sb(sb);
 }
 
-struct file_system_type cgroup_fs_type = {
+static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
@@ -2153,8 +2366,8 @@ static struct file_system_type cgroup2_fs_type = {
 	.fs_flags = FS_USERNS_MOUNT,
 };
 
-int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-			  struct cgroup_namespace *ns)
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+				 struct cgroup_namespace *ns)
 {
 	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
 
@@ -2217,18 +2430,49 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
+/* used to track tasks and other necessary states during migration */
+struct cgroup_taskset {
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/* the subsys currently being processed */
+	int			ssid;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
+};
+
+#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
+	.csets			= &tset.src_csets,			\
+}
+
 /**
- * cgroup_migrate_add_task - add a migration target task to a migration context
+ * cgroup_taskset_add - try to add a migration target task to a taskset
  * @task: target task
- * @mgctx: target migration context
+ * @tset: target taskset
  *
- * Add @task, which is a migration target, to @mgctx->tset.  This function
- * becomes noop if @task doesn't need to be migrated.  @task's css_set
- * should have been added as a migration source and @task->cg_list will be
- * moved from the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @tset.  This function becomes
+ * noop if @task doesn't need to be migrated.  @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
  */
-static void cgroup_migrate_add_task(struct task_struct *task,
-				    struct cgroup_mgctx *mgctx)
+static void cgroup_taskset_add(struct task_struct *task,
+			       struct cgroup_taskset *tset)
 {
 	struct css_set *cset;
 
@@ -2246,15 +2490,12 @@ static void cgroup_migrate_add_task(struct task_struct *task,
 	if (!cset->mg_src_cgrp)
 		return;
 
-	mgctx->tset.nr_tasks++;
-
 	list_move_tail(&task->cg_list, &cset->mg_tasks);
 	if (list_empty(&cset->mg_node))
-		list_add_tail(&cset->mg_node,
-			      &mgctx->tset.src_csets);
+		list_add_tail(&cset->mg_node, &tset->src_csets);
 	if (list_empty(&cset->mg_dst_cset->mg_node))
-		list_add_tail(&cset->mg_dst_cset->mg_node,
-			      &mgctx->tset.dst_csets);
+		list_move_tail(&cset->mg_dst_cset->mg_node,
+			       &tset->dst_csets);
 }
 
 /**
@@ -2321,34 +2562,37 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 /**
  * cgroup_taskset_migrate - migrate a taskset
- * @mgctx: migration context
+ * @tset: taget taskset
+ * @root: cgroup root the migration is taking place on
  *
- * Migrate tasks in @mgctx as setup by migration preparation functions.
+ * Migrate tasks in @tset as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @mgctx are migrated.
- * @mgctx is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
  */
-static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+				  struct cgroup_root *root)
 {
-	struct cgroup_taskset *tset = &mgctx->tset;
 	struct cgroup_subsys *ss;
 	struct task_struct *task, *tmp_task;
 	struct css_set *cset, *tmp_cset;
 	int ssid, failed_ssid, ret;
 
+	/* methods shouldn't be called if no task is actually migrating */
+	if (list_empty(&tset->src_csets))
+		return 0;
+
 	/* check that we can legitimately attach to the cgroup */
-	if (tset->nr_tasks) {
-		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-			if (ss->can_attach) {
-				tset->ssid = ssid;
-				ret = ss->can_attach(tset);
-				if (ret) {
-					failed_ssid = ssid;
-					goto out_cancel_attach;
-				}
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ss->can_attach) {
+			tset->ssid = ssid;
+			ret = ss->can_attach(tset);
+			if (ret) {
+				failed_ssid = ssid;
+				goto out_cancel_attach;
 			}
-		} while_each_subsys_mask();
-	}
+		}
+	} while_each_subsys_mask();
 
 	/*
 	 * Now that we're guaranteed success, proceed to move all tasks to
@@ -2362,17 +2606,8 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 			struct css_set *to_cset = cset->mg_dst_cset;
 
 			get_css_set(to_cset);
-			to_cset->nr_tasks++;
 			css_set_move_task(task, from_cset, to_cset, true);
-			from_cset->nr_tasks--;
-			/*
-			 * If the source or destination cgroup is frozen,
-			 * the task might require to change its state.
-			 */
-			cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
-						    to_cset->dfl_cgrp);
 			put_css_set_locked(from_cset);
-
 		}
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -2384,29 +2619,25 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 	 */
 	tset->csets = &tset->dst_csets;
 
-	if (tset->nr_tasks) {
-		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-			if (ss->attach) {
-				tset->ssid = ssid;
-				ss->attach(tset);
-			}
-		} while_each_subsys_mask();
-	}
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ss->attach) {
+			tset->ssid = ssid;
+			ss->attach(tset);
+		}
+	} while_each_subsys_mask();
 
 	ret = 0;
 	goto out_release_tset;
 
 out_cancel_attach:
-	if (tset->nr_tasks) {
-		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-			if (ssid == failed_ssid)
-				break;
-			if (ss->cancel_attach) {
-				tset->ssid = ssid;
-				ss->cancel_attach(tset);
-			}
-		} while_each_subsys_mask();
-	}
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ssid == failed_ssid)
+			break;
+		if (ss->cancel_attach) {
+			tset->ssid = ssid;
+			ss->cancel_attach(tset);
+		}
+	} while_each_subsys_mask();
 out_release_tset:
 	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2415,87 +2646,44 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 		list_del_init(&cset->mg_node);
 	}
 	spin_unlock_irq(&css_set_lock);
-
-	/*
-	 * Re-initialize the cgroup_taskset structure in case it is reused
-	 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
-	 * iteration.
-	 */
-	tset->nr_tasks = 0;
-	tset->csets    = &tset->src_csets;
 	return ret;
 }
 
 /**
- * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
  * @dst_cgrp: destination cgroup to test
  *
- * On the default hierarchy, except for the mixable, (possible) thread root
- * and threaded cgroups, subtree_control must be zero for migration
- * destination cgroups with tasks so that child cgroups don't compete
- * against tasks.
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
  */
-int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 {
-	/* v1 doesn't have any restriction */
-	if (!cgroup_on_dfl(dst_cgrp))
-		return 0;
-
-	/* verify @dst_cgrp can host resources */
-	if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
-		return -EOPNOTSUPP;
-
-	/* mixables don't care */
-	if (cgroup_is_mixable(dst_cgrp))
-		return 0;
-
-	/*
-	 * If @dst_cgrp is already or can become a thread root or is
-	 * threaded, it doesn't matter.
-	 */
-	if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
-		return 0;
-
-	/* apply no-internal-process constraint */
-	if (dst_cgrp->subtree_control)
-		return -EBUSY;
-
-	return 0;
+	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+		!dst_cgrp->subtree_control;
 }
 
 /**
  * cgroup_migrate_finish - cleanup after attach
- * @mgctx: migration context
+ * @preloaded_csets: list of preloaded css_sets
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 {
 	struct css_set *cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
-
-	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
-				 mg_src_preload_node) {
-		cset->mg_src_cgrp = NULL;
-		cset->mg_dst_cgrp = NULL;
-		cset->mg_dst_cset = NULL;
-		list_del_init(&cset->mg_src_preload_node);
-		put_css_set_locked(cset);
-	}
-
-	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
-				 mg_dst_preload_node) {
+	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
-		list_del_init(&cset->mg_dst_preload_node);
+		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-
 	spin_unlock_irq(&css_set_lock);
 }
 
@@ -2503,10 +2691,10 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
- * @mgctx: migration context
+ * @preloaded_csets: list of preloaded css_sets
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2515,9 +2703,9 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
  * into play and the preloaded css_sets are guaranteed to cover all
  * migrations.
  */
-void cgroup_migrate_add_src(struct css_set *src_cset,
-			    struct cgroup *dst_cgrp,
-			    struct cgroup_mgctx *mgctx)
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+				   struct cgroup *dst_cgrp,
+				   struct list_head *preloaded_csets)
 {
 	struct cgroup *src_cgrp;
 
@@ -2534,7 +2722,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 
-	if (!list_empty(&src_cset->mg_src_preload_node))
+	if (!list_empty(&src_cset->mg_preload_node))
 		return;
 
 	WARN_ON(src_cset->mg_src_cgrp);
@@ -2545,39 +2733,37 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 	src_cset->mg_src_cgrp = src_cgrp;
 	src_cset->mg_dst_cgrp = dst_cgrp;
 	get_css_set(src_cset);
-	list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
+	list_add(&src_cset->mg_preload_node, preloaded_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @mgctx: migration context
+ * @preloaded_csets: list of preloaded source css_sets
  *
  * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @mgctx->preloaded_dst_csets.
+ * preloaded to @preloaded_csets.  This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @mgctx.
+ * @preloaded_csets.
  */
-int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 {
+	LIST_HEAD(csets);
 	struct css_set *src_cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	/* look up the dst cset for each src cset and link it to src */
-	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
-				 mg_src_preload_node) {
+	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		struct css_set *dst_cset;
-		struct cgroup_subsys *ss;
-		int ssid;
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
 		if (!dst_cset)
-			return -ENOMEM;
+			goto err;
 
 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
 
@@ -2589,7 +2775,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 		if (src_cset == dst_cset) {
 			src_cset->mg_src_cgrp = NULL;
 			src_cset->mg_dst_cgrp = NULL;
-			list_del_init(&src_cset->mg_src_preload_node);
+			list_del_init(&src_cset->mg_preload_node);
 			put_css_set(src_cset);
 			put_css_set(dst_cset);
 			continue;
@@ -2597,25 +2783,24 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 
 		src_cset->mg_dst_cset = dst_cset;
 
-		if (list_empty(&dst_cset->mg_dst_preload_node))
-			list_add_tail(&dst_cset->mg_dst_preload_node,
-				      &mgctx->preloaded_dst_csets);
+		if (list_empty(&dst_cset->mg_preload_node))
+			list_add(&dst_cset->mg_preload_node, &csets);
 		else
 			put_css_set(dst_cset);
-
-		for_each_subsys(ss, ssid)
-			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
-				mgctx->ss_mask |= 1 << ssid;
 	}
 
+	list_splice_tail(&csets, preloaded_csets);
 	return 0;
+err:
+	cgroup_migrate_finish(&csets);
+	return -ENOMEM;
 }
 
 /**
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @mgctx: migration context
+ * @root: cgroup root migration is taking place on
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
@@ -2629,9 +2814,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_mgctx *mgctx)
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+			  struct cgroup_root *root)
 {
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct task_struct *task;
 
 	/*
@@ -2643,14 +2829,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_migrate_add_task(task, mgctx);
+		cgroup_taskset_add(task, &tset);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
-	return cgroup_migrate_execute(mgctx);
+	return cgroup_taskset_migrate(&tset, root);
 }
 
 /**
@@ -2661,23 +2847,23 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  *
  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
-int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
-		       bool threadgroup)
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+			      struct task_struct *leader, bool threadgroup)
 {
-	DEFINE_CGROUP_MGCTX(mgctx);
+	LIST_HEAD(preloaded_csets);
 	struct task_struct *task;
 	int ret;
 
-	ret = cgroup_migrate_vet_dst(dst_cgrp);
-	if (ret)
-		return ret;
+	if (!cgroup_may_migrate_to(dst_cgrp))
+		return -EBUSY;
 
 	/* look up all src csets */
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+				       &preloaded_csets);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
@@ -2685,11 +2871,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
-	ret = cgroup_migrate_prepare_dst(&mgctx);
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
 	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, &mgctx);
+		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
 
-	cgroup_migrate_finish(&mgctx);
+	cgroup_migrate_finish(&preloaded_csets);
 
 	if (!ret)
 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2697,65 +2883,222 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	return ret;
 }
 
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
-	__acquires(&cgroup_threadgroup_rwsem)
+int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
 {
-	struct task_struct *tsk;
-	pid_t pid;
+	const struct cred *cred = current_cred(), *tcred;
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
 
-	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-		return ERR_PTR(-EINVAL);
+	if (capable(CAP_SYS_NICE))
+		return 0;
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
+	cgroup_taskset_for_each(task, css, tset) {
+		tcred = __task_cred(task);
 
-	rcu_read_lock();
-	if (pid) {
-		tsk = find_task_by_vpid(pid);
-		if (!tsk) {
-			tsk = ERR_PTR(-ESRCH);
-			goto out_unlock_threadgroup;
-		}
-	} else {
-		tsk = current;
+		if (current != task && !uid_eq(cred->euid, tcred->uid) &&
+		    !uid_eq(cred->euid, tcred->suid))
+			return -EACCES;
 	}
 
-	if (threadgroup)
-		tsk = tsk->group_leader;
+	return 0;
+}
 
-	/*
+static int cgroup_procs_write_permission(struct task_struct *task,
+					 struct cgroup *dst_cgrp,
+					 struct kernfs_open_file *of)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = get_task_cred(task);
+	int ret = 0;
+
+	/*
+	 * even if we're attaching all tasks in the thread group, we only
+	 * need to check permissions on one of them.
+	 */
+	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+	    !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->euid, tcred->suid) &&
+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
+		ret = -EACCES;
+
+	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+		struct super_block *sb = of->file->f_path.dentry->d_sb;
+		struct cgroup *cgrp;
+		struct inode *inode;
+
+		spin_lock_irq(&css_set_lock);
+		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+		spin_unlock_irq(&css_set_lock);
+
+		while (!cgroup_is_descendant(dst_cgrp, cgrp))
+			cgrp = cgroup_parent(cgrp);
+
+		ret = -ENOMEM;
+		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+		if (inode) {
+			ret = inode_permission(inode, MAY_WRITE);
+			iput(inode);
+		}
+	}
+
+	put_cred(tcred);
+	return ret;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup.
+ */
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+				    size_t nbytes, loff_t off, bool threadgroup)
+{
+	struct task_struct *tsk;
+	struct cgroup_subsys *ss;
+	struct cgroup *cgrp;
+	pid_t pid;
+	int ssid, ret;
+
+	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+		return -EINVAL;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	rcu_read_lock();
+	if (pid) {
+		tsk = find_task_by_vpid(pid);
+		if (!tsk) {
+			ret = -ESRCH;
+			goto out_unlock_rcu;
+		}
+	} else {
+		tsk = current;
+	}
+
+	if (threadgroup)
+		tsk = tsk->group_leader;
+
+	/*
 	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
 	 * If userland migrates such a kthread to a non-root cgroup, it can
 	 * become trapped in a cpuset, or RT kthread may be born in a
 	 * cgroup with no rt_runtime allocated.  Just say no.
 	 */
 	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
-		tsk = ERR_PTR(-EINVAL);
-		goto out_unlock_threadgroup;
+		ret = -EINVAL;
+		goto out_unlock_rcu;
 	}
 
 	get_task_struct(tsk);
-	goto out_unlock_rcu;
+	rcu_read_unlock();
+
+	ret = cgroup_procs_write_permission(tsk, cgrp, of);
+	if (!ret)
+		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
+	put_task_struct(tsk);
+	goto out_unlock_threadgroup;
 
-out_unlock_threadgroup:
-	percpu_up_write(&cgroup_threadgroup_rwsem);
 out_unlock_rcu:
 	rcu_read_unlock();
-	return tsk;
+out_unlock_threadgroup:
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	for_each_subsys(ss, ssid)
+		if (ss->post_attach)
+			ss->post_attach();
+	cgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
 }
 
-void cgroup_procs_write_finish(struct task_struct *task)
-	__releases(&cgroup_threadgroup_rwsem)
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
-	struct cgroup_subsys *ss;
-	int ssid;
+	struct cgroup_root *root;
+	int retval = 0;
 
-	/* release reference from cgroup_procs_write_start() */
-	put_task_struct(task);
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	for_each_root(root) {
+		struct cgroup *from_cgrp;
 
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		spin_lock_irq(&css_set_lock);
+		from_cgrp = task_cgroup_from_root(from, root);
+		spin_unlock_irq(&css_set_lock);
+
+		retval = cgroup_attach_task(from_cgrp, tsk, false);
+		if (retval)
+			break;
+	}
 	percpu_up_write(&cgroup_threadgroup_rwsem);
-	for_each_subsys(ss, ssid)
-		if (ss->post_attach)
-			ss->post_attach();
+	mutex_unlock(&cgroup_mutex);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+	/*
+	 * Release agent gets called with all capabilities,
+	 * require capabilities to set release agent.
+	 */
+	if ((of->file->f_cred->user_ns != &init_user_ns) ||
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+	spin_lock(&release_agent_path_lock);
+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+		sizeof(cgrp->root->release_agent_path));
+	spin_unlock(&release_agent_path_lock);
+	cgroup_kn_unlock(of->kn);
+	return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	spin_lock(&release_agent_path_lock);
+	seq_puts(seq, cgrp->root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, "0\n");
+	return 0;
 }
 
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2803,7 +3146,8 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
-	DEFINE_CGROUP_MGCTX(mgctx);
+	LIST_HEAD(preloaded_csets);
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
@@ -2819,29 +3163,33 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		struct cgrp_cset_link *link;
 
 		list_for_each_entry(link, &dsct->cset_links, cset_link)
-			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
+			cgroup_migrate_add_src(link->cset, dsct,
+					       &preloaded_csets);
 	}
 	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
-	ret = cgroup_migrate_prepare_dst(&mgctx);
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
 	if (ret)
 		goto out_finish;
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
-			    mg_src_preload_node) {
+	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
+		/* src_csets precede dst_csets, break on the first dst_cset */
+		if (!src_cset->mg_src_cgrp)
+			break;
+
 		/* all tasks in src_csets need to be migrated */
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-			cgroup_migrate_add_task(task, &mgctx);
+			cgroup_taskset_add(task, &tset);
 	}
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_migrate_execute(&mgctx);
+	ret = cgroup_taskset_migrate(&tset, cgrp->root);
 out_finish:
-	cgroup_migrate_finish(&mgctx);
+	cgroup_migrate_finish(&preloaded_csets);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
@@ -2854,7 +3202,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  * controller while the previous css is still around.  This function grabs
  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
  */
-void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
 	__acquires(&cgroup_mutex)
 {
 	struct cgroup *dsct;
@@ -2888,12 +3236,11 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
 }
 
 /**
- * cgroup_save_control - save control masks and dom_cgrp of a subtree
+ * cgroup_save_control - save control masks of a subtree
  * @cgrp: root of the target subtree
  *
- * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
- * respective old_ prefixed fields for @cgrp's subtree including @cgrp
- * itself.
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
  */
 static void cgroup_save_control(struct cgroup *cgrp)
 {
@@ -2903,7 +3250,6 @@ static void cgroup_save_control(struct cgroup *cgrp)
 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
 		dsct->old_subtree_control = dsct->subtree_control;
 		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
-		dsct->old_dom_cgrp = dsct->dom_cgrp;
 	}
 }
 
@@ -2929,12 +3275,11 @@ static void cgroup_propagate_control(struct cgroup *cgrp)
 }
 
 /**
- * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
+ * cgroup_restore_control - restore control masks of a subtree
  * @cgrp: root of the target subtree
  *
- * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
- * respective old_ prefixed fields for @cgrp's subtree including @cgrp
- * itself.
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
  */
 static void cgroup_restore_control(struct cgroup *cgrp)
 {
@@ -2944,7 +3289,6 @@ static void cgroup_restore_control(struct cgroup *cgrp)
 	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
 		dsct->subtree_control = dsct->old_subtree_control;
 		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
-		dsct->dom_cgrp = dsct->old_dom_cgrp;
 	}
 }
 
@@ -2984,6 +3328,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
 		for_each_subsys(ss, ssid) {
 			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
 
+			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
 			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
 				continue;
 
@@ -2993,8 +3339,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
 					return PTR_ERR(css);
 			}
 
-			WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
-
 			if (css_visible(css)) {
 				ret = css_populate_dir(css);
 				if (ret)
@@ -3030,11 +3374,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
 		for_each_subsys(ss, ssid) {
 			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
 
+			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
 			if (!css)
 				continue;
 
-			WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
-
 			if (css->parent &&
 			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
 				kill_css(css);
@@ -3103,46 +3447,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
 	cgroup_apply_control_disable(cgrp);
 }
 
-static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
-{
-	u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
-
-	/* if nothing is getting enabled, nothing to worry about */
-	if (!enable)
-		return 0;
-
-	/* can @cgrp host any resources? */
-	if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
-		return -EOPNOTSUPP;
-
-	/* mixables don't care */
-	if (cgroup_is_mixable(cgrp))
-		return 0;
-
-	if (domain_enable) {
-		/* can't enable domain controllers inside a thread subtree */
-		if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
-			return -EOPNOTSUPP;
-	} else {
-		/*
-		 * Threaded controllers can handle internal competitions
-		 * and are always allowed inside a (prospective) thread
-		 * subtree.
-		 */
-		if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
-			return 0;
-	}
-
-	/*
-	 * Controllers can't be enabled for a cgroup with tasks to avoid
-	 * child cgroups competing against tasks.
-	 */
-	if (cgroup_has_tasks(cgrp))
-		return -EBUSY;
-
-	return 0;
-}
-
 /* change the enabled child controllers for a cgroup in the default hierarchy */
 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 					    char *buf, size_t nbytes,
@@ -3218,9 +3522,33 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 		goto out_unlock;
 	}
 
-	ret = cgroup_vet_subtree_control_enable(cgrp, enable);
-	if (ret)
-		goto out_unlock;
+	/*
+	 * Except for the root, subtree_control must be zero for a cgroup
+	 * with tasks so that child cgroups don't compete against tasks.
+	 */
+	if (enable && cgroup_parent(cgrp)) {
+		struct cgrp_cset_link *link;
+
+		/*
+		 * Because namespaces pin csets too, @cgrp->cset_links
+		 * might not be empty even when @cgrp is empty.  Walk and
+		 * verify each cset.
+		 */
+		spin_lock_irq(&css_set_lock);
+
+		ret = 0;
+		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+			if (css_set_populated(link->cset)) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		spin_unlock_irq(&css_set_lock);
+
+		if (ret)
+			goto out_unlock;
+	}
 
 	/* save and update control masks and prepare csses */
 	cgroup_save_control(cgrp);
@@ -3239,193 +3567,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
-/**
- * cgroup_enable_threaded - make @cgrp threaded
- * @cgrp: the target cgroup
- *
- * Called when "threaded" is written to the cgroup.type interface file and
- * tries to make @cgrp threaded and join the parent's resource domain.
- * This function is never called on the root cgroup as cgroup.type doesn't
- * exist on it.
- */
-static int cgroup_enable_threaded(struct cgroup *cgrp)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-	struct cgroup *dom_cgrp = parent->dom_cgrp;
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-	int ret;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/* noop if already threaded */
-	if (cgroup_is_threaded(cgrp))
-		return 0;
-
-	/*
-	 * If @cgroup is populated or has domain controllers enabled, it
-	 * can't be switched.  While the below cgroup_can_be_thread_root()
-	 * test can catch the same conditions, that's only when @parent is
-	 * not mixable, so let's check it explicitly.
-	 */
-	if (cgroup_is_populated(cgrp) ||
-	    cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
-		return -EOPNOTSUPP;
-
-	/* we're joining the parent's domain, ensure its validity */
-	if (!cgroup_is_valid_domain(dom_cgrp) ||
-	    !cgroup_can_be_thread_root(dom_cgrp))
-		return -EOPNOTSUPP;
-
-	/*
-	 * The following shouldn't cause actual migrations and should
-	 * always succeed.
-	 */
-	cgroup_save_control(cgrp);
-
-	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
-		if (dsct == cgrp || cgroup_is_threaded(dsct))
-			dsct->dom_cgrp = dom_cgrp;
-
-	ret = cgroup_apply_control(cgrp);
-	if (!ret)
-		parent->nr_threaded_children++;
-
-	cgroup_finalize_control(cgrp, ret);
-	return ret;
-}
-
-static int cgroup_type_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	if (cgroup_is_threaded(cgrp))
-		seq_puts(seq, "threaded\n");
-	else if (!cgroup_is_valid_domain(cgrp))
-		seq_puts(seq, "domain invalid\n");
-	else if (cgroup_is_thread_root(cgrp))
-		seq_puts(seq, "domain threaded\n");
-	else
-		seq_puts(seq, "domain\n");
-
-	return 0;
-}
-
-static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
-				 size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-	int ret;
-
-	/* only switching to threaded mode is supported */
-	if (strcmp(strstrip(buf), "threaded"))
-		return -EINVAL;
-
-	/* drain dying csses before we re-apply (threaded) subtree control */
-	cgrp = cgroup_kn_lock_live(of->kn, true);
-	if (!cgrp)
-		return -ENOENT;
-
-	/* threaded can only be enabled */
-	ret = cgroup_enable_threaded(cgrp);
-
-	cgroup_kn_unlock(of->kn);
-	return ret ?: nbytes;
-}
-
-static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	int descendants = READ_ONCE(cgrp->max_descendants);
-
-	if (descendants == INT_MAX)
-		seq_puts(seq, "max\n");
-	else
-		seq_printf(seq, "%d\n", descendants);
-
-	return 0;
-}
-
-static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
-					   char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-	int descendants;
-	ssize_t ret;
-
-	buf = strstrip(buf);
-	if (!strcmp(buf, "max")) {
-		descendants = INT_MAX;
-	} else {
-		ret = kstrtoint(buf, 0, &descendants);
-		if (ret)
-			return ret;
-	}
-
-	if (descendants < 0)
-		return -ERANGE;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENOENT;
-
-	cgrp->max_descendants = descendants;
-
-	cgroup_kn_unlock(of->kn);
-
-	return nbytes;
-}
-
-static int cgroup_max_depth_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	int depth = READ_ONCE(cgrp->max_depth);
-
-	if (depth == INT_MAX)
-		seq_puts(seq, "max\n");
-	else
-		seq_printf(seq, "%d\n", depth);
-
-	return 0;
-}
-
-static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
-				      char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-	ssize_t ret;
-	int depth;
-
-	buf = strstrip(buf);
-	if (!strcmp(buf, "max")) {
-		depth = INT_MAX;
-	} else {
-		ret = kstrtoint(buf, 0, &depth);
-		if (ret)
-			return ret;
-	}
-
-	if (depth < 0)
-		return -ERANGE;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENOENT;
-
-	cgrp->max_depth = depth;
-
-	cgroup_kn_unlock(of->kn);
-
-	return nbytes;
-}
-
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
-	seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
-
+	seq_printf(seq, "populated %d\n",
+		   cgroup_is_populated(seq_css(seq)->cgroup));
 	return 0;
 }
 
@@ -3519,108 +3664,31 @@ bool cgroup_psi_enabled(void)
 
 #endif /* CONFIG_PSI */
 
-static int cgroup_stat_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgroup = seq_css(seq)->cgroup;
-
-	seq_printf(seq, "nr_descendants %d\n",
-		   cgroup->nr_descendants);
-	seq_printf(seq, "nr_dying_descendants %d\n",
-		   cgroup->nr_dying_descendants);
-
-	return 0;
-}
-
-static int cgroup_freeze_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	seq_printf(seq, "%d\n", cgrp->freezer.freeze);
-
-	return 0;
-}
-
-static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
-				   char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-	ssize_t ret;
-	int freeze;
-
-	ret = kstrtoint(strstrip(buf), 0, &freeze);
-	if (ret)
-		return ret;
-
-	if (freeze < 0 || freeze > 1)
-		return -ERANGE;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENOENT;
-
-	cgroup_freeze(cgrp, freeze);
-
-	cgroup_kn_unlock(of->kn);
-
-	return nbytes;
-}
-
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
-	struct cgroup_file_ctx *ctx;
-	int ret;
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	ctx->ns = current->nsproxy->cgroup_ns;
-	get_cgroup_ns(ctx->ns);
-	of->priv = ctx;
 
-	if (!cft->open)
-		return 0;
-
-	ret = cft->open(of);
-	if (ret) {
-		put_cgroup_ns(ctx->ns);
-		kfree(ctx);
-	}
-	return ret;
+	if (cft->open)
+		return cft->open(of);
+	return 0;
 }
 
 static void cgroup_file_release(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
-	struct cgroup_file_ctx *ctx = of->priv;
 
 	if (cft->release)
 		cft->release(of);
-	put_cgroup_ns(ctx->ns);
-	kfree(ctx);
 }
 
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 
-	/*
-	 * If namespaces are delegation boundaries, disallow writes to
-	 * files in an non-init namespace root from inside the namespace
-	 * except for the files explicitly marked delegatable -
-	 * cgroup.procs and cgroup.subtree_control.
-	 */
-	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
-	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
-	    ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
-		return -EPERM;
-
 	if (cft->write)
 		return cft->write(of, buf, nbytes, off);
 
@@ -3715,6 +3783,52 @@ static struct kernfs_ops cgroup_kf_ops = {
 	.seq_show		= cgroup_seqfile_show,
 };
 
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			 const char *new_name_str)
+{
+	struct cgroup *cgrp = kn->priv;
+	int ret;
+
+	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+	if (strchr(new_name_str, '\n'))
+		return -EINVAL;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+	if (kn->parent != new_parent)
+		return -EIO;
+
+	/*
+	 * This isn't a proper migration and its usefulness is very
+	 * limited.  Disallow on the default hierarchy.
+	 */
+	if (cgroup_on_dfl(cgrp))
+		return -EPERM;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret)
+		trace_cgroup_rename(cgrp);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
+	return ret;
+}
+
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
@@ -3814,6 +3928,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 
 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
+	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->cgrp;
 	struct cgroup_subsys_state *css;
@@ -4014,6 +4129,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
+	spin_unlock_irq(&css_set_lock);
+	return count;
+}
+
 /**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
@@ -4241,58 +4376,6 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
 	return ret;
 }
 
-static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
-{
-	struct list_head *l;
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-
-	lockdep_assert_held(&css_set_lock);
-
-	/* find the next threaded cset */
-	if (it->tcset_pos) {
-		l = it->tcset_pos->next;
-
-		if (l != it->tcset_head) {
-			it->tcset_pos = l;
-			return container_of(l, struct css_set,
-					    threaded_csets_node);
-		}
-
-		it->tcset_pos = NULL;
-	}
-
-	/* find the next cset */
-	l = it->cset_pos;
-	l = l->next;
-	if (l == it->cset_head) {
-		it->cset_pos = NULL;
-		return NULL;
-	}
-
-	if (it->ss) {
-		cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
-	} else {
-		link = list_entry(l, struct cgrp_cset_link, cset_link);
-		cset = link->cset;
-	}
-
-	it->cset_pos = l;
-
-	/* initialize threaded css_set walking */
-	if (it->flags & CSS_TASK_ITER_THREADED) {
-		if (it->cur_dcset)
-			put_css_set_locked(it->cur_dcset);
-		it->cur_dcset = cset;
-		get_css_set(cset);
-
-		it->tcset_head = &cset->threaded_csets;
-		it->tcset_pos = &cset->threaded_csets;
-	}
-
-	return cset;
-}
-
 /**
  * css_task_iter_advance_css_set - advance a task itererator to the next css_set
  * @it: the iterator to advance
@@ -4301,33 +4384,39 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
  */
 static void css_task_iter_advance_css_set(struct css_task_iter *it)
 {
+	struct list_head *l = it->cset_pos;
+	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
 	lockdep_assert_held(&css_set_lock);
 
 	/* Advance to the next non-empty css_set */
 	do {
-		cset = css_task_iter_next_css_set(it);
-		if (!cset) {
+		l = l->next;
+		if (l == it->cset_head) {
+			it->cset_pos = NULL;
 			it->task_pos = NULL;
 			return;
 		}
-	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
 
-	if (!list_empty(&cset->tasks)) {
+		if (it->ss) {
+			cset = container_of(l, struct css_set,
+					    e_cset_node[it->ss->id]);
+		} else {
+			link = list_entry(l, struct cgrp_cset_link, cset_link);
+			cset = link->cset;
+		}
+	} while (!css_set_populated(cset));
+
+	it->cset_pos = l;
+
+	if (!list_empty(&cset->tasks))
 		it->task_pos = cset->tasks.next;
-		it->cur_tasks_head = &cset->tasks;
-	} else if (!list_empty(&cset->mg_tasks)) {
+	else
 		it->task_pos = cset->mg_tasks.next;
-		it->cur_tasks_head = &cset->mg_tasks;
-	} else {
-		it->task_pos = cset->dying_tasks.next;
-		it->cur_tasks_head = &cset->dying_tasks;
-	}
 
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
-	it->dying_tasks_head = &cset->dying_tasks;
 
 	/*
 	 * We don't keep css_sets locked across iteration steps and thus
@@ -4353,74 +4442,32 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 	list_add(&it->iters_node, &cset->task_iters);
 }
 
-static void css_task_iter_skip(struct css_task_iter *it,
-			       struct task_struct *task)
-{
-	lockdep_assert_held(&css_set_lock);
-
-	if (it->task_pos == &task->cg_list) {
-		it->task_pos = it->task_pos->next;
-		it->flags |= CSS_TASK_ITER_SKIPPED;
-	}
-}
-
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-	struct task_struct *task;
+	struct list_head *l = it->task_pos;
 
 	lockdep_assert_held(&css_set_lock);
-repeat:
-	if (it->task_pos) {
-		/*
-		 * Advance iterator to find next entry.  cset->tasks is
-		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
-		 * we move onto the next cset.
-		 */
-		if (it->flags & CSS_TASK_ITER_SKIPPED)
-			it->flags &= ~CSS_TASK_ITER_SKIPPED;
-		else
-			it->task_pos = it->task_pos->next;
+	WARN_ON_ONCE(!l);
 
-		if (it->task_pos == it->tasks_head) {
-			it->task_pos = it->mg_tasks_head->next;
-			it->cur_tasks_head = it->mg_tasks_head;
-		}
-		if (it->task_pos == it->mg_tasks_head) {
-			it->task_pos = it->dying_tasks_head->next;
-			it->cur_tasks_head = it->dying_tasks_head;
-		}
-		if (it->task_pos == it->dying_tasks_head)
-			css_task_iter_advance_css_set(it);
-	} else {
-		/* called from start, proceed to the first cset */
-		css_task_iter_advance_css_set(it);
-	}
-
-	if (!it->task_pos)
-		return;
-
-	task = list_entry(it->task_pos, struct task_struct, cg_list);
+	/*
+	 * Advance iterator to find next entry.  cset->tasks is consumed
+	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+	 * next cset.
+	 */
+	l = l->next;
 
-	if (it->flags & CSS_TASK_ITER_PROCS) {
-		/* if PROCS, skip over tasks which aren't group leaders */
-		if (!thread_group_leader(task))
-			goto repeat;
+	if (l == it->tasks_head)
+		l = it->mg_tasks_head->next;
 
-		/* and dying leaders w/o live member threads */
-		if (it->cur_tasks_head == it->dying_tasks_head &&
-		    !atomic_read(&task->signal->live))
-			goto repeat;
-	} else {
-		/* skip all dying ones */
-		if (it->cur_tasks_head == it->dying_tasks_head)
-			goto repeat;
-	}
+	if (l == it->mg_tasks_head)
+		css_task_iter_advance_css_set(it);
+	else
+		it->task_pos = l;
 }
 
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
- * @flags: CSS_TASK_ITER_* flags
  * @it: the task iterator to use
  *
  * Initiate iteration through the tasks of @css.  The caller can call
@@ -4428,7 +4475,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
  */
-void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
+void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
 {
 	/* no one should try to iterate before mounting cgroups */
@@ -4439,7 +4486,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 	spin_lock_irq(&css_set_lock);
 
 	it->ss = css->ss;
-	it->flags = flags;
 
 	if (it->ss)
 		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -4448,7 +4494,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 
 	it->cset_head = it->cset_pos;
 
-	css_task_iter_advance(it);
+	css_task_iter_advance_css_set(it);
 
 	spin_unlock_irq(&css_set_lock);
 }
@@ -4470,10 +4516,6 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 
 	spin_lock_irq(&css_set_lock);
 
-	/* @it may be half-advanced by skips, finish advancing */
-	if (it->flags & CSS_TASK_ITER_SKIPPED)
-		css_task_iter_advance(it);
-
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
 					  cg_list);
@@ -4501,276 +4543,576 @@ void css_task_iter_end(struct css_task_iter *it)
 		spin_unlock_irq(&css_set_lock);
 	}
 
-	if (it->cur_dcset)
-		put_css_set(it->cur_dcset);
-
 	if (it->cur_task)
 		put_task_struct(it->cur_task);
 }
 
-static void cgroup_procs_release(struct kernfs_open_file *of)
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	LIST_HEAD(preloaded_csets);
+	struct cgrp_cset_link *link;
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret;
+
+	if (!cgroup_may_migrate_to(to))
+		return -EBUSY;
+
+	mutex_lock(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* all tasks in @from are being moved, all csets are source */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
+	do {
+		css_task_iter_start(&from->self, &it);
+
+		do {
+			task = css_task_iter_next(&it);
+		} while (task && (task->flags & PF_EXITING));
+
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			ret = cgroup_migrate(task, false, to->root);
+			if (!ret)
+				trace_cgroup_transfer_tasks(to, task, false);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+out_err:
+	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
 
-	if (ctx->procs.started)
-		css_task_iter_end(&ctx->procs.iter);
+static void pidlist_free(void *p)
+{
+	kvfree(p);
 }
 
-static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
 {
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_file_ctx *ctx = of->priv;
+	struct cgroup_pidlist *l, *tmp_l;
 
-	if (pos)
-		(*pos)++;
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
 
-	return css_task_iter_next(&ctx->procs.iter);
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
 }
 
-static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
-				  unsigned int iter_flags)
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 {
-	struct kernfs_open_file *of = s->private;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct css_task_iter *it = &ctx->procs.iter;
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
 
 	/*
-	 * When a seq_file is seeked, it's always traversed sequentially
-	 * from position 0, so we can simply keep iterating on !0 *pos.
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
 	 */
-	if (!ctx->procs.started) {
-		if (WARN_ON_ONCE((*pos)))
-			return ERR_PTR(-EINVAL);
-		css_task_iter_start(&cgrp->self, iter_flags, it);
-		ctx->procs.started = true;
-	} else if (!(*pos)) {
-		css_task_iter_end(it);
-		css_task_iter_start(&cgrp->self, iter_flags, it);
-	} else
-		return it->cur_task;
+	if (!delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
 
-	return cgroup_procs_next(s, NULL, NULL);
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
 }
 
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
 {
-	struct cgroup *cgrp = seq_css(s)->cgroup;
+	int src, dest = 1;
 
 	/*
-	 * All processes of a threaded subtree belong to the domain cgroup
-	 * of the subtree.  Only threads can be distributed across the
-	 * subtree.  Reject reads on cgroup.procs in the subtree proper.
-	 * They're always empty anyway.
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
 	 */
-	if (cgroup_is_threaded(cgrp))
-		return ERR_PTR(-EOPNOTSUPP);
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property.  In the long term, we
+ * want to do away with it.  Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+	unsigned a = pid & 0x55555555;
+	unsigned b = pid & 0xAAAAAAAA;
 
-	return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
-					    CSS_TASK_ITER_THREADED);
+	return (a << 1) | (b >> 1);
 }
 
-static int cgroup_procs_show(struct seq_file *s, void *v)
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
 {
-	seq_printf(s, "%d\n", task_pid_vnr(v));
-	return 0;
+	if (cgroup_on_dfl(cgrp))
+		return pid_fry(pid);
+	else
+		return pid;
 }
 
-int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
+static int cmppid(const void *a, const void *b)
 {
-	const struct cred *cred = current_cred(), *tcred;
-	struct task_struct *task;
-	struct cgroup_subsys_state *css;
+	return *(pid_t *)a - *(pid_t *)b;
+}
 
-	if (capable(CAP_SYS_NICE))
-		return 0;
+static int fried_cmppid(const void *a, const void *b)
+{
+	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
 
-	cgroup_taskset_for_each(task, css, tset) {
-		tcred = __task_cred(task);
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
 
-		if (current != task && !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
-			return -EACCES;
-	}
+	lockdep_assert_held(&cgrp->pidlist_mutex);
 
-	return 0;
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
 }
 
-static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
-					 struct cgroup *dst_cgrp,
-					 struct super_block *sb,
-					 struct cgroup_namespace *ns)
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
 {
-	struct cgroup *com_cgrp = src_cgrp;
-	struct inode *inode;
-	int ret;
+	struct cgroup_pidlist *l;
 
-	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&cgrp->pidlist_mutex);
 
-	/* find the common ancestor */
-	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
-		com_cgrp = cgroup_parent(com_cgrp);
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
 
-	/* %current should be authorized to migrate to the common ancestor */
-	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-	if (!inode)
-		return -ENOMEM;
+	/* entry not found; create a new one */
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l)
+		return l;
 
-	ret = inode_permission(inode, MAY_WRITE);
-	iput(inode);
-	if (ret)
-		return ret;
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+	l->key.type = type;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
 
 	/*
-	 * If namespaces are delegation boundaries, %current must be able
-	 * to see both source and destination cgroups from its namespace.
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
 	 */
-	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
-	    (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
-	     !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
-		return -ENOENT;
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		if (unlikely(n == length))
+			break;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
+	}
+	css_task_iter_end(&it);
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	if (cgroup_on_dfl(cgrp))
+		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+	else
+		sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(array, length);
 
+	l = cgroup_pidlist_find_create(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+
+	/* store array, freeing old if necessary */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	*lp = l;
 	return 0;
 }
 
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct cgroup *src_cgrp, *dst_cgrp;
-	struct task_struct *task;
-	const struct cred *saved_cred;
-	ssize_t ret;
-
-	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!dst_cgrp)
-		return -ENODEV;
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup *cgrp;
+	struct css_task_iter it;
+	struct task_struct *tsk;
 
-	task = cgroup_procs_write_start(buf, true);
-	ret = PTR_ERR_OR_ZERO(task);
-	if (ret)
-		goto out_unlock;
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
 
-	/* find the source cgroup */
-	spin_lock_irq(&css_set_lock);
-	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-	spin_unlock_irq(&css_set_lock);
+	mutex_lock(&cgroup_mutex);
 
 	/*
-	 * Process and thread migrations follow same delegation rule. Check
-	 * permissions using the credentials from file open to protect against
-	 * inherited fd attacks.
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
-	saved_cred = override_creds(of->file->f_cred);
-	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-					    of->file->f_path.dentry->d_sb,
-					    ctx->ns);
-	revert_creds(saved_cred);
-	if (ret)
-		goto out_finish;
-
-	ret = cgroup_attach_task(dst_cgrp, task, true);
+	rcu_read_lock();
+	cgrp = rcu_dereference(kn->priv);
+	if (!cgrp || cgroup_is_dead(cgrp)) {
+		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
+		return -ENOENT;
+	}
+	rcu_read_unlock();
 
-out_finish:
-	cgroup_procs_write_finish(task);
-out_unlock:
-	cgroup_kn_unlock(of->kn);
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		switch (tsk->state) {
+		case TASK_RUNNING:
+			stats->nr_running++;
+			break;
+		case TASK_INTERRUPTIBLE:
+			stats->nr_sleeping++;
+			break;
+		case TASK_UNINTERRUPTIBLE:
+			stats->nr_uninterruptible++;
+			break;
+		case TASK_STOPPED:
+			stats->nr_stopped++;
+			break;
+		default:
+			if (delayacct_is_task_waiting_on_io(tsk))
+				stats->nr_io_wait++;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
 
-	return ret ?: nbytes;
+	mutex_unlock(&cgroup_mutex);
+	return 0;
 }
 
-static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
-{
-	return __cgroup_procs_start(s, pos, 0);
-}
 
-static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
-				    char *buf, size_t nbytes, loff_t off)
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct cgroup *src_cgrp, *dst_cgrp;
-	struct task_struct *task;
-	const struct cred *saved_cred;
-	ssize_t ret;
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = seq_cft(s)->private;
+	int index = 0, pid = *pos;
+	int *iter, ret;
 
-	buf = strstrip(buf);
+	mutex_lock(&cgrp->pidlist_mutex);
 
-	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!dst_cgrp)
-		return -ENODEV;
+	/*
+	 * !NULL @of->priv indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->priv can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
 
-	task = cgroup_procs_write_start(buf, false);
-	ret = PTR_ERR_OR_ZERO(task);
-	if (ret)
-		goto out_unlock;
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->priv;
 
-	/* find the source cgroup */
-	spin_lock_irq(&css_set_lock);
-	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-	spin_unlock_irq(&css_set_lock);
+	if (pid) {
+		int end = l->length;
+
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
+				index = mid;
+				break;
+			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= l->length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = l->list + index;
+	*pos = cgroup_pid_fry(cgrp, *iter);
+	return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
 
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
 	/*
-	 * Process and thread migrations follow same delegation rule. Check
-	 * permissions using the credentials from file open to protect against
-	 * inherited fd attacks.
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
 	 */
-	saved_cred = override_creds(of->file->f_cred);
-	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-					    of->file->f_path.dentry->d_sb,
-					    ctx->ns);
-	revert_creds(saved_cred);
-	if (ret)
-		goto out_finish;
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
+		return p;
+	}
+}
 
-	/* and must be contained in the same domain */
-	ret = -EOPNOTSUPP;
-	if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
-		goto out_finish;
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", *(int *)v);
 
-	ret = cgroup_attach_task(dst_cgrp, task, false);
+	return 0;
+}
 
-out_finish:
-	cgroup_procs_write_finish(task);
-out_unlock:
-	cgroup_kn_unlock(of->kn);
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return notify_on_release(css->cgroup);
+}
 
-	return ret ?: nbytes;
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+					  struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+				       struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	return 0;
 }
 
 /* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_base_files[] = {
-	{
-		.name = "cgroup.type",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cgroup_type_show,
-		.write = cgroup_type_write,
-	},
+static struct cftype cgroup_dfl_base_files[] = {
 	{
 		.name = "cgroup.procs",
-		.flags = CFTYPE_NS_DELEGATABLE,
 		.file_offset = offsetof(struct cgroup, procs_file),
-		.release = cgroup_procs_release,
-		.seq_start = cgroup_procs_start,
-		.seq_next = cgroup_procs_next,
-		.seq_show = cgroup_procs_show,
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
 		.write = cgroup_procs_write,
 	},
-	{
-		.name = "cgroup.threads",
-		.flags = CFTYPE_NS_DELEGATABLE,
-		.release = cgroup_procs_release,
-		.seq_start = cgroup_threads_start,
-		.seq_next = cgroup_procs_next,
-		.seq_show = cgroup_procs_show,
-		.write = cgroup_threads_write,
-	},
 	{
 		.name = "cgroup.controllers",
 		.seq_show = cgroup_controllers_show,
 	},
 	{
 		.name = "cgroup.subtree_control",
-		.flags = CFTYPE_NS_DELEGATABLE,
 		.seq_show = cgroup_subtree_control_show,
 		.write = cgroup_subtree_control_write,
 	},
@@ -4806,25 +5148,50 @@ static struct cftype cgroup_base_files[] = {
 		.release = cgroup_pressure_release,
 	},
 #endif /* CONFIG_PSI */
+	{ }	/* terminate */
+};
+
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
+		.write = cgroup_procs_write,
+	},
 	{
-		.name = "cgroup.max.descendants",
-		.seq_show = cgroup_max_descendants_show,
-		.write = cgroup_max_descendants_write,
+		.name = "cgroup.clone_children",
+		.read_u64 = cgroup_clone_children_read,
+		.write_u64 = cgroup_clone_children_write,
 	},
 	{
-		.name = "cgroup.max.depth",
-		.seq_show = cgroup_max_depth_show,
-		.write = cgroup_max_depth_write,
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_sane_behavior_show,
 	},
 	{
-		.name = "cgroup.stat",
-		.seq_show = cgroup_stat_show,
+		.name = "tasks",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_TASKS,
+		.write = cgroup_tasks_write,
 	},
 	{
-		.name = "cgroup.freeze",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cgroup_freeze_show,
-		.write = cgroup_freeze_write,
+		.name = "notify_on_release",
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_release_agent_show,
+		.write = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
 };
@@ -4874,7 +5241,7 @@ static void css_free_work_fn(struct work_struct *work)
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-		cgroup1_pidlist_destroy_all(cgrp);
+		cgroup_pidlist_destroy_all(cgrp);
 		cancel_work_sync(&cgrp->release_agent_work);
 
 		if (cgroup_parent(cgrp)) {
@@ -4927,17 +5294,9 @@ static void css_release_work_fn(struct work_struct *work)
 		if (ss->css_released)
 			ss->css_released(css);
 	} else {
-		struct cgroup *tcgrp;
-
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
-		spin_lock_irq(&css_set_lock);
-		for (tcgrp = cgroup_parent(cgrp); tcgrp;
-		     tcgrp = cgroup_parent(tcgrp))
-			tcgrp->nr_dying_descendants--;
-		spin_unlock_irq(&css_set_lock);
-
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 		cgrp->id = -1;
 
@@ -5024,6 +5383,9 @@ static void offline_css(struct cgroup_subsys_state *css)
 	if (!(css->flags & CSS_ONLINE))
 		return;
 
+	if (ss->css_reset)
+		ss->css_reset(css);
+
 	if (ss->css_offline)
 		ss->css_offline(css);
 
@@ -5136,40 +5498,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_idr_free;
 
-	/*
-	 * New cgroup inherits effective freeze counter, and
-	 * if the parent has to be frozen, the child has too.
-	 */
-	cgrp->freezer.e_freeze = parent->freezer.e_freeze;
-	if (cgrp->freezer.e_freeze) {
-		/*
-		 * Set the CGRP_FREEZE flag, so when a process will be
-		 * attached to the child cgroup, it will become frozen.
-		 * At this point the new cgroup is unpopulated, so we can
-		 * consider it frozen immediately.
-		 */
-		set_bit(CGRP_FREEZE, &cgrp->flags);
-		set_bit(CGRP_FROZEN, &cgrp->flags);
-	}
-
-	spin_lock_irq(&css_set_lock);
-	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
-		if (tcgrp != cgrp) {
-			tcgrp->nr_descendants++;
-
-			/*
-			 * If the new cgroup is frozen, all ancestor cgroups
-			 * get a new frozen descendant, but their state can't
-			 * change because of this.
-			 */
-			if (cgrp->freezer.e_freeze)
-				tcgrp->freezer.nr_frozen_descendants++;
-		}
-	}
-	spin_unlock_irq(&css_set_lock);
-
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 
@@ -5215,30 +5546,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	return ERR_PTR(ret);
 }
 
-static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
-{
-	struct cgroup *cgroup;
-	int ret = false;
-	int level = 1;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
-		if (cgroup->nr_descendants >= cgroup->max_descendants)
-			goto fail;
-
-		if (level > cgroup->max_depth)
-			goto fail;
-
-		level++;
-	}
-
-	ret = true;
-fail:
-	return ret;
-}
-
-int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			umode_t mode)
 {
 	struct cgroup *parent, *cgrp;
 	struct kernfs_node *kn;
@@ -5252,11 +5561,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 	if (!parent)
 		return -ENODEV;
 
-	if (!cgroup_check_hierarchy_limits(parent)) {
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-
 	cgrp = cgroup_create(parent);
 	if (IS_ERR(cgrp)) {
 		ret = PTR_ERR(cgrp);
@@ -5408,7 +5712,6 @@ static void kill_css(struct cgroup_subsys_state *css)
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-	struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
 	struct cgroup_subsys_state *css;
 	struct cgrp_cset_link *link;
 	int ssid;
@@ -5447,27 +5750,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 
-	/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
-	css_clear_dir(&cgrp->self);
+	/*
+	 * Remove @cgrp directory along with the base files.  @cgrp has an
+	 * extra ref on its kn.
+	 */
 	kernfs_remove(cgrp->kn);
 
-	if (parent && cgroup_is_threaded(cgrp))
-		parent->nr_threaded_children--;
-
-	spin_lock_irq(&css_set_lock);
-	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
-		tcgrp->nr_descendants--;
-		tcgrp->nr_dying_descendants++;
-		/*
-		 * If the dying cgroup is frozen, decrease frozen descendants
-		 * counters of ancestor cgroups.
-		 */
-		if (test_bit(CGRP_FROZEN, &cgrp->flags))
-			tcgrp->freezer.nr_frozen_descendants--;
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	cgroup1_check_for_release(parent);
+	check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
@@ -5475,7 +5764,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	return 0;
 };
 
-int cgroup_rmdir(struct kernfs_node *kn)
+static int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 	int ret = 0;
@@ -5494,10 +5783,11 @@ int cgroup_rmdir(struct kernfs_node *kn)
 }
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
-	.show_options		= cgroup_show_options,
 	.remount_fs		= cgroup_remount,
+	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
+	.rename			= cgroup_rename,
 	.show_path		= cgroup_show_path,
 };
 
@@ -5541,7 +5831,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
-	have_release_callback |= (bool)ss->release << ss->id;
+	have_free_callback |= (bool)ss->free << ss->id;
 	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
@@ -5603,8 +5893,8 @@ int __init cgroup_init(void)
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
@@ -5650,23 +5940,17 @@ int __init cgroup_init(void)
 		if (!cgroup_ssid_enabled(ssid))
 			continue;
 
-		if (cgroup1_ssid_disabled(ssid))
+		if (cgroup_ssid_no_v1(ssid))
 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
 			       ss->name);
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
-		/* implicit controllers must be threaded too */
-		WARN_ON(ss->implicit_on_dfl && !ss->threaded);
-
 		if (ss->implicit_on_dfl)
 			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
 		else if (!ss->dfl_cftypes)
 			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
 
-		if (ss->threaded)
-			cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
-
 		if (ss->dfl_cftypes == ss->legacy_cftypes) {
 			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
 		} else {
@@ -5707,6 +5991,15 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
+
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+
 	return 0;
 }
 core_initcall(cgroup_wq_init);
@@ -5789,6 +6082,42 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 	return retval;
 }
 
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
+	mutex_lock(&cgroup_mutex);
+
+	for_each_subsys(ss, i)
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->legacy_name, ss->root->hierarchy_id,
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+static const struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5895,29 +6224,8 @@ void cgroup_post_fork(struct task_struct *child)
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
-			cset->nr_tasks++;
 			css_set_move_task(child, NULL, cset, false);
 		}
-
-		/*
-		 * If the cgroup has to be frozen, the new task has too.
-		 * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
-		 * the task into the frozen state.
-		 */
-		if (unlikely(cgroup_task_freeze(child))) {
-			spin_lock(&child->sighand->siglock);
-			WARN_ON_ONCE(child->frozen);
-			child->jobctl |= JOBCTL_TRAP_FREEZE;
-			spin_unlock(&child->sighand->siglock);
-
-			/*
-			 * Calling cgroup_update_frozen() isn't required here,
-			 * because it will be called anyway a bit later
-			 * from do_freezer_trap(). So we avoid cgroup's
-			 * transient switch from the frozen state and back.
-			 */
-		}
-
 		spin_unlock_irq(&css_set_lock);
 	}
 
@@ -5965,13 +6273,6 @@ void cgroup_exit(struct task_struct *tsk)
 	if (!list_empty(&tsk->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		list_add_tail(&tsk->cg_list, &cset->dying_tasks);
-		cset->nr_tasks--;
-
-		if (unlikely(cgroup_task_frozen(tsk)))
-			cgroup_freezer_frozen_exit(tsk);
-		else if (unlikely(cgroup_task_freeze(tsk)))
-			cgroup_update_frozen(task_dfl_cgroup(tsk));
 		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
@@ -5983,27 +6284,87 @@ void cgroup_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-void cgroup_release(struct task_struct *task)
+void cgroup_free(struct task_struct *task)
 {
+	struct css_set *cset = task_css_set(task);
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	do_each_subsys_mask(ss, ssid, have_release_callback) {
-		ss->release(task);
+	do_each_subsys_mask(ss, ssid, have_free_callback) {
+		ss->free(task);
 	} while_each_subsys_mask();
 
-	if (use_task_css_set_links) {
-		spin_lock_irq(&css_set_lock);
-		css_set_skip_task_iters(task_css_set(task), task);
-		list_del_init(&task->cg_list);
-		spin_unlock_irq(&css_set_lock);
-	}
+	put_css_set(cset);
 }
 
-void cgroup_free(struct task_struct *task)
+static void check_for_release(struct cgroup *cgrp)
 {
-	struct css_set *cset = task_css_set(task);
-	put_css_set(cset);
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+		schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+static void cgroup_release_agent(struct work_struct *work)
+{
+	struct cgroup *cgrp =
+		container_of(work, struct cgroup, release_agent_work);
+	char *pathbuf = NULL, *agentbuf = NULL;
+	char *argv[3], *envp[3];
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+	if (!pathbuf || !agentbuf)
+		goto out;
+
+	spin_lock_irq(&css_set_lock);
+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+	spin_unlock_irq(&css_set_lock);
+	if (ret < 0 || ret >= PATH_MAX)
+		goto out;
+
+	argv[0] = agentbuf;
+	argv[1] = pathbuf;
+	argv[2] = NULL;
+
+	/* minimal command environment */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	goto out_free;
+out:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(agentbuf);
+	kfree(pathbuf);
 }
 
 static int __init cgroup_disable(char *str)
@@ -6039,6 +6400,33 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
+static int __init cgroup_no_v1(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		if (!strcmp(token, "all")) {
+			cgroup_no_v1_mask = U16_MAX;
+			break;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			cgroup_no_v1_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -6068,7 +6456,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See css_release_work_fn() for details.
 	 */
-	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 
@@ -6238,6 +6626,154 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+	struct cgroup_namespace *new_ns;
+	int ret;
+
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	atomic_set(&new_ns->count, 1);
+	new_ns->ns.ops = &cgroupns_operations;
+	return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+	put_css_set(ns->root_cset);
+	dec_cgroup_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns)
+{
+	struct cgroup_namespace *new_ns;
+	struct ucounts *ucounts;
+	struct css_set *cset;
+
+	BUG_ON(!old_ns);
+
+	if (!(flags & CLONE_NEWCGROUP)) {
+		get_cgroup_ns(old_ns);
+		return old_ns;
+	}
+
+	/* Allow only sysadmin to create cgroup namespace. */
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	ucounts = inc_cgroup_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
+	/* It is not safe to take cgroup_mutex here */
+	spin_lock_irq(&css_set_lock);
+	cset = task_css_set(current);
+	get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	new_ns = alloc_cgroup_ns();
+	if (IS_ERR(new_ns)) {
+		put_css_set(cset);
+		dec_cgroup_namespaces(ucounts);
+		return new_ns;
+	}
+
+	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->root_cset = cset;
+
+	return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Don't need to do anything if we are attaching to our own cgroupns. */
+	if (cgroup_ns == nsproxy->cgroup_ns)
+		return 0;
+
+	get_cgroup_ns(cgroup_ns);
+	put_cgroup_ns(nsproxy->cgroup_ns);
+	nsproxy->cgroup_ns = cgroup_ns;
+
+	return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+	struct cgroup_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->cgroup_ns;
+		get_cgroup_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+	put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+	.name		= "cgroup",
+	.type		= CLONE_NEWCGROUP,
+	.get		= cgroupns_get,
+	.put		= cgroupns_put,
+	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+	return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
 #ifdef CONFIG_CGROUP_BPF
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags)
@@ -6261,69 +6797,148 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 }
 #endif /* CONFIG_CGROUP_BPF */
 
-#ifdef CONFIG_SYSFS
-static ssize_t show_delegatable_files(struct cftype *files, char *buf,
-				      ssize_t size, const char *prefix)
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
 {
-	struct cftype *cft;
-	ssize_t ret = 0;
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 
-	for (cft = files; cft && cft->name[0] != '\0'; cft++) {
-		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
-			continue;
+	if (!css)
+		return ERR_PTR(-ENOMEM);
 
-		if (prefix)
-			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+	return css;
+}
 
-		ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
 
-		if (WARN_ON(ret >= size))
-			break;
-	}
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
 
-	return ret;
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
 }
 
-static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
-			      char *buf)
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
 {
-	struct cgroup_subsys *ss;
-	int ssid;
-	ssize_t ret = 0;
+	u64 count;
 
-	ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
-				     NULL);
+	rcu_read_lock();
+	count = atomic_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
 
-	for_each_subsys(ss, ssid)
-		ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
-					      PAGE_SIZE - ret,
-					      cgroup_subsys_name[ssid]);
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
 
-	return ret;
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
 }
-static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 
-static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
-			     char *buf)
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
-	return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
-}
-static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
 
-static struct attribute *cgroup_sysfs_attrs[] = {
-	&cgroup_delegate_attr.attr,
-	&cgroup_features_attr.attr,
-	NULL,
-};
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
 
-static const struct attribute_group cgroup_sysfs_attr_group = {
-	.attrs = cgroup_sysfs_attrs,
-	.name = "cgroup",
-};
+		seq_printf(seq, "css_set %pK\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
 
-static int __init cgroup_sysfs_init(void)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
 }
-subsys_initcall(cgroup_sysfs_init);
-#endif /* CONFIG_SYSFS */
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
deleted file mode 100644
index 0a3e87cc648d..000000000000
--- a/kernel/cgroup/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o freezer.o
-
-obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += pids.o
-obj-$(CONFIG_CGROUP_RDMA) += rdma.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
deleted file mode 100644
index 90104f82593d..000000000000
--- a/kernel/cgroup/cgroup-internal.h
+++ /dev/null
@@ -1,242 +0,0 @@
-#ifndef __CGROUP_INTERNAL_H
-#define __CGROUP_INTERNAL_H
-
-#include <linux/cgroup.h>
-#include <linux/kernfs.h>
-#include <linux/workqueue.h>
-#include <linux/list.h>
-#include <linux/refcount.h>
-
-struct cgroup_pidlist;
-
-struct cgroup_file_ctx {
-	struct cgroup_namespace	*ns;
-
-	struct {
-		void			*trigger;
-	} psi;
-
-	struct {
-		bool			started;
-		struct css_task_iter	iter;
-	} procs;
-
-	struct {
-		struct cgroup_pidlist	*pidlist;
-	} procs1;
-};
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-	/* the cgroup and css_set this link associates */
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-
-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
-	struct list_head	cset_link;
-
-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
-	struct list_head	cgrp_link;
-};
-
-/* used to track tasks and csets during migration */
-struct cgroup_taskset {
-	/* the src and dst cset list running through cset->mg_node */
-	struct list_head	src_csets;
-	struct list_head	dst_csets;
-
-	/* the number of tasks in the set */
-	int			nr_tasks;
-
-	/* the subsys currently being processed */
-	int			ssid;
-
-	/*
-	 * Fields for cgroup_taskset_*() iteration.
-	 *
-	 * Before migration is committed, the target migration tasks are on
-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
-	 * or ->dst_csets depending on whether migration is committed.
-	 *
-	 * ->cur_csets and ->cur_task point to the current task position
-	 * during iteration.
-	 */
-	struct list_head	*csets;
-	struct css_set		*cur_cset;
-	struct task_struct	*cur_task;
-};
-
-/* migration context also tracks preloading */
-struct cgroup_mgctx {
-	/*
-	 * Preloaded source and destination csets.  Used to guarantee
-	 * atomic success or failure on actual migration.
-	 */
-	struct list_head	preloaded_src_csets;
-	struct list_head	preloaded_dst_csets;
-
-	/* tasks and csets to migrate */
-	struct cgroup_taskset	tset;
-
-	/* subsystems affected by migration */
-	u16			ss_mask;
-};
-
-#define CGROUP_TASKSET_INIT(tset)						\
-{										\
-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
-	.csets			= &tset.src_csets,				\
-}
-
-#define CGROUP_MGCTX_INIT(name)							\
-{										\
-	LIST_HEAD_INIT(name.preloaded_src_csets),				\
-	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
-	CGROUP_TASKSET_INIT(name.tset),						\
-}
-
-#define DEFINE_CGROUP_MGCTX(name)						\
-	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-
-struct cgroup_sb_opts {
-	u16 subsys_mask;
-	unsigned int flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-};
-
-extern struct mutex cgroup_mutex;
-extern spinlock_t css_set_lock;
-extern struct cgroup_subsys *cgroup_subsys[];
-extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
-
-/* iterate across the hierarchies */
-#define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
-
-/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-	return !(cgrp->self.flags & CSS_ONLINE);
-}
-
-static inline bool notify_on_release(const struct cgroup *cgrp)
-{
-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
-void put_css_set_locked(struct css_set *cset);
-
-static inline void put_css_set(struct css_set *cset)
-{
-	unsigned long flags;
-
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (refcount_dec_not_one(&cset->refcount))
-		return;
-
-	spin_lock_irqsave(&css_set_lock, flags);
-	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-	refcount_inc(&cset->refcount);
-}
-
-bool cgroup_ssid_enabled(int ssid);
-bool cgroup_on_dfl(const struct cgroup *cgrp);
-bool cgroup_is_thread_root(struct cgroup *cgrp);
-bool cgroup_is_threaded(struct cgroup *cgrp);
-
-struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
-struct cgroup *task_cgroup_from_root(struct task_struct *task,
-				     struct cgroup_root *root);
-struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
-void cgroup_kn_unlock(struct kernfs_node *kn);
-int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-			  struct cgroup_namespace *ns);
-
-void cgroup_free_root(struct cgroup_root *root);
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
-			       struct cgroup_root *root, unsigned long magic,
-			       struct cgroup_namespace *ns);
-
-int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
-void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
-void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
-			    struct cgroup_mgctx *mgctx);
-int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
-int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_mgctx *mgctx);
-
-int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
-		       bool threadgroup);
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
-	__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task)
-	__releases(&cgroup_threadgroup_rwsem);
-
-void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
-
-int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
-int cgroup_rmdir(struct kernfs_node *kn);
-int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-		     struct kernfs_root *kf_root);
-
-int __cgroup_task_count(const struct cgroup *cgrp);
-int cgroup_task_count(const struct cgroup *cgrp);
-
-/*
- * namespace.c
- */
-extern const struct proc_ns_operations cgroupns_operations;
-
-/*
- * cgroup-v1.c
- */
-extern struct cftype cgroup1_base_files[];
-extern const struct file_operations proc_cgroupstats_operations;
-extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
-
-bool cgroup1_ssid_disabled(int ssid);
-void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
-void cgroup1_release_agent(struct work_struct *work);
-void cgroup1_check_for_release(struct cgroup *cgrp);
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
-			     void *data, unsigned long magic,
-			     struct cgroup_namespace *ns);
-
-#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
deleted file mode 100644
index fc576131fdf2..000000000000
--- a/kernel/cgroup/cgroup-v1.c
+++ /dev/null
@@ -1,1314 +0,0 @@
-#include "cgroup-internal.h"
-
-#include <linux/ctype.h>
-#include <linux/kmod.h>
-#include <linux/sort.h>
-#include <linux/delay.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/delayacct.h>
-#include <linux/pid_namespace.h>
-#include <linux/cgroupstats.h>
-
-#include <trace/events/cgroup.h>
-
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
-
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
-/* disable named v1 mounts */
-static bool cgroup_no_v1_named;
-
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
-bool cgroup1_ssid_disabled(int ssid)
-{
-	return cgroup_no_v1_mask & (1 << ssid);
-}
-
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroup_root *root;
-	int retval = 0;
-
-	mutex_lock(&cgroup_mutex);
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	for_each_root(root) {
-		struct cgroup *from_cgrp;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		spin_lock_irq(&css_set_lock);
-		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
-
-		retval = cgroup_attach_task(from_cgrp, tsk, false);
-		if (retval)
-			break;
-	}
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
-	DEFINE_CGROUP_MGCTX(mgctx);
-	struct cgrp_cset_link *link;
-	struct css_task_iter it;
-	struct task_struct *task;
-	int ret;
-
-	if (cgroup_on_dfl(to))
-		return -EINVAL;
-
-	ret = cgroup_migrate_vet_dst(to);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &mgctx);
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_migrate_prepare_dst(&mgctx);
-	if (ret)
-		goto out_err;
-
-	/*
-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
-	 * ->can_attach() fails.
-	 */
-	do {
-		css_task_iter_start(&from->self, 0, &it);
-
-		do {
-			task = css_task_iter_next(&it);
-		} while (task && (task->flags & PF_EXITING));
-
-		if (task)
-			get_task_struct(task);
-		css_task_iter_end(&it);
-
-		if (task) {
-			ret = cgroup_migrate(task, false, &mgctx);
-			if (!ret)
-				trace_cgroup_transfer_tasks(to, task, false);
-			put_task_struct(task);
-		}
-	} while (task && !ret);
-out_err:
-	cgroup_migrate_finish(&mgctx);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-	CGROUP_FILE_PROCS,
-	CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-	/*
-	 * used to find which pidlist is wanted. doesn't change as long as
-	 * this particular list stays in the list.
-	*/
-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-	/* array of xids */
-	pid_t *list;
-	/* how many elements the above list has */
-	int length;
-	/* each of these stored in a list by its cgroup */
-	struct list_head links;
-	/* pointer to the cgroup we belong to, for list removal purposes */
-	struct cgroup *owner;
-	/* for delayed destruction */
-	struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
-	if (PIDLIST_TOO_LARGE(count))
-		return vmalloc(count * sizeof(pid_t));
-	else
-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-	kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
-{
-	struct cgroup_pidlist *l, *tmp_l;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-	mutex_unlock(&cgrp->pidlist_mutex);
-
-	flush_workqueue(cgroup_pidlist_destroy_wq);
-	BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-						destroy_dwork);
-	struct cgroup_pidlist *tofree = NULL;
-
-	mutex_lock(&l->owner->pidlist_mutex);
-
-	/*
-	 * Destroy iff we didn't get queued again.  The state won't change
-	 * as destroy_dwork can only be queued while locked.
-	 */
-	if (!delayed_work_pending(dwork)) {
-		list_del(&l->links);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		tofree = l;
-	}
-
-	mutex_unlock(&l->owner->pidlist_mutex);
-	kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-	int src, dest = 1;
-
-	/*
-	 * we presume the 0th element is unique, so i starts at 1. trivial
-	 * edge cases first; no work needs to be done for either
-	 */
-	if (length == 0 || length == 1)
-		return length;
-	/* src and dest walk down the list; dest counts unique elements */
-	for (src = 1; src < length; src++) {
-		/* find next unique element */
-		while (list[src] == list[src-1]) {
-			src++;
-			if (src == length)
-				goto after;
-		}
-		/* dest always points to where the next unique element goes */
-		list[dest] = list[src];
-		dest++;
-	}
-after:
-	return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- */
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	list_for_each_entry(l, &cgrp->pidlists, links)
-		if (l->key.type == type && l->key.ns == ns)
-			return l;
-	return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-						enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	l = cgroup_pidlist_find(cgrp, type);
-	if (l)
-		return l;
-
-	/* entry not found; create a new one */
-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l)
-		return l;
-
-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	l->key.type = type;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
-	l->owner = cgrp;
-	list_add(&l->links, &cgrp->pidlists);
-	return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-			      struct cgroup_pidlist **lp)
-{
-	pid_t *array;
-	int length;
-	int pid, n = 0; /* used for populating the array */
-	struct css_task_iter it;
-	struct task_struct *tsk;
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	length = cgroup_task_count(cgrp);
-	array = pidlist_allocate(length);
-	if (!array)
-		return -ENOMEM;
-	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, 0, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		if (unlikely(n == length))
-			break;
-		/* get tgid or pid for procs or tasks file respectively */
-		if (type == CGROUP_FILE_PROCS)
-			pid = task_tgid_vnr(tsk);
-		else
-			pid = task_pid_vnr(tsk);
-		if (pid > 0) /* make sure to only use valid results */
-			array[n++] = pid;
-	}
-	css_task_iter_end(&it);
-	length = n;
-	/* now sort & (if procs) strip out duplicates */
-	sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
-
-	l = cgroup_pidlist_find_create(cgrp, type);
-	if (!l) {
-		pidlist_free(array);
-		return -ENOMEM;
-	}
-
-	/* store array, freeing old if necessary */
-	pidlist_free(l->list);
-	l->list = array;
-	l->length = length;
-	*lp = l;
-	return 0;
-}
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
-{
-	/*
-	 * Initially we receive a position value that corresponds to
-	 * one more than the last pid shown (or 0 on the first call or
-	 * after a seek to the start). Use a binary-search to find the
-	 * next pid to display, if any
-	 */
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = seq_cft(s)->private;
-	int index = 0, pid = *pos;
-	int *iter, ret;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-
-	/*
-	 * !NULL @ctx->procs1.pidlist indicates that this isn't the first
-	 * start() after open. If the matching pidlist is around, we can use
-	 * that. Look for it. Note that @ctx->procs1.pidlist can't be used
-	 * directly. It could already have been destroyed.
-	 */
-	if (ctx->procs1.pidlist)
-		ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
-
-	/*
-	 * Either this is the first start() after open or the matching
-	 * pidlist has been destroyed inbetween.  Create a new one.
-	 */
-	if (!ctx->procs1.pidlist) {
-		ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	l = ctx->procs1.pidlist;
-
-	if (pid) {
-		int end = l->length;
-
-		while (index < end) {
-			int mid = (index + end) / 2;
-			if (l->list[mid] == pid) {
-				index = mid;
-				break;
-			} else if (l->list[mid] <= pid)
-				index = mid + 1;
-			else
-				end = mid;
-		}
-	}
-	/* If we're off the end of the array, we're done */
-	if (index >= l->length)
-		return NULL;
-	/* Update the abstract position to be the actual pid that we found */
-	iter = l->list + index;
-	*pos = *iter;
-	return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct cgroup_pidlist *l = ctx->procs1.pidlist;
-
-	if (l)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_file_ctx *ctx = of->priv;
-	struct cgroup_pidlist *l = ctx->procs1.pidlist;
-	pid_t *p = v;
-	pid_t *end = l->list + l->length;
-	/*
-	 * Advance to the next pid in the array. If this goes off the
-	 * end, we're done
-	 */
-	p++;
-	if (p >= end) {
-		(*pos)++;
-		return NULL;
-	} else {
-		*pos = *p;
-		return p;
-	}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", *(int *)v);
-
-	return 0;
-}
-
-static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
-				     char *buf, size_t nbytes, loff_t off,
-				     bool threadgroup)
-{
-	struct cgroup *cgrp;
-	struct task_struct *task;
-	const struct cred *cred, *tcred;
-	ssize_t ret;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-
-	task = cgroup_procs_write_start(buf, threadgroup);
-	ret = PTR_ERR_OR_ZERO(task);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Even if we're attaching all tasks in the thread group, we only need
-	 * to check permissions on one of them. Check permissions using the
-	 * credentials from file open to protect against inherited fd attacks.
-	 */
-	cred = of->file->f_cred;
-	tcred = get_task_cred(task);
-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid) &&
-	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
-		ret = -EACCES;
-	put_cred(tcred);
-	if (ret)
-		goto out_finish;
-
-	ret = cgroup_attach_task(cgrp, task, threadgroup);
-
-out_finish:
-	cgroup_procs_write_finish(task);
-out_unlock:
-	cgroup_kn_unlock(of->kn);
-
-	return ret ?: nbytes;
-}
-
-static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
-				   char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup1_procs_write(of, buf, nbytes, off, true);
-}
-
-static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
-				   char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup1_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-	/*
-	 * Release agent gets called with all capabilities,
-	 * require capabilities to set release agent.
-	 */
-	if ((of->file->f_cred->user_ns != &init_user_ns) ||
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
-	cgroup_kn_unlock(of->kn);
-	return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-	seq_putc(seq, '\n');
-	return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-	seq_puts(seq, "0\n");
-	return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-					  struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-				       struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	return 0;
-}
-
-/* cgroup core interface files for the legacy hierarchies */
-struct cftype cgroup1_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
-		.write = cgroup1_procs_write,
-	},
-	{
-		.name = "cgroup.clone_children",
-		.read_u64 = cgroup_clone_children_read,
-		.write_u64 = cgroup_clone_children_write,
-	},
-	{
-		.name = "cgroup.sane_behavior",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_sane_behavior_show,
-	},
-	{
-		.name = "tasks",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_TASKS,
-		.write = cgroup1_tasks_write,
-	},
-	{
-		.name = "notify_on_release",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_release_agent_show,
-		.write = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX - 1,
-	},
-	{ }	/* terminate */
-};
-
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-	/*
-	 * ideally we don't want subsystems moving around while we do this.
-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-	 * subsys/hierarchy state.
-	 */
-	mutex_lock(&cgroup_mutex);
-
-	for_each_subsys(ss, i)
-		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps),
-			   cgroup_ssid_enabled(i));
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct cgroup *cgrp;
-	struct css_task_iter it;
-	struct task_struct *tsk;
-
-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-	    kernfs_type(kn) != KERNFS_DIR)
-		return -EINVAL;
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
-	 */
-	rcu_read_lock();
-	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
-	if (!cgrp || cgroup_is_dead(cgrp)) {
-		rcu_read_unlock();
-		mutex_unlock(&cgroup_mutex);
-		return -ENOENT;
-	}
-	rcu_read_unlock();
-
-	css_task_iter_start(&cgrp->self, 0, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
-		case TASK_RUNNING:
-			stats->nr_running++;
-			break;
-		case TASK_INTERRUPTIBLE:
-			stats->nr_sleeping++;
-			break;
-		case TASK_UNINTERRUPTIBLE:
-			stats->nr_uninterruptible++;
-			break;
-		case TASK_STOPPED:
-			stats->nr_stopped++;
-			break;
-		default:
-			if (delayacct_is_task_waiting_on_io(tsk))
-				stats->nr_io_wait++;
-			break;
-		}
-	}
-	css_task_iter_end(&it);
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-void cgroup1_check_for_release(struct cgroup *cgrp)
-{
-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-		schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-void cgroup1_release_agent(struct work_struct *work)
-{
-	struct cgroup *cgrp =
-		container_of(work, struct cgroup, release_agent_work);
-	char *pathbuf = NULL, *agentbuf = NULL;
-	char *argv[3], *envp[3];
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-	if (!pathbuf || !agentbuf || !strlen(agentbuf))
-		goto out;
-
-	spin_lock_irq(&css_set_lock);
-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_irq(&css_set_lock);
-	if (ret < 0 || ret >= PATH_MAX)
-		goto out;
-
-	argv[0] = agentbuf;
-	argv[1] = pathbuf;
-	argv[2] = NULL;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	mutex_unlock(&cgroup_mutex);
-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-	goto out_free;
-out:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(agentbuf);
-	kfree(pathbuf);
-}
-
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-			  const char *new_name_str)
-{
-	struct cgroup *cgrp = kn->priv;
-	int ret;
-
-	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
-	if (strchr(new_name_str, '\n'))
-		return -EINVAL;
-
-	if (kernfs_type(kn) != KERNFS_DIR)
-		return -ENOTDIR;
-	if (kn->parent != new_parent)
-		return -EIO;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  kernfs_rename() doesn't require active_ref
-	 * protection.  Break them before grabbing cgroup_mutex.
-	 */
-	kernfs_break_active_protection(new_parent);
-	kernfs_break_active_protection(kn);
-
-	mutex_lock(&cgroup_mutex);
-
-	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret)
-		trace_cgroup_rename(cgrp);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	kernfs_unbreak_active_protection(new_parent);
-	return ret;
-}
-
-static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
-{
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	for_each_subsys(ss, ssid)
-		if (root->subsys_mask & (1 << ssid))
-			seq_show_option(seq, ss->legacy_name, NULL);
-	if (root->flags & CGRP_ROOT_NOPREFIX)
-		seq_puts(seq, ",noprefix");
-	if (root->flags & CGRP_ROOT_XATTR)
-		seq_puts(seq, ",xattr");
-	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
-		seq_puts(seq, ",cpuset_v2_mode");
-
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-		seq_puts(seq, ",clone_children");
-	if (strlen(root->name))
-		seq_show_option(seq, "name", root->name);
-	return 0;
-}
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-	char *token, *o = data;
-	bool all_ss = false, one_ss = false;
-	u16 mask = U16_MAX;
-	struct cgroup_subsys *ss;
-	int nr_opts = 0;
-	int i;
-
-#ifdef CONFIG_CPUSETS
-	mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-	memset(opts, 0, sizeof(*opts));
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		nr_opts++;
-
-		if (!*token)
-			return -EINVAL;
-		if (!strcmp(token, "none")) {
-			/* Explicitly have no subsystems */
-			opts->none = true;
-			continue;
-		}
-		if (!strcmp(token, "all")) {
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (one_ss)
-				return -EINVAL;
-			all_ss = true;
-			continue;
-		}
-		if (!strcmp(token, "noprefix")) {
-			opts->flags |= CGRP_ROOT_NOPREFIX;
-			continue;
-		}
-		if (!strcmp(token, "clone_children")) {
-			opts->cpuset_clone_children = true;
-			continue;
-		}
-		if (!strcmp(token, "cpuset_v2_mode")) {
-			opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
-			continue;
-		}
-		if (!strcmp(token, "xattr")) {
-			opts->flags |= CGRP_ROOT_XATTR;
-			continue;
-		}
-		if (!strncmp(token, "release_agent=", 14)) {
-			/* Specifying two release agents is forbidden */
-			if (opts->release_agent)
-				return -EINVAL;
-			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-			if (!opts->release_agent)
-				return -ENOMEM;
-			continue;
-		}
-		if (!strncmp(token, "name=", 5)) {
-			const char *name = token + 5;
-
-			/* blocked by boot param? */
-			if (cgroup_no_v1_named)
-				return -ENOENT;
-			/* Can't specify an empty name */
-			if (!strlen(name))
-				return -EINVAL;
-			/* Must match [\w.-]+ */
-			for (i = 0; i < strlen(name); i++) {
-				char c = name[i];
-				if (isalnum(c))
-					continue;
-				if ((c == '.') || (c == '-') || (c == '_'))
-					continue;
-				return -EINVAL;
-			}
-			/* Specifying two names is forbidden */
-			if (opts->name)
-				return -EINVAL;
-			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN - 1,
-					      GFP_KERNEL);
-			if (!opts->name)
-				return -ENOMEM;
-
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->legacy_name))
-				continue;
-			if (!cgroup_ssid_enabled(i))
-				continue;
-			if (cgroup1_ssid_disabled(i))
-				continue;
-
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (all_ss)
-				return -EINVAL;
-			opts->subsys_mask |= (1 << i);
-			one_ss = true;
-
-			break;
-		}
-		if (i == CGROUP_SUBSYS_COUNT)
-			return -ENOENT;
-	}
-
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options were
-	 * not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
-				opts->subsys_mask |= (1 << i);
-
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
-	/*
-	 * Option noprefix was introduced just for backward compatibility
-	 * with the old cpuset, so we allow noprefix only if mounting just
-	 * the cpuset subsystem.
-	 */
-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-		return -EINVAL;
-
-	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_mask && opts->none)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
-{
-	int ret = 0;
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-	struct cgroup_sb_opts opts;
-	u16 added_mask, removed_mask;
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* See what subsystems are wanted */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-			task_tgid_nr(current), current->comm);
-	/* See cgroup1_mount release_agent handling */
-	if (opts.release_agent &&
-	    ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-	/* Don't allow flags or name to change at remount */
-	if ((opts.flags ^ root->flags) ||
-	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-		       opts.flags, opts.name ?: "", root->flags, root->name);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.self.children)) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = rebind_subsystems(root, added_mask);
-	if (ret)
-		goto out_unlock;
-
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-	if (opts.release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, opts.release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
-
-	trace_cgroup_remount(root);
-
- out_unlock:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
-	.rename			= cgroup1_rename,
-	.show_options		= cgroup1_show_options,
-	.remount_fs		= cgroup1_remount,
-	.mkdir			= cgroup_mkdir,
-	.rmdir			= cgroup_rmdir,
-	.show_path		= cgroup_show_path,
-};
-
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
-			     void *data, unsigned long magic,
-			     struct cgroup_namespace *ns)
-{
-	struct cgroup_sb_opts opts;
-	struct cgroup_root *root = NULL;
-	struct cgroup_subsys *ss;
-	struct dentry *dentry;
-	int i, ret;
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Destruction of cgroup root is asynchronous, so subsystems may
-	 * still be dying after the previous unmount.  Let's drain the
-	 * dying subsystems.  We just need to ensure that the ones
-	 * unmounted previously finish dying and don't care about new ones
-	 * starting.  Testing ref liveliness is good enough.
-	 */
-	for_each_subsys(ss, i) {
-		if (!(opts.subsys_mask & (1 << i)) ||
-		    ss->root == &cgrp_dfl_root)
-			continue;
-
-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-		cgroup_put(&ss->root->cgrp);
-	}
-
-	for_each_root(root) {
-		bool name_match = false;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		/*
-		 * If we asked for a name then it must match.  Also, if
-		 * name matches but sybsys_mask doesn't, we should fail.
-		 * Remember whether name matched.
-		 */
-		if (opts.name) {
-			if (strcmp(opts.name, root->name))
-				continue;
-			name_match = true;
-		}
-
-		/*
-		 * If we asked for subsystems (or explicitly for no
-		 * subsystems) then they must match.
-		 */
-		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
-			if (!name_match)
-				continue;
-			ret = -EBUSY;
-			goto out_unlock;
-		}
-
-		if (root->flags ^ opts.flags)
-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/*
-	 * No such thing, create a new one.  name= matching without subsys
-	 * specification is allowed for already existing hierarchies but we
-	 * can't create new one without subsys specification.
-	 */
-	if (!opts.subsys_mask && !opts.none) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Hierarchies may only be created in the initial cgroup namespace. */
-	if (ns != &init_cgroup_ns) {
-		ret = -EPERM;
-		goto out_unlock;
-	}
-	/*
-	 * Release agent gets called with all capabilities,
-	 * require capabilities to set release agent.
-	 */
-	if (opts.release_agent &&
-	    ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	init_cgroup_root(root, &opts);
-
-	ret = cgroup_setup_root(root, opts.subsys_mask);
-	if (ret)
-		cgroup_free_root(root);
-
-out_unlock:
-	if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-		mutex_unlock(&cgroup_mutex);
-		msleep(10);
-		ret = restart_syscall();
-		goto out_free;
-	}
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-
-	if (ret)
-		return ERR_PTR(ret);
-
-	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
-				 CGROUP_SUPER_MAGIC, ns);
-
-	if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
-		struct super_block *sb = dentry->d_sb;
-		dput(dentry);
-		deactivate_locked_super(sb);
-		msleep(10);
-		dentry = ERR_PTR(restart_syscall());
-	}
-	return dentry;
-}
-
-static int __init cgroup1_wq_init(void)
-{
-	/*
-	 * Used to destroy pidlists and separate to serve as flush domain.
-	 * Cap @max_active to 1 too.
-	 */
-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
-	BUG_ON(!cgroup_pidlist_destroy_wq);
-	return 0;
-}
-core_initcall(cgroup1_wq_init);
-
-static int __init cgroup_no_v1(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		if (!strcmp(token, "all")) {
-			cgroup_no_v1_mask = U16_MAX;
-			continue;
-		}
-
-		if (!strcmp(token, "named")) {
-			cgroup_no_v1_named = true;
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-
-			cgroup_no_v1_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
deleted file mode 100644
index f661b4cc5efd..000000000000
--- a/kernel/cgroup/debug.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Debug controller
- *
- * WARNING: This controller is for cgroup core debugging only.
- * Its interfaces are unstable and subject to changes at any time.
- */
-#include <linux/ctype.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-
-#include "cgroup-internal.h"
-
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-/*
- * debug_taskcount_read - return the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- */
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static int current_css_set_read(struct seq_file *seq, void *v)
-{
-	struct kernfs_open_file *of = seq->private;
-	struct css_set *cset;
-	struct cgroup_subsys *ss;
-	struct cgroup_subsys_state *css;
-	int i, refcnt;
-
-	if (!cgroup_kn_lock_live(of->kn, false))
-		return -ENODEV;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	refcnt = refcount_read(&cset->refcount);
-	seq_printf(seq, "css_set %pK %d", cset, refcnt);
-	if (refcnt > cset->nr_tasks)
-		seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
-	seq_puts(seq, "\n");
-
-	/*
-	 * Print the css'es stored in the current css_set.
-	 */
-	for_each_subsys(ss, i) {
-		css = cset->subsys[ss->id];
-		if (!css)
-			continue;
-		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
-			  (unsigned long)css, css->id);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	cgroup_kn_unlock(of->kn);
-	return 0;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = refcount_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-	int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
-
-	spin_lock_irq(&css_set_lock);
-
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-		int refcnt = refcount_read(&cset->refcount);
-
-		/*
-		 * Print out the proc_cset and threaded_cset relationship
-		 * and highlight difference between refcount and task_count.
-		 */
-		seq_printf(seq, "css_set %pK", cset);
-		if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
-			threaded_csets++;
-			seq_printf(seq, "=>%pK", cset->dom_cset);
-		}
-		if (!list_empty(&cset->threaded_csets)) {
-			struct css_set *tcset;
-			int idx = 0;
-
-			list_for_each_entry(tcset, &cset->threaded_csets,
-					    threaded_csets_node) {
-				seq_puts(seq, idx ? "," : "<=");
-				seq_printf(seq, "%pK", tcset);
-				idx++;
-			}
-		} else {
-			seq_printf(seq, " %d", refcnt);
-			if (refcnt - cset->nr_tasks > 0) {
-				int extra = refcnt - cset->nr_tasks;
-
-				seq_printf(seq, " +%d", extra);
-				/*
-				 * Take out the one additional reference in
-				 * init_css_set.
-				 */
-				if (cset == &init_css_set)
-					extra--;
-				extra_refs += extra;
-			}
-		}
-		seq_puts(seq, "\n");
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
-				seq_printf(seq, "  task %d\n",
-					   task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
-				seq_printf(seq, "  task %d\n",
-					   task_pid_vnr(task));
-		}
-		/* show # of overflowed tasks */
-		if (count > MAX_TASKS_SHOWN_PER_CSS)
-			seq_printf(seq, "  ... (%d)\n",
-				   count - MAX_TASKS_SHOWN_PER_CSS);
-
-		if (cset->dead) {
-			seq_puts(seq, "    [dead]\n");
-			dead_cnt++;
-		}
-
-		WARN_ON(count != cset->nr_tasks);
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	if (!dead_cnt && !extra_refs && !threaded_csets)
-		return 0;
-
-	seq_puts(seq, "\n");
-	if (threaded_csets)
-		seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
-	if (extra_refs)
-		seq_printf(seq, "extra references = %d\n", extra_refs);
-	if (dead_cnt)
-		seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
-
-	return 0;
-}
-
-static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
-{
-	struct kernfs_open_file *of = seq->private;
-	struct cgroup *cgrp;
-	struct cgroup_subsys *ss;
-	struct cgroup_subsys_state *css;
-	char pbuf[16];
-	int i;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-
-	for_each_subsys(ss, i) {
-		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
-		if (!css)
-			continue;
-
-		pbuf[0] = '\0';
-
-		/* Show the parent CSS if applicable*/
-		if (css->parent)
-			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
-				 css->parent->id);
-		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
-			  (unsigned long)css, css->id,
-			  atomic_read(&css->online_cnt), pbuf);
-	}
-
-	cgroup_kn_unlock(of->kn);
-	return 0;
-}
-
-static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
-				  u16 mask)
-{
-	struct cgroup_subsys *ss;
-	int ssid;
-	bool first = true;
-
-	seq_printf(seq, "%-17s: ", name);
-	for_each_subsys(ss, ssid) {
-		if (!(mask & (1 << ssid)))
-			continue;
-		if (!first)
-			seq_puts(seq, ", ");
-		seq_puts(seq, ss->name);
-		first = false;
-	}
-	seq_putc(seq, '\n');
-}
-
-static int cgroup_masks_read(struct seq_file *seq, void *v)
-{
-	struct kernfs_open_file *of = seq->private;
-	struct cgroup *cgrp;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-
-	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
-	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
-
-	cgroup_kn_unlock(of->kn);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_legacy_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.seq_show = current_css_set_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "cgroup_subsys_states",
-		.seq_show = cgroup_subsys_states_read,
-	},
-
-	{
-		.name = "cgroup_masks",
-		.seq_show = cgroup_masks_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.seq_show = current_css_set_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-		.flags = CFTYPE_ONLY_ON_ROOT,
-	},
-
-	{
-		.name = "css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "csses",
-		.seq_show = cgroup_subsys_states_read,
-	},
-
-	{
-		.name = "masks",
-		.seq_show = cgroup_masks_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc	= debug_css_alloc,
-	.css_free	= debug_css_free,
-	.legacy_cftypes	= debug_legacy_files,
-};
-
-/*
- * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
- * parameter.
- */
-static int __init enable_cgroup_debug(char *str)
-{
-	debug_cgrp_subsys.dfl_cftypes = debug_files;
-	debug_cgrp_subsys.implicit_on_dfl = true;
-	debug_cgrp_subsys.threaded = true;
-	return 1;
-}
-__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
deleted file mode 100644
index 103938f25757..000000000000
--- a/kernel/cgroup/freezer.c
+++ /dev/null
@@ -1,315 +0,0 @@
-//SPDX-License-Identifier: GPL-2.0
-#include <linux/cgroup.h>
-#include <linux/sched.h>
-
-#include "cgroup-internal.h"
-
-/*
- * Propagate the cgroup frozen state upwards by the cgroup tree.
- */
-static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
-{
-	int desc = 1;
-
-	/*
-	 * If the new state is frozen, some freezing ancestor cgroups may change
-	 * their state too, depending on if all their descendants are frozen.
-	 *
-	 * Otherwise, all ancestor cgroups are forced into the non-frozen state.
-	 */
-	while ((cgrp = cgroup_parent(cgrp))) {
-		if (frozen) {
-			cgrp->freezer.nr_frozen_descendants += desc;
-			if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
-			    test_bit(CGRP_FREEZE, &cgrp->flags) &&
-			    cgrp->freezer.nr_frozen_descendants ==
-			    cgrp->nr_descendants) {
-				set_bit(CGRP_FROZEN, &cgrp->flags);
-				cgroup_file_notify(&cgrp->events_file);
-				desc++;
-			}
-		} else {
-			cgrp->freezer.nr_frozen_descendants -= desc;
-			if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
-				clear_bit(CGRP_FROZEN, &cgrp->flags);
-				cgroup_file_notify(&cgrp->events_file);
-				desc++;
-			}
-		}
-	}
-}
-
-/*
- * Revisit the cgroup frozen state.
- * Checks if the cgroup is really frozen and perform all state transitions.
- */
-void cgroup_update_frozen(struct cgroup *cgrp)
-{
-	bool frozen;
-
-	lockdep_assert_held(&css_set_lock);
-
-	/*
-	 * If the cgroup has to be frozen (CGRP_FREEZE bit set),
-	 * and all tasks are frozen and/or stopped, let's consider
-	 * the cgroup frozen. Otherwise it's not frozen.
-	 */
-	frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
-		cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
-
-	if (frozen) {
-		/* Already there? */
-		if (test_bit(CGRP_FROZEN, &cgrp->flags))
-			return;
-
-		set_bit(CGRP_FROZEN, &cgrp->flags);
-	} else {
-		/* Already there? */
-		if (!test_bit(CGRP_FROZEN, &cgrp->flags))
-			return;
-
-		clear_bit(CGRP_FROZEN, &cgrp->flags);
-	}
-	cgroup_file_notify(&cgrp->events_file);
-
-	/* Update the state of ancestor cgroups. */
-	cgroup_propagate_frozen(cgrp, frozen);
-}
-
-/*
- * Increment cgroup's nr_frozen_tasks.
- */
-static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
-{
-	cgrp->freezer.nr_frozen_tasks++;
-}
-
-/*
- * Decrement cgroup's nr_frozen_tasks.
- */
-static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
-{
-	cgrp->freezer.nr_frozen_tasks--;
-	WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
-}
-
-/*
- * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
- * and revisit the state of the cgroup, if necessary.
- */
-void cgroup_enter_frozen(void)
-{
-	struct cgroup *cgrp;
-
-	if (current->frozen)
-		return;
-
-	spin_lock_irq(&css_set_lock);
-	current->frozen = true;
-	cgrp = task_dfl_cgroup(current);
-	cgroup_inc_frozen_cnt(cgrp);
-	cgroup_update_frozen(cgrp);
-	spin_unlock_irq(&css_set_lock);
-}
-
-/*
- * Conditionally leave frozen/stopped state. Update cgroup's counters,
- * and revisit the state of the cgroup, if necessary.
- *
- * If always_leave is not set, and the cgroup is freezing,
- * we're racing with the cgroup freezing. In this case, we don't
- * drop the frozen counter to avoid a transient switch to
- * the unfrozen state.
- */
-void cgroup_leave_frozen(bool always_leave)
-{
-	struct cgroup *cgrp;
-
-	spin_lock_irq(&css_set_lock);
-	cgrp = task_dfl_cgroup(current);
-	if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
-		cgroup_dec_frozen_cnt(cgrp);
-		cgroup_update_frozen(cgrp);
-		WARN_ON_ONCE(!current->frozen);
-		current->frozen = false;
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	if (unlikely(current->frozen)) {
-		/*
-		 * If the task remained in the frozen state,
-		 * make sure it won't reach userspace without
-		 * entering the signal handling loop.
-		 */
-		spin_lock_irq(&current->sighand->siglock);
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-	}
-}
-
-/*
- * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
- * jobctl bit.
- */
-static void cgroup_freeze_task(struct task_struct *task, bool freeze)
-{
-	unsigned long flags;
-
-	/* If the task is about to die, don't bother with freezing it. */
-	if (!lock_task_sighand(task, &flags))
-		return;
-
-	if (freeze) {
-		task->jobctl |= JOBCTL_TRAP_FREEZE;
-		signal_wake_up(task, false);
-	} else {
-		task->jobctl &= ~JOBCTL_TRAP_FREEZE;
-		wake_up_process(task);
-	}
-
-	unlock_task_sighand(task, &flags);
-}
-
-/*
- * Freeze or unfreeze all tasks in the given cgroup.
- */
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	spin_lock_irq(&css_set_lock);
-	if (freeze)
-		set_bit(CGRP_FREEZE, &cgrp->flags);
-	else
-		clear_bit(CGRP_FREEZE, &cgrp->flags);
-	spin_unlock_irq(&css_set_lock);
-
-	css_task_iter_start(&cgrp->self, 0, &it);
-	while ((task = css_task_iter_next(&it))) {
-		/*
-		 * Ignore kernel threads here. Freezing cgroups containing
-		 * kthreads isn't supported.
-		 */
-		if (task->flags & PF_KTHREAD)
-			continue;
-		cgroup_freeze_task(task, freeze);
-	}
-	css_task_iter_end(&it);
-
-	/*
-	 * Cgroup state should be revisited here to cover empty leaf cgroups
-	 * and cgroups which descendants are already in the desired state.
-	 */
-	spin_lock_irq(&css_set_lock);
-	if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
-		cgroup_update_frozen(cgrp);
-	spin_unlock_irq(&css_set_lock);
-}
-
-/*
- * Adjust the task state (freeze or unfreeze) and revisit the state of
- * source and destination cgroups.
- */
-void cgroup_freezer_migrate_task(struct task_struct *task,
-				 struct cgroup *src, struct cgroup *dst)
-{
-	lockdep_assert_held(&css_set_lock);
-
-	/*
-	 * Kernel threads are not supposed to be frozen at all.
-	 */
-	if (task->flags & PF_KTHREAD)
-		return;
-
-	/*
-	 * Adjust counters of freezing and frozen tasks.
-	 * Note, that if the task is frozen, but the destination cgroup is not
-	 * frozen, we bump both counters to keep them balanced.
-	 */
-	if (task->frozen) {
-		cgroup_inc_frozen_cnt(dst);
-		cgroup_dec_frozen_cnt(src);
-	}
-	cgroup_update_frozen(dst);
-	cgroup_update_frozen(src);
-
-	/*
-	 * Force the task to the desired state.
-	 */
-	cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
-}
-
-void cgroup_freezer_frozen_exit(struct task_struct *task)
-{
-	struct cgroup *cgrp = task_dfl_cgroup(task);
-
-	lockdep_assert_held(&css_set_lock);
-
-	cgroup_dec_frozen_cnt(cgrp);
-	cgroup_update_frozen(cgrp);
-}
-
-void cgroup_freeze(struct cgroup *cgrp, bool freeze)
-{
-	struct cgroup_subsys_state *css;
-	struct cgroup *dsct;
-	bool applied = false;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/*
-	 * Nothing changed? Just exit.
-	 */
-	if (cgrp->freezer.freeze == freeze)
-		return;
-
-	cgrp->freezer.freeze = freeze;
-
-	/*
-	 * Propagate changes downwards the cgroup tree.
-	 */
-	css_for_each_descendant_pre(css, &cgrp->self) {
-		dsct = css->cgroup;
-
-		if (cgroup_is_dead(dsct))
-			continue;
-
-		if (freeze) {
-			dsct->freezer.e_freeze++;
-			/*
-			 * Already frozen because of ancestor's settings?
-			 */
-			if (dsct->freezer.e_freeze > 1)
-				continue;
-		} else {
-			dsct->freezer.e_freeze--;
-			/*
-			 * Still frozen because of ancestor's settings?
-			 */
-			if (dsct->freezer.e_freeze > 0)
-				continue;
-
-			WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
-		}
-
-		/*
-		 * Do change actual state: freeze or unfreeze.
-		 */
-		cgroup_do_freeze(dsct, freeze);
-		applied = true;
-	}
-
-	/*
-	 * Even if the actual state hasn't changed, let's notify a user.
-	 * The state can be enforced by an ancestor cgroup: the cgroup
-	 * can already be in the desired state or it can be locked in the
-	 * opposite state, so that the transition will never happen.
-	 * In both cases it's better to notify a user, that there is
-	 * nothing to wait for.
-	 */
-	if (!applied)
-		cgroup_file_notify(&cgrp->events_file);
-}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
deleted file mode 100644
index 86e9bbeb57ec..000000000000
--- a/kernel/cgroup/namespace.c
+++ /dev/null
@@ -1,155 +0,0 @@
-#include "cgroup-internal.h"
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
-
-
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-	struct cgroup_namespace *new_ns;
-	int ret;
-
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		return ERR_PTR(ret);
-	}
-	refcount_set(&new_ns->count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-	put_css_set(ns->root_cset);
-	dec_cgroup_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns)
-{
-	struct cgroup_namespace *new_ns;
-	struct ucounts *ucounts;
-	struct css_set *cset;
-
-	BUG_ON(!old_ns);
-
-	if (!(flags & CLONE_NEWCGROUP)) {
-		get_cgroup_ns(old_ns);
-		return old_ns;
-	}
-
-	/* Allow only sysadmin to create cgroup namespace. */
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	ucounts = inc_cgroup_namespaces(user_ns);
-	if (!ucounts)
-		return ERR_PTR(-ENOSPC);
-
-	/* It is not safe to take cgroup_mutex here */
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	new_ns = alloc_cgroup_ns();
-	if (IS_ERR(new_ns)) {
-		put_css_set(cset);
-		dec_cgroup_namespaces(ucounts);
-		return new_ns;
-	}
-
-	new_ns->user_ns = get_user_ns(user_ns);
-	new_ns->ucounts = ucounts;
-	new_ns->root_cset = cset;
-
-	return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* Don't need to do anything if we are attaching to our own cgroupns. */
-	if (cgroup_ns == nsproxy->cgroup_ns)
-		return 0;
-
-	get_cgroup_ns(cgroup_ns);
-	put_cgroup_ns(nsproxy->cgroup_ns);
-	nsproxy->cgroup_ns = cgroup_ns;
-
-	return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-	struct cgroup_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	task_lock(task);
-	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->cgroup_ns;
-		get_cgroup_ns(ns);
-	}
-	task_unlock(task);
-
-	return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-	put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-	return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
-	.get		= cgroupns_get,
-	.put		= cgroupns_put,
-	.install	= cgroupns_install,
-	.owner		= cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-	return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
deleted file mode 100644
index defad3c5e7dc..000000000000
--- a/kernel/cgroup/rdma.c
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * RDMA resource limiting controller for cgroups.
- *
- * Used to allow a cgroup hierarchy to stop processes from consuming
- * additional RDMA resources after a certain limit is reached.
- *
- * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
- */
-
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/seq_file.h>
-#include <linux/cgroup.h>
-#include <linux/parser.h>
-#include <linux/cgroup_rdma.h>
-
-#define RDMACG_MAX_STR "max"
-
-/*
- * Protects list of resource pools maintained on per cgroup basis
- * and rdma device list.
- */
-static DEFINE_MUTEX(rdmacg_mutex);
-static LIST_HEAD(rdmacg_devices);
-
-enum rdmacg_file_type {
-	RDMACG_RESOURCE_TYPE_MAX,
-	RDMACG_RESOURCE_TYPE_STAT,
-};
-
-/*
- * resource table definition as to be seen by the user.
- * Need to add entries to it when more resources are
- * added/defined at IB verb/core layer.
- */
-static char const *rdmacg_resource_names[] = {
-	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
-	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
-};
-
-/* resource tracker for each resource of rdma cgroup */
-struct rdmacg_resource {
-	int max;
-	int usage;
-};
-
-/*
- * resource pool object which represents per cgroup, per device
- * resources. There are multiple instances of this object per cgroup,
- * therefore it cannot be embedded within rdma_cgroup structure. It
- * is maintained as list.
- */
-struct rdmacg_resource_pool {
-	struct rdmacg_device	*device;
-	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
-
-	struct list_head	cg_node;
-	struct list_head	dev_node;
-
-	/* count active user tasks of this pool */
-	u64			usage_sum;
-	/* total number counts which are set to max */
-	int			num_max_cnt;
-};
-
-static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
-{
-	return container_of(css, struct rdma_cgroup, css);
-}
-
-static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
-{
-	return css_rdmacg(cg->css.parent);
-}
-
-static inline struct rdma_cgroup *get_current_rdmacg(void)
-{
-	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
-}
-
-static void set_resource_limit(struct rdmacg_resource_pool *rpool,
-			       int index, int new_max)
-{
-	if (new_max == S32_MAX) {
-		if (rpool->resources[index].max != S32_MAX)
-			rpool->num_max_cnt++;
-	} else {
-		if (rpool->resources[index].max == S32_MAX)
-			rpool->num_max_cnt--;
-	}
-	rpool->resources[index].max = new_max;
-}
-
-static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
-{
-	int i;
-
-	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
-		set_resource_limit(rpool, i, S32_MAX);
-}
-
-static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
-{
-	lockdep_assert_held(&rdmacg_mutex);
-
-	list_del(&rpool->cg_node);
-	list_del(&rpool->dev_node);
-	kfree(rpool);
-}
-
-static struct rdmacg_resource_pool *
-find_cg_rpool_locked(struct rdma_cgroup *cg,
-		     struct rdmacg_device *device)
-
-{
-	struct rdmacg_resource_pool *pool;
-
-	lockdep_assert_held(&rdmacg_mutex);
-
-	list_for_each_entry(pool, &cg->rpools, cg_node)
-		if (pool->device == device)
-			return pool;
-
-	return NULL;
-}
-
-static struct rdmacg_resource_pool *
-get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
-{
-	struct rdmacg_resource_pool *rpool;
-
-	rpool = find_cg_rpool_locked(cg, device);
-	if (rpool)
-		return rpool;
-
-	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
-	if (!rpool)
-		return ERR_PTR(-ENOMEM);
-
-	rpool->device = device;
-	set_all_resource_max_limit(rpool);
-
-	INIT_LIST_HEAD(&rpool->cg_node);
-	INIT_LIST_HEAD(&rpool->dev_node);
-	list_add_tail(&rpool->cg_node, &cg->rpools);
-	list_add_tail(&rpool->dev_node, &device->rpools);
-	return rpool;
-}
-
-/**
- * uncharge_cg_locked - uncharge resource for rdma cgroup
- * @cg: pointer to cg to uncharge and all parents in hierarchy
- * @device: pointer to rdmacg device
- * @index: index of the resource to uncharge in cg (resource pool)
- *
- * It also frees the resource pool which was created as part of
- * charging operation when there are no resources attached to
- * resource pool.
- */
-static void
-uncharge_cg_locked(struct rdma_cgroup *cg,
-		   struct rdmacg_device *device,
-		   enum rdmacg_resource_type index)
-{
-	struct rdmacg_resource_pool *rpool;
-
-	rpool = find_cg_rpool_locked(cg, device);
-
-	/*
-	 * rpool cannot be null at this stage. Let kernel operate in case
-	 * if there a bug in IB stack or rdma controller, instead of crashing
-	 * the system.
-	 */
-	if (unlikely(!rpool)) {
-		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
-		return;
-	}
-
-	rpool->resources[index].usage--;
-
-	/*
-	 * A negative count (or overflow) is invalid,
-	 * it indicates a bug in the rdma controller.
-	 */
-	WARN_ON_ONCE(rpool->resources[index].usage < 0);
-	rpool->usage_sum--;
-	if (rpool->usage_sum == 0 &&
-	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
-		/*
-		 * No user of the rpool and all entries are set to max, so
-		 * safe to delete this rpool.
-		 */
-		free_cg_rpool_locked(rpool);
-	}
-}
-
-/**
- * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
- * @device: pointer to rdmacg device
- * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
- *           stop uncharging
- * @index: index of the resource to uncharge in cg in given resource pool
- */
-static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
-				     struct rdmacg_device *device,
-				     struct rdma_cgroup *stop_cg,
-				     enum rdmacg_resource_type index)
-{
-	struct rdma_cgroup *p;
-
-	mutex_lock(&rdmacg_mutex);
-
-	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
-		uncharge_cg_locked(p, device, index);
-
-	mutex_unlock(&rdmacg_mutex);
-
-	css_put(&cg->css);
-}
-
-/**
- * rdmacg_uncharge - hierarchically uncharge rdma resource count
- * @device: pointer to rdmacg device
- * @index: index of the resource to uncharge in cgroup in given resource pool
- */
-void rdmacg_uncharge(struct rdma_cgroup *cg,
-		     struct rdmacg_device *device,
-		     enum rdmacg_resource_type index)
-{
-	if (index >= RDMACG_RESOURCE_MAX)
-		return;
-
-	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
-}
-EXPORT_SYMBOL(rdmacg_uncharge);
-
-/**
- * rdmacg_try_charge - hierarchically try to charge the rdma resource
- * @rdmacg: pointer to rdma cgroup which will own this resource
- * @device: pointer to rdmacg device
- * @index: index of the resource to charge in cgroup (resource pool)
- *
- * This function follows charging resource in hierarchical way.
- * It will fail if the charge would cause the new value to exceed the
- * hierarchical limit.
- * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
- * Returns pointer to rdmacg for this resource when charging is successful.
- *
- * Charger needs to account resources on two criteria.
- * (a) per cgroup & (b) per device resource usage.
- * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
- * the configured limits. Per device provides granular configuration
- * in multi device usage. It allocates resource pool in the hierarchy
- * for each parent it come across for first resource. Later on resource
- * pool will be available. Therefore it will be much faster thereon
- * to charge/uncharge.
- */
-int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
-		      struct rdmacg_device *device,
-		      enum rdmacg_resource_type index)
-{
-	struct rdma_cgroup *cg, *p;
-	struct rdmacg_resource_pool *rpool;
-	s64 new;
-	int ret = 0;
-
-	if (index >= RDMACG_RESOURCE_MAX)
-		return -EINVAL;
-
-	/*
-	 * hold on to css, as cgroup can be removed but resource
-	 * accounting happens on css.
-	 */
-	cg = get_current_rdmacg();
-
-	mutex_lock(&rdmacg_mutex);
-	for (p = cg; p; p = parent_rdmacg(p)) {
-		rpool = get_cg_rpool_locked(p, device);
-		if (IS_ERR(rpool)) {
-			ret = PTR_ERR(rpool);
-			goto err;
-		} else {
-			new = rpool->resources[index].usage + 1;
-			if (new > rpool->resources[index].max) {
-				ret = -EAGAIN;
-				goto err;
-			} else {
-				rpool->resources[index].usage = new;
-				rpool->usage_sum++;
-			}
-		}
-	}
-	mutex_unlock(&rdmacg_mutex);
-
-	*rdmacg = cg;
-	return 0;
-
-err:
-	mutex_unlock(&rdmacg_mutex);
-	rdmacg_uncharge_hierarchy(cg, device, p, index);
-	return ret;
-}
-EXPORT_SYMBOL(rdmacg_try_charge);
-
-/**
- * rdmacg_register_device - register rdmacg device to rdma controller.
- * @device: pointer to rdmacg device whose resources need to be accounted.
- *
- * If IB stack wish a device to participate in rdma cgroup resource
- * tracking, it must invoke this API to register with rdma cgroup before
- * any user space application can start using the RDMA resources.
- * Returns 0 on success or EINVAL when table length given is beyond
- * supported size.
- */
-int rdmacg_register_device(struct rdmacg_device *device)
-{
-	INIT_LIST_HEAD(&device->dev_node);
-	INIT_LIST_HEAD(&device->rpools);
-
-	mutex_lock(&rdmacg_mutex);
-	list_add_tail(&device->dev_node, &rdmacg_devices);
-	mutex_unlock(&rdmacg_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(rdmacg_register_device);
-
-/**
- * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
- * @device: pointer to rdmacg device which was previously registered with rdma
- *          controller using rdmacg_register_device().
- *
- * IB stack must invoke this after all the resources of the IB device
- * are destroyed and after ensuring that no more resources will be created
- * when this API is invoked.
- */
-void rdmacg_unregister_device(struct rdmacg_device *device)
-{
-	struct rdmacg_resource_pool *rpool, *tmp;
-
-	/*
-	 * Synchronize with any active resource settings,
-	 * usage query happening via configfs.
-	 */
-	mutex_lock(&rdmacg_mutex);
-	list_del_init(&device->dev_node);
-
-	/*
-	 * Now that this device is off the cgroup list, its safe to free
-	 * all the rpool resources.
-	 */
-	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
-		free_cg_rpool_locked(rpool);
-
-	mutex_unlock(&rdmacg_mutex);
-}
-EXPORT_SYMBOL(rdmacg_unregister_device);
-
-static int parse_resource(char *c, int *intval)
-{
-	substring_t argstr;
-	const char **table = &rdmacg_resource_names[0];
-	char *name, *value = c;
-	size_t len;
-	int ret, i = 0;
-
-	name = strsep(&value, "=");
-	if (!name || !value)
-		return -EINVAL;
-
-	len = strlen(value);
-
-	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
-		if (strcmp(table[i], name))
-			continue;
-
-		argstr.from = value;
-		argstr.to = value + len;
-
-		ret = match_int(&argstr, intval);
-		if (ret >= 0) {
-			if (*intval < 0)
-				break;
-			return i;
-		}
-		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
-			*intval = S32_MAX;
-			return i;
-		}
-		break;
-	}
-	return -EINVAL;
-}
-
-static int rdmacg_parse_limits(char *options,
-			       int *new_limits, unsigned long *enables)
-{
-	char *c;
-	int err = -EINVAL;
-
-	/* parse resource options */
-	while ((c = strsep(&options, " ")) != NULL) {
-		int index, intval;
-
-		index = parse_resource(c, &intval);
-		if (index < 0)
-			goto err;
-
-		new_limits[index] = intval;
-		*enables |= BIT(index);
-	}
-	return 0;
-
-err:
-	return err;
-}
-
-static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
-{
-	struct rdmacg_device *device;
-
-	lockdep_assert_held(&rdmacg_mutex);
-
-	list_for_each_entry(device, &rdmacg_devices, dev_node)
-		if (!strcmp(name, device->name))
-			return device;
-
-	return NULL;
-}
-
-static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
-				       char *buf, size_t nbytes, loff_t off)
-{
-	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
-	const char *dev_name;
-	struct rdmacg_resource_pool *rpool;
-	struct rdmacg_device *device;
-	char *options = strstrip(buf);
-	int *new_limits;
-	unsigned long enables = 0;
-	int i = 0, ret = 0;
-
-	/* extract the device name first */
-	dev_name = strsep(&options, " ");
-	if (!dev_name) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
-	if (!new_limits) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	ret = rdmacg_parse_limits(options, new_limits, &enables);
-	if (ret)
-		goto parse_err;
-
-	/* acquire lock to synchronize with hot plug devices */
-	mutex_lock(&rdmacg_mutex);
-
-	device = rdmacg_get_device_locked(dev_name);
-	if (!device) {
-		ret = -ENODEV;
-		goto dev_err;
-	}
-
-	rpool = get_cg_rpool_locked(cg, device);
-	if (IS_ERR(rpool)) {
-		ret = PTR_ERR(rpool);
-		goto dev_err;
-	}
-
-	/* now set the new limits of the rpool */
-	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
-		set_resource_limit(rpool, i, new_limits[i]);
-
-	if (rpool->usage_sum == 0 &&
-	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
-		/*
-		 * No user of the rpool and all entries are set to max, so
-		 * safe to delete this rpool.
-		 */
-		free_cg_rpool_locked(rpool);
-	}
-
-dev_err:
-	mutex_unlock(&rdmacg_mutex);
-
-parse_err:
-	kfree(new_limits);
-
-err:
-	return ret ?: nbytes;
-}
-
-static void print_rpool_values(struct seq_file *sf,
-			       struct rdmacg_resource_pool *rpool)
-{
-	enum rdmacg_file_type sf_type;
-	int i;
-	u32 value;
-
-	sf_type = seq_cft(sf)->private;
-
-	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
-		seq_puts(sf, rdmacg_resource_names[i]);
-		seq_putc(sf, '=');
-		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
-			if (rpool)
-				value = rpool->resources[i].max;
-			else
-				value = S32_MAX;
-		} else {
-			if (rpool)
-				value = rpool->resources[i].usage;
-			else
-				value = 0;
-		}
-
-		if (value == S32_MAX)
-			seq_puts(sf, RDMACG_MAX_STR);
-		else
-			seq_printf(sf, "%d", value);
-		seq_putc(sf, ' ');
-	}
-}
-
-static int rdmacg_resource_read(struct seq_file *sf, void *v)
-{
-	struct rdmacg_device *device;
-	struct rdmacg_resource_pool *rpool;
-	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
-
-	mutex_lock(&rdmacg_mutex);
-
-	list_for_each_entry(device, &rdmacg_devices, dev_node) {
-		seq_printf(sf, "%s ", device->name);
-
-		rpool = find_cg_rpool_locked(cg, device);
-		print_rpool_values(sf, rpool);
-
-		seq_putc(sf, '\n');
-	}
-
-	mutex_unlock(&rdmacg_mutex);
-	return 0;
-}
-
-static struct cftype rdmacg_files[] = {
-	{
-		.name = "max",
-		.write = rdmacg_resource_set_max,
-		.seq_show = rdmacg_resource_read,
-		.private = RDMACG_RESOURCE_TYPE_MAX,
-		.flags = CFTYPE_NOT_ON_ROOT,
-	},
-	{
-		.name = "current",
-		.seq_show = rdmacg_resource_read,
-		.private = RDMACG_RESOURCE_TYPE_STAT,
-		.flags = CFTYPE_NOT_ON_ROOT,
-	},
-	{ }	/* terminate */
-};
-
-static struct cgroup_subsys_state *
-rdmacg_css_alloc(struct cgroup_subsys_state *parent)
-{
-	struct rdma_cgroup *cg;
-
-	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
-	if (!cg)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&cg->rpools);
-	return &cg->css;
-}
-
-static void rdmacg_css_free(struct cgroup_subsys_state *css)
-{
-	struct rdma_cgroup *cg = css_rdmacg(css);
-
-	kfree(cg);
-}
-
-/**
- * rdmacg_css_offline - cgroup css_offline callback
- * @css: css of interest
- *
- * This function is called when @css is about to go away and responsible
- * for shooting down all rdmacg associated with @css. As part of that it
- * marks all the resource pool entries to max value, so that when resources are
- * uncharged, associated resource pool can be freed as well.
- */
-static void rdmacg_css_offline(struct cgroup_subsys_state *css)
-{
-	struct rdma_cgroup *cg = css_rdmacg(css);
-	struct rdmacg_resource_pool *rpool;
-
-	mutex_lock(&rdmacg_mutex);
-
-	list_for_each_entry(rpool, &cg->rpools, cg_node)
-		set_all_resource_max_limit(rpool);
-
-	mutex_unlock(&rdmacg_mutex);
-}
-
-struct cgroup_subsys rdma_cgrp_subsys = {
-	.css_alloc	= rdmacg_css_alloc,
-	.css_free	= rdmacg_css_free,
-	.css_offline	= rdmacg_css_offline,
-	.legacy_cftypes	= rdmacg_files,
-	.dfl_cftypes	= rdmacg_files,
-};
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup_freezer.c
similarity index 99%
rename from kernel/cgroup/legacy_freezer.c
rename to kernel/cgroup_freezer.c
index 08236798d173..1b72d56edce5 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	rcu_read_unlock();
 
 	/* are all tasks frozen? */
-	css_task_iter_start(css, 0, &it);
+	css_task_iter_start(css, &it);
 
 	while ((task = css_task_iter_next(&it))) {
 		if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&freezer->css, 0, &it);
+	css_task_iter_start(&freezer->css, &it);
 	while ((task = css_task_iter_next(&it)))
 		freeze_task(task);
 	css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&freezer->css, 0, &it);
+	css_task_iter_start(&freezer->css, &it);
 	while ((task = css_task_iter_next(&it)))
 		__thaw_task(task);
 	css_task_iter_end(&it);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup_pids.c
similarity index 98%
rename from kernel/cgroup/pids.c
rename to kernel/cgroup_pids.c
index 6f064cce257a..b8b898e21c19 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup_pids.c
@@ -248,7 +248,7 @@ static void pids_cancel_fork(struct task_struct *task)
 	pids_uncharge(pids, 1);
 }
 
-static void pids_release(struct task_struct *task)
+static void pids_free(struct task_struct *task)
 {
 	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
@@ -343,8 +343,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
 	.cancel_attach 	= pids_cancel_attach,
 	.can_fork	= pids_can_fork,
 	.cancel_fork	= pids_cancel_fork,
-	.release	= pids_release,
+	.free		= pids_free,
 	.legacy_cftypes	= pids_files,
 	.dfl_cftypes	= pids_files,
-	.threaded	= true,
 };
diff --git a/kernel/cgroup/cpuset.c b/kernel/cpuset.c
similarity index 97%
rename from kernel/cgroup/cpuset.c
rename to kernel/cpuset.c
index 4890211f5709..df64cb9ba63a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cpuset.c
@@ -298,16 +298,6 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 
 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 
-/*
- * Cgroup v2 behavior is used when on default hierarchy or the
- * cgroup_v2_mode flag is set.
- */
-static inline bool is_in_v2_mode(void)
-{
-	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
-	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
-}
-
 /*
  * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
@@ -504,7 +494,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 
 	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
 	ret = -EACCES;
-	if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+	    !is_cpuset_subset(trial, par))
 		goto out;
 
 	/*
@@ -889,7 +880,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&cs->css, 0, &it);
+	css_task_iter_start(&cs->css, &it);
 	while ((task = css_task_iter_next(&it)))
 		update_cpus_allowed(cs, task, cs->effective_cpus);
 	css_task_iter_end(&it);
@@ -922,7 +913,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs.
 		 */
-		if (is_in_v2_mode() && cpumask_empty(new_cpus))
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    cpumask_empty(new_cpus))
 			cpumask_copy(new_cpus, parent->effective_cpus);
 
 		/* Skip the whole subtree if the cpumask remains the same. */
@@ -939,7 +931,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		cpumask_copy(cp->effective_cpus, new_cpus);
 		spin_unlock_irq(&callback_lock);
 
-		WARN_ON(!is_in_v2_mode() &&
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
 
 		update_tasks_cpumask(cp);
@@ -1134,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	css_task_iter_start(&cs->css, 0, &it);
+	css_task_iter_start(&cs->css, &it);
 	while ((task = css_task_iter_next(&it))) {
 		struct mm_struct *mm;
 		bool migrate;
@@ -1192,7 +1184,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some MEMs.
 		 */
-		if (is_in_v2_mode() && nodes_empty(*new_mems))
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    nodes_empty(*new_mems))
 			*new_mems = parent->effective_mems;
 
 		/* Skip the whole subtree if the nodemask remains the same. */
@@ -1209,7 +1202,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 		cp->effective_mems = *new_mems;
 		spin_unlock_irq(&callback_lock);
 
-		WARN_ON(!is_in_v2_mode() &&
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
 
 		update_tasks_nodemask(cp);
@@ -1326,7 +1319,7 @@ static void update_tasks_flags(struct cpuset *cs)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&cs->css, 0, &it);
+	css_task_iter_start(&cs->css, &it);
 	while ((task = css_task_iter_next(&it)))
 		cpuset_update_task_spread_flag(cs, task);
 	css_task_iter_end(&it);
@@ -1500,7 +1493,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 
 	/* allow moving tasks into an empty cpuset if on default hierarchy */
 	ret = -ENOSPC;
-	if (!is_in_v2_mode() &&
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
 		goto out_unlock;
 
@@ -1557,7 +1550,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
 
-	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 
 	/* prepare for attach */
@@ -1613,7 +1605,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		wake_up(&cpuset_attach_wq);
 
 	mutex_unlock(&cpuset_mutex);
-	put_online_cpus();
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2032,7 +2023,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cpuset_inc();
 
 	spin_lock_irq(&callback_lock);
-	if (is_in_v2_mode()) {
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
 	}
@@ -2113,7 +2104,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_lock(&cpuset_mutex);
 	spin_lock_irq(&callback_lock);
 
-	if (is_in_v2_mode()) {
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
 		top_cpuset.mems_allowed = node_possible_map;
 	} else {
@@ -2183,9 +2174,12 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
-	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
-	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
-	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
+	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+		BUG();
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	cpumask_setall(top_cpuset.cpus_requested);
@@ -2201,7 +2195,8 @@ int __init cpuset_init(void)
 	if (err < 0)
 		return err;
 
-	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+		BUG();
 
 	return 0;
 }
@@ -2327,7 +2322,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
 	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
-	if (is_in_v2_mode())
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
 		hotplug_update_tasks(cs, &new_cpus, &new_mems,
 				     cpus_updated, mems_updated);
 	else
@@ -2365,7 +2360,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
-	bool on_dfl = is_in_v2_mode();
+	bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
 
 	mutex_lock(&cpuset_mutex);
 
@@ -2424,7 +2419,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	}
 }
 
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
 	/*
 	 * We're inside cpu hotplug critical region which usually nests
@@ -2469,11 +2464,8 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
  */
 void __init cpuset_init_smp(void)
 {
-	/*
-	 * cpus_allowd/mems_allowed set to v2 values in the initial
-	 * cpuset_bind() call will be reset to v1 values in another
-	 * cpuset_bind() call when v1 cpuset is mounted.
-	 */
+	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+	top_cpuset.mems_allowed = node_states[N_MEMORY];
 	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
 
 	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
@@ -2507,23 +2499,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	spin_unlock_irqrestore(&callback_lock, flags);
 }
 
-/**
- * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
- * @tsk: pointer to task_struct with which the scheduler is struggling
- *
- * Description: In the case that the scheduler cannot find an allowed cpu in
- * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
- * mode however, this value is the same as task_cs(tsk)->effective_cpus,
- * which will not contain a sane cpumask during cases such as cpu hotplugging.
- * This is the absolute last resort for the scheduler and it is only used if
- * _every_ other avenue has been traveled.
- **/
-
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
 	rcu_read_lock();
-	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
-		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
 	rcu_read_unlock();
 
 	/*
diff --git a/kernel/cred.c b/kernel/cred.c
index ad24a4cb25c0..d63a2d861ac2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -196,7 +196,7 @@ const struct cred *get_task_cred(struct task_struct *task)
 	do {
 		cred = __task_cred((task));
 		BUG_ON(!cred);
-	} while (!get_cred_rcu(cred));
+	} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
 
 	rcu_read_unlock();
 	return cred;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 540256086e91..547184b71dce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11458,12 +11458,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
 	.attach		= perf_cgroup_attach,
-	/*
-	 * Implicitly enable on dfl hierarchy so that perf events can
-	 * always be filtered by cgroup2 path as long as perf_event
-	 * controller is not mounted on a legacy hierarchy.
-	 */
-	.implicit_on_dfl = true,
-	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/exit.c b/kernel/exit.c
index 5e0ca9c806a6..09beccfb0977 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,7 +178,6 @@ void release_task(struct task_struct *p)
 	rcu_read_unlock();
 
 	proc_flush_task(p);
-	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index cbce7b33193b..00f93deb2829 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1069,9 +1069,7 @@ static int wait_for_vfork_done(struct task_struct *child,
 	int killed;
 
 	freezer_do_not_count();
-	cgroup_enter_frozen();
 	killed = wait_for_completion_killable(vfork);
-	cgroup_leave_frozen(false);
 	freezer_count();
 
 	if (killed) {
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 48445cb61e8f..a5ff5d0ef572 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -269,6 +269,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
 
 static DEFINE_MUTEX(reboot_mutex);
 
+#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD)
+extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg);
+#endif
+
 /*
  * Reboot system call: for obvious reasons only root may call it,
  * and even root needs to set up some magic numbers in the registers
@@ -277,9 +281,6 @@ static DEFINE_MUTEX(reboot_mutex);
  *
  * reboot doesn't sync: do that yourself before calling this.
  */
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg);
-#endif
 SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		void __user *, arg)
 {
@@ -287,10 +288,9 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	char buffer[256];
 	int ret = 0;
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
+#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD)
 	ksu_handle_sys_reboot(magic1, magic2, cmd, &arg);
 #endif
-
 	/* We only trust the superuser with rebooting the system. */
 	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
 		return -EPERM;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 29e696d490f1..f312d7a3b914 100755
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8055,7 +8055,7 @@ static void cpuset_cpu_active(void)
 		 */
 		cpuset_force_rebuild();
 	}
-	cpuset_update_active_cpus();
+	cpuset_update_active_cpus(true);
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -8078,7 +8078,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
 		if (overflow)
 			return -EBUSY;
-		cpuset_update_active_cpus();
+		cpuset_update_active_cpus(false);
 	} else {
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5b2edc6341f8..8051e3741aed 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,7 +38,6 @@
 #include <linux/compiler.h>
 #include <linux/oom.h>
 #include <linux/capability.h>
-#include <linux/cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -48,10 +47,6 @@
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
-#ifdef CONFIG_REKERNEL
-#include <uapi/asm/signal.h>
-#include <../drivers/rekernel/rekernel.h>
-#endif /* CONFIG_REKERNEL */
 #include "audit.h"	/* audit_signal_info() */
 
 /*
@@ -151,10 +146,9 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
+	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
 	    PENDING(&t->pending, &t->blocked) ||
-	    PENDING(&t->signal->shared_pending, &t->blocked) ||
-	    cgroup_task_frozen(t)) {
+	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
 		return 1;
 	}
@@ -1212,10 +1206,6 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
 {
 	unsigned long flags;
 	int ret = -ESRCH;
-#ifdef CONFIG_REKERNEL
-	if (sig == SIGKILL || sig == SIGTERM || sig == SIGABRT || sig == SIGQUIT)
-		rekernel_report(SIGNAL, sig, task_tgid_nr(current), current, task_tgid_nr(p), p, false, NULL);
-#endif /* CONFIG_REKERNEL */
 
 	if (lock_task_sighand(p, &flags)) {
 		ret = send_signal(sig, info, p, group);
@@ -1939,10 +1929,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		 */
 		preempt_disable();
 		read_unlock(&tasklist_lock);
-		cgroup_enter_frozen();
 		preempt_enable_no_resched();
 		freezable_schedule();
-		cgroup_leave_frozen(true);
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
@@ -2120,7 +2108,6 @@ static bool do_signal_stop(int signr)
 		}
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
-		cgroup_enter_frozen();
 		freezable_schedule();
 		return true;
 	} else {
@@ -2167,43 +2154,6 @@ static void do_jobctl_trap(void)
 	}
 }
 
-/**
- * do_freezer_trap - handle the freezer jobctl trap
- *
- * Puts the task into frozen state, if only the task is not about to quit.
- * In this case it drops JOBCTL_TRAP_FREEZE.
- *
- * CONTEXT:
- * Must be called with @current->sighand->siglock held,
- * which is always released before returning.
- */
-static void do_freezer_trap(void)
-	__releases(&current->sighand->siglock)
-{
-	/*
-	 * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
-	 * let's make another loop to give it a chance to be handled.
-	 * In any case, we'll return back.
-	 */
-	if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
-	     JOBCTL_TRAP_FREEZE) {
-		spin_unlock_irq(&current->sighand->siglock);
-		return;
-	}
-
-	/*
-	 * Now we're sure that there is no pending fatal signal and no
-	 * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
-	 * immediately (if there is a non-fatal signal pending), and
-	 * put the task into sleep.
-	 */
-	__set_current_state(TASK_INTERRUPTIBLE);
-	clear_thread_flag(TIF_SIGPENDING);
-	spin_unlock_irq(&current->sighand->siglock);
-	cgroup_enter_frozen();
-	freezable_schedule();
-}
-
 static int ptrace_signal(int signr, siginfo_t *info)
 {
 	ptrace_signal_deliver();
@@ -2316,10 +2266,6 @@ int get_signal(struct ksignal *ksig)
 		trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
 				&sighand->action[SIGKILL - 1]);
 		recalc_sigpending();
-		current->jobctl &= ~JOBCTL_TRAP_FREEZE;
-		spin_unlock_irq(&sighand->siglock);
-		if (unlikely(cgroup_task_frozen(current)))
-			cgroup_leave_frozen(true);
 		goto fatal;
 	}
 
@@ -2330,24 +2276,9 @@ int get_signal(struct ksignal *ksig)
 		    do_signal_stop(0))
 			goto relock;
 
-		if (unlikely(current->jobctl &
-			     (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
-			if (current->jobctl & JOBCTL_TRAP_MASK) {
-				do_jobctl_trap();
-				spin_unlock_irq(&sighand->siglock);
-			} else if (current->jobctl & JOBCTL_TRAP_FREEZE)
-				do_freezer_trap();
-
-			goto relock;
-		}
-
-		/*
-		 * If the task is leaving the frozen state, let's update
-		 * cgroup counters and reset the frozen bit.
-		 */
-		if (unlikely(cgroup_task_frozen(current))) {
+		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+			do_jobctl_trap();
 			spin_unlock_irq(&sighand->siglock);
-			cgroup_leave_frozen(true);
 			goto relock;
 		}
 
@@ -2441,8 +2372,8 @@ int get_signal(struct ksignal *ksig)
 			continue;
 		}
 
-		spin_unlock_irq(&sighand->siglock);
 	fatal:
+		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * Anything else is fatal, maybe with a core dump.
@@ -2477,7 +2408,7 @@ int get_signal(struct ksignal *ksig)
 }
 
 /**
- * signal_delivered -
+ * signal_delivered - 
  * @ksig:		kernel signal struct
  * @stepping:		nonzero if debugger single-step or block-step in use
  *
@@ -3540,7 +3471,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
  */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
-	return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
+	return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 
 }
 
 #endif
@@ -3665,7 +3596,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 	if (!ret && oact) {
 		sigset_to_compat(&mask, &old_ka.sa.sa_mask);
-		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
+		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
 		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
@@ -3843,7 +3774,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 		return -EFAULT;
 	return sigsuspend(&newset);
 }
-
+ 
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a49cdf33a62e..b508b47ae3ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -966,7 +966,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 		struct css_task_iter it;
 		struct task_struct *task;
 
-		css_task_iter_start(&iter->css, 0, &it);
+		css_task_iter_start(&iter->css, &it);
 		while (!ret && (task = css_task_iter_next(&it)))
 			ret = fn(task, arg);
 		css_task_iter_end(&it);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index dad43d1924db..db65b0cdfc4c 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -128,7 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 
 	cs->classid = (u32)value;
 
-	css_task_iter_start(css, 0, &it);
+	css_task_iter_start(css, &it);
 	while ((p = css_task_iter_next(&it))) {
 		update_classid_task(p, cs->classid);
 		cond_resched();
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7e1caf9ee106..ac0c60389581 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2314,10 +2314,6 @@ static u32 ptrace_parent_sid(struct task_struct *task)
 	return sid;
 }
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern bool is_ksu_transition(const struct task_security_struct *old_tsec,
-			      const struct task_security_struct *new_tsec);
-#endif
 static int check_nnp_nosuid(const struct linux_binprm *bprm,
 			    const struct task_security_struct *old_tsec,
 			    const struct task_security_struct *new_tsec)
@@ -2332,11 +2328,6 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
 	if (new_tsec->sid == old_tsec->sid)
 		return 0; /* No change in credentials */
 
-#ifdef CONFIG_KSU_MANUAL_HOOK
-	if (is_ksu_transition(old_tsec, new_tsec))
-		return 0;
-#endif
-
 	/*
 	 * The only transitions we permit under NNP or nosuid
 	 * are transitions to bounded SIDs, i.e. SIDs that are
@@ -5916,6 +5907,10 @@ static int selinux_getprocattr(struct task_struct *p,
 	return -EINVAL;
 }
 
+#ifdef CONFIG_KSU
+extern int ksu_hide_setprocattr(const char *name, void *value, size_t size);
+#endif
+
 static int selinux_setprocattr(struct task_struct *p,
 			       char *name, void *value, size_t size)
 {
@@ -5925,6 +5920,10 @@ static int selinux_setprocattr(struct task_struct *p,
 	int error;
 	char *str = value;
 
+#ifdef CONFIG_KSU
+	ksu_hide_setprocattr(name, value, size);
+#endif
+
 	if (current != p) {
 		/* SELinux only allows a process to change its own
 		   security attributes. */
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index b818410d2418..58e5ccf6b1e9 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -76,7 +76,11 @@ int selinux_policycap_netpeer;
 int selinux_policycap_openperm;
 int selinux_policycap_alwaysnetwork;
 
+#ifdef CONFIG_KSU
+DEFINE_RWLOCK(policy_rwlock);
+#else
 static DEFINE_RWLOCK(policy_rwlock);
+#endif
 
 static struct sidtab sidtab;
 struct policydb policydb;
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 34156826c14f..5bc2b92ace6d 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
-	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
 	char *token, *saved_ptr = NULL;
+	int found = 0;
 
 	fp = fopen("/proc/mounts", "r");
 	if (!fp)
@@ -24,43 +24,31 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 	 * and inspect every cgroupfs mount point to find one that has
 	 * perf_event subsystem
 	 */
-	path_v1[0] = '\0';
-	path_v2[0] = '\0';
-
 	while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
 				STR(PATH_MAX)"s %*d %*d\n",
 				mountpoint, type, tokens) == 3) {
 
-		if (!path_v1[0] && !strcmp(type, "cgroup")) {
+		if (!strcmp(type, "cgroup")) {
 
 			token = strtok_r(tokens, ",", &saved_ptr);
 
 			while (token != NULL) {
 				if (!strcmp(token, "perf_event")) {
-					strcpy(path_v1, mountpoint);
+					found = 1;
 					break;
 				}
 				token = strtok_r(NULL, ",", &saved_ptr);
 			}
 		}
-
-		if (!path_v2[0] && !strcmp(type, "cgroup2"))
-			strcpy(path_v2, mountpoint);
-
-		if (path_v1[0] && path_v2[0])
+		if (found)
 			break;
 	}
 	fclose(fp);
-
-	if (path_v1[0])
-		path = path_v1;
-	else if (path_v2[0])
-		path = path_v2;
-	else
+	if (!found)
 		return -1;
 
-	if (strlen(path) < maxlen) {
-		strcpy(buf, path);
+	if (strlen(mountpoint) < maxlen) {
+		strcpy(buf, mountpoint);
 		return 0;
 	}
 	return -1;