diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 948e5d193a8c..e35f5377f681 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -27,9 +27,9 @@ jobs: - ursa steps: - name: 安装软件包 + if: env.PACKAGES != '' env: PACKAGES: - ccache binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi run: | @@ -39,7 +39,7 @@ jobs: - name: 安装make4.4.1-2 run: | curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb - sudo apt-get install -y ./make.deb + sudo apt-get install -y -q ./make.deb rm ./make.deb - name: 同步仓库 @@ -49,17 +49,25 @@ jobs: - name: 缓存Clang id: cache-clang - uses: actions/cache@main + uses: actions/cache/restore@main with: path: clang key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }} - name: 下载Clang + id: download_clang if: steps.cache-clang.outputs.cache-hit != 'true' - run: - mkdir -p clang && - curl -LSs "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" | - tar xz -C clang + run: | + mkdir -p clang + wget -c -t 10 "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" -O clang.tgz + tar -zxvf clang.tgz -C clang/ + + - name: 保存Clang + if: always() && steps.cache-clang.outputs.cache-hit != 'true' && steps.download_clang.outcome == 'success' + uses: actions/cache/save@main + with: + path: clang + key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }} - name: 缓存ccache uses: hendrikmuhs/ccache-action@main @@ -73,17 +81,18 @@ jobs: env: MAKE_ARGS: -j$(nproc --all) + O=out + LLVM=1 + LLVM_IAS=1 CC="ccache clang" LD=ld.lld ARCH=arm64 - LLVM=1 - LLVM_IAS=1 - O=out CROSS_COMPILE=aarch64-linux-gnu- CROSS_COMPILE_ARM32=arm-linux-gnueabi- CONFIG_FILES: vendor/xiaomi/mi845_defconfig vendor/xiaomi/${{ matrix.device }}.config + lxc.config run: | export PATH=$GITHUB_WORKSPACE/clang/bin:$PATH export KBUILD_BUILD_USER=${{ github.repository_owner }} @@ -112,31 +121,15 @@ jobs: EOF zip -qr9 Anykernel3-${{ matrix.device }}.zip * -x .git .github README.md *placeholder - - name: 打包(boot) - run: | - git clone https://android.googlesource.com/platform/system/tools/mkbootimg --depth=1 mkbootimg - cp kernel/out/arch/arm64/boot/Image.gz-dtb mkbootimg/ - cd mkbootimg - boot_url=$(curl -LSs https://download.lineageos.org/api/v2/devices/${{ matrix.device }}/builds | jq -r '.[0].files[1].url') - curl -LSs $boot_url -o boot.img - mkbootimg_args=$(./unpack_bootimg.py --out out --boot_img boot.img --format mkbootimg) - mv Image.gz-dtb out/kernel - eval "./mkbootimg.py $mkbootimg_args -o boot-lineage-${{ matrix.device }}.img" - - name: 上传文件 uses: actions/upload-artifact@main with: name: kernel-${{ matrix.device }}-ak3 path: ak3/Anykernel3-${{ matrix.device }}.zip - - name: 上传文件 - uses: actions/upload-artifact@main - with: - name: kernel-${{ matrix.device }}-boot - path: mkbootimg/boot-lineage-${{ matrix.device }}.img - release: name: 发布 + if: github.event_name == 'push' permissions: { contents: write } runs-on: ubuntu-latest needs: build @@ -151,54 +144,61 @@ jobs: - name: 获取当前时间 id: time run: | - echo "time=$(TZ='Asia/Shanghai' date -u +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT - echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT - - - name: 下载ci管理器 - continue-on-error: true - uses: dawidd6/action-download-artifact@master - with: - repo: rsuntk/KernelSU - workflow_conclusion: success - name: manager - workflow: build-manager.yml - path: manager - check_artifacts: true - search_artifacts: true + NOW=$(date +%s) + TIME_STR=$(TZ='Asia/Shanghai' date -d "@$NOW" +'%Y%m%d%H%M') + echo "timestamp=$NOW" >> $GITHUB_OUTPUT + echo "time=$TIME_STR" >> $GITHUB_OUTPUT - name: 发布 uses: softprops/action-gh-release@master + id: release with: tag_name: rel-${{ steps.time.outputs.timestamp }} name: Kernel build ${{ steps.time.outputs.time }} prerelease: ${{ startsWith(github.ref_name, 'dev/') }} files: | kernel/* - manager/* - name: 发送Telegram通知 continue-on-error: true - env: - COMMIT_MESSAGE: ${{ github.event.head_commit.message }} - COMMIT_URL: ${{ github.event.head_commit.url }} - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }} run: | - msg="*CI ${{ steps.time.outputs.time }}* - > Branch/分支: \`${{ github.ref_name }}\` - \`\`\` - $COMMIT_MESSAGE - \`\`\` - [Download/下载]($RELEASE_URL) - [Commit/提交]($COMMIT_URL) - [Run/工作流]($RUN_URL) + IDS=(${{ join(github.event.commits.*.id, ' ') }}) + MAX=6 + if [ "${#IDS[@]}" -gt "$MAX" ]; then + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")" + else + COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")" + fi + MSG="\ + CI ${{ steps.time.outputs.time }} +
\
+          项目: ${{ github.repository }}
+          分支: ${{ github.ref_name }}\
+          
+ 提交ID: +
$COMMIT_IDS_TEXT
\ " - curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \ - -F 'parse_mode="markdownv2"' \ - -F "text=\"$msg\"" | tee Markdown.txt - ! ${{ startsWith(github.ref_name, 'stable/') }} || \ - curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ - -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \ - -F message_id=$(jq '.result.message_id' Markdown.txt) + PREVIEW_OPTIONS="{ \ + \"url\": \"${{ steps.release.outputs.url }}\", \ + \"prefer_small_media\": true, \ + \"show_above_text\": true \ + }" + BUTTONS="{\"inline_keyboard\": [ [ \ + { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \ + { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \ + ] ] }" + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=$MSG" \ + -d "link_preview_options=$PREVIEW_OPTIONS" \ + -d "reply_markup=$BUTTONS" \ + -o response.txt && \ + (! ${{ startsWith(github.ref_name, 'stable/') }} || \ + curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \ + -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \ + -d "message_id=$(jq '.result.message_id' response.txt)") + if [ "${{ runner.debug }}" = "1" ]; then + cat response.txt + fi diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index f8614b3d49f9..a542b9f2a30d 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -1,7 +1,5 @@ 00-INDEX - This file -bfq-iosched.txt - - BFQ IO scheduler and its tunables biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 biovecs.txt diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt deleted file mode 100644 index 0539e87962ed..000000000000 --- a/Documentation/block/bfq-iosched.txt +++ /dev/null @@ -1,545 +0,0 @@ -BFQ (Budget Fair Queueing) -========================== - -BFQ is a proportional-share I/O scheduler, with some extra -low-latency capabilities. In addition to cgroups support (blkio or io -controllers), BFQ's main features are: -- BFQ guarantees a high system and application responsiveness, and a - low latency for time-sensitive applications, such as audio or video - players; -- BFQ distributes bandwidth, and not just time, among processes or - groups (switching back to time distribution when needed to keep - throughput high). - -In its default configuration, BFQ privileges latency over -throughput. So, when needed for achieving a lower latency, BFQ builds -schedules that may lead to a lower throughput. If your main or only -goal, for a given device, is to achieve the maximum-possible -throughput at all times, then do switch off all low-latency heuristics -for that device, by setting low_latency to 0. Full details in Section 3. - -On average CPUs, the current version of BFQ can handle devices -performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -reference, 30-50 KIOPS correspond to very high bandwidths with -sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -to 120-200 MB/s with 4KB random I/O. - -The table of contents follow. Impatients can just jump to Section 3. - -CONTENTS - -1. When may BFQ be useful? - 1-1 Personal systems - 1-2 Server systems -2. How does BFQ work? -3. What are BFQ's tunable? -4. BFQ group scheduling - 4-1 Service guarantees provided - 4-2 Interface - -1. When may BFQ be useful? -========================== - -BFQ provides the following benefits on personal and server systems. - -1-1 Personal systems --------------------- - -Low latency for interactive applications - -Regardless of the actual background workload, BFQ guarantees that, for -interactive tasks, the storage device is virtually as responsive as if -it was idle. For example, even if one or more of the following -background workloads are being executed: -- one or more large files are being read, written or copied, -- a tree of source files is being compiled, -- one or more virtual machines are performing I/O, -- a software update is in progress, -- indexing daemons are scanning filesystems and updating their - databases, -starting an application or loading a file from within an application -takes about the same time as if the storage device was idle. As a -comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, -applications experience high latencies, or even become unresponsive -until the background workload terminates (also on SSDs). - -Low latency for soft real-time applications - -Also soft real-time applications, such as audio and video -players/streamers, enjoy a low latency and a low drop rate, regardless -of the background I/O workload. As a consequence, these applications -do not suffer from almost any glitch due to the background workload. - -Higher speed for code-development tasks - -If some additional workload happens to be executed in parallel, then -BFQ executes the I/O-related components of typical code-development -tasks (compilation, checkout, merge, ...) much more quickly than CFQ, -NOOP or DEADLINE. - -High throughput - -On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and -up to 150% higher throughput than DEADLINE and NOOP, with all the -sequential workloads considered in our tests. With random workloads, -and with all the workloads on flash-based devices, BFQ achieves, -instead, about the same throughput as the other schedulers. - -Strong fairness, bandwidth and delay guarantees - -BFQ distributes the device throughput, and not just the device time, -among I/O-bound applications in proportion their weights, with any -workload and regardless of the device parameters. From these bandwidth -guarantees, it is possible to compute tight per-I/O-request delay -guarantees by a simple formula. If not configured for strict service -guarantees, BFQ switches to time-based resource sharing (only) for -applications that would otherwise cause a throughput loss. - -1-2 Server systems ------------------- - -Most benefits for server systems follow from the same service -properties as above. In particular, regardless of whether additional, -possibly heavy workloads are being served, BFQ guarantees: - -. audio and video-streaming with zero or very low jitter and drop - rate; - -. fast retrieval of WEB pages and embedded objects; - -. real-time recording of data in live-dumping applications (e.g., - packet logging); - -. responsiveness in local and remote access to a server. - - -2. How does BFQ work? -===================== - -BFQ is a proportional-share I/O scheduler, whose general structure, -plus a lot of code, are borrowed from CFQ. - -- Each process doing I/O on a device is associated with a weight and a - (bfq_)queue. - -- BFQ grants exclusive access to the device, for a while, to one queue - (process) at a time, and implements this service model by - associating every queue with a budget, measured in number of - sectors. - - - After a queue is granted access to the device, the budget of the - queue is decremented, on each request dispatch, by the size of the - request. - - - The in-service queue is expired, i.e., its service is suspended, - only if one of the following events occurs: 1) the queue finishes - its budget, 2) the queue empties, 3) a "budget timeout" fires. - - - The budget timeout prevents processes doing random I/O from - holding the device for too long and dramatically reducing - throughput. - - - Actually, as in CFQ, a queue associated with a process issuing - sync requests may not be expired immediately when it empties. In - contrast, BFQ may idle the device for a short time interval, - giving the process the chance to go on being served if it issues - a new request in time. Device idling typically boosts the - throughput on rotational devices, if processes do synchronous - and sequential I/O. In addition, under BFQ, device idling is - also instrumental in guaranteeing the desired throughput - fraction to processes issuing sync requests (see the description - of the slice_idle tunable in this document, or [1, 2], for more - details). - - - With respect to idling for service guarantees, if several - processes are competing for the device at the same time, but - all processes (and groups, after the following commit) have - the same weight, then BFQ guarantees the expected throughput - distribution without ever idling the device. Throughput is - thus as high as possible in this common scenario. - - - If low-latency mode is enabled (default configuration), BFQ - executes some special heuristics to detect interactive and soft - real-time applications (e.g., video or audio players/streamers), - and to reduce their latency. The most important action taken to - achieve this goal is to give to the queues associated with these - applications more than their fair share of the device - throughput. For brevity, we call just "weight-raising" the whole - sets of actions taken by BFQ to privilege these queues. In - particular, BFQ provides a milder form of weight-raising for - interactive applications, and a stronger form for soft real-time - applications. - - - BFQ automatically deactivates idling for queues born in a burst of - queue creations. In fact, these queues are usually associated with - the processes of applications and services that benefit mostly - from a high throughput. Examples are systemd during boot, or git - grep. - - - As CFQ, BFQ merges queues performing interleaved I/O, i.e., - performing random I/O that becomes mostly sequential if - merged. Differently from CFQ, BFQ achieves this goal with a more - reactive mechanism, called Early Queue Merge (EQM). EQM is so - responsive in detecting interleaved I/O (cooperating processes), - that it enables BFQ to achieve a high throughput, by queue - merging, even for queues for which CFQ needs a different - mechanism, preemption, to get a high throughput. As such EQM is a - unified mechanism to achieve a high throughput with interleaved - I/O. - - - Queues are scheduled according to a variant of WF2Q+, named - B-WF2Q+, and implemented using an augmented rb-tree to preserve an - O(log N) overall complexity. See [2] for more details. B-WF2Q+ is - also ready for hierarchical scheduling. However, for a cleaner - logical breakdown, the code that enables and completes - hierarchical support is provided in the next commit, which focuses - exactly on this feature. - - - B-WF2Q+ guarantees a tight deviation with respect to an ideal, - perfectly fair, and smooth service. In particular, B-WF2Q+ - guarantees that each queue receives a fraction of the device - throughput proportional to its weight, even if the throughput - fluctuates, and regardless of: the device parameters, the current - workload and the budgets assigned to the queue. - - - The last, budget-independence, property (although probably - counterintuitive in the first place) is definitely beneficial, for - the following reasons: - - - First, with any proportional-share scheduler, the maximum - deviation with respect to an ideal service is proportional to - the maximum budget (slice) assigned to queues. As a consequence, - BFQ can keep this deviation tight not only because of the - accurate service of B-WF2Q+, but also because BFQ *does not* - need to assign a larger budget to a queue to let the queue - receive a higher fraction of the device throughput. - - - Second, BFQ is free to choose, for every process (queue), the - budget that best fits the needs of the process, or best - leverages the I/O pattern of the process. In particular, BFQ - updates queue budgets with a simple feedback-loop algorithm that - allows a high throughput to be achieved, while still providing - tight latency guarantees to time-sensitive applications. When - the in-service queue expires, this algorithm computes the next - budget of the queue so as to: - - - Let large budgets be eventually assigned to the queues - associated with I/O-bound applications performing sequential - I/O: in fact, the longer these applications are served once - got access to the device, the higher the throughput is. - - - Let small budgets be eventually assigned to the queues - associated with time-sensitive applications (which typically - perform sporadic and short I/O), because, the smaller the - budget assigned to a queue waiting for service is, the sooner - B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). - -- If several processes are competing for the device at the same time, - but all processes and groups have the same weight, then BFQ - guarantees the expected throughput distribution without ever idling - the device. It uses preemption instead. Throughput is then much - higher in this common scenario. - -- ioprio classes are served in strict priority order, i.e., - lower-priority queues are not served as long as there are - higher-priority queues. Among queues in the same class, the - bandwidth is distributed in proportion to the weight of each - queue. A very thin extra bandwidth is however guaranteed to - the Idle class, to prevent it from starving. - - -3. What are BFQ's tunable? -========================== - -The tunables back_seek-max, back_seek_penalty, fifo_expire_async and -fifo_expire_sync below are the same as in CFQ. Their description is -just copied from that for CFQ. Some considerations in the description -of slice_idle are copied from CFQ too. - -per-process ioprio and weight ------------------------------ - -Unless the cgroups interface is used (see "4. BFQ group scheduling"), -weights can be assigned to processes only indirectly, through I/O -priorities, and according to the relation: -weight = (IOPRIO_BE_NR - ioprio) * 10. - -Beware that, if low-latency is set, then BFQ automatically raises the -weight of the queues associated with interactive and soft real-time -applications. Unset this tunable if you need/want to control weights. - -slice_idle ----------- - -This parameter specifies how long BFQ should idle for next I/O -request, when certain sync BFQ queues become empty. By default -slice_idle is a non-zero value. Idling has a double purpose: boosting -throughput and making sure that the desired throughput distribution is -respected (see the description of how BFQ works, and, if needed, the -papers referred there). - -As for throughput, idling can be very helpful on highly seeky media -like single spindle SATA/SAS disks where we can cut down on overall -number of seeks and see improved throughput. - -Setting slice_idle to 0 will remove all the idling on queues and one -should see an overall improved throughput on faster storage devices -like multiple SATA/SAS disks in hardware RAID configuration. - -So depending on storage and workload, it might be useful to set -slice_idle=0. In general for SATA/SAS disks and software RAID of -SATA/SAS disks keeping slice_idle enabled should be useful. For any -configurations where there are multiple spindles behind single LUN -(Host based hardware RAID controller or for storage arrays), setting -slice_idle=0 might end up in better throughput and acceptable -latencies. - -Idling is however necessary to have service guarantees enforced in -case of differentiated weights or differentiated I/O-request lengths. -To see why, suppose that a given BFQ queue A must get several I/O -requests served for each request served for another queue B. Idling -ensures that, if A makes a new I/O request slightly after becoming -empty, then no request of B is dispatched in the middle, and thus A -does not lose the possibility to get more than one request dispatched -before the next request of B is dispatched. Note that idling -guarantees the desired differentiated treatment of queues only in -terms of I/O-request dispatches. To guarantee that the actual service -order then corresponds to the dispatch order, the strict_guarantees -tunable must be set too. - -There is an important flipside for idling: apart from the above cases -where it is beneficial also for throughput, idling can severely impact -throughput. One important case is random workload. Because of this -issue, BFQ tends to avoid idling as much as possible, when it is not -beneficial also for throughput. As a consequence of this behavior, and -of further issues described for the strict_guarantees tunable, -short-term service guarantees may be occasionally violated. And, in -some cases, these guarantees may be more important than guaranteeing -maximum throughput. For example, in video playing/streaming, a very -low drop rate may be more important than maximum throughput. In these -cases, consider setting the strict_guarantees parameter. - -strict_guarantees ------------------ - -If this parameter is set (default: unset), then BFQ - -- always performs idling when the in-service queue becomes empty; - -- forces the device to serve one I/O request at a time, by dispatching a - new request only if there is no outstanding request. - -In the presence of differentiated weights or I/O-request sizes, both -the above conditions are needed to guarantee that every BFQ queue -receives its allotted share of the bandwidth. The first condition is -needed for the reasons explained in the description of the slice_idle -tunable. The second condition is needed because all modern storage -devices reorder internally-queued requests, which may trivially break -the service guarantees enforced by the I/O scheduler. - -Setting strict_guarantees may evidently affect throughput. - -back_seek_max -------------- - -This specifies, given in Kbytes, the maximum "distance" for backward seeking. -The distance is the amount of space from the current head location to the -sectors that are backward in terms of distance. - -This parameter allows the scheduler to anticipate requests in the "backward" -direction and consider them as being the "next" if they are within this -distance from the current head location. - -back_seek_penalty ------------------ - -This parameter is used to compute the cost of backward seeking. If the -backward distance of request is just 1/back_seek_penalty from a "front" -request, then the seeking cost of two requests is considered equivalent. - -So scheduler will not bias toward one or the other request (otherwise scheduler -will bias toward front request). Default value of back_seek_penalty is 2. - -fifo_expire_async ------------------ - -This parameter is used to set the timeout of asynchronous requests. Default -value of this is 248ms. - -fifo_expire_sync ----------------- - -This parameter is used to set the timeout of synchronous requests. Default -value of this is 124ms. In case to favor synchronous requests over asynchronous -one, this value should be decreased relative to fifo_expire_async. - -low_latency ------------ - -This parameter is used to enable/disable BFQ's low latency mode. By -default, low latency mode is enabled. If enabled, interactive and soft -real-time applications are privileged and experience a lower latency, -as explained in more detail in the description of how BFQ works. - -DISABLE this mode if you need full control on bandwidth -distribution. In fact, if it is enabled, then BFQ automatically -increases the bandwidth share of privileged applications, as the main -means to guarantee a lower latency to them. - -In addition, as already highlighted at the beginning of this document, -DISABLE this mode if your only goal is to achieve a high throughput. -In fact, privileging the I/O of some application over the rest may -entail a lower throughput. To achieve the highest-possible throughput -on a non-rotational device, setting slice_idle to 0 may be needed too -(at the cost of giving up any strong guarantee on fairness and low -latency). - -timeout_sync ------------- - -Maximum amount of device time that can be given to a task (queue) once -it has been selected for service. On devices with costly seeks, -increasing this time usually increases maximum throughput. On the -opposite end, increasing this time coarsens the granularity of the -short-term bandwidth and latency guarantees, especially if the -following parameter is set to zero. - -max_budget ----------- - -Maximum amount of service, measured in sectors, that can be provided -to a BFQ queue once it is set in service (of course within the limits -of the above timeout). According to what said in the description of -the algorithm, larger values increase the throughput in proportion to -the percentage of sequential I/O requests issued. The price of larger -values is that they coarsen the granularity of short-term bandwidth -and latency guarantees. - -The default value is 0, which enables auto-tuning: BFQ sets max_budget -to the maximum number of sectors that can be served during -timeout_sync, according to the estimated peak rate. - -weights -------- - -Read-only parameter, used to show the weights of the currently active -BFQ queues. - - -wr_ tunables ------------- - -BFQ exports a few parameters to control/tune the behavior of -low-latency heuristics. - -wr_coeff - -Factor by which the weight of a weight-raised queue is multiplied. If -the queue is deemed soft real-time, then the weight is further -multiplied by an additional, constant factor. - -wr_max_time - -Maximum duration of a weight-raising period for an interactive task -(ms). If set to zero (default value), then this value is computed -automatically, as a function of the peak rate of the device. In any -case, when the value of this parameter is read, it always reports the -current duration, regardless of whether it has been set manually or -computed automatically. - -wr_max_softrt_rate - -Maximum service rate below which a queue is deemed to be associated -with a soft real-time application, and is then weight-raised -accordingly (sectors/sec). - -wr_min_idle_time - -Minimum idle period after which interactive weight-raising may be -reactivated for a queue (in ms). - -wr_rt_max_time - -Maximum weight-raising duration for soft real-time queues (in ms). The -start time from which this duration is considered is automatically -moved forward if the queue is detected to be still soft real-time -before the current soft real-time weight-raising period finishes. - -wr_min_inter_arr_async - -Minimum period between I/O request arrivals after which weight-raising -may be reactivated for an already busy async queue (in ms). - - -4. Group scheduling with BFQ -============================ - -BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely -blkio and io. In particular, BFQ supports weight-based proportional -share. To activate cgroups support, set BFQ_GROUP_IOSCHED. - -4-1 Service guarantees provided -------------------------------- - -With BFQ, proportional share means true proportional share of the -device bandwidth, according to group weights. For example, a group -with weight 200 gets twice the bandwidth, and not just twice the time, -of a group with weight 100. - -BFQ supports hierarchies (group trees) of any depth. Bandwidth is -distributed among groups and processes in the expected way: for each -group, the children of the group share the whole bandwidth of the -group in proportion to their weights. In particular, this implies -that, for each leaf group, every process of the group receives the -same share of the whole group bandwidth, unless the ioprio of the -process is modified. - -The resource-sharing guarantee for a group may partially or totally -switch from bandwidth to time, if providing bandwidth guarantees to -the group lowers the throughput too much. This switch occurs on a -per-process basis: if a process of a leaf group causes throughput loss -if served in such a way to receive its share of the bandwidth, then -BFQ switches back to just time-based proportional share for that -process. - -4-2 Interface -------------- - -To get proportional sharing of bandwidth with BFQ for a given device, -BFQ must of course be the active scheduler for that device. - -Within each group directory, the names of the files associated with -BFQ-specific cgroup parameters and stats begin with the "bfq." -prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -parameter to set the weight of a group with BFQ is blkio.bfq.weight -or io.bfq.weight. - -Parameters to set ------------------ - -For each group, there is only the following parameter to set. - -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -group inside its parent. Available values: 1..10000 (default 100). The -linear mapping between ioprio and weights, described at the beginning -of the tunable section, is still valid, but all weights higher than -IOPRIO_BE_NR*10 are mapped to ioprio 0. - -Recall that, if low-latency is set, then BFQ automatically raises the -weight of the queues associated with interactive and soft real-time -applications. Unset this tunable if you need/want to control weights. - - -[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O - Scheduler", Proceedings of the First Workshop on Mobile System - Technologies (MST-2015), May 2015. - http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf - -[2] P. Valente and M. Andreolini, "Improving Application - Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of - the 5th Annual International Systems and Storage Conference - (SYSTOR '12), June 2012. - Slightly extended version: - http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- - results.pdf diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt deleted file mode 100644 index af618171e0eb..000000000000 --- a/Documentation/cgroup-v1/rdma.txt +++ /dev/null @@ -1,109 +0,0 @@ - RDMA Controller - ---------------- - -Contents --------- - -1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? -2. Usage Examples - -1. Overview - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can leads to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - -2. Usage Examples ------------------ - -(a) Configure resource limit: -echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max -echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit: -cat /sys/fs/cgroup/rdma/2/rdma.max -#Output: -mlx4_0 hca_handle=2 hca_object=2000 -ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage: -cat /sys/fs/cgroup/rdma/2/rdma.current -#Output: -mlx4_0 hca_handle=1 hca_object=20 -ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit: -echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e4b6bf4de837..73950fdea31a 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -16,9 +16,7 @@ CONTENTS 1-2. What is cgroup? 2. Basic Operations 2-1. Mounting - 2-2. Organizing Processes and Threads - 2-2-1. Processes - 2-2-2. Threads + 2-2. Organizing Processes 2-3. [Un]populated Notification 2-4. Controlling Controllers 2-4-1. Enabling and Disabling @@ -49,12 +47,6 @@ CONTENTS 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback - 5-4. PID - 5-4-1. PID Interface Files - 5-5. Misc - 5-5-1. perf_event - 5-6. RDMA - 5-6-1. RDMA Interface Files 6. Namespace 6-1. Basics 6-2. The Root and Views @@ -151,20 +143,8 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. -cgroup v2 currently supports the following mount options. - nsdelegate - - Consider cgroup namespaces as delegation boundaries. This - option is system wide and can only be set on mount or modified - through remount from the init namespace. The mount option is - ignored on non-init namespace mounts. Please refer to the - Delegation section for details. - - -2-2. Organizing Processes and Threads - -2-2-1. Processes +2-2. Organizing Processes Initially, only the root cgroup exists to which all processes belong. A child cgroup can be created by creating a sub-directory. @@ -215,104 +195,6 @@ is removed subsequently, " (deleted)" is appended to the path. 0::/test-cgroup/test-cgroup-nested (deleted) -2-2-2. Threads - -cgroup v2 supports thread granularity for a subset of controllers to -support use cases requiring hierarchical resource distribution across -the threads of a group of processes. By default, all threads of a -process belong to the same cgroup, which also serves as the resource -domain to host resource consumptions which are not specific to a -process or thread. The thread mode allows threads to be spread across -a subtree while still maintaining the common resource domain for them. - -Controllers which support thread mode are called threaded controllers. -The ones which don't are called domain controllers. - -Marking a cgroup threaded makes it join the resource domain of its -parent as a threaded cgroup. The parent may be another threaded -cgroup whose resource domain is further up in the hierarchy. The root -of a threaded subtree, that is, the nearest ancestor which is not -threaded, is called threaded domain or thread root interchangeably and -serves as the resource domain for the entire subtree. - -Inside a threaded subtree, threads of a process can be put in -different cgroups and are not subject to the no internal process -constraint - threaded controllers can be enabled on non-leaf cgroups -whether they have threads in them or not. - -As the threaded domain cgroup hosts all the domain resource -consumptions of the subtree, it is considered to have internal -resource consumptions whether there are processes in it or not and -can't have populated child cgroups which aren't threaded. Because the -root cgroup is not subject to no internal process constraint, it can -serve both as a threaded domain and a parent to domain cgroups. - -The current operation mode or type of the cgroup is shown in the -"cgroup.type" file which indicates whether the cgroup is a normal -domain, a domain which is serving as the domain of a threaded subtree, -or a threaded cgroup. - -On creation, a cgroup is always a domain cgroup and can be made -threaded by writing "threaded" to the "cgroup.type" file. The -operation is single direction:: - - # echo threaded > cgroup.type - -Once threaded, the cgroup can't be made a domain again. To enable the -thread mode, the following conditions must be met. - -- As the cgroup will join the parent's resource domain. The parent - must either be a valid (threaded) domain or a threaded cgroup. - -- When the parent is an unthreaded domain, it must not have any domain - controllers enabled or populated domain children. The root is - exempt from this requirement. - -Topology-wise, a cgroup can be in an invalid state. Please consider -the following toplogy:: - - A (threaded domain) - B (threaded) - C (domain, just created) - -C is created as a domain but isn't connected to a parent which can -host child domains. C can't be used until it is turned into a -threaded cgroup. "cgroup.type" file will report "domain (invalid)" in -these cases. Operations which fail due to invalid topology use -EOPNOTSUPP as the errno. - -A domain cgroup is turned into a threaded domain when one of its child -cgroup becomes threaded or threaded controllers are enabled in the -"cgroup.subtree_control" file while there are processes in the cgroup. -A threaded domain reverts to a normal domain when the conditions -clear. - -When read, "cgroup.threads" contains the list of the thread IDs of all -threads in the cgroup. Except that the operations are per-thread -instead of per-process, "cgroup.threads" has the same format and -behaves the same way as "cgroup.procs". While "cgroup.threads" can be -written to in any cgroup, as it can only move threads inside the same -threaded domain, its operations are confined inside each threaded -subtree. - -The threaded domain cgroup serves as the resource domain for the whole -subtree, and, while the threads can be scattered across the subtree, -all the processes are considered to be in the threaded domain cgroup. -"cgroup.procs" in a threaded domain cgroup contains the PIDs of all -processes in the subtree and is not readable in the subtree proper. -However, "cgroup.procs" can be written to from anywhere in the subtree -to migrate all threads of the matching process to the cgroup. - -Only threaded controllers can be enabled in a threaded subtree. When -a threaded controller is enabled inside a threaded subtree, it only -accounts for and controls resource consumptions associated with the -threads in the cgroup and its descendants. All consumptions which -aren't tied to a specific thread belong to the threaded domain cgroup. - -Because a threaded subtree is exempt from no internal process -constraint, a threaded controller must be able to handle competition -between threads in a non-leaf cgroup and its child cgroups. Each -threaded controller defines how such competitions are handled. - - 2-3. [Un]populated Notification Each non-root cgroup has a "cgroup.events" file which contains @@ -391,15 +273,15 @@ disabled if one or more children have it enabled. 2-4-3. No Internal Process Constraint -Non-root cgroups can distribute domain resources to their children -only when they don't have any processes of their own. In other words, -only domain cgroups which don't contain any processes can have domain -controllers enabled in their "cgroup.subtree_control" files. +Non-root cgroups can only distribute resources to their children when +they don't have any processes of their own. In other words, only +cgroups which don't contain any processes can have controllers enabled +in their "cgroup.subtree_control" files. -This guarantees that, when a domain controller is looking at the part -of the hierarchy which has it enabled, processes are always only on -the leaves. This rules out situations where child cgroups compete -against internal processes of the parent. +This guarantees that, when a controller is looking at the part of the +hierarchy which has it enabled, processes are always only on the +leaves. This rules out situations where child cgroups compete against +internal processes of the parent. The root cgroup is exempt from this restriction. Root contains processes and anonymous resource consumption which can't be associated @@ -420,27 +302,18 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated in two ways. First, to a less privileged -user by granting write access of the directory and its "cgroup.procs", -"cgroup.threads" and "cgroup.subtree_control" files to the user. -Second, if the "nsdelegate" mount option is set, automatically to a -cgroup namespace on namespace creation. - -Because the resource control interface files in a given directory -control the distribution of the parent's resources, the delegatee -shouldn't be allowed to write to them. For the first method, this is -achieved by not granting access to these files. For the second, the -kernel rejects writes to all files other than "cgroup.procs" and -"cgroup.subtree_control" on a namespace root from inside the -namespace. - -The end results are equivalent for both delegation types. Once -delegated, the user can build sub-hierarchy under the directory, -organize processes inside it as it sees fit and further distribute the -resources it received from the parent. The limits and other settings -of all resource controllers are hierarchical and regardless of what -happens in the delegated sub-hierarchy, nothing can escape the -resource restrictions imposed by the parent. +A cgroup can be delegated to a less privileged user by granting write +access of the directory and its "cgroup.procs" file to the user. Note +that resource control interface files in a given directory control the +distribution of the parent's resources and thus must not be delegated +along with the directory. + +Once delegated, the user can build sub-hierarchy under the directory, +organize processes as it sees fit and further distribute the resources +it received from the parent. The limits and other settings of all +resource controllers are hierarchical and regardless of what happens +in the delegated sub-hierarchy, nothing can escape the resource +restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -450,19 +323,19 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. +can't be moved into or out of the sub-hierarchy by the delegatee. For +a process with a non-root euid to migrate a target process into a +cgroup by writing its PID to the "cgroup.procs" file, the following +conditions must be met. -For delegations to a less privileged user, this is achieved by -requiring the following conditions for a process with a non-root euid -to migrate a target process into a cgroup by writing its PID to the -"cgroup.procs" file. +- The writer's euid must match either uid or suid of the target process. - The writer must have write access to the "cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file of the common ancestor of the source and destination cgroups. -The above two constraints ensure that while a delegatee may migrate +The above three constraints ensure that while a delegatee may migrate processes around freely in the delegated sub-hierarchy it can't pull in from or push out to outside the sub-hierarchy. @@ -477,15 +350,10 @@ all processes under C0 and C1 belong to U0. Let's also say U0 wants to write the PID of a process which is currently in C10 into "C00/cgroup.procs". U0 has write access to the -file; however, the common ancestor of the source cgroup C10 and the -destination cgroup C00 is above the points of delegation and U0 would -not have write access to its "cgroup.procs" files and thus the write -will be denied with -EACCES. - -For delegations to namespaces, containment is achieved by requiring -that both the source and destination cgroups are reachable from the -namespace of the process which is attempting the migration. If either -is not reachable, the migration is rejected with -ENOENT. +file and uid match on the process; however, the common ancestor of the +source cgroup C10 and the destination cgroup C00 is above the points +of delegation and U0 would not have write access to its "cgroup.procs" +files and thus the write will be denied with -EACCES. 2-6. Guidelines @@ -718,29 +586,6 @@ may be specified in any order and not all pairs have to be specified. All cgroup core files are prefixed with "cgroup." - cgroup.type - - A read-write single value file which exists on non-root - cgroups. - - When read, it indicates the current type of the cgroup, which - can be one of the following values. - - - "domain" : A normal valid domain cgroup. - - - "domain threaded" : A threaded domain cgroup which is - serving as the root of a threaded subtree. - - - "domain invalid" : A cgroup which is in an invalid state. - It can't be populated or have controllers enabled. It may - be allowed to become a threaded cgroup. - - - "threaded" : A threaded cgroup which is a member of a - threaded subtree. - - A cgroup can be turned into a threaded cgroup by writing - "threaded" to this file. - cgroup.procs A read-write new-line separated values file which exists on @@ -756,36 +601,10 @@ All cgroup core files are prefixed with "cgroup." the PID to the cgroup. The writer should match all of the following conditions. - - It must have write access to the "cgroup.procs" file. + - Its euid is either root or must match either uid or suid of + the target process. - - It must have write access to the "cgroup.procs" file of the - common ancestor of the source and destination cgroups. - - When delegating a sub-hierarchy, write access to this file - should be granted along with the containing directory. - - In a threaded cgroup, reading this file fails with EOPNOTSUPP - as all the processes belong to the thread root. Writing is - supported and moves every thread of the process to the cgroup. - - cgroup.threads - A read-write new-line separated values file which exists on - all cgroups. - - When read, it lists the TIDs of all threads which belong to - the cgroup one-per-line. The TIDs are not ordered and the - same TID may show up more than once if the thread got moved to - another cgroup and then back or the TID got recycled while - reading. - - A TID can be written to migrate the thread associated with the - TID to the cgroup. The writer should match all of the - following conditions. - - - It must have write access to the "cgroup.threads" file. - - - The cgroup that the thread is currently in must be in the - same resource domain as the destination cgroup. + - It must have write access to the "cgroup.procs" file. - It must have write access to the "cgroup.procs" file of the common ancestor of the source and destination cgroups. @@ -829,38 +648,6 @@ All cgroup core files are prefixed with "cgroup." 1 if the cgroup or its descendants contains any live processes; otherwise, 0. - cgroup.max.descendants - A read-write single value files. The default is "max". - - Maximum allowed number of descent cgroups. - If the actual number of descendants is equal or larger, - an attempt to create a new cgroup in the hierarchy will fail. - - cgroup.max.depth - A read-write single value files. The default is "max". - - Maximum allowed descent depth below the current cgroup. - If the actual descent depth is equal or larger, - an attempt to create a new child cgroup will fail. - - cgroup.stat - A read-only flat-keyed file with the following entries: - - nr_descendants - Total number of visible descendant cgroups. - - nr_dying_descendants - Total number of dying descendant cgroups. A cgroup becomes - dying after being deleted by a user. The cgroup will remain - in dying state for some time undefined time (which can depend - on system load) before being completely destroyed. - - A process can't enter a dying cgroup under any circumstances, - a dying cgroup can't revive. - - A dying cgroup can consume system resources not exceeding - limits, which were active at the moment of cgroup deletion. - 5. Controllers @@ -1350,92 +1137,6 @@ writeback as follows. vm.dirty[_background]_ratio. -5-4. PID - -The process number controller is used to allow a cgroup to stop any -new tasks from being fork()'d or clone()'d after a specified limit is -reached. - -The number of tasks in a cgroup can be exhausted in ways which other -controllers cannot prevent, thus warranting its own controller. For -example, a fork bomb is likely to exhaust the number of tasks before -hitting memory restrictions. - -Note that PIDs used in this controller refer to TIDs, process IDs as -used by the kernel. - - -5-4-1. PID Interface Files - - pids.max - - A read-write single value file which exists on non-root - cgroups. The default is "max". - - Hard limit of number of processes. - - pids.current - - A read-only single value file which exists on all cgroups. - - The number of processes currently in the cgroup and its - descendants. - -Organisational operations are not blocked by cgroup policies, so it is -possible to have pids.current > pids.max. This can be done by either -setting the limit to be smaller than pids.current, or attaching enough -processes to the cgroup such that pids.current is larger than -pids.max. However, it is not possible to violate a cgroup PID policy -through fork() or clone(). These will return -EAGAIN if the creation -of a new process would cause a cgroup policy to be violated. - - -5-5. Misc - -5-5-1. perf_event - -perf_event controller, if not mounted on a legacy hierarchy, is -automatically enabled on the v2 hierarchy so that perf events can -always be filtered by cgroup v2 path. The controller can still be -moved to a legacy hierarchy after v2 hierarchy is populated. - - -5-6. RDMA - -The "rdma" controller regulates the distribution and accounting of -of RDMA resources. - -5-6-1. RDMA Interface Files - - rdma.max - A readwrite nested-keyed file that exists for all the cgroups - except root that describes current configured resource limit - for a RDMA/IB device. - - Lines are keyed by device name and are not ordered. - Each line contains space separated resource name and its configured - limit that can be distributed. - - The following nested keys are defined. - - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - - An example for mlx4 and ocrdma device follows. - - mlx4_0 hca_handle=2 hca_object=2000 - ocrdma1 hca_handle=3 hca_object=max - - rdma.current - A read-only file that describes current resource usage. - It exists for all the cgroup except root. - - An example for mlx4 and ocrdma device follows. - - mlx4_0 hca_handle=1 hca_object=20 - ocrdma1 hca_handle=1 hca_object=23 - - 6. Namespace 6-1. Basics @@ -1623,7 +1324,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All v1 mount options are not supported. +- All mount options and remounting are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9afba613a5c3..a66de7db0118 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -692,14 +692,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Specifying "pressure" disables per-cgroup pressure stall information accounting feature - cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 - Format: { { controller | "all" | "named" } - [,{ controller | "all" | "named" }...] } + cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1 + Format: { controller[,controller...] | "all" } Like cgroup_disable, but only applies to cgroup v1; the blacklisted controllers remain available in cgroup2. - "all" blacklists all controllers and "named" disables - named mounts. Specifying both "all" and "named" disables - all v1 hierarchies. cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: diff --git a/Makefile b/Makefile index 64d8b31c7b5d..a2946c935d83 100644 --- a/Makefile +++ b/Makefile @@ -87,10 +87,16 @@ endif # If the user is running make -s (silent mode), suppress echoing of # commands +# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. -ifneq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),) - quiet=silent_ - tools_silent=s +ifeq ($(filter 3.%,$(MAKE_VERSION)),) +silence:=$(findstring s,$(firstword -$(MAKEFLAGS))) +else +silence:=$(findstring s,$(filter-out --%,$(MAKEFLAGS))) +endif + +ifeq ($(silence),s) +quiet=silent_ endif export quiet Q KBUILD_VERBOSE diff --git a/README.md b/README.md deleted file mode 100644 index 4a7cd21638cd..000000000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -已停更,随缘更新 \ No newline at end of file diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config new file mode 100644 index 000000000000..7c2967ce7218 --- /dev/null +++ b/arch/arm64/configs/lxc.config @@ -0,0 +1,44 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y + +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_PID_NS=y +CONFIG_USER_NS=y +CONFIG_NET_NS=y + +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_PIDS=y +CONFIG_MEMCG=y +CONFIG_CPUSETS=y + +CONFIG_VETH=y +CONFIG_MACVLAN=y +CONFIG_VLAN_8021Q=y +CONFIG_BRIDGE=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NF_NAT_IPV4=y +CONFIG_NF_NAT_IPV6=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP6_NF_TARGET_MASQUERADE=y +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_FUSE_FS=y + +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_FHANDLE=y +CONFIG_EVENTFD=y +CONFIG_EPOLL=y +CONFIG_UNIX_DIAG=y +CONFIG_INET_DIAG=y +CONFIG_PACKET_DIAG=y +CONFIG_NETLINK_DIAG=y + +CONFIG_BINFMT_MISC=y + +CONFIG_ANDROID_PARANOID_NETWORK=n \ No newline at end of file diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig index 3768440716ca..1ff7a9286950 100644 --- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig +++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig @@ -71,7 +71,7 @@ CONFIG_PCI_MSM=y CONFIG_SCHED_MC=y CONFIG_NR_CPUS=8 CONFIG_PREEMPT=y -CONFIG_HZ_100=y +CONFIG_HZ_300=y CONFIG_ANON_MIN_KBYTES=196608 CONFIG_CLEAN_LOW_KBYTES=393216 CONFIG_CLEAN_MIN_KBYTES=196608 @@ -218,6 +218,7 @@ CONFIG_IP6_NF_IPTABLES_128=y CONFIG_IP6_NF_MATCH_RPFILTER=y CONFIG_IP6_NF_TARGET_HL=y CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_NAT=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IP6_NF_MANGLE=y CONFIG_IP6_NF_RAW=y @@ -634,5 +635,4 @@ CONFIG_SND_SOC_WCD_MBHC_ADC=y CONFIG_SND_SOC_WCD_SPI=y CONFIG_SOUNDWIRE=y CONFIG_WCD_SPI_AC=y -CONFIG_REKERNEL=y -CONFIG_REKERNEL_NETWORK=y +CONFIG_KSU=y diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 75ee7ba34ebb..421bef9c4c48 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -39,28 +39,9 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. -config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - default y - ---help--- - The BFQ I/O scheduler distributes bandwidth among all - processes according to their weights, regardless of the - device parameters and with any workload. It also guarantees - a low latency to interactive and soft real-time applications. - Details in Documentation/block/bfq-iosched.txt - -config BFQ_GROUP_IOSCHED - bool "BFQ hierarchical scheduling support" - depends on IOSCHED_BFQ && BLK_CGROUP - default n - ---help--- - - Enable hierarchical scheduling in BFQ, using the blkio - (cgroups-v1) or io (cgroups-v2) controller. - choice prompt "Default I/O scheduler" - default DEFAULT_BFQ + default DEFAULT_CFQ help Select the I/O scheduler which will be used by default for all block devices. @@ -74,16 +55,6 @@ choice config DEFAULT_NOOP bool "No-op" - config DEFAULT_BFQ - bool "BFQ" if IOSCHED_BFQ=y - help - Selects BFQ as the default I/O scheduler which will be - used by default for all block devices. - The BFQ I/O scheduler aims at distributing the bandwidth - as desired, independently of the disk parameters and with - any workload. It also tries to guarantee low latency to - interactive and soft real-time applications. - endchoice config DEFAULT_IOSCHED @@ -91,7 +62,6 @@ config DEFAULT_IOSCHED default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP - default "bfq" if DEFAULT_BFQ endmenu diff --git a/block/Makefile b/block/Makefile index 736e91a2ca1c..36acdd7545be 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c deleted file mode 100644 index 52484f10bb6f..000000000000 --- a/block/bfq-cgroup.c +++ /dev/null @@ -1,1191 +0,0 @@ -/* - * BFQ: CGROUPS support. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - */ - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -/* bfqg stats flags */ -enum bfqg_stats_flags { - BFQG_stats_waiting = 0, - BFQG_stats_idling, - BFQG_stats_empty, -}; - -#define BFQG_FLAG_FNS(name) \ -static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags |= (1 << BFQG_stats_##name); \ -} \ -static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << BFQG_stats_##name); \ -} \ -static int bfqg_stats_##name(struct bfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -} \ - -BFQG_FLAG_FNS(waiting) -BFQG_FLAG_FNS(idling) -BFQG_FLAG_FNS(empty) -#undef BFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_waiting(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - bfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_waiting(stats)) - return; - if (bfqg == curr_bfqg) - return; - stats->start_group_wait_time = sched_clock(); - bfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -{ - unsigned long long now; - - if (!bfqg_stats_empty(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - bfqg_stats_clear_empty(stats); -} - -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -{ - blkg_stat_add(&bfqg->stats.dequeue, 1); -} - -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (blkg_rwstat_total(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if bfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (bfqg_stats_empty(stats)) - return; - - stats->start_empty_time = sched_clock(); - bfqg_stats_mark_empty(stats); -} - -static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - if (bfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); - - if (time_after64(now, stats->start_idle_time)) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - bfqg_stats_clear_idling(stats); - } -} - -static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - stats->start_idle_time = sched_clock(); - bfqg_stats_mark_idling(stats); -} - -static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -{ - struct bfqg_stats *stats = &bfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - bfqg_stats_update_group_wait_time(stats); -} - -static struct blkcg_policy blkcg_policy_bfq; - -/* - * blk-cgroup policy-related handlers - * The following functions help in converting between blk-cgroup - * internal structures and BFQ-specific structures. - */ - -static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct bfq_group, pd) : NULL; -} - -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -{ - return pd_to_blkg(&bfqg->pd); -} - -static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -{ - struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - - return pd_to_bfqg(pd); -} - -/* - * bfq_group handlers - * The following functions help in navigating the bfq_group hierarchy - * by allowing to find the parent of a bfq_group or the bfq_group - * associated to a bfq_queue. - */ - -static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -{ - struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; - - return pblkg ? blkg_to_bfqg(pblkg) : NULL; -} - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - return group_entity ? container_of(group_entity, struct bfq_group, - entity) : - bfqq->bfqd->root_group; -} - -/* - * The following two functions handle get and put of a bfq_group by - * wrapping the related blk-cgroup hooks. - */ - -static void bfqg_get(struct bfq_group *bfqg) -{ - return blkg_get(bfqg_to_blkg(bfqg)); -} - -static void bfqg_put(struct bfq_group *bfqg) -{ - return blkg_put(bfqg_to_blkg(bfqg)); -} - -static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - int op, int op_flags) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} - -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, - int op_flags) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1); -} - -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, - int op_flags) -{ - blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1); -} - -static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) -{ - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, - io_start_time - start_time); -} - -/* @stats = 0 */ -static void bfqg_stats_reset(struct bfqg_stats *stats) -{ - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -} - -/* @to += @from */ -static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -{ - if (!to || !from) - return; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, - &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -} - -/* - * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' - * recursive stats can still account for the amount used by this bfqg after - * it's gone. - */ -static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -{ - struct bfq_group *parent; - - if (!bfqg) /* root_group */ - return; - - parent = bfqg_parent(bfqg); - - lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); - - if (unlikely(!parent)) - return; - - bfqg_stats_add_aux(&parent->stats, &bfqg->stats); - bfqg_stats_reset(&bfqg->stats); -} - -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); - } - entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; -} - -static void bfqg_stats_exit(struct bfqg_stats *stats) -{ - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -} - -static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -{ - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } - - return 0; -} - -static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -{ - return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -} - -static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -{ - return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -} - -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - -static void bfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - - d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); -} - -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -{ - struct bfq_group *bfqg; - - bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); - if (!bfqg) - return NULL; - - if (bfqg_stats_init(&bfqg->stats, gfp)) { - kfree(bfqg); - return NULL; - } - - return &bfqg->pd; -} - -static void bfq_pd_init(struct blkg_policy_data *pd) -{ - struct blkcg_gq *blkg; - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; - struct bfq_group_data *d; - - blkg = pd_to_blkg(pd); - BUG_ON(!blkg); - bfqg = blkg_to_bfqg(blkg); - bfqd = blkg->q->elevator->elevator_data; - entity = &bfqg->entity; - d = blkcg_to_bfqgd(blkg->blkcg); - - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; - bfqg->my_entity = entity; /* - * the root_group's will be set to NULL - * in bfq_init_queue() - */ - bfqg->bfqd = bfqd; - bfqg->active_entities = 0; - bfqg->rq_pos_tree = RB_ROOT; -} - -static void bfq_pd_free(struct blkg_policy_data *pd) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); - return kfree(bfqg); -} - -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_reset(&bfqg->stats); -} - -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) -{ - struct bfq_entity *entity; - - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); - - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct bfq_group *bfqg, *parent; - struct bfq_entity *entity; - - assert_spin_locked(bfqd->queue->queue_lock); - - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) - return NULL; - - /* - * Update chain of bfq_groups as we might be handling a leaf group - * which, along with some of its relatives, has not been hooked yet - * to the private hierarchy of BFQ. - */ - entity = &bfqg->entity; - for_each_entity(entity) { - bfqg = container_of(entity, struct bfq_group, entity); - BUG_ON(!bfqg); - if (bfqg != bfqd->root_group) { - parent = bfqg_parent(bfqg); - if (!parent) - parent = bfqd->root_group; - BUG_ON(!parent); - bfq_group_set_parent(bfqg, parent); - } - } - - return bfqg; -} - -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq); - -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason); - -/** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. - * @bfqq: the queue to move. - * @bfqg: the group to move to. - * - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). - */ -static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) -{ - struct bfq_entity *entity = &bfqq->entity; - - BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); - BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) - && entity->on_st && - bfqq != bfqd->in_service_queue); - BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); - - /* If bfqq is empty, then bfq_bfqq_expire also invokes - * bfq_del_bfqq_busy, thereby removing bfqq and its entity - * from data structures related to current group. Otherwise we - * need to remove bfqq explicitly with bfq_deactivate_bfqq, as - * we do below. - */ - if (bfqq == bfqd->in_service_queue) - bfq_bfqq_expire(bfqd, bfqd->in_service_queue, - false, BFQ_BFQQ_PREEMPTED); - - BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) - && &bfq_entity_service_tree(entity)->idle != - entity->tree); - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - - if (bfq_bfqq_busy(bfqq)) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); - else if (entity->on_st) { - BUG_ON(&bfq_entity_service_tree(entity)->idle != - entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - bfqg_put(bfqq_group(bfqq)); - - /* - * Here we use a reference to bfqg. We don't need a refcounter - * as the cgroup reference will not be dropped, so that its - * destroy() callback will not be invoked. - */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_busy(bfqq)) { - bfq_pos_tree_add_move(bfqd, bfqq); - bfq_activate_bfqq(bfqd, bfqq); - } - - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) - && &bfq_entity_service_tree(entity)->idle != - entity->tree); -} - -/** - * __bfq_bic_change_cgroup - move @bic to @cgroup. - * @bfqd: the queue descriptor. - * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. - * - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. - */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) -{ - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; - struct bfq_entity *entity; - - lockdep_assert_held(bfqd->queue->queue_lock); - - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - - if (async_bfqq) { - entity = &async_bfqq->entity; - - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", - async_bfqq, - async_bfqq->ref); - bfq_put_queue(async_bfqq); - } - } - - if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } - - return bfqg; -} - -static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; - uint64_t serial_nr; - - rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); - bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); -} - -/** - * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. - * @st: the service tree being flushed. - */ -static void bfq_flush_idle_tree(struct bfq_service_tree *st) -{ - struct bfq_entity *entity = st->first_idle; - - for (; entity ; entity = st->first_idle) - __bfq_deactivate_entity(entity, false); -} - -/** - * bfq_reparent_leaf_entity - move leaf entity to the root_group. - * @bfqd: the device data structure with the root group. - * @entity: the entity to move. - */ -static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -} - -/** - * bfq_reparent_active_entities - move to the root group all active - * entities. - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. - * - * Needs queue_lock to be taken and reference to be valid over the call. - */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) -{ - struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; - - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); - - for (; entity ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); - - if (bfqg->sched_data.in_service_entity) - bfq_reparent_leaf_entity(bfqd, - bfqg->sched_data.in_service_entity); -} - -/** - * bfq_pd_offline - deactivate the entity associated with @pd, - * and reparent its children entities. - * @pd: descriptor of the policy going offline. - * - * blkio already grabs the queue_lock for us, so no need to use - * RCU-based magic - */ -static void bfq_pd_offline(struct blkg_policy_data *pd) -{ - struct bfq_service_tree *st; - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; - int i; - - BUG_ON(!pd); - bfqg = pd_to_bfqg(pd); - BUG_ON(!bfqg); - bfqd = bfqg->bfqd; - BUG_ON(bfqd && !bfqd->root_group); - - entity = bfqg->my_entity; - - if (!entity) /* root group */ - return; - - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. - */ - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - st = bfqg->sched_data.service_tree + i; - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. No one else - * can access them so it's safe to act without any lock. - */ - bfq_flush_idle_tree(st); - - /* - * It may happen that some queues are still active - * (busy) upon group destruction (if the corresponding - * processes have been forced to terminate). We move - * all the leaf entities corresponding to these queues - * to the root_group. - * Also, it may happen that the group has an entity - * in service, which is disconnected from the active - * tree: it must be moved, too. - * There is no need to put the sync queues, as the - * scheduler has taken no reference. - */ - bfq_reparent_active_entities(bfqd, bfqg, st); - BUG_ON(!RB_EMPTY_ROOT(&st->active)); - BUG_ON(!RB_EMPTY_ROOT(&st->idle)); - } - BUG_ON(bfqg->sched_data.next_in_service); - BUG_ON(bfqg->sched_data.in_service_entity); - - __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); - - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so - * that they don't get lost. If IOs complete after this point, the - * stats for them will be lost. Oh well... - */ - bfqg_stats_xfer_dead(bfqg); -} - -static void bfq_end_wr_async(struct bfq_data *bfqd) -{ - struct blkcg_gq *blkg; - - list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - BUG_ON(!bfqg); - - bfq_end_wr_async_queues(bfqd, bfqg); - } - bfq_end_wr_async_queues(bfqd, bfqd->root_group); -} - -static int bfq_io_show_weight(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - unsigned int val = 0; - - if (bfqgd) - val = bfqgd->weight; - - seq_printf(sf, "%u\n", val); - - return 0; -} - -static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - struct blkcg_gq *blkg; - int ret = -ERANGE; - - if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) - return ret; - - ret = 0; - spin_lock_irq(&blkcg->lock); - bfqgd->weight = (unsigned short)val; - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - - if (!bfqg) - continue; - /* - * Setting the prio_changed flag of the entity - * to 1 with new_weight == weight would re-set - * the value of the weight to its ioprio mapping. - * Set the flag only if necessary. - */ - if ((unsigned short)val != bfqg->entity.new_weight) { - bfqg->entity.new_weight = (unsigned short)val; - /* - * Make sure that the above new value has been - * stored in bfqg->entity.new_weight before - * setting the prio_changed flag. In fact, - * this flag may be read asynchronously (in - * critical sections protected by a different - * lock than that held here), and finding this - * flag set may cause the execution of the code - * for updating parameters whose value may - * depend also on bfqg->entity.new_weight (in - * __bfq_entity_update_weight_prio). - * This barrier makes sure that the new value - * of bfqg->entity.new_weight is correctly - * seen in that code. - */ - smp_wmb(); - bfqg->entity.prio_changed = 1; - } - } - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - u64 weight; - /* First unsigned long found in the file is used */ - int ret = kstrtoull(strim(buf), 0, &weight); - - if (ret) - return ret; - - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); -} - -static int bfqg_print_stat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); - return 0; -} - -static int bfqg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); - return 0; -} - -static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, off); - return __blkg_prfill_u64(sf, pd, sum); -} - -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, - off); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - -static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_stat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, false); - return 0; -} - -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - -static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); - return 0; -} - -static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, - false); - return 0; -} - - -static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct bfq_group *bfqg = pd_to_bfqg(pd); - u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); - u64 v = 0; - - if (samples) { - v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); - v = div64_u64(v, samples); - } - __blkg_prfill_u64(sf, pd, v); - return 0; -} - -/* print avg_queue_size */ -static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, - 0, false); - return 0; -} - -static struct bfq_group * -bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -{ - int ret; - - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); - if (ret) - return NULL; - - return blkg_to_bfqg(bfqd->queue->root_blkg); -} - -static struct cftype bfq_blkcg_legacy_files[] = { - { - .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write_u64 = bfq_io_set_weight_legacy, - }, - - /* statistics, covers only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.sectors", - .seq_show = bfqg_print_stat_sectors, - }, - { - .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, - }, - { - .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, - { - .name = "bfq.io_service_time", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_wait_time", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_merged", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { - .name = "bfq.io_queued", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, - - /* the same statictics which cover the bfqg and its descendants */ - { - .name = "bfq.time_recursive", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.sectors_recursive", - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { - .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { - .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { - .name = "bfq.io_service_time_recursive", - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_wait_time_recursive", - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_merged_recursive", - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.io_queued_recursive", - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { - .name = "bfq.avg_queue_size", - .seq_show = bfqg_print_avg_queue_size, - }, - { - .name = "bfq.group_wait_time", - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.idle_time", - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.empty_time", - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.dequeue", - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, - { } /* terminate */ -}; - -static struct cftype bfq_blkg_files[] = { - { - .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write = bfq_io_set_weight, - }, - {} /* terminate */ -}; - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, int op, int op_flags) { } -static inline void -bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { } -static inline void -bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { } -static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } -static inline void -bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) { } -static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } - -static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) {} - -static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; - } - entity->sched_data = &bfqg->sched_data; -} - -static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} - -static void bfq_end_wr_async(struct bfq_data *bfqd) -{ - bfq_end_wr_async_queues(bfqd, bfqd->root_group); -} - -static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - return bfqd->root_group; -} - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - -static struct bfq_group * -bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -{ - struct bfq_group *bfqg; - int i; - - bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (!bfqg) - return NULL; - - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - - return bfqg; -} -#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c deleted file mode 100644 index fb7bb8f08b75..000000000000 --- a/block/bfq-ioc.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * BFQ: I/O context handling. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2010 Paolo Valente - */ - -/** - * icq_to_bic - convert iocontext queue structure to bfq_io_cq. - * @icq: the iocontext queue. - */ -static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -{ - /* bic->icq is the first member, %NULL will convert to %NULL */ - return container_of(icq, struct bfq_io_cq, icq); -} - -/** - * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. - * - * Queue lock must be held. - */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc) -{ - if (ioc) - return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); - return NULL; -} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c deleted file mode 100644 index 6e6025dacfc6..000000000000 --- a/block/bfq-iosched.c +++ /dev/null @@ -1,5403 +0,0 @@ -/* - * Budget Fair Queueing (BFQ) I/O scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2017 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - * - * BFQ is a proportional-share I/O scheduler, with some extra - * low-latency capabilities. BFQ also supports full hierarchical - * scheduling through cgroups. Next paragraphs provide an introduction - * on BFQ inner workings. Details on BFQ benefits and usage can be - * found in Documentation/block/bfq-iosched.txt. - * - * BFQ is a proportional-share storage-I/O scheduling algorithm based - * on the slice-by-slice service scheme of CFQ. But BFQ assigns - * budgets, measured in number of sectors, to processes instead of - * time slices. The device is not granted to the in-service process - * for a given time slice, but until it has exhausted its assigned - * budget. This change from the time to the service domain enables BFQ - * to distribute the device throughput among processes as desired, - * without any distortion due to throughput fluctuations, or to device - * internal queueing. BFQ uses an ad hoc internal scheduler, called - * B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated with processes. Thanks to - * the accurate policy of B-WF2Q+, BFQ can afford to assign high - * budgets to I/O-bound processes issuing sequential requests (to - * boost the throughput), and yet guarantee a low latency to - * interactive and soft real-time applications. - * - * NOTE: if the main or only goal, with a given device, is to achieve - * the maximum-possible throughput at all times, then do switch off - * all low-latency heuristics for that device, by setting low_latency - * to 0. - * - * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find - * in the latter paper full details on the main algorithm, as well as - * formulas of the guarantees and formal proofs of all the properties. - * With respect to the version of BFQ presented in these papers, this - * implementation adds a few more heuristics, such as the one that - * guarantees a low latency to soft real-time applications, and a - * hierarchical extension based on H-WF2Q+. - * - * B-WF2Q+ is based on WF2Q+, that is described in [2], together with - * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) - * complexity derives from the one introduced with EEVDF in [3]. - * - * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O - * Scheduler", Proceedings of the First Workshop on Mobile System - * Technologies (MST-2015), May 2015. - * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf - * - * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf - * - * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing - * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, - * Oct 1997. - * - * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz - * - * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline - * First: A Flexible and Accurate Mechanism for Proportional Share - * Resource Allocation,'' technical report. - * - * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "bfq.h" -#include "blk.h" - -/* Expiration time of sync (0) and async (1) requests, in ns. */ -static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; - -/* Maximum backwards seek, in KiB. */ -static const int bfq_back_max = (16 * 1024); - -/* Penalty of a backwards seek, in number of sectors. */ -static const int bfq_back_penalty = 2; - -/* Idling period duration, in ns. */ -static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); - -/* Minimum number of assigned budgets for which stats are safe to compute. */ -static const int bfq_stats_min_budgets = 194; - -/* Default maximum budget values, in sectors and number of requests. */ -static const int bfq_default_max_budget = (16 * 1024); - -/* - * Async to sync throughput distribution is controlled as follows: - * when an async request is served, the entity is charged the number - * of sectors of the request, multiplied by the factor below - */ -static const int bfq_async_charge_factor = 10; - -/* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout = (HZ / 8); - -static struct kmem_cache *bfq_pool; - -/* Below this threshold (in ns), we consider thinktime immediate. */ -#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) - -/* hw_tag detection: parallel requests threshold and min samples needed. */ -#define BFQ_HW_QUEUE_THRESHOLD 4 -#define BFQ_HW_QUEUE_SAMPLES 32 - -#define BFQQ_SEEK_THR (sector_t)(8 * 100) -#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) - -/* Min number of samples required to perform peak-rate update */ -#define BFQ_RATE_MIN_SAMPLES 32 -/* Min observation time interval required to perform a peak-rate update (ns) */ -#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -/* Target observation time interval for a peak-rate update (ns) */ -#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - -/* Shift used for peak rate fixed precision calculations. */ -#define BFQ_RATE_SHIFT 16 - -/* - * By default, BFQ computes the duration of the weight raising for - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; - * . whether the device is slow, such as old or portable HDDs, as well as - * SD cards, or fast, such as newer HDDs and SSDs. - * - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. - * - * In the following definitions, R_slow[0]/R_fast[0] and - * T_slow[0]/T_fast[0] are the reference values for a slow/fast - * rotational device, whereas R_slow[1]/R_fast[1] and - * T_slow[1]/T_fast[1] are the reference values for a slow/fast - * non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. The reference - * rates are not the actual peak rates of the devices used as a - * reference, but slightly lower values. The reason for using these - * slightly lower values is that the peak-rate estimator tends to - * yield slightly lower values than the actual peak rate (it can yield - * the actual peak rate only if there is only one process doing I/O, - * and the process does sequential I/O). - * - * Both the reference peak rates and the thresholds are measured in - * sectors/usec, left-shifted by BFQ_RATE_SHIFT. - */ -static int R_slow[2] = {1000, 10700}; -static int R_fast[2] = {14000, 33000}; -/* - * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that they can be initialized only in a - * function. - */ -static int T_slow[2]; -static int T_fast[2]; -static int device_speed_thresh[2]; - -#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - -static void bfq_schedule_dispatch(struct bfq_data *bfqd); - -#include "bfq-ioc.c" -#include "bfq-sched.c" -#include "bfq-cgroup.c" - -#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) - -#define bfq_sample_valid(samples) ((samples) > 80) - -/* - * We regard a request as SYNC, if either it's a read or has the SYNC bit - * set (in which case it could also be a direct WRITE). - */ -static int bfq_bio_sync(struct bio *bio) -{ - return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); -} - -/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -static void bfq_schedule_dispatch(struct bfq_data *bfqd) -{ - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); - kblockd_schedule_work(&bfqd->unplug_work); - } -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request *bfq_choose_req(struct bfq_data *bfqd, - struct request *rq1, - struct request *rq2, - sector_t last) -{ - sector_t s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ - - if (!rq1 || rq1 == rq2) - return rq2; - if (!rq2) - return rq1; - - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) - return rq2; - - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - /* - * By definition, 1KiB is 2 sectors. - */ - back_max = bfqd->bfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - - if (s1 >= s2) - return rq1; - else - return rq2; - - case BFQ_RQ2_WRAP: - return rq1; - case BFQ_RQ1_WRAP: - return rq2; - case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -static struct bfq_queue * -bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, - struct rb_node ***rb_link) -{ - struct rb_node **p, *parent; - struct bfq_queue *bfqq = NULL; - - parent = NULL; - p = &root->rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - bfqq = rb_entry(parent, struct bfq_queue, pos_node); - - /* - * Sort strictly based on sector. Smallest to the left, - * largest to the right. - */ - if (sector > blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_right; - else if (sector < blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_left; - else - break; - p = n; - bfqq = NULL; - } - - *ret_parent = parent; - if (rb_link) - *rb_link = p; - - bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - - return bfqq; -} - -static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct rb_node **p, *parent; - struct bfq_queue *__bfqq; - - if (bfqq->pos_root) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) - return; - - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; - __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, - blk_rq_pos(bfqq->next_rq), &parent, &p); - if (!__bfqq) { - rb_link_node(&bfqq->pos_node, parent, p); - rb_insert_color(&bfqq->pos_node, bfqq->pos_root); - } else - bfqq->pos_root = NULL; -} - -/* - * Tell whether there are active queues or groups with differentiated weights. - */ -static bool bfq_differentiated_weights(struct bfq_data *bfqd) -{ - /* - * For weights to differ, at least one of the trees must contain - * at least two nodes. - */ - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || - bfqd->group_weights_tree.rb_node->rb_right) -#endif - ); -} - -/* - * The following function returns true if every queue must receive the - * same share of the throughput (this condition is used when deciding - * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). - * - * Such a scenario occurs when: - * 1) all active queues have the same weight, - * 2) all active groups at the same level in the groups tree have the same - * weight, - * 3) all active groups at the same level in the groups tree have the same - * number of children. - * - * Unfortunately, keeping the necessary state for evaluating exactly the - * above symmetry conditions would be quite complex and time-consuming. - * Therefore this function evaluates, instead, the following stronger - * sub-conditions, for which it is much easier to maintain the needed - * state: - * 1) all active queues have the same weight, - * 2) all active groups have the same weight, - * 3) all active groups have at most one active child each. - * In particular, the last two conditions are always true if hierarchical - * support and the cgroups interface are not enabled, thus no state needs - * to be maintained in this case. - */ -static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -{ - return !bfq_differentiated_weights(bfqd); -} - -/* - * If the weight-counter tree passed as input contains no counter for - * the weight of the input entity, then add that counter; otherwise just - * increment the existing counter. - * - * Note that weight-counter trees contain few nodes in mostly symmetric - * scenarios. For example, if all queues have the same weight, then the - * weight-counter tree for the queues may contain at most one node. - * This holds even if low_latency is on, because weight-raised queues - * are not inserted in the tree. - * In most scenarios, the rate at which nodes are created/destroyed - * should be low too. - */ -static void bfq_weights_tree_add(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* - * Do not insert if the entity is already associated with a - * counter, which happens if: - * 1) the entity is associated with a queue, - * 2) a request arrival has caused the queue to become both - * non-weight-raised, and hence change its weight, and - * backlogged; in this respect, each of the two events - * causes an invocation of this function, - * 3) this is the invocation of this function caused by the - * second event. This second invocation is actually useless, - * and we handle this fact by exiting immediately. More - * efficient or clearer solutions might possibly be adopted. - */ - if (entity->weight_counter) - return; - - while (*new) { - struct bfq_weight_counter *__counter = container_of(*new, - struct bfq_weight_counter, - weights_node); - parent = *new; - - if (entity->weight == __counter->weight) { - entity->weight_counter = __counter; - goto inc_counter; - } - if (entity->weight < __counter->weight) - new = &((*new)->rb_left); - else - new = &((*new)->rb_right); - } - - entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), - GFP_ATOMIC); - - /* - * In the unlucky event of an allocation failure, we just - * exit. This will cause the weight of entity to not be - * considered in bfq_differentiated_weights, which, in its - * turn, causes the scenario to be deemed wrongly symmetric in - * case entity's weight would have been the only weight making - * the scenario asymmetric. On the bright side, no unbalance - * will however occur when entity becomes inactive again (the - * invocation of this function is triggered by an activation - * of entity). In fact, bfq_weights_tree_remove does nothing - * if !entity->weight_counter. - */ - if (unlikely(!entity->weight_counter)) - return; - - entity->weight_counter->weight = entity->weight; - rb_link_node(&entity->weight_counter->weights_node, parent, new); - rb_insert_color(&entity->weight_counter->weights_node, root); - -inc_counter: - entity->weight_counter->num_active++; -} - -/* - * Decrement the weight counter associated with the entity, and, if the - * counter reaches 0, remove the counter from the tree. - * See the comments to the function bfq_weights_tree_add() for considerations - * about overhead. - */ -static void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root) -{ - if (!entity->weight_counter) - return; - - BUG_ON(RB_EMPTY_ROOT(root)); - BUG_ON(entity->weight_counter->weight != entity->weight); - - BUG_ON(!entity->weight_counter->num_active); - entity->weight_counter->num_active--; - if (entity->weight_counter->num_active > 0) - goto reset_entity_pointer; - - rb_erase(&entity->weight_counter->weights_node, root); - kfree(entity->weight_counter); - -reset_entity_pointer: - entity->weight_counter = NULL; -} - -/* - * Return expired entry, or NULL to just start from scratch in rbtree. - */ -static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - struct request *last) -{ - struct request *rq; - - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; - - bfq_mark_bfqq_fifo_expire(bfqq); - - rq = rq_entry_fifo(bfqq->fifo.next); - - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; -} - -static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next, *prev = NULL; - - BUG_ON(list_empty(&bfqq->fifo)); - - /* Follow expired path, else get first next available. */ - next = bfq_check_fifo(bfqq, last); - if (next) { - BUG_ON(next == last); - return next; - } - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); - - if (rbnext) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&bfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -} - -/* see the definition of bfq_async_charge_factor for details */ -static unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) -{ - if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) - return blk_rq_sectors(rq); - - /* - * If there are no weight-raised queues, then amplify service - * by just the async charge factor; otherwise amplify service - * by twice the async charge factor, to further reduce latency - * for weight-raised queues. - */ - if (bfqq->bfqd->wr_busy_queues == 0) - return blk_rq_sectors(rq) * bfq_async_charge_factor; - - return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -} - -/** - * bfq_updated_next_req - update the queue after a new next_rq selection. - * @bfqd: the device data the queue belongs to. - * @bfqq: the queue to update. - * - * If the first request of a queue changes we make sure that the queue - * has enough budget to serve at least its first request (if the - * request has grown). We do this because if the queue has not enough - * budget for its first request, it has to go through two dispatch - * rounds to actually get it dispatched. - */ -static void bfq_updated_next_req(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - struct request *next_rq = bfqq->next_rq; - unsigned long new_budget; - - if (!next_rq) - return; - - if (bfqq == bfqd->in_service_queue) - /* - * In order not to break guarantees, budgets cannot be - * changed after an entity has been selected. - */ - return; - - BUG_ON(entity->tree != &st->active); - BUG_ON(entity == entity->sched_data->in_service_entity); - - new_budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq); - } -} - -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -{ - u64 dur; - - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. - * - * On the other end, lower values than 3 seconds make it - * difficult for most interactive tasks to complete their jobs - * before weight-raising finishes. - */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; -} - -static void -bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -{ - unsigned int old_wr_coeff; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - - if (bic->saved_has_short_ttime) - bfq_mark_bfqq_has_short_ttime(bfqq); - else - bfq_clear_bfqq_has_short_ttime(bfqq); - - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); - else - bfq_clear_bfqq_IO_bound(bfqq); - - if (unlikely(busy)) - old_wr_coeff = bfqq->wr_coeff; - - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - - bfqq->wr_coeff = 1; - } - - /* make sure weight will be updated, however we got here */ - bfqq->entity.prio_changed = 1; - - if (likely(!busy)) - return; - - if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { - bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { - bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } -} - -static int bfqq_process_refs(struct bfq_queue *bfqq) -{ - int process_refs, io_refs; - - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -} - -/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_queue *item; - struct hlist_node *n; - - hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) - hlist_del_init(&item->burst_list_node); - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - bfqd->burst_size = 1; - bfqd->burst_parent_entity = bfqq->entity.parent; -} - -/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - - bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - - if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { - struct bfq_queue *pos, *bfqq_item; - struct hlist_node *n; - - /* - * Enough queues have been activated shortly after each - * other to consider this burst as large. - */ - bfqd->large_burst = true; - bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); - - /* - * We can now mark all queues in the burst list as - * belonging to a large burst. - */ - hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) { - bfq_mark_bfqq_in_large_burst(bfqq_item); - bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); - } - bfq_mark_bfqq_in_large_burst(bfqq); - bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); - - /* - * From now on, and until the current burst finishes, any - * new queue being activated shortly after the last queue - * was inserted in the burst can be immediately marked as - * belonging to a large burst. So the burst list is not - * needed any more. Remove it. - */ - hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, - burst_list_node) - hlist_del_init(&pos->burst_list_node); - } else /* - * Burst not yet large: add bfqq to the burst list. Do - * not increment the ref counter for bfqq, because bfqq - * is removed from the burst list before freeing bfqq - * in put_queue. - */ - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -} - -/* - * If many queues belonging to the same group happen to be created - * shortly after each other, then the processes associated with these - * queues have typically a common goal. In particular, bursts of queue - * creations are usually caused by services or applications that spawn - * many parallel threads/processes. Examples are systemd during boot, - * or git grep. To help these processes get their job done as soon as - * possible, it is usually better to not grant either weight-raising - * or device idling to their queues. - * - * In this comment we describe, firstly, the reasons why this fact - * holds, and, secondly, the next function, which implements the main - * steps needed to properly mark these queues so that they can then be - * treated in a different way. - * - * The above services or applications benefit mostly from a high - * throughput: the quicker the requests of the activated queues are - * cumulatively served, the sooner the target job of these queues gets - * completed. As a consequence, weight-raising any of these queues, - * which also implies idling the device for it, is almost always - * counterproductive. In most cases it just lowers throughput. - * - * On the other hand, a burst of queue creations may be caused also by - * the start of an application that does not consist of a lot of - * parallel I/O-bound threads. In fact, with a complex application, - * several short processes may need to be executed to start-up the - * application. In this respect, to start an application as quickly as - * possible, the best thing to do is in any case to privilege the I/O - * related to the application with respect to all other - * I/O. Therefore, the best strategy to start as quickly as possible - * an application that causes a burst of queue creations is to - * weight-raise all the queues created during the burst. This is the - * exact opposite of the best strategy for the other type of bursts. - * - * In the end, to take the best action for each of the two cases, the - * two types of bursts need to be distinguished. Fortunately, this - * seems relatively easy, by looking at the sizes of the bursts. In - * particular, we found a threshold such that only bursts with a - * larger size than that threshold are apparently caused by - * services or commands such as systemd or git grep. For brevity, - * hereafter we call just 'large' these bursts. BFQ *does not* - * weight-raise queues whose creation occurs in a large burst. In - * addition, for each of these queues BFQ performs or does not perform - * idling depending on which choice boosts the throughput more. The - * exact choice depends on the device and request pattern at - * hand. - * - * Unfortunately, false positives may occur while an interactive task - * is starting (e.g., an application is being started). The - * consequence is that the queues associated with the task do not - * enjoy weight raising as expected. Fortunately these false positives - * are very rare. They typically occur if some service happens to - * start doing I/O exactly when the interactive task starts. - * - * Turning back to the next function, it implements all the steps - * needed to detect the occurrence of a large burst and to properly - * mark all the queues belonging to it (so that they can then be - * treated in a different way). This goal is achieved by maintaining a - * "burst list" that holds, temporarily, the queues that belong to the - * burst in progress. The list is then used to mark these queues as - * belonging to a large burst if the burst does become large. The main - * steps are the following. - * - * . when the very first queue is created, the queue is inserted into the - * list (as it could be the first queue in a possible burst) - * - * . if the current burst has not yet become large, and a queue Q that does - * not yet belong to the burst is activated shortly after the last time - * at which a new queue entered the burst list, then the function appends - * Q to the burst list - * - * . if, as a consequence of the previous step, the burst size reaches - * the large-burst threshold, then - * - * . all the queues in the burst list are marked as belonging to a - * large burst - * - * . the burst list is deleted; in fact, the burst list already served - * its purpose (keeping temporarily track of the queues in a burst, - * so as to be able to mark them as belonging to a large burst in the - * previous sub-step), and now is not needed any more - * - * . the device enters a large-burst mode - * - * . if a queue Q that does not belong to the burst is created while - * the device is in large-burst mode and shortly after the last time - * at which a queue either entered the burst list or was marked as - * belonging to the current large burst, then Q is immediately marked - * as belonging to a large burst. - * - * . if a queue Q that does not belong to the burst is created a while - * later, i.e., not shortly after, than the last time at which a queue - * either entered the burst list or was marked as belonging to the - * current large burst, then the current burst is deemed as finished and: - * - * . the large-burst mode is reset if set - * - * . the burst list is emptied - * - * . Q is inserted in the burst list, as Q may be the first queue - * in a possible new burst (then the burst list contains just Q - * after this step). - */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - /* - * If bfqq is already in the burst list or is part of a large - * burst, or finally has just been split, then there is - * nothing else to do. - */ - if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq) || - time_is_after_eq_jiffies(bfqq->split_time + - msecs_to_jiffies(10))) - return; - - /* - * If bfqq's creation happens late enough, or bfqq belongs to - * a different group than the burst group, then the current - * burst is finished, and related data structures must be - * reset. - * - * In this respect, consider the special case where bfqq is - * the very first queue created after BFQ is selected for this - * device. In this case, last_ins_in_burst and - * burst_parent_entity are not yet significant when we get - * here. But it is easy to verify that, whether or not the - * following condition is true, bfqq will end up being - * inserted into the burst list. In particular the list will - * happen to contain only bfqq. And this is exactly what has - * to happen, as bfqq may be the first queue of the first - * burst. - */ - if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval) || - bfqq->entity.parent != bfqd->burst_parent_entity) { - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, - "handle_burst: late activation or different group"); - goto end; - } - - /* - * If we get here, then bfqq is being activated shortly after the - * last queue. So, if the current burst is also large, we can mark - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { - bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } - - /* - * If we get here, then a large-burst state has not yet been - * reached, but bfqq is being activated shortly after the last - * queue. Then we add bfqq to the burst. - */ - bfq_add_to_burst(bfqd, bfqq); -end: - /* - * At this point, bfqq either has been added to the current - * burst or has caused the current burst to terminate and a - * possible new burst to start. In particular, in the second - * case, bfqq has become the first queue in the possible new - * burst. In both cases last_ins_in_burst needs to be moved - * forward. - */ - bfqd->last_ins_in_burst = jiffies; - -} - -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - return entity->budget - entity->service; -} - -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason); - -/* - * The next function, invoked after the input queue bfqq switches from - * idle to busy, updates the budget of bfqq. The function also tells - * whether the in-service queue should be expired, by returning - * true. The purpose of expiring the in-service queue is to give bfqq - * the chance to possibly preempt the in-service queue, and the reason - * for preempting the in-service queue is to achieve one of the two - * goals below. - * - * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has - * expired because it has remained idle. In particular, bfqq may have - * expired for one of the following two reasons: - * - * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and - * did not make it to issue a new request before its last request - * was served; - * - * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue - * a new request before the expiration of the idling-time. - * - * Even if bfqq has expired for one of the above reasons, the process - * associated with the queue may be however issuing requests greedily, - * and thus be sensitive to the bandwidth it receives (bfqq may have - * remained idle for other reasons: CPU high load, bfqq not enjoying - * idling, I/O throttling somewhere in the path from the process to - * the I/O scheduler, ...). But if, after every expiration for one of - * the above two reasons, bfqq has to wait for the service of at least - * one full budget of another queue before being served again, then - * bfqq is likely to get a much lower bandwidth or resource time than - * its reserved ones. To address this issue, two countermeasures need - * to be taken. - * - * First, the budget and the timestamps of bfqq need to be updated in - * a special way on bfqq reactivation: they need to be updated as if - * bfqq did not remain idle and did not expire. In fact, if they are - * computed as if bfqq expired and remained idle until reactivation, - * then the process associated with bfqq is treated as if, instead of - * being greedy, it stopped issuing requests when bfqq remained idle, - * and restarts issuing requests only on this reactivation. In other - * words, the scheduler does not help the process recover the "service - * hole" between bfqq expiration and reactivation. As a consequence, - * the process receives a lower bandwidth than its reserved one. In - * contrast, to recover this hole, the budget must be updated as if - * bfqq was not expired at all before this reactivation, i.e., it must - * be set to the value of the remaining budget when bfqq was - * expired. Along the same line, timestamps need to be assigned the - * value they had the last time bfqq was selected for service, i.e., - * before last expiration. Thus timestamps need to be back-shifted - * with respect to their normal computation (see [1] for more details - * on this tricky aspect). - * - * Secondly, to allow the process to recover the hole, the in-service - * queue must be expired too, to give bfqq the chance to preempt it - * immediately. In fact, if bfqq has to wait for a full budget of the - * in-service queue to be completed, then it may become impossible to - * let the process recover the hole, even if the back-shifted - * timestamps of bfqq are lower than those of the in-service queue. If - * this happens for most or all of the holes, then the process may not - * receive its reserved bandwidth. In this respect, it is worth noting - * that, being the service of outstanding requests unpreemptible, a - * little fraction of the holes may however be unrecoverable, thereby - * causing a little loss of bandwidth. - * - * The last important point is detecting whether bfqq does need this - * bandwidth recovery. In this respect, the next function deems the - * process associated with bfqq greedy, and thus allows it to recover - * the hole, if: 1) the process is waiting for the arrival of a new - * request (which implies that bfqq expired for one of the above two - * reasons), and 2) such a request has arrived soon. The first - * condition is controlled through the flag non_blocking_wait_rq, - * while the second through the flag arrived_in_time. If both - * conditions hold, then the function computes the budget in the - * above-described special way, and signals that the in-service queue - * should be expired. Timestamp back-shifting is done later in - * __bfq_activate_entity. - * - * 2. Reduce latency. Even if timestamps are not backshifted to let - * the process associated with bfqq recover a service hole, bfqq may - * however happen to have, after being (re)activated, a lower finish - * timestamp than the in-service queue. That is, the next budget of - * bfqq may have to be completed before the one of the in-service - * queue. If this is the case, then preempting the in-service queue - * allows this goal to be achieved, apart from the unpreemptible, - * outstanding requests mentioned above. - * - * Unfortunately, regardless of which of the above two goals one wants - * to achieve, service trees need first to be updated to know whether - * the in-service queue must be preempted. To have service trees - * correctly updated, the in-service queue must be expired and - * rescheduled, and bfqq must be scheduled too. This is one of the - * most costly operations (in future versions, the scheduling - * mechanism may be re-designed in such a way to make it possible to - * know whether preemption is needed without needing to update service - * trees). In addition, queue preemptions almost always cause random - * I/O, and thus loss of throughput. Because of these facts, the next - * function adopts the following simple scheme to avoid both costly - * operations and too frequent preemptions: it requests the expiration - * of the in-service queue (unconditionally) only for queues that need - * to recover a hole, or that either are weight-raised or deserve to - * be weight-raised. - */ -static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool arrived_in_time, - bool wr_or_deserves_wr) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { - /* - * We do not clear the flag non_blocking_wait_rq here, as - * the latter is used in bfq_activate_bfqq to signal - * that timestamps need to be back-shifted (and is - * cleared right after). - */ - - /* - * In next assignment we rely on that either - * entity->service or entity->budget are not updated - * on expiration if bfqq is empty (see - * __bfq_bfqq_recalc_budget). Thus both quantities - * remain unchanged after such an expiration, and the - * following statement therefore assigns to - * entity->budget the remaining budget on such an - * expiration. For clarity, entity->service is not - * updated on expiration in any case, and, in normal - * operation, is reset only when bfqq is selected for - * service (see bfq_get_next_queue). - */ - BUG_ON(bfqq->max_budget < 0); - entity->budget = min_t(unsigned long, - bfq_bfqq_budget_left(bfqq), - bfqq->max_budget); - - BUG_ON(entity->budget < 0); - return true; - } - - BUG_ON(bfqq->max_budget < 0); - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(bfqq->next_rq, bfqq)); - BUG_ON(entity->budget < 0); - - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - return wr_or_deserves_wr; -} - -static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, - bool wr_or_deserves_wr, - bool interactive, - bool in_burst, - bool soft_rt) -{ - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { - bfqq->wr_start_at_switch_to_srt = jiffies; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - /* - * If needed, further reduce budget to make sure it is - * close to bfqq's backlog, so as to reduce the - * scheduling-error component due to a too large - * budget. Do not care about throughput consequences, - * but only about latency. Finally, do not assign a - * too small budget either, to avoid increasing - * latency by causing too frequent expirations. - */ - bfqq->entity.budget = min_t(unsigned long, - bfqq->entity.budget, - 2 * bfq_min_budget(bfqd)); - - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) { /* update wr coeff and duration */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else if (in_burst) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (soft_rt) { - /* - * The application is now or still meeting the - * requirements for being deemed soft rt. We - * can then correctly and safely (re)charge - * the weight-raising duration for the - * application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - if (bfqq->wr_cur_max_time != - bfqd->bfq_wr_rt_max_time) { - bfqq->wr_start_at_switch_to_srt = - bfqq->last_wr_start_finish; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfq_log_bfqq(bfqd, bfqq, - "switching to soft_rt wr"); - } else - bfq_log_bfqq(bfqd, bfqq, - "moving forward soft_rt wr duration"); - bfqq->last_wr_start_finish = jiffies; - } - } -} - -static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - return bfqq->dispatched == 0 && - time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); -} - -static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - int old_wr_coeff, - struct request *rq, - bool *interactive) -{ - bool soft_rt, in_burst, wr_or_deserves_wr, - bfqq_wants_to_preempt, - idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), - /* - * See the comments on - * bfq_bfqq_update_budg_for_activation for - * details on the usage of the next variable. - */ - arrived_in_time = ktime_get_ns() <= - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; - - bfq_log_bfqq(bfqd, bfqq, - "bfq_add_request non-busy: " - "jiffies %lu, in_time %d, idle_long %d busyw %d " - "wr_coeff %u", - jiffies, arrived_in_time, - idle_for_long_time, - bfq_bfqq_non_blocking_wait_rq(bfqq), - old_wr_coeff); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfqq == bfqd->in_service_queue); - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - req_op(rq), rq->cmd_flags); - - /* - * bfqq deserves to be weight-raised if: - * - it is sync, - * - it does not belong to a large burst, - * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense) - */ - in_burst = bfq_bfqq_in_large_burst(bfqq); - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - *interactive = - !in_burst && - idle_for_long_time; - wr_or_deserves_wr = bfqd->low_latency && - (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); - - bfq_log_bfqq(bfqd, bfqq, - "bfq_add_request: " - "in_burst %d, " - "soft_rt %d (next %lu), inter %d, bic %p", - bfq_bfqq_in_large_burst(bfqq), soft_rt, - bfqq->soft_rt_next_start, - *interactive, - bfqq->bic); - - /* - * Using the last flag, update budget and check whether bfqq - * may want to preempt the in-service queue. - */ - bfqq_wants_to_preempt = - bfq_bfqq_update_budg_for_activation(bfqd, bfqq, - arrived_in_time, - wr_or_deserves_wr); - - /* - * If bfqq happened to be activated in a burst, but has been - * idle for much more than an interactive queue, then we - * assume that, in the overall I/O initiated in the burst, the - * I/O associated with bfqq is finished. So bfqq does not need - * to be treated as a queue belonging to a burst - * anymore. Accordingly, we reset bfqq's in_large_burst flag - * if set, and remove bfqq from the burst list if it's - * there. We do not decrement burst_size, because the fact - * that bfqq does not need to belong to the burst list any - * more does not invalidate the fact that bfqq was created in - * a burst. - */ - if (likely(!bfq_bfqq_just_created(bfqq)) && - idle_for_long_time && - time_is_before_jiffies( - bfqq->budget_timeout + - msecs_to_jiffies(10000))) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - bfq_log_bfqq(bfqd, bfqq, "requests in time %d", - bfqq->requests_within_timer); - } - - if (bfqd->low_latency) { - if (unlikely(time_is_after_jiffies(bfqq->split_time))) - /* wraparound */ - bfqq->split_time = - jiffies - bfqd->bfq_wr_min_idle_time - 1; - - if (time_is_before_jiffies(bfqq->split_time + - bfqd->bfq_wr_min_idle_time)) { - bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, - old_wr_coeff, - wr_or_deserves_wr, - *interactive, - in_burst, - soft_rt); - - if (old_wr_coeff != bfqq->wr_coeff) - bfqq->entity.prio_changed = 1; - } - } - - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - - bfq_add_bfqq_busy(bfqd, bfqq); - - /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In this respect, the function - * next_queue_may_preempt just checks a simple, necessary - * condition, and not a sufficient condition based on - * timestamps. In fact, for the latter condition to be - * evaluated, timestamps would need first to be updated, and - * this operation is quite costly (see the comments on the - * function bfq_bfqq_update_budg_for_activation). - */ - if (bfqd->in_service_queue && bfqq_wants_to_preempt && - bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && - next_queue_may_preempt(bfqd)) { - struct bfq_queue *in_serv = - bfqd->in_service_queue; - BUG_ON(in_serv == bfqq); - - bfq_bfqq_expire(bfqd, bfqd->in_service_queue, - false, BFQ_BFQQ_PREEMPTED); - } -} - -static void bfq_add_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - struct request *next_rq, *prev; - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - - bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqq->wr_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); - - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - elv_rb_add(&bfqq->sort_list, rq); - - /* - * Check if this request is a better next-to-serve candidate. - */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - - /* - * Adjust priority tree position, if next_rq changes. - */ - if (prev != bfqq->next_rq) - bfq_pos_tree_add_move(bfqd, bfqq); - - if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ - bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, - rq, &interactive); - else { - if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && - time_is_before_jiffies( - bfqq->last_wr_start_finish + - bfqd->bfq_wr_min_inter_arr_async)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - - bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting, " - "wr_max_time %u wr_busy %d", - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqd->wr_busy_queues); - } - if (prev != bfqq->next_rq) - bfq_updated_next_req(bfqd, bfqq); - } - - /* - * Assign jiffies to last_wr_start_finish in the following - * cases: - * - * . if bfqq is not going to be weight-raised, because, for - * non weight-raised queues, last_wr_start_finish stores the - * arrival time of the last request; as of now, this piece - * of information is used only for deciding whether to - * weight-raise async queues - * - * . if bfqq is not weight-raised, because, if bfqq is now - * switching to weight-raised, then last_wr_start_finish - * stores the time when weight-raising starts - * - * . if bfqq is interactive, because, regardless of whether - * bfqq is currently weight-raised, the weight-raising - * period must start or restart (this case is considered - * separately because it is not detected by the above - * conditions, if bfqq is already weight-raised) - * - * last_wr_start_finish has to be updated also if bfqq is soft - * real-time, because the weight-raising period is constantly - * restarted on idle-to-busy transitions for these queues, but - * this is already done in bfq_bfqq_handle_idle_busy_switch if - * needed. - */ - if (bfqd->low_latency && - (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) - bfqq->last_wr_start_finish = jiffies; -} - -static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio) -{ - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - - bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) - return NULL; - - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - - return NULL; -} - -static sector_t get_sdist(sector_t last_pos, struct request *rq) -{ - sector_t sdist = 0; - - if (last_pos) { - if (last_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - last_pos; - else - sdist = last_pos - blk_rq_pos(rq); - } - - return sdist; -} - -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - bfqd->rq_in_driver++; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - BUG_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; -} - -static void bfq_remove_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - - BUG_ON(bfqq->entity.service > bfqq->entity.budget && - bfqq == bfqd->in_service_queue); - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); - } - - if (rq->queuelist.prev != &rq->queuelist) - list_del_init(&rq->queuelist); - BUG_ON(bfqq->queued[sync] == 0); - bfqq->queued[sync]--; - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfqq->next_rq = NULL; - - BUG_ON(bfqq->entity.budget < 0); - - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ - bfq_del_bfqq_busy(bfqd, bfqq, false); - /* - * bfqq emptied. In normal operation, when - * bfqq is empty, bfqq->entity.service and - * bfqq->entity.budget must contain, - * respectively, the service received and the - * budget used last time bfqq emptied. These - * facts do not hold in this case, as at least - * this last removal occurred while bfqq is - * not in service. To avoid inconsistencies, - * reset both bfqq->entity.service and - * bfqq->entity.budget, if bfqq has still a - * process that may issue I/O requests to it. - */ - bfqq->entity.budget = bfqq->entity.service = 0; - } - - /* - * Remove queue from request-position tree as it is empty. - */ - if (bfqq->pos_root) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - } - - if (rq->cmd_flags & REQ_META) { - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } - bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq), - rq->cmd_flags); -} - -static int bfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void bfq_merged_request(struct request_queue *q, struct request *req, - int type) -{ - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && - blk_rq_pos(req) < - blk_rq_pos(container_of(rb_prev(&req->rb_node), - struct request, rb_node))) { - struct bfq_queue *bfqq = RQ_BFQQ(req); - struct bfq_data *bfqd = bfqq->bfqd; - struct request *prev, *next_rq; - - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); - elv_rb_add(&bfqq->sort_list, req); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, - bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - /* - * If next_rq changes, update both the queue's budget to - * fit the new request and the queue's position in its - * rq_pos_tree. - */ - if (prev != bfqq->next_rq) { - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } - } -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio), - bio->bi_opf); -} -#endif - -static void bfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - - /* - * If next and rq belong to the same bfq_queue and next is older - * than rq, then reposition rq in the fifo (by substituting next - * with rq). Otherwise, if next and rq belong to different - * bfq_queues, never reposition rq: in fact, we would have to - * reposition it with respect to next's position in its own fifo, - * which would most certainly be too expensive with respect to - * the benefits. - */ - if (bfqq == next_bfqq && - !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - next->fifo_time < rq->fifo_time) { - list_del_init(&rq->queuelist); - list_replace_init(&next->queuelist, &rq->queuelist); - rq->fifo_time = next->fifo_time; - } - - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - - bfq_remove_request(next); - bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next), - next->cmd_flags); -} - -/* Must be called with bfqq != NULL */ -static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -{ - BUG_ON(!bfqq); - - if (bfq_bfqq_busy(bfqq)) { - bfqq->bfqd->wr_busy_queues--; - BUG_ON(bfqq->bfqd->wr_busy_queues < 0); - } - bfqq->wr_coeff = 1; - bfqq->wr_cur_max_time = 0; - bfqq->last_wr_start_finish = jiffies; - /* - * Trigger a weight change on the next invocation of - * __bfq_entity_update_weight_prio. - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, - "end_wr: wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", - bfqq->bfqd->wr_busy_queues); -} - -static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg) -{ - int i, j; - - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -} - -static void bfq_end_wr(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - - spin_lock_irq(bfqd->queue->queue_lock); - - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); - - spin_unlock_irq(bfqd->queue->queue_lock); -} - -static sector_t bfq_io_struct_pos(void *io_struct, bool request) -{ - if (request) - return blk_rq_pos(io_struct); - else - return ((struct bio *)io_struct)->bi_iter.bi_sector; -} - -static int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) -{ - return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_CLOSE_THR; -} - -static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - sector_t sector) -{ - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; - struct rb_node *parent, *node; - struct bfq_queue *__bfqq; - - if (RB_EMPTY_ROOT(root)) - return NULL; - - /* - * First, if we find a request starting at the end of the last - * request, choose it. - */ - __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); - if (__bfqq) - return __bfqq; - - /* - * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector (rq_pos_tree sorted by - * next_request position). - */ - __bfqq = rb_entry(parent, struct bfq_queue, pos_node); - if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) - return __bfqq; - - if (blk_rq_pos(__bfqq->next_rq) < sector) - node = rb_next(&__bfqq->pos_node); - else - node = rb_prev(&__bfqq->pos_node); - if (!node) - return NULL; - - __bfqq = rb_entry(node, struct bfq_queue, pos_node); - if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) - return __bfqq; - - return NULL; -} - -static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, - struct bfq_queue *cur_bfqq, - sector_t sector) -{ - struct bfq_queue *bfqq; - - /* - * We shall notice if some of the queues are cooperating, - * e.g., working closely on the same area of the device. In - * that case, we can group them together and: 1) don't waste - * time idling, and 2) serve the union of their requests in - * the best possible order for throughput. - */ - bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); - if (!bfqq || bfqq == cur_bfqq) - return NULL; - - return bfqq; -} - -static struct bfq_queue * -bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -{ - int process_refs, new_process_refs; - struct bfq_queue *__bfqq; - - /* - * If there are no process references on the new_bfqq, then it is - * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain - * may have dropped their last reference (not just their last process - * reference). - */ - if (!bfqq_process_refs(new_bfqq)) - return NULL; - - /* Avoid a circular list and skip interim queue merges. */ - while ((__bfqq = new_bfqq->new_bfqq)) { - if (__bfqq == bfqq) - return NULL; - new_bfqq = __bfqq; - } - - process_refs = bfqq_process_refs(bfqq); - new_process_refs = bfqq_process_refs(new_bfqq); - /* - * If the process for the bfqq has gone away, there is no - * sense in merging the queues. - */ - if (process_refs == 0 || new_process_refs == 0) - return NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", - new_bfqq->pid); - - /* - * Merging is just a redirection: the requests of the process - * owning one of the two queues are redirected to the other queue. - * The latter queue, in its turn, is set as shared if this is the - * first time that the requests of some process are redirected to - * it. - * - * We redirect bfqq to new_bfqq and not the opposite, because we - * are in the context of the process owning bfqq, hence we have - * the io_cq of this process. So we can immediately configure this - * io_cq to redirect the requests of the process to new_bfqq. - * - * NOTE, even if new_bfqq coincides with the in-service queue, the - * io_cq of new_bfqq is not available, because, if the in-service - * queue is shared, bfqd->in_service_bic may not point to the - * io_cq of the in-service queue. - * Redirecting the requests of the process owning bfqq to the - * currently in-service queue is in any case the best option, as - * we feed the in-service queue with new requests close to the - * last request served and, by doing so, hopefully increase the - * throughput. - */ - bfqq->new_bfqq = new_bfqq; - new_bfqq->ref += process_refs; - return new_bfqq; -} - -static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) -{ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; - - /* - * If either of the queues has already been detected as seeky, - * then merging it with the other queue is unlikely to lead to - * sequential I/O. - */ - if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) - return false; - - /* - * Interleaved I/O is known to be done by (some) applications - * only for reads, so it does not make sense to merge async - * queues. - */ - if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) - return false; - - return true; -} - -/* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - -/* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return - * NULL if no merge was scheduled, a pointer to the shared bfq_queue - * structure otherwise. - * - * The OOM queue is not allowed to participate to cooperation: in fact, since - * the requests temporarily redirected to the OOM queue could be redirected - * again to dedicated queues at any time, the state needed to correctly - * handle merging with the OOM queue would be quite complex and expensive - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the - * weight the same, a merged queue may be bloated with many more - * requests than the ones produced by its originally-associated - * process. - */ -static struct bfq_queue * -bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) -{ - struct bfq_queue *in_service_bfqq, *new_bfqq; - - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - - if (io_struct && wr_from_too_long(bfqq) && - likely(bfqq != &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have looked for coop, but bfq%d wr", - bfqq->pid); - - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && - bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) - && likely(in_service_bfqq == &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have tried merge with in-service-queue, but wr"); - - if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); - if (new_bfqq) - return new_bfqq; - } - /* - * Check whether there is a cooperator among currently scheduled - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ -check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - - if (new_bfqq && wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have merged with bfq%d, but wr", - new_bfqq->pid); - - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - - return NULL; -} - -static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -{ - struct bfq_io_cq *bic = bfqq->bic; - - /* - * If !bfqq->bic, the queue is already shared or its requests - * have already been redirected to a shared queue; both idle window - * and weight raising state have already been saved. Do nothing. - */ - if (!bic) - return; - - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -} - -static void bfq_get_bic_reference(struct bfq_queue *bfqq) -{ - /* - * If bfqq->bic has a non-NULL value, the bic to which it belongs - * is about to begin using a shared bfq_queue. - */ - if (bfqq->bic) - atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -} - -static void -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -{ - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (unsigned long) new_bfqq->pid); - /* Save weight raising and idle window of the merged queues */ - bfq_bfqq_save_state(bfqq); - bfq_bfqq_save_state(new_bfqq); - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); - - /* - * If bfqq is weight-raised, then let new_bfqq inherit - * weight-raising. To reduce false positives, neglect the case - * where bfqq has just been created, but has not yet made it - * to be weight-raised (which may happen because EQM may merge - * bfqq even before bfq_add_request is executed for the first - * time for bfqq). Handling this case would however be very - * easy, thanks to the flag just_created. - */ - if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { - new_bfqq->wr_coeff = bfqq->wr_coeff; - new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; - new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; - new_bfqq->wr_start_at_switch_to_srt = - bfqq->wr_start_at_switch_to_srt; - if (bfq_bfqq_busy(new_bfqq)) { - bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } - - new_bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, new_bfqq, - "wr start after merge with %d, rais_max_time %u", - bfqq->pid, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ - bfqq->wr_coeff = 1; - bfqq->entity.prio_changed = 1; - if (bfq_bfqq_busy(bfqq)) { - bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } - - } - - bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", - bfqd->wr_busy_queues); - - /* - * Grab a reference to the bic, to prevent it from being destroyed - * before being possibly touched by a bfq_split_bfqq(). - */ - bfq_get_bic_reference(bfqq); - bfq_get_bic_reference(new_bfqq); - /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ - bic_set_bfqq(bic, new_bfqq, 1); - bfq_mark_bfqq_coop(new_bfqq); - /* - * new_bfqq now belongs to at least two bics (it is a shared queue): - * set new_bfqq->bic to NULL. bfqq either: - * - does not belong to any bic any more, and hence bfqq->bic must - * be set to NULL, or - * - is a queue whose owning bics have already been redirected to a - * different queue, hence the queue is destined to not belong to - * any bic soon and bfqq->bic is already NULL (therefore the next - * assignment causes no harm). - */ - new_bfqq->bic = NULL; - bfqq->bic = NULL; - /* release process reference to bfqq */ - bfq_put_queue(bfqq); -} - -static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ - if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return false; - - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - * Queue lock is held here. - */ - bic = bfq_bic_lookup(bfqd, current->io_context); - if (!bic) - return false; - - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ - if (bfqq) { - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); - if (new_bfqq) { - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - /* - * If we get here, the bio will be queued in the - * shared queue, i.e., new_bfqq, so use new_bfqq - * to decide whether bio and rq can be merged. - */ - bfqq = new_bfqq; - } - } - - return bfqq == RQ_BFQQ(rq); -} - -static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, - struct request *next) -{ - return RQ_BFQQ(rq) == RQ_BFQQ(next); -} - -/* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the throughput. - * In practice, a time-slice service scheme is used with seeky - * processes. - */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - unsigned int timeout_coeff; - - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - - bfqd->last_budget_start = ktime_get(); - - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -} - -static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); - bfq_mark_bfqq_must_alloc(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - - BUG_ON(bfqq == bfqd->in_service_queue); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - if (time_is_before_jiffies(bfqq->last_wr_start_finish) && - bfqq->wr_coeff > 1 && - bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_before_jiffies(bfqq->budget_timeout)) { - /* - * For soft real-time queues, move the start - * of the weight-raising period forward by the - * time the queue has not received any - * service. Otherwise, a relatively long - * service delay is likely to cause the - * weight-raising period of the queue to end, - * because of the short duration of the - * weight-raising period of a soft real-time - * queue. It is worth noting that this move - * is not so dangerous for the other queues, - * because soft real-time queues are not - * greedy. - * - * To not add a further variable, we use the - * overloaded field budget_timeout to - * determine for how long the queue has not - * received service, i.e., how much time has - * elapsed since the queue expired. However, - * this is a little imprecise, because - * budget_timeout is set to jiffies if bfqq - * not only expires, but also remains with no - * request. - */ - if (time_after(bfqq->budget_timeout, - bfqq->last_wr_start_finish)) - bfqq->last_wr_start_finish += - jiffies - bfqq->budget_timeout; - else - bfqq->last_wr_start_finish = jiffies; - - if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { - pr_crit( - "BFQ WARNING:last %lu budget %lu jiffies %lu", - bfqq->last_wr_start_finish, - bfqq->budget_timeout, - jiffies); - pr_crit("diff %lu", jiffies - - max_t(unsigned long, - bfqq->last_wr_start_finish, - bfqq->budget_timeout)); - bfqq->last_wr_start_finish = jiffies; - } - } - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %d", - bfqq->entity.budget); - } else - bfq_log(bfqd, "set_in_service_queue: NULL"); - - bfqd->in_service_queue = bfqq; -} - -/* - * Get and set a new queue for service. - */ -static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); - - __bfq_set_in_service_queue(bfqd, bfqq); - return bfqq; -} - -static void bfq_arm_slice_timer(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->in_service_queue; - struct bfq_io_cq *bic; - u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* Processes have exited, don't wait. */ - bic = bfqd->in_service_bic; - if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) - return; - - bfq_mark_bfqq_wait_request(bfqq); - - /* - * We don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. So allow a little bit of time for him to submit a new rq. - * - * To prevent processes with (partly) seeky workloads from - * being too ill-treated, grant them a small fraction of the - * assigned budget before reducing the waiting time to - * BFQ_MIN_TT. This happened to help reduce latency. - */ - sl = bfqd->bfq_slice_idle; - /* - * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue - * is seeky. A long idling is preserved for a weight-raised - * queue, or, more in general, in an asymemtric scenario, - * because a long idling is needed for guaranteeing to a queue - * its reserved share of the throughput (in particular, it is - * needed if the queue has a higher weight than some other - * queue). - */ - if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && - bfq_symmetric_scenario(bfqd)) - sl = min_t(u32, sl, BFQ_MIN_TT); - - bfqd->last_idling_start = ktime_get(); - hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), - HRTIMER_MODE_REL); - bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); - bfq_log(bfqd, "arm idle: %ld/%ld ms", - sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -} - -/* - * In autotuning mode, max_budget is dynamically recomputed as the - * amount of sectors transferred in timeout at the estimated peak - * rate. This enables BFQ to utilize a full timeslice with a full - * budget, even if the in-service queue is served at peak rate. And - * this maximises throughput with sequential workloads. - */ -static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -{ - return (u64)bfqd->peak_rate * USEC_PER_MSEC * - jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -} - -/* - * Update parameters related to throughput and responsiveness, as a - * function of the estimated peak rate. See comments on - * bfq_calc_max_budget(), and on T_slow and T_fast arrays. - */ -static void update_thr_responsiveness_params(struct bfq_data *bfqd) -{ - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd); - BUG_ON(bfqd->bfq_max_budget < 0); - bfq_log(bfqd, "new max_budget = %d", - bfqd->bfq_max_budget); - } - - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - - bfq_log(bfqd, -"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", - dev_type == 0 ? "ROT" : "NONROT", - bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", - bfqd->device_speed == BFQ_BFQD_FAST ? - (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : - (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> - BFQ_RATE_SHIFT); -} - -static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -{ - if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; - bfqd->peak_rate_samples = 1; - bfqd->sequential_samples = 0; - bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = - blk_rq_sectors(rq); - } else /* no new rq dispatched, just reset the number of samples */ - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, - "reset_rate_computation at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); -} - -static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -{ - u32 rate, weight, divisor; - - /* - * For the convergence property to hold (see comments on - * bfq_update_peak_rate()) and for the assessment to be - * reliable, a minimum number of samples must be present, and - * a minimum amount of time must have elapsed. If not so, do - * not compute new rate. Just reset parameters, to get ready - * for a new evaluation attempt. - */ - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, - "update_rate_reset: only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } - - /* - * If a new request completion has occurred after last - * dispatch, then, to approximate the rate at which requests - * have been served by the device, it is more precise to - * extend the observation interval to the last completion. - */ - bfqd->delta_from_first = - max_t(u64, bfqd->delta_from_first, - bfqd->last_completion - bfqd->first_dispatch); - - BUG_ON(bfqd->delta_from_first == 0); - /* - * Rate computed in sects/usec, and not sects/nsec, for - * precision issues. - */ - rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, -"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20< 20M sectors/sec) - */ - if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && - rate <= bfqd->peak_rate) || - rate > 20<peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, - "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - } - - /* - * We have to update the peak rate, at last! To this purpose, - * we use a low-pass filter. We compute the smoothing constant - * of the filter as a function of the 'weight' of the new - * measured rate. - * - * As can be seen in next formulas, we define this weight as a - * quantity proportional to how sequential the workload is, - * and to how long the observation time interval is. - * - * The weight runs from 0 to 8. The maximum value of the - * weight, 8, yields the minimum value for the smoothing - * constant. At this minimum value for the smoothing constant, - * the measured rate contributes for half of the next value of - * the estimated peak rate. - * - * So, the first step is to compute the weight as a function - * of how sequential the workload is. Note that the weight - * cannot reach 9, because bfqd->sequential_samples cannot - * become equal to bfqd->peak_rate_samples, which, in its - * turn, holds true because bfqd->sequential_samples is not - * incremented for the first sample. - */ - weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; - - /* - * Second step: further refine the weight as a function of the - * duration of the observation interval. - */ - weight = min_t(u32, 8, - div_u64(weight * bfqd->delta_from_first, - BFQ_RATE_REF_INTERVAL)); - - /* - * Divisor ranging from 10, for minimum weight, to 2, for - * maximum weight. - */ - divisor = 10 - weight; - BUG_ON(divisor == 0); - - /* - * Finally, update peak rate: - * - * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor - */ - bfqd->peak_rate *= divisor-1; - bfqd->peak_rate /= divisor; - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, - "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); - - BUG_ON(bfqd->peak_rate == 0); - BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, - "update_peak_rate: goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ - } - - /* - * Device idle for very long: the observation interval lasting - * up to this dispatch cannot be a valid observation interval - * for computing a new peak rate (similarly to the late- - * completion event in bfq_completed_request()). Go to - * update_rate_and_reset to have the following three steps - * taken: - * - close the observation interval at the last (previous) - * request dispatch or completion - * - compute rate, if possible, for that observation interval - * - start a new observation interval with this dispatch - */ - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, -"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; - } - - /* Update sampling information */ - bfqd->peak_rate_samples++; - - if ((bfqd->rq_in_driver > 0 || - now_ns - bfqd->last_completion < BFQ_MIN_TT) - && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) - bfqd->sequential_samples++; - - bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); - - /* Reset max observed rq size every 32 dispatches */ - if (likely(bfqd->peak_rate_samples % 32)) - bfqd->last_rq_max_size = - max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); - else - bfqd->last_rq_max_size = blk_rq_sectors(rq); - - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, - "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); - - /* Target observation interval not yet reached, go on sampling */ - if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) - goto update_last_values; - -update_rate_and_reset: - bfq_update_rate_reset(bfqd, rq); -update_last_values: - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, - "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, - "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -} - -/* - * Move request from internal lists to the dispatch list of the request queue - */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* - * For consistency, the next instruction should have been executed - * after removing the request from the queue and dispatching it. - * We execute instead this instruction before bfq_remove_request() - * (and hence introduce a temporary inconsistency), for efficiency. - * In fact, in a forced_dispatch, this prevents two counters related - * to bfqq->dispatched to risk to be uselessly decremented if bfqq - * is not in service, and then to be incremented again after - * incrementing bfqq->dispatched. - */ - bfqq->dispatched++; - bfq_update_peak_rate(q->elevator->elevator_data, rq); - - bfq_remove_request(rq); - elv_dispatch_sort(q, rq); -} - -static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfqq != bfqd->in_service_queue); - - /* - * If this bfqq is shared between multiple processes, check - * to make sure that those processes are still issuing I/Os - * within the mean seek distance. If not, it may be time to - * break the queues apart again. - */ - if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) - bfq_mark_bfqq_split_coop(bfqq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfqq->dispatched == 0) - /* - * Overloading budget_timeout field to store - * the time at which the queue remains with no - * backlog and no outstanding request; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { - bfq_requeue_bfqq(bfqd, bfqq); - /* - * Resort priority tree of potential close cooperators. - */ - bfq_pos_tree_add_move(bfqd, bfqq); - } - - /* - * All in-service entities must have been properly deactivated - * or requeued before executing the next function, which - * resets all in-service entites as no more in service. - */ - __bfq_bfqd_reset_in_service(bfqd); -} - -/** - * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. - * @bfqd: device data. - * @bfqq: queue to update. - * @reason: reason for expiration. - * - * Handle the feedback on @bfqq budget at queue expiration. - * See the body for detailed comments. - */ -static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - enum bfqq_expiration reason) -{ - struct request *next_rq; - int budget, min_budget; - - BUG_ON(bfqq != bfqd->in_service_queue); - - min_budget = bfq_min_budget(bfqd); - - if (bfqq->wr_coeff == 1) - budget = bfqq->max_budget; - else /* - * Use a constant, low budget for weight-raised queues, - * to help achieve a low latency. Keep it slightly higher - * than the minimum possible budget, to cause a little - * bit fewer expirations. - */ - budget = 2 * min_budget; - - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { - switch (reason) { - /* - * Caveat: in all the following cases we trade latency - * for throughput. - */ - case BFQ_BFQQ_TOO_IDLE: - /* - * This is the only case where we may reduce - * the budget: if there is no request of the - * process still waiting for completion, then - * we assume (tentatively) that the timer has - * expired because the batch of requests of - * the process could have been served with a - * smaller budget. Hence, betting that - * process will behave in the same way when it - * becomes backlogged again, we reduce its - * next budget. As long as we guess right, - * this budget cut reduces the latency - * experienced by the process. - * - * However, if there are still outstanding - * requests, then the process may have not yet - * issued its next request just because it is - * still waiting for the completion of some of - * the still outstanding ones. So in this - * subcase we do not reduce its budget, on the - * contrary we increase it to possibly boost - * the throughput, as discussed in the - * comments to the BUDGET_TIMEOUT case. - */ - if (bfqq->dispatched > 0) /* still outstanding reqs */ - budget = min(budget * 2, bfqd->bfq_max_budget); - else { - if (budget > 5 * min_budget) - budget -= 4 * min_budget; - else - budget = min_budget; - } - break; - case BFQ_BFQQ_BUDGET_TIMEOUT: - /* - * We double the budget here because it gives - * the chance to boost the throughput if this - * is not a seeky process (and has bumped into - * this timeout because of, e.g., ZBR). - */ - budget = min(budget * 2, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_BUDGET_EXHAUSTED: - /* - * The process still has backlog, and did not - * let either the budget timeout or the disk - * idling timeout expire. Hence it is not - * seeky, has a short thinktime and may be - * happy with a higher budget too. So - * definitely increase the budget of this good - * candidate to boost the disk throughput. - */ - budget = min(budget * 4, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * For queues that expire for this reason, it - * is particularly important to keep the - * budget close to the actual service they - * need. Doing so reduces the timestamp - * misalignment problem described in the - * comments in the body of - * __bfq_activate_entity. In fact, suppose - * that a queue systematically expires for - * BFQ_BFQQ_NO_MORE_REQUESTS and presents a - * new request in time to enjoy timestamp - * back-shifting. The larger the budget of the - * queue is with respect to the service the - * queue actually requests in each service - * slot, the more times the queue can be - * reactivated with the same virtual finish - * time. It follows that, even if this finish - * time is pushed to the system virtual time - * to reduce the consequent timestamp - * misalignment, the queue unjustly enjoys for - * many re-activations a lower finish time - * than all newly activated queues. - * - * The service needed by bfqq is measured - * quite precisely by bfqq->entity.service. - * Since bfqq does not enjoy device idling, - * bfqq->entity.service is equal to the number - * of sectors that the process associated with - * bfqq requested to read/write before waiting - * for request completions, or blocking for - * other reasons. - */ - budget = max_t(int, bfqq->entity.service, min_budget); - break; - default: - return; - } - } else if (!bfq_bfqq_sync(bfqq)) - /* - * Async queues get always the maximum possible - * budget, as for them we do not care about latency - * (in addition, their ability to dispatch is limited - * by the charging factor). - */ - budget = bfqd->bfq_max_budget; - - bfqq->max_budget = budget; - - if (bfqd->budgets_assigned >= bfq_stats_min_budgets && - !bfqd->bfq_user_max_budget) - bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); - - /* - * If there is still backlog, then assign a new budget, making - * sure that it is large enough for the next request. Since - * the finish time of bfqq must be kept in sync with the - * budget, be sure to call __bfq_bfqq_expire() *after* this - * update. - * - * If there is no backlog, then no need to update the budget; - * it will be updated on the arrival of a new request. - */ - next_rq = bfqq->next_rq; - if (next_rq) { - BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || - reason == BFQ_BFQQ_NO_MORE_REQUESTS); - bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - } - - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", - next_rq ? blk_rq_sectors(next_rq) : 0, - bfqq->entity.budget); -} - -/* - * Return true if the process associated with bfqq is "slow". The slow - * flag is used, in addition to the budget timeout, to reduce the - * amount of service provided to seeky processes, and thus reduce - * their chances to lower the throughput. More details in the comments - * on the function bfq_bfqq_expire(). - * - * An important observation is in order: as discussed in the comments - * on the function bfq_update_peak_rate(), with devices with internal - * queues, it is hard if ever possible to know when and for how long - * an I/O request is processed by the device (apart from the trivial - * I/O pattern where a new request is dispatched only after the - * previous one has been completed). This makes it hard to evaluate - * the real rate at which the I/O requests of each bfq_queue are - * served. In fact, for an I/O scheduler like BFQ, serving a - * bfq_queue means just dispatching its requests during its service - * slot (i.e., until the budget of the queue is exhausted, or the - * queue remains idle, or, finally, a timeout fires). But, during the - * service slot of a bfq_queue, around 100 ms at most, the device may - * be even still processing requests of bfq_queues served in previous - * service slots. On the opposite end, the requests of the in-service - * bfq_queue may be completed after the service slot of the queue - * finishes. - * - * Anyway, unless more sophisticated solutions are used - * (where possible), the sum of the sizes of the requests dispatched - * during the service slot of a bfq_queue is probably the only - * approximation available for the service received by the bfq_queue - * during its service slot. And this sum is the quantity used in this - * function to evaluate the I/O speed of a process. - */ -static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) -{ - ktime_t delta_ktime; - u32 delta_usecs; - bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - - if (!bfq_bfqq_sync(bfqq)) - return false; - - if (compensate) - delta_ktime = bfqd->last_idling_start; - else - delta_ktime = ktime_get(); - delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); - delta_usecs = ktime_to_us(delta_ktime); - - /* don't use too short time intervals */ - if (delta_usecs < 1000) { - if (blk_queue_nonrot(bfqd->queue)) - /* - * give same worst-case guarantees as idling - * for seeky - */ - *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - - bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); - - return slow; - } - - *delta_ms = delta_usecs / USEC_PER_MSEC; - - /* - * Use only long (> 20ms) intervals to filter out excessive - * spikes in service rate estimation. - */ - if (delta_usecs > 20000) { - /* - * Caveat for rotational devices: processes doing I/O - * in the slower disk zones tend to be slow(er) even - * if not seeky. In this respect, the estimated peak - * rate is likely to be an average over the disk - * surface. Accordingly, to not be too harsh with - * unlucky processes, a process is deemed slow only if - * its rate has been lower than half of the estimated - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; - bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - - bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); - - return slow; -} - -/* - * To be deemed as soft real-time, an application must meet two - * requirements. First, the application must not require an average - * bandwidth higher than the approximate bandwidth required to playback or - * record a compressed high-definition video. - * The next function is invoked on the completion of the last request of a - * batch, to compute the next-start time instant, soft_rt_next_start, such - * that, if the next request of the application does not arrive before - * soft_rt_next_start, then the above requirement on the bandwidth is met. - * - * The second requirement is that the request pattern of the application is - * isochronous, i.e., that, after issuing a request or a batch of requests, - * the application stops issuing new requests until all its pending requests - * have been completed. After that, the application may issue a new batch, - * and so on. - * For this reason the next function is invoked to compute - * soft_rt_next_start only for applications that meet this requirement, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). - * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. - */ -static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqd, bfqq, -"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -} - -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - -/** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - * @bfqq: the queue to expire. - * @compensate: if true, compensate for the time spent idling. - * @reason: the reason causing the expiration. - * - * If the process associated with bfqq does slow I/O (e.g., because it - * issues random requests), we charge bfqq with the time it has been - * in service instead of the service it has received (see - * bfq_bfqq_charge_time for details on how this goal is achieved). As - * a consequence, bfqq will typically get higher timestamps upon - * reactivation, and hence it will be rescheduled as if it had - * received more service than what it has actually received. In the - * end, bfqq receives less service in proportion to how slowly its - * associated process consumes its budgets (and hence how seriously it - * tends to lower the throughput). In addition, this time-charging - * strategy guarantees time fairness among slow processes. In - * contrast, if the process associated with bfqq is not slow, we - * charge bfqq exactly with the service it has received. - * - * Charging time to the first type of queues and the exact service to - * the other has the effect of using the WF2Q+ policy to schedule the - * former on a timeslice basis, without violating service domain - * guarantees among the latter. - */ -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - bool compensate, - enum bfqq_expiration reason) -{ - bool slow; - unsigned long delta = 0; - struct bfq_entity *entity = &bfqq->entity; - int ref; - - BUG_ON(bfqq != bfqd->in_service_queue); - - /* - * Check whether the process is slow (see bfq_bfqq_is_slow). - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - - /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. - */ - bfqq->service_from_backlogged += entity->service; - - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service - * received, to favor sequential workloads. - * - * Processes doing I/O in the slower disk zones will tend to - * be slow(er) even if not seeky. Therefore, since the - * estimated peak rate is actually an average over the disk - * surface, these processes may timeout just for bad luck. To - * avoid punishing them, do not charge time to processes that - * succeeded in consuming at least 2/3 of their budget. This - * allows BFQ to preserve enough elasticity to still perform - * bandwidth, and not time, distribution with little unlucky - * or quasi-sequential processes. - */ - if (bfqq->wr_coeff == 1 && - (slow || - (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) - bfq_bfqq_charge_time(bfqd, bfqq, delta); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - if (reason == BFQ_BFQQ_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - - if (bfqd->low_latency && bfqq->wr_coeff == 1) - bfqq->last_wr_start_finish = jiffies; - - if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * If we get here, and there are no outstanding - * requests, then the request pattern is isochronous - * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. If, instead, the queue still - * has outstanding requests, then we have to wait for - * the completion of all the outstanding requests to - * discover whether the request pattern is actually - * isochronous. - */ - BUG_ON(bfqd->busy_queues < 1); - if (bfqq->dispatched == 0) { - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", - bfqq->soft_rt_next_start); - } else { - /* - * The application is still waiting for the - * completion of one or more requests: - * prevent it from possibly being incorrectly - * deemed as soft real-time by setting its - * soft_rt_next_start to infinity. In fact, - * without this assignment, the application - * would be incorrectly deemed as soft - * real-time if: - * 1) it issued a new request before the - * completion of all its in-flight - * requests, and - * 2) at that time, its soft_rt_next_start - * happened to be in the past. - */ - bfqq->soft_rt_next_start = - bfq_greatest_from_now(); - /* - * Schedule an update of soft_rt_next_start to when - * the task may be discovered to be isochronous. - */ - bfq_mark_bfqq_softrt_update(bfqq); - } - } - - bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", - reason, slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to - * reason. - */ - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); - ref = bfqq->ref; - __bfq_bfqq_expire(bfqd, bfqq); - - BUG_ON(ref > 1 && - !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && - !bfq_class_idle(bfqq)); - - /* mark bfqq as waiting a request only if a bic still points to it */ - if (ref > 1 && !bfq_bfqq_busy(bfqq) && - reason != BFQ_BFQQ_BUDGET_TIMEOUT && - reason != BFQ_BFQQ_BUDGET_EXHAUSTED) - bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -} - -/* - * Budget timeout is not implemented through a dedicated timer, but - * just checked on request arrivals and completions, as well as on - * idle timer expirations. - */ -static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -{ - return time_is_before_eq_jiffies(bfqq->budget_timeout); -} - -/* - * If we expire a queue that is actively waiting (i.e., with the - * device idled) for the arrival of a new request, then we may incur - * the timestamp misalignment problem described in the body of the - * function __bfq_activate_entity. Hence we return true only if this - * condition does not hold, or if the queue is slow enough to deserve - * only to be kicked off for preserving a high throughput. - */ -static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, - "may_budget_timeout: wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); - - return (!bfq_bfqq_wait_request(bfqq) || - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) - && - bfq_bfqq_budget_timeout(bfqq); -} - -/* - * For a queue that becomes empty, device idling is allowed only if - * this function returns true for that queue. As a consequence, since - * device idling plays a critical role for both throughput boosting - * and service guarantees, the return value of this function plays a - * critical role as well. - * - * In a nutshell, this function returns true only if idling is - * beneficial for throughput or, even if detrimental for throughput, - * idling is however necessary to preserve service guarantees (low - * latency, desired throughput distribution, ...). In particular, on - * NCQ-capable devices, this function tries to return false, so as to - * help keep the drives' internal queues full, whenever this helps the - * device boost the throughput without causing any service-guarantee - * issue. - * - * In more detail, the return value of this function is obtained by, - * first, computing a number of boolean variables that take into - * account throughput and service-guarantee issues, and, then, - * combining these variables in a logical expression. Most of the - * issues taken into account are not trivial. We discuss these issues - * while introducing the variables. - */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, - bfqq_sequential_and_IO_bound, - idling_boosts_thr, idling_boosts_thr_without_issues, - idling_needed_for_service_guarantees, - asymmetric_scenario; - - if (bfqd->strict_guarantees) - return true; - - /* - * Idling is performed only if slice_idle > 0. In addition, we - * do not idle if - * (a) bfqq is async - * (b) bfqq is in the idle io prio class: in this case we do - * not idle because we want to minimize the bandwidth that - * queues in this class can steal to higher-priority queues - */ - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || - bfq_class_idle(bfqq)) - return false; - - bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && - bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); - /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable and rotational, or - * (b) regardless of the presence of NCQ, the device is rotational and - * the request pattern for bfqq is I/O-bound and sequential, or - * (c) regardless of whether it is rotational, the device is - * not NCQ-capable and the request pattern for bfqq is - * I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the - * above conditions (a), (b) or (c) is true, and, in - * particular, happens to be false if bfqd is an NCQ-capable - * flash-based device. - */ - idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && - bfqq_sequential_and_IO_bound); - - /* - * The value of the next variable, - * idling_boosts_thr_without_issues, is equal to that of - * idling_boosts_thr, unless a special case holds. In this - * special case, described below, idling may cause problems to - * weight-raised queues. - * - * When the request pool is saturated (e.g., in the presence - * of write hogs), if the processes associated with - * non-weight-raised queues ask for requests at a lower rate, - * then processes associated with weight-raised queues have a - * higher probability to get a request from the pool - * immediately (or at least soon) when they need one. Thus - * they have a higher probability to actually get a fraction - * of the device throughput proportional to their high - * weight. This is especially true with NCQ-capable drives, - * which enqueue several requests in advance, and further - * reorder internally-queued requests. - * - * For this reason, we force to false the value of - * idling_boosts_thr_without_issues if there are weight-raised - * busy queues. In this case, and if bfqq is not weight-raised, - * this guarantees that the device is not idled for bfqq (if, - * instead, bfqq is weight-raised, then idling will be - * guaranteed by another variable, see below). Combined with - * the timestamping rules of BFQ (see [1] for details), this - * behavior causes bfqq, and hence any sync non-weight-raised - * queue, to get a lower number of requests served, and thus - * to ask for a lower number of requests from the request - * pool, before the busy weight-raised queues get served - * again. This often mitigates starvation problems in the - * presence of heavy write workloads and NCQ, thereby - * guaranteeing a higher application and system responsiveness - * in these hostile scenarios. - */ - idling_boosts_thr_without_issues = idling_boosts_thr && - bfqd->wr_busy_queues == 0; - - /* - * There is then a case where idling must be performed not - * for throughput concerns, but to preserve service - * guarantees. - * - * To introduce this case, we can note that allowing the drive - * to enqueue more than one request at a time, and hence - * delegating de facto final scheduling decisions to the - * drive's internal scheduler, entails loss of control on the - * actual request service order. In particular, the critical - * situation is when requests from different processes happen - * to be present, at the same time, in the internal queue(s) - * of the drive. In such a situation, the drive, by deciding - * the service order of the internally-queued requests, does - * determine also the actual throughput distribution among - * these processes. But the drive typically has no notion or - * concern about per-process throughput distribution, and - * makes its decisions only on a per-request basis. Therefore, - * the service distribution enforced by the drive's internal - * scheduler is likely to coincide with the desired - * device-throughput distribution only in a completely - * symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) all these processes have the same I/O pattern - * (either sequential or random). - * In fact, in such a scenario, the drive will tend to treat - * the requests of each of these processes in about the same - * way as the requests of the others, and thus to provide - * each of these processes with about the same throughput - * (which is exactly the desired throughput distribution). In - * contrast, in any asymmetric scenario, device idling is - * certainly needed to guarantee that bfqq receives its - * assigned fraction of the device throughput (see [1] for - * details). - * - * We address this issue by controlling, actually, only the - * symmetry sub-condition (i), i.e., provided that - * sub-condition (i) holds, idling is not performed, - * regardless of whether sub-condition (ii) holds. In other - * words, only if sub-condition (i) holds, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that we - * exploit preemption to preserve guarantees in case of - * symmetric scenarios, even if (ii) does not hold, as - * explained in the next two paragraphs. - * - * Even if a queue, say Q, is expired when it remains idle, Q - * can still preempt the new in-service queue if the next - * request of Q arrives soon (see the comments on - * bfq_bfqq_update_budg_for_activation). If all queues and - * groups have the same weight, this form of preemption, - * combined with the hole-recovery heuristic described in the - * comments on function bfq_bfqq_update_budg_for_activation, - * are enough to preserve a correct bandwidth distribution in - * the mid term, even without idling. In fact, even if not - * idling allows the internal queues of the device to contain - * many requests, and thus to reorder requests, we can rather - * safely assume that the internal scheduler still preserves a - * minimum of mid-term fairness. The motivation for using - * preemption instead of idling is that, by not idling, - * service guarantees are preserved without minimally - * sacrificing throughput. In other words, both a high - * throughput and its desired distribution are obtained. - * - * More precisely, this preemption-based, idleless approach - * provides fairness in terms of IOPS, and not sectors per - * second. This can be seen with a simple example. Suppose - * that there are two queues with the same weight, but that - * the first queue receives requests of 8 sectors, while the - * second queue receives requests of 1024 sectors. In - * addition, suppose that each of the two queues contains at - * most one request at a time, which implies that each queue - * always remains idle after it is served. Finally, after - * remaining idle, each queue receives very quickly a new - * request. It follows that the two queues are served - * alternatively, preempting each other if needed. This - * implies that, although both queues have the same weight, - * the queue with large requests receives a service that is - * 1024/8 times as high as the service received by the other - * queue. - * - * On the other hand, device idling is performed, and thus - * pure sector-domain guarantees are provided, for the - * following queues, which are likely to need stronger - * throughput guarantees: weight-raised queues, and queues - * with a higher weight than other queues. When such queues - * are active, sub-condition (i) is false, which triggers - * device idling. - * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the - * value of this variable, we not only use the return value of - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments on - * bfq_weights_tree_add()). - * - * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. - */ - asymmetric_scenario = bfqq->wr_coeff > 1 || - !bfq_symmetric_scenario(bfqd); - - /* - * Finally, there is a case where maximizing throughput is the - * best choice even if it may cause unfairness toward - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the - * comments on bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the above case, we can - * now establish when idling is actually needed to preserve - * service guarantees. - */ - idling_needed_for_service_guarantees = - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* - * We have now all the components we need to compute the - * return value of the function, which is true only if idling - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ - bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, - "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), - idling_needed_for_service_guarantees); - - return idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees; -} - -/* - * If the in-service queue is empty but the function bfq_bfqq_may_idle - * returns true, then: - * 1) the queue must remain in service and cannot be expired, and - * 2) the device must be idled to wait for the possible arrival of a new - * request for the queue. - * See the comments on the function bfq_bfqq_may_idle for the reasons - * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself - * returns true. - */ -static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -{ - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); -} - -/* - * Select a queue for service. If we have a current queue in service, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - struct request *next_rq; - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; - - bfqq = bfqd->in_service_queue; - if (!bfqq) - goto new_queue; - - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -check_queue: - /* - * This loop is rarely executed more than once. Even when it - * happens, it is much more convenient to re-execute this loop - * than to return NULL and trigger a new dispatch to get a - * request served. - */ - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve them, keep the queue, otherwise expire it. - */ - if (next_rq) { - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - if (bfq_serv_to_charge(next_rq, bfqq) > - bfq_bfqq_budget_left(bfqq)) { - /* - * Expire the queue for budget exhaustion, - * which makes sure that the next budget is - * enough to serve the next request, even if - * it comes from the fifo expired path. - */ - reason = BFQ_BFQQ_BUDGET_EXHAUSTED; - goto expire; - } else { - /* - * The idle timer may be pending because we may - * not disable disk idling even when a new request - * arrives. - */ - if (bfq_bfqq_wait_request(bfqq)) { - BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the - * timer because the request was too small, - * 2) then the block layer has unplugged - * the device, causing the dispatch to be - * invoked. - * - * Since the device is unplugged, now the - * requests are probably large enough to - * provide a reasonable throughput. - * So we disable idling. - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - } - goto keep_queue; - } - } - - /* - * No requests pending. However, if the in-service queue is idling - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ - if (hrtimer_active(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; - } - - reason = BFQ_BFQQ_NO_MORE_REQUESTS; -expire: - bfq_bfqq_expire(bfqd, bfqq, false, reason); -new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); - goto check_queue; - } -keep_queue: - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); - else - bfq_log(bfqd, "select_queue: no queue returned"); - - return bfqq; -} - -static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ - BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqq->wr_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); - - BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != - entity->orig_weight * bfqq->wr_coeff); - if (entity->prio_changed) - bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); - - /* - * If the queue was activated in a burst, or too much - * time has elapsed from the beginning of this - * weight-raising period, then end weight raising. - */ - if (bfq_bfqq_in_large_burst(bfqq)) - bfq_bfqq_end_wr(bfqq); - else if (time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || - time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, - "back to interactive wr"); - } - } - } - /* - * To improve latency (for this or other queues), immediately - * update weight both if it must be raised and if it must be - * lowered. Since, entity may be on some active tree here, and - * might have a pending change of its ioprio class, invoke - * next function with the last parameter unset (see the - * comments on the function). - */ - if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) - __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), - entity, false); -} - -/* - * Dispatch one request from bfqq, moving it to the request queue - * dispatch list. - */ -static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - int dispatched = 0; - struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - BUG_ON(!rq); - service_to_charge = bfq_serv_to_charge(rq, bfqq); - - BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - bfq_bfqq_served(bfqq, service_to_charge); - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - bfq_dispatch_insert(bfqd->queue, rq); - - /* - * If weight raising has to terminate for bfqq, then next - * function causes an immediate update of bfqq's weight, - * without waiting for next activation. As a consequence, on - * expiration, bfqq will be timestamped as if has never been - * weight-raised during this service slot, even if it has - * received part or even most of the service as a - * weight-raised queue. This inflates bfqq's timestamps, which - * is beneficial, as bfqq is then more willing to leave the - * device immediately to possible other weight-raised queues. - */ - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, - "dispatched %u sec req (%llu), budg left %d", - blk_rq_sectors(rq), - (unsigned long long) blk_rq_pos(rq), - bfq_bfqq_budget_left(bfqq)); - - dispatched++; - - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - - return dispatched; - -expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); - return dispatched; -} - -static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -{ - int dispatched = 0; - - while (bfqq->next_rq) { - bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&bfqq->fifo)); - return dispatched; -} - -/* - * Drain our current requests. - * Used for barriers and when switching io schedulers on-the-fly. - */ -static int bfq_forced_dispatch(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq, *n; - struct bfq_service_tree *st; - int dispatched = 0; - - bfqq = bfqd->in_service_queue; - if (bfqq) - __bfq_bfqq_expire(bfqd, bfqq); - - /* - * Loop through classes, and be careful to leave the scheduler - * in a consistent state, as feedback mechanisms and vtime - * updates cannot be disabled during the process. - */ - list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); - - bfqq->max_budget = bfq_max_budget(bfqd); - bfq_forget_idle(st); - } - - BUG_ON(bfqd->busy_queues != 0); - - return dispatched; -} - -static int bfq_dispatch_requests(struct request_queue *q, int force) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - - /* - * Force device to serve one request at a time if - * strict_guarantees is true. Forcing this service scheme is - * currently the ONLY way to guarantee that the request - * service order enforced by the scheduler is respected by a - * queueing device. Otherwise the device is free even to make - * some unlucky request wait for as long as the device - * wishes. - * - * Of course, serving one request at at time may cause loss of - * throughput. - */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) - return 0; - - bfqq = bfq_select_queue(bfqd); - if (!bfqq) - return 0; - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfq_bfqq_wait_request(bfqq)); - - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - - bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); - return 1; -} - -/* - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * Queue lock must be held here. Recall not to use bfqq after calling - * this function on it. - */ -static void bfq_put_queue(struct bfq_queue *bfqq) -{ -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_group *bfqg = bfqq_group(bfqq); -#endif - - BUG_ON(bfqq->ref <= 0); - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); - bfqq->ref--; - if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ - hlist_del_init(&bfqq->burst_list_node); - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_put(bfqg); -#endif -} - -static void bfq_put_cooperator(struct bfq_queue *bfqq) -{ - struct bfq_queue *__bfqq, *next; - - /* - * If this queue was scheduled to merge with another queue, be - * sure to drop the reference taken on that queue (and others in - * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. - */ - __bfqq = bfqq->new_bfqq; - while (__bfqq) { - if (__bfqq == bfqq) - break; - next = __bfqq->new_bfqq; - bfq_put_queue(__bfqq); - __bfqq = next; - } -} - -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - if (bfqq == bfqd->in_service_queue) { - __bfq_bfqq_expire(bfqd, bfqq); - bfq_schedule_dispatch(bfqd); - } - - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - - bfq_put_queue(bfqq); /* release process reference */ -} - -static void bfq_init_icq(struct io_cq *icq) -{ - icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -} - -static void bfq_exit_icq(struct io_cq *icq) -{ - struct bfq_io_cq *bic = icq_to_bic(icq); - struct bfq_data *bfqd = bic_to_bfqd(bic); - - if (bic_to_bfqq(bic, false)) { - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); - bic_set_bfqq(bic, NULL, false); - } - - if (bic_to_bfqq(bic, true)) { - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ - if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) - put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); - bic_set_bfqq(bic, NULL, true); - } -} - -/* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. - */ -static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - struct bfq_io_cq *bic) -{ - struct task_struct *tsk = current; - int ioprio_class; - - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { - default: - dev_err(bfqq->bfqd->queue->backing_dev_info->dev, - "bfq: bad prio class %d\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* - * No prio set, inherit CPU scheduling settings. - */ - bfqq->new_ioprio = task_nice_ioprio(tsk); - bfqq->new_ioprio_class = task_nice_ioclass(tsk); - break; - case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - bfqq->new_ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; - break; - } - - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { - pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->new_ioprio); - BUG(); - } - - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, - "set_next_ioprio_data: bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -} - -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_queue *bfqq; - unsigned long uninitialized_var(flags); - int ioprio = bic->icq.ioc->ioprio; - - /* - * This condition may trigger on a newly created bic, be sure to - * drop the lock before returning. - */ - if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - return; - - bic->ioprio = ioprio; - - bfqq = bic_to_bfqq(bic, false); - if (bfqq) { - /* release process reference on this queue */ - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, bfqq->ref); - } - - bfqq = bic_to_bfqq(bic, true); - if (bfqq) - bfq_set_next_ioprio_data(bfqq, bic); -} - -static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) -{ - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - - bfqq->ref = 0; - bfqq->bfqd = bfqd; - - if (bic) - bfq_set_next_ioprio_data(bfqq, bic); - - if (is_sync) { - /* - * No need to mark as has_short_ttime if in - * idle_class, because no device idling is performed - * for queues in idle class - */ - if (!bfq_class_idle(bfqq)) - /* tentatively mark as has_short_ttime */ - bfq_mark_bfqq_has_short_ttime(bfqq); - bfq_mark_bfqq_sync(bfqq); - bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); - bfq_mark_bfqq_IO_bound(bfqq); - - /* Tentative initial value to trade off between thr and lat */ - bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; - bfqq->pid = pid; - - bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bfqq->budget_timeout = bfq_smallest_from_now(); - bfqq->split_time = bfq_smallest_from_now(); - - /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. - */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; -} - -static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - struct bfq_group *bfqg, - int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ - case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; - default: - BUG(); - } -} - -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) -{ - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq; - struct bfq_group *bfqg; - - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - - if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); - bfqq = *async_bfqq; - if (bfqq) - goto out; - } - - bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, - bfqd->queue->node); - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - goto out; - } - - /* - * Pin the queue now that it's allocated, scheduler exit will - * prune it. - */ - if (async_bfqq) { - bfqq->ref++; /* - * Extra group reference, w.r.t. sync - * queue. This extra reference is removed - * only if bfqq->bfqg disappears, to - * guarantee that this queue is not freed - * until its group goes away. - */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - -out: - bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; -} - -static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct bfq_io_cq *bic) -{ - struct bfq_ttime *ttime = &bic->ttime; - u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - - elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - - ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -} - -static void -bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) -{ - bfqq->seek_history <<= 1; - bfqq->seek_history |= - get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && - (!blk_queue_nonrot(bfqd->queue) || - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -} - -static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) -{ - bool has_short_ttime = true; - - /* - * No need to update has_short_ttime if bfqq is async or in - * idle io prio class, or if bfq_slice_idle is zero, because - * no device idling is performed for bfqq in this case. - */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || - bfqd->bfq_slice_idle == 0) - return; - - /* Idle window just restored, statistics are meaningless. */ - if (time_is_after_eq_jiffies(bfqq->split_time + - bfqd->bfq_wr_min_idle_time)) - return; - - /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime - */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - (bfq_sample_valid(bic->ttime.ttime_samples) && - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) - bfq_mark_bfqq_has_short_ttime(bfqq); - else - bfq_clear_bfqq_has_short_ttime(bfqq); -} - -/* - * Called when a new fs request (rq) is added to bfqq. Check if there's - * something we should do about it. - */ -static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) -{ - struct bfq_io_cq *bic = RQ_BIC(rq); - - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, bic); - bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { - bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && - blk_rq_sectors(rq) < 32; - bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); - - /* - * There is just this request queued: if the request - * is small and the queue is not to be expired, then - * just exit. - * - * In this way, if the device is being idled to wait - * for a new request from the in-service queue, we - * avoid unplugging the device and committing the - * device to serve just a small request. On the - * contrary, we wait for the block layer to decide - * when to unplug the device: hopefully, new requests - * will be merged to this one quickly, then the device - * will be unplugged and larger requests will be - * dispatched. - */ - if (small_req && !budget_timeout) - return; - - /* - * A large enough request arrived, or the queue is to - * be expired: in both cases disk idling is to be - * stopped, so clear wait_request flag and reset - * timer. - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - - /* - * The queue is not empty, because a new request just - * arrived. Hence we can safely expire the queue, in - * case of budget timeout, without risking that the - * timestamps of the queue are not updated correctly. - * See [1] for more details. - */ - if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); - - /* - * Let the request rip immediately, or let a new queue be - * selected if bfqq has just been expired. - */ - __blk_run_queue(bfqd->queue); - } -} - -static void bfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - - assert_spin_locked(bfqd->queue->queue_lock); - - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to - * merge two bfq_queues. - */ - if (!in_interrupt()) { - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); - if (new_bfqq) { - if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) - new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); - /* - * Release the request's reference to the old bfqq - * and make sure one is taken to the shared queue. - */ - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq - */ - bfq_put_queue(bfqq); - rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; - } - } - - bfq_add_request(rq); - - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -} - -static void bfq_update_hw_tag(struct bfq_data *bfqd) -{ - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); - - if (bfqd->hw_tag == 1) - return; - - /* - * This sample is valid if the number of outstanding requests - * is large enough to allow a queueing behavior. Note that the - * sum is not exact, as it's not taking into account deactivated - * requests. - */ - if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) - return; - - if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) - return; - - bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; - bfqd->max_rq_in_driver = 0; - bfqd->hw_tag_samples = 0; -} - -static void bfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - u64 now_ns; - u32 delta_us; - - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", - blk_rq_sectors(rq)); - - assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* - * Set budget_timeout (which we overload to store the - * time at which the queue remains with no backlog and - * no outstanding request; used by the weight-raising - * mechanism). - */ - bfqq->budget_timeout = jiffies; - - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - } - - now_ns = ktime_get_ns(); - - RQ_BIC(rq)->ttime.last_end_request = now_ns; - - /* - * Using us instead of ns, to get a reasonable precision in - * computing rate in next check. - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* - * If the request took rather long to complete, and, according - * to the maximum request size recorded, this completion latency - * implies that the request was certainly served at a very low - * rate (less than 1M sectors/sec), then the whole observation - * interval that lasts up to this time instant cannot be a - * valid time interval for computing a new peak rate. Invoke - * bfq_update_rate_reset to have the following three steps - * taken: - * - close the observation interval at the last (previous) - * request dispatch or completion - * - compute rate, if possible, for that observation interval - * - reset to zero samples, which will trigger a proper - * re-initialization of the observation interval on next - * dispatch - */ - if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && - (bfqd->last_rq_max_size<last_completion = now_ns; - - /* - * If we are waiting to discover whether the request pattern - * of the task associated with the queue is actually - * isochronous, and both requisites for this condition to hold - * are now satisfied, then compute soft_rt_next_start (see the - * comments on the function bfq_bfqq_softrt_next_start()). We - * schedule this delayed check when bfqq expires, if it still - * has in-flight requests. - */ - if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); - - /* - * If this is the in-service queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); - goto out; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); - else if (RB_EMPTY_ROOT(&bfqq->sort_list) && - (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_NO_MORE_REQUESTS); - } - - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - -out: - return; -} - -static int __bfq_may_queue(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { - bfq_clear_bfqq_must_alloc(bfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int bfq_may_queue(struct request_queue *q, int op, int op_flags) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - - /* - * Don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be - * queued. So just lookup a possibly existing queue, or return - * 'may queue' if that fails. - */ - bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) - return ELV_MQUEUE_MAY; - - bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags)); - if (bfqq) - return __bfq_may_queue(bfqq); - - return ELV_MQUEUE_MAY; -} - -/* - * Queue lock held here. - */ -static void bfq_put_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - if (bfqq) { - const int rw = rq_data_dir(rq); - - BUG_ON(!bfqq->allocated[rw]); - bfqq->allocated[rw]--; - - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -} - -/* - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to that bfqq. - */ -static struct bfq_queue * -bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - - put_io_context(bic->icq.ioc); - - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_coop(bfqq); - bfq_clear_bfqq_split_coop(bfqq); - return bfqq; - } - - bic_set_bfqq(bic, NULL, 1); - - bfq_put_cooperator(bfqq); - - bfq_put_queue(bfqq); - return NULL; -} - -/* - * Allocate bfq data structures associated with this request. - */ -static int bfq_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); - const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - unsigned long flags; - bool bfqq_already_existing = false, split = false; - - spin_lock_irqsave(q->queue_lock, flags); - - if (!bic) - goto queue_fail; - - bfq_check_ioprio_change(bic, bio); - - bfq_bic_update_cgroup(bic, bio); - -new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, - "set_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, - bic->saved_in_large_burst, - bfqd->large_burst); - - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, - "set_request: marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, - "set_request: clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - bfqq->split_time = jiffies; - } - } else { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - if (!bfqq) - goto new_queue; - else - bfqq_already_existing = true; - } - } - - bfqq->allocated[rw]++; - bfqq->ref++; - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - - /* - * If a bfq_queue has only one process reference, it is owned - * by only one bfq_io_cq: we can set the bic field of the - * bfq_queue to the address of that structure. Also, if the - * queue has just been split, mark a flag so that the - * information is available to the other scheduler hooks. - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; - if (split) { - /* - * If the queue has just been split from a shared - * queue, restore the idle window and the possible - * weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); - } - } - - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; - -queue_fail: - bfq_schedule_dispatch(bfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 1; -} - -static void bfq_kick_queue(struct work_struct *work) -{ - struct bfq_data *bfqd = - container_of(work, struct bfq_data, unplug_work); - struct request_queue *q = bfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - -/* - * Handler of the expiration of the timer running if the in-service queue - * is idling inside its time slice. - */ -static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -{ - struct bfq_data *bfqd = container_of(timer, struct bfq_data, - idle_slice_timer); - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - - bfqq = bfqd->in_service_queue; - /* - * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if the timer handler - * spins on the queue_lock and a new request arrives for the - * current queue and there is a full dispatch cycle that changes - * the in-service queue. This can hardly happen, but in the worst - * case we just expire a queue too early. - */ - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired - * for budget timeout without wasting - * guarantees - */ - reason = BFQ_BFQQ_BUDGET_TIMEOUT; - else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) - /* - * The queue may not be empty upon timer expiration, - * because we may not disable the timer when the - * first request of the in-service queue arrives - * during disk idling. - */ - reason = BFQ_BFQQ_TOO_IDLE; - else - goto schedule_dispatch; - - bfq_bfqq_expire(bfqd, bfqq, true, reason); - } - -schedule_dispatch: - bfq_schedule_dispatch(bfqd); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; -} - -static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -{ - hrtimer_cancel(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); -} - -static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) -{ - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - - bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); - bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; - } -} - -/* - * Release all the bfqg references to its async queues. If we are - * deallocating the group these queues may still contain requests, so - * we reparent them to the root cgroup (i.e., the only one that will - * exist for sure until all the requests on a device are gone). - */ -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -{ - int i, j; - - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); - - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -} - -static void bfq_exit_queue(struct elevator_queue *e) -{ - struct bfq_data *bfqd = e->elevator_data; - struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - - bfq_shutdown_timer_wq(bfqd); - - spin_lock_irq(q->queue_lock); - - BUG_ON(bfqd->in_service_queue); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); - - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); -#else - bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); -#endif - - kfree(bfqd); -} - -static void bfq_init_root_group(struct bfq_group *root_group, - struct bfq_data *bfqd) -{ - int i; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -#endif - root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - root_group->sched_data.bfq_class_idle_last_service = jiffies; -} - -static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct bfq_data *bfqd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = bfqd; - - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. - */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - bfqd->oom_bfqq.ref++; - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; - bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); - - /* oom_bfqq does not participate to bursts */ - bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); - /* - * Trigger weight initialization, according to ioprio, at the - * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio - * class won't be changed any more. - */ - bfqd->oom_bfqq.entity.prio_changed = 1; - - bfqd->queue = q; - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; - - INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); - - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); - - bfqd->hw_tag = -1; - - bfqd->bfq_max_budget = bfq_default_max_budget; - - bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; - bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_timeout = bfq_timeout; - - bfqd->bfq_requests_within_timer = 120; - - bfqd->bfq_large_burst_thresh = 8; - bfqd->bfq_burst_interval = msecs_to_jiffies(180); - - bfqd->low_latency = true; - - /* - * Trade-off between responsiveness and fairness. - */ - bfqd->bfq_wr_coeff = 30; - bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; - bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); - bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); - bfqd->bfq_wr_max_softrt_rate = 7000; /* - * Approximate rate required - * to playback or record a - * high-definition compressed - * video. - */ - bfqd->wr_busy_queues = 0; - - /* - * Begin by assuming, optimistically, that the device is a - * high-speed one, and that its peak rate is equal to 2/3 of - * the highest reference rate. - */ - bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * - T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - - return 0; - -out_free: - kfree(bfqd); - kobject_put(&eq->kobj); - return -ENOMEM; -} - -static void bfq_slab_kill(void) -{ - kmem_cache_destroy(bfq_pool); -} - -static int __init bfq_slab_setup(void) -{ - bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (!bfq_pool) - return -ENOMEM; - return 0; -} - -static ssize_t bfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%u\n", var); -} - -static ssize_t bfq_var_store(unsigned long *var, const char *page, - size_t count) -{ - unsigned long new_val; - int ret = kstrtoul(page, 10, &new_val); - - if (ret == 0) - *var = new_val; - - return count; -} - -static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -{ - struct bfq_data *bfqd = e->elevator_data; - - return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? - jiffies_to_msecs(bfqd->bfq_wr_max_time) : - jiffies_to_msecs(bfq_wr_duration(bfqd))); -} - -static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -{ - struct bfq_queue *bfqq; - struct bfq_data *bfqd = e->elevator_data; - ssize_t num_char = 0; - - num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", - bfqd->queued); - - spin_lock_irq(bfqd->queue->queue_lock); - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, nr_queued %d %d, ", - bfqq->pid, - bfqq->entity.weight, - bfqq->queued[0], - bfqq->queued[1]); - num_char += sprintf(page + num_char, - "dur %d/%u\n", - jiffies_to_msecs( - jiffies - - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - num_char += sprintf(page + num_char, "Idle:\n"); - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, dur %d/%u\n", - bfqq->pid, - bfqq->entity.weight, - jiffies_to_msecs(jiffies - - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - - spin_unlock_irq(bfqd->queue->queue_lock); - - return num_char; -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - u64 __data = __VAR; \ - if (__CONV == 1) \ - __data = jiffies_to_msecs(__data); \ - else if (__CONV == 2) \ - __data = div_u64(__data, NSEC_PER_MSEC); \ - return bfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, - 1); -SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -#undef SHOW_FUNCTION - -#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - u64 __data = __VAR; \ - __data = div_u64(__data, NSEC_PER_USEC); \ - return bfq_var_show(__data, (page)); \ -} -USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -#undef USEC_SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t \ -__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV == 1) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else if (__CONV == 2) \ - *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ - else \ - *(__PTR) = __data; \ - return ret; \ -} -STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 2); -STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 2); -STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, - 1); -STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, - &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, - INT_MAX, 0); -#undef STORE_FUNCTION - -#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ -} -USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, - UINT_MAX); -#undef USEC_STORE_FUNCTION - -/* do nothing for the moment */ -static ssize_t bfq_weights_store(struct elevator_queue *e, - const char *page, size_t count) -{ - return count; -} - -static ssize_t bfq_max_budget_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data == 0) - bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - else { - if (__data > INT_MAX) - __data = INT_MAX; - bfqd->bfq_max_budget = __data; - } - - bfqd->bfq_user_max_budget = __data; - - return ret; -} - -/* - * Leaving this name to preserve name compatibility with cfq - * parameters, but this timeout is used for both sync and async. - */ -static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data < 1) - __data = 1; - else if (__data > INT_MAX) - __data = INT_MAX; - - bfqd->bfq_timeout = msecs_to_jiffies(__data); - if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - - return ret; -} - -static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data > 1) - __data = 1; - if (!bfqd->strict_guarantees && __data == 1 - && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) - bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; - - bfqd->strict_guarantees = __data; - - return ret; -} - -static ssize_t bfq_low_latency_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); - - if (__data > 1) - __data = 1; - if (__data == 0 && bfqd->low_latency != 0) - bfq_end_wr(bfqd); - bfqd->low_latency = __data; - - return ret; -} - -#define BFQ_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) - -static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(fifo_expire_sync), - BFQ_ATTR(fifo_expire_async), - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_idle), - BFQ_ATTR(slice_idle_us), - BFQ_ATTR(max_budget), - BFQ_ATTR(timeout_sync), - BFQ_ATTR(strict_guarantees), - BFQ_ATTR(low_latency), - BFQ_ATTR(wr_coeff), - BFQ_ATTR(wr_max_time), - BFQ_ATTR(wr_rt_max_time), - BFQ_ATTR(wr_min_idle_time), - BFQ_ATTR(wr_min_inter_arr_async), - BFQ_ATTR(wr_max_softrt_rate), - BFQ_ATTR(weights), - __ATTR_NULL -}; - -static struct elevator_type iosched_bfq = { - .ops = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, -#ifdef CONFIG_BFQ_GROUP_IOSCHED - .elevator_bio_merged_fn = bfq_bio_merged, -#endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, - .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, - .elevator_deactivate_req_fn = bfq_deactivate_request, - .elevator_completed_req_fn = bfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_icq_fn = bfq_init_icq, - .elevator_exit_icq_fn = bfq_exit_icq, - .elevator_set_req_fn = bfq_set_request, - .elevator_put_req_fn = bfq_put_request, - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, - }, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - .elevator_attrs = bfq_attrs, - .elevator_name = "bfq", - .elevator_owner = THIS_MODULE, -}; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, -}; -#endif - -static int __init bfq_init(void) -{ - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -#endif - - ret = -ENOMEM; - if (bfq_slab_setup()) - goto err_pol_unreg; - - /* - * Times to load large popular applications for the typical - * systems installed on the reference devices (see the - * comments before the definitions of the next two - * arrays). Actually, we use slightly slower values, as the - * estimated peak rate tends to be smaller than the actual - * peak rate. The reason for this last fact is that estimates - * are computed over much shorter time intervals than the long - * intervals typically used for benchmarking. Why? First, to - * adapt more quickly to variations. Second, because an I/O - * scheduler cannot rely on a peak-rate-evaluation workload to - * be run for a long time. - */ - T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ - T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ - T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ - T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ - - /* - * Thresholds that determine the switch between speed classes - * (see the comments before the definition of the array - * device_speed_thresh). These thresholds are biased towards - * transitions to the fast class. This is safer than the - * opposite bias. In fact, a wrong transition to the slow - * class results in short weight-raising periods, because the - * speed of the device then tends to be higher that the - * reference peak rate. On the opposite end, a wrong - * transition to the fast class tends to increase - * weight-raising periods, because of the opposite reason. - */ - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; - - ret = elv_register(&iosched_bfq); - if (ret) - goto err_pol_unreg; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - strcat(msg, " (with cgroups support)"); -#endif - pr_info("%s", msg); - - return 0; - -err_pol_unreg: -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif - return ret; -} - -static void __exit bfq_exit(void) -{ - elv_unregister(&iosched_bfq); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_bfq); -#endif - bfq_slab_kill(); -} - -module_init(bfq_init); -module_exit(bfq_exit); - -MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -MODULE_LICENSE("GPL"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c deleted file mode 100644 index be985d9d5f17..000000000000 --- a/block/bfq-sched.c +++ /dev/null @@ -1,2025 +0,0 @@ -/* - * BFQ: Hierarchical B-WF2Q+ scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2016 Paolo Valente - */ - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static int bfq_gt(u64 a, u64 b) -{ - return (s64)(a - b) > 0; -} - -static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -{ - struct rb_node *node = tree->rb_node; - - return rb_entry(node, struct bfq_entity, rb_node); -} - -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); - -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - -/** - * bfq_update_next_in_service - update sd->next_in_service - * @sd: sched_data for which to perform the update. - * @new_entity: if not NULL, pointer to the entity whose activation, - * requeueing or repositionig triggered the invocation of - * this function. - * - * This function is called to update sd->next_in_service, which, in - * its turn, may change as a consequence of the insertion or - * extraction of an entity into/from one of the active trees of - * sd. These insertions/extractions occur as a consequence of - * activations/deactivations of entities, with some activations being - * 'true' activations, and other activations being requeueings (i.e., - * implementing the second, requeueing phase of the mechanism used to - * reposition an entity in its active tree; see comments on - * __bfq_activate_entity and __bfq_requeue_entity for details). In - * both the last two activation sub-cases, new_entity points to the - * just activated or requeued entity. - * - * Returns true if sd->next_in_service changes in such a way that - * entity->parent may become the next_in_service for its parent - * entity. - */ -static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *new_entity) -{ - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; - bool parent_sched_may_change = false; - - /* - * If this update is triggered by the activation, requeueing - * or repositiong of an entity that does not coincide with - * sd->next_in_service, then a full lookup in the active tree - * can be avoided. In fact, it is enough to check whether the - * just-modified entity has a higher priority than - * sd->next_in_service, or, even if it has the same priority - * as sd->next_in_service, is eligible and has a lower virtual - * finish time than sd->next_in_service. If this compound - * condition holds, then the new entity becomes the new - * next_in_service. Otherwise no change is needed. - */ - if (new_entity && new_entity != sd->next_in_service) { - /* - * Flag used to decide whether to replace - * sd->next_in_service with new_entity. Tentatively - * set to true, and left as true if - * sd->next_in_service is NULL. - */ - bool replace_next = true; - - /* - * If there is already a next_in_service candidate - * entity, then compare class priorities or timestamps - * to decide whether to replace sd->service_tree with - * new_entity. - */ - if (next_in_service) { - unsigned int new_entity_class_idx = - bfq_class_idx(new_entity); - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - - /* - * For efficiency, evaluate the most likely - * sub-condition first. - */ - replace_next = - (new_entity_class_idx == - bfq_class_idx(next_in_service) - && - !bfq_gt(new_entity->start, st->vtime) - && - bfq_gt(next_in_service->finish, - new_entity->finish)) - || - new_entity_class_idx < - bfq_class_idx(next_in_service); - } - - if (replace_next) - next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ - next_in_service = bfq_lookup_next_entity(sd); - - if (next_in_service) { - parent_sched_may_change = !sd->next_in_service || - bfq_update_parent_budget(next_in_service); - } - - sd->next_in_service = next_in_service; - - if (!next_in_service) - return parent_sched_may_change; - - bfqq = bfq_entity_to_bfqq(next_in_service); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_next_in_service: chosen this queue"); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(next_in_service, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_next_in_service: chosen this entity"); - } -#endif - return parent_sched_may_change; -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -/* both next loops stop at one of the child entities of the root group */ -#define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) - -/* - * For each iteration, compute parent in advance, so as to be safe if - * entity is deallocated during the iteration. Such a deallocation may - * happen as a consequence of a bfq_put_queue that frees the bfq_queue - * containing entity. - */ -#define for_each_entity_safe(entity, parent) \ - for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - -/* - * Returns true if this budget changes may let next_in_service->parent - * become the next_in_service entity for its parent entity. - */ -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -{ - struct bfq_entity *bfqg_entity; - struct bfq_group *bfqg; - struct bfq_sched_data *group_sd; - bool ret = false; - - BUG_ON(!next_in_service); - - group_sd = next_in_service->sched_data; - - bfqg = container_of(group_sd, struct bfq_group, sched_data); - /* - * bfq_group's my_entity field is not NULL only if the group - * is not the root group. We must not touch the root entity - * as it must never become an in-service entity. - */ - bfqg_entity = bfqg->my_entity; - if (bfqg_entity) { - if (bfqg_entity->budget > next_in_service->budget) - ret = true; - bfqg_entity->budget = next_in_service->budget; - } - - return ret; -} - -/* - * This function tells whether entity stops being a candidate for next - * service, according to the restrictive definition of the field - * next_in_service. In particular, this function is invoked for an - * entity that is about to be set in service. - * - * If entity is a queue, then the entity is no longer a candidate for - * next service according to the that definition, because entity is - * about to become the in-service queue. This function then returns - * true if entity is a queue. - * - * In contrast, entity could still be a candidate for next service if - * it is not a queue, and has more than one active child. In fact, - * even if one of its children is about to be set in service, other - * active children may still be the next to serve, for the parent - * entity, even according to the above definition. As a consequence, a - * non-queue entity is not a candidate for next-service only if it has - * only one active child. And only if this condition holds, then this - * function returns true for a non-queue entity. - */ -static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -{ - struct bfq_group *bfqg; - - if (bfq_entity_to_bfqq(entity)) - return true; - - bfqg = container_of(entity, struct bfq_group, entity); - - BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); - BUG_ON(bfqg->active_entities == 0); - /* - * The field active_entities does not always contain the - * actual number of active children entities: it happens to - * not account for the in-service entity in case the latter is - * removed from its active tree (which may get done after - * invoking the function bfq_no_longer_next_in_service in - * bfq_get_next_queue). Fortunately, here, i.e., while - * bfq_no_longer_next_in_service is not yet completed in - * bfq_get_next_queue, bfq_active_extract has not yet been - * invoked, and thus active_entities still coincides with the - * actual number of active entities. - */ - if (bfqg->active_entities == 1) - return true; - - return false; -} - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ -#define for_each_entity(entity) \ - for (; entity ; entity = NULL) - -#define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity ; entity = parent) - -static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -{ - return false; -} - -static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -{ - return true; -} - -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - -/* - * Shift for timestamp calculations. This actually limits the maximum - * service allowed in one timestamp delta (small shift values increase it), - * the maximum total weight that can be used for the queues in the system - * (big shift values increase it), and the period of virtual time - * wraparounds. - */ -#define WFQ_SERVICE_SHIFT 22 - -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = NULL; - - BUG_ON(!entity); - - if (!entity->my_sched_data) - bfqq = container_of(entity, struct bfq_queue, entity); - - return bfqq; -} - - -/** - * bfq_delta - map service into the virtual time domain. - * @service: amount of service. - * @weight: scale factor (weight of an entity or weight sum). - */ -static u64 bfq_delta(unsigned long service, unsigned long weight) -{ - u64 d = (u64)service << WFQ_SERVICE_SHIFT; - - do_div(d, weight); - return d; -} - -/** - * bfq_calc_finish - assign the finish time to an entity. - * @entity: the entity to act upon. - * @service: the service to be charged to the entity. - */ -static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned long long start, finish, delta; - - BUG_ON(entity->weight == 0); - - entity->finish = entity->start + - bfq_delta(service, entity->weight); - - start = ((entity->start>>10)*1000)>>12; - finish = ((entity->finish>>10)*1000)>>12; - delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - start, finish, delta); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_finish group: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_finish group: start %llu, finish %llu, delta %llu", - start, finish, delta); -#endif - } -} - -/** - * bfq_entity_of - get an entity from a node. - * @node: the node field of the entity. - * - * Convert a node pointer to the relative entity. This is used only - * to simplify the logic of some functions and not as the generic - * conversion mechanism because, e.g., in the tree walking functions, - * the check for a %NULL value would be redundant. - */ -static struct bfq_entity *bfq_entity_of(struct rb_node *node) -{ - struct bfq_entity *entity = NULL; - - if (node) - entity = rb_entry(node, struct bfq_entity, rb_node); - - return entity; -} - -/** - * bfq_extract - remove an entity from a tree. - * @root: the tree root. - * @entity: the entity to remove. - */ -static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -{ - BUG_ON(entity->tree != root); - - entity->tree = NULL; - rb_erase(&entity->rb_node, root); -} - -/** - * bfq_idle_extract - extract an entity from the idle tree. - * @st: the service tree of the owning @entity. - * @entity: the entity being removed. - */ -static void bfq_idle_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *next; - - BUG_ON(entity->tree != &st->idle); - - if (entity == st->first_idle) { - next = rb_next(&entity->rb_node); - st->first_idle = bfq_entity_of(next); - } - - if (entity == st->last_idle) { - next = rb_prev(&entity->rb_node); - st->last_idle = bfq_entity_of(next); - } - - bfq_extract(&st->idle, entity); - - if (bfqq) - list_del(&bfqq->bfqq_list); -} - -/** - * bfq_insert - generic tree insertion. - * @root: tree root. - * @entity: entity to insert. - * - * This is used for the idle and the active tree, since they are both - * ordered by finish time. - */ -static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -{ - struct bfq_entity *entry; - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - - BUG_ON(entity->tree); - - while (*node) { - parent = *node; - entry = rb_entry(parent, struct bfq_entity, rb_node); - - if (bfq_gt(entry->finish, entity->finish)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&entity->rb_node, parent, node); - rb_insert_color(&entity->rb_node, root); - - entity->tree = root; -} - -/** - * bfq_update_min - update the min_start field of a entity. - * @entity: the entity to update. - * @node: one of its children. - * - * This function is called when @entity may store an invalid value for - * min_start due to updates to the active tree. The function assumes - * that the subtree rooted at @node (which may be its left or its right - * child) has a valid min_start value. - */ -static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -{ - struct bfq_entity *child; - - if (node) { - child = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entity->min_start, child->min_start)) - entity->min_start = child->min_start; - } -} - -/** - * bfq_update_active_node - recalculate min_start. - * @node: the node to update. - * - * @node may have changed position or one of its children may have moved, - * this function updates its min_start value. The left and right subtrees - * are assumed to hold a correct min_start value. - */ -static void bfq_update_active_node(struct rb_node *node) -{ - struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->min_start = entity->start; - bfq_update_min(entity, node->rb_right); - bfq_update_min(entity, node->rb_left); - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); -#endif - } -} - -/** - * bfq_update_active_tree - update min_start for the whole active tree. - * @node: the starting node. - * - * @node must be the deepest modified node after an update. This function - * updates its min_start using the values held by its children, assuming - * that they did not change, and then updates all the nodes that may have - * changed in the path to the root. The only nodes that may have changed - * are the ones in the path or their siblings. - */ -static void bfq_update_active_tree(struct rb_node *node) -{ - struct rb_node *parent; - -up: - bfq_update_active_node(node); - - parent = rb_parent(node); - if (!parent) - return; - - if (node == parent->rb_left && parent->rb_right) - bfq_update_active_node(parent->rb_right); - else if (parent->rb_left) - bfq_update_active_node(parent->rb_left); - - node = parent; - goto up; -} - -static void bfq_weights_tree_add(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root); - -static void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_entity *entity, - struct rb_root *root); - - -/** - * bfq_active_insert - insert an entity in the active tree of its - * group/device. - * @st: the service tree of the entity. - * @entity: the entity being inserted. - * - * The active tree is ordered by finish time, but an extra key is kept - * per each node, containing the minimum value for the start times of - * its children (and the node itself), so it's possible to search for - * the eligible node with the lowest finish time in logarithmic time. - */ -static void bfq_active_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif - - bfq_insert(&st->active, entity); - - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); - } - if (bfqg != bfqd->root_group) { - BUG_ON(!bfqg); - BUG_ON(!bfqd); - bfqg->active_entities++; - } -#endif -} - -/** - * bfq_ioprio_to_weight - calc a weight from an ioprio. - * @ioprio: the ioprio value to convert. - */ -static unsigned short bfq_ioprio_to_weight(int ioprio) -{ - BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -} - -/** - * bfq_weight_to_ioprio - calc an ioprio from a weight. - * @weight: the weight value to convert. - * - * To preserve as much as possible the old only-ioprio user interface, - * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. - */ -static unsigned short bfq_weight_to_ioprio(int weight) -{ - BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? - 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -} - -static void bfq_get_entity(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - if (bfqq) { - bfqq->ref++; - bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, bfqq->ref); - } -} - -/** - * bfq_find_deepest - find the deepest node that an extraction can modify. - * @node: the node being removed. - * - * Do the first step of an extraction in an rb tree, looking for the - * node that will replace @node, and returning the deepest node that - * the following modifications to the tree can touch. If @node is the - * last node in the tree return %NULL. - */ -static struct rb_node *bfq_find_deepest(struct rb_node *node) -{ - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} - -/** - * bfq_active_extract - remove an entity from the active tree. - * @st: the service_tree containing the tree. - * @entity: the entity being removed. - */ -static void bfq_active_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif - - node = bfq_find_deepest(&entity->rb_node); - bfq_extract(&st->active, entity); - - if (node) - bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif - if (bfqq) - list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); - } - if (bfqg != bfqd->root_group) { - BUG_ON(!bfqg); - BUG_ON(!bfqd); - BUG_ON(!bfqg->active_entities); - bfqg->active_entities--; - } -#endif -} - -/** - * bfq_idle_insert - insert an entity into the idle tree. - * @st: the service tree containing the tree. - * @entity: the entity to insert. - */ -static void bfq_idle_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) - st->first_idle = entity; - if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) - st->last_idle = entity; - - bfq_insert(&st->idle, entity); - - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -} - -/** - * bfq_forget_entity - do not consider entity any longer for scheduling - * @st: the service tree. - * @entity: the entity being removed. - * @is_in_service: true if entity is currently the in-service entity. - * - * Forget everything about @entity. In addition, if entity represents - * a queue, and the latter is not in service, then release the service - * reference to the queue (the one taken through bfq_get_entity). In - * fact, in this case, there is really no more service reference to - * the queue, as the latter is also outside any service tree. If, - * instead, the queue is in service, then __bfq_bfqd_reset_in_service - * will take care of putting the reference when the queue finally - * stops being served. - */ -static void bfq_forget_entity(struct bfq_service_tree *st, - struct bfq_entity *entity, - bool is_in_service) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!entity->on_st); - - entity->on_st = false; - st->wsum -= entity->weight; - if (bfqq && !is_in_service) { - bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -} - -/** - * bfq_put_idle_entity - release the idle tree ref of an entity. - * @st: service tree for the entity. - * @entity: the entity being released. - */ -static void bfq_put_idle_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - bfq_idle_extract(st, entity); - bfq_forget_entity(st, entity, - entity == entity->sched_data->in_service_entity); -} - -/** - * bfq_forget_idle - update the idle tree if necessary. - * @st: the service tree to act upon. - * - * To preserve the global O(log N) complexity we only remove one entry here; - * as the idle tree will not grow indefinitely this can be done safely. - */ -static void bfq_forget_idle(struct bfq_service_tree *st) -{ - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (RB_EMPTY_ROOT(&st->active) && last_idle && - !bfq_gt(last_idle->finish, st->vtime)) { - /* - * Forget the whole idle tree, increasing the vtime past - * the last finish time of idle entities. - */ - st->vtime = last_idle->finish; - } - - if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) - bfq_put_idle_entity(st, first_idle); -} - -/* - * Update weight and priority of entity. If update_class_too is true, - * then update the ioprio_class of entity too. - * - * The reason why the update of ioprio_class is controlled through the - * last parameter is as follows. Changing the ioprio class of an - * entity implies changing the destination service trees for that - * entity. If such a change occurred when the entity is already on one - * of the service trees for its previous class, then the state of the - * entity would become more complex: none of the new possible service - * trees for the entity, according to bfq_entity_service_tree(), would - * match any of the possible service trees on which the entity - * is. Complex operations involving these trees, such as entity - * activations and deactivations, should take into account this - * additional complexity. To avoid this issue, this function is - * invoked with update_class_too unset in the points in the code where - * entity may happen to be on some tree. - */ -static struct bfq_service_tree * -__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - struct bfq_entity *entity, - bool update_class_too) -{ - struct bfq_service_tree *new_st = old_st; - - if (entity->prio_changed) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); - bfqd = (struct bfq_data *)bfqg->bfqd; - BUG_ON(!bfqd); - } -#endif - - BUG_ON(entity->tree && update_class_too); - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - - if (entity->new_weight != entity->orig_weight) { - if (entity->new_weight < BFQ_MIN_WEIGHT || - entity->new_weight > BFQ_MAX_WEIGHT) { - pr_crit("update_weight_prio: new_weight %d\n", - entity->new_weight); - if (entity->new_weight < BFQ_MIN_WEIGHT) - entity->new_weight = BFQ_MIN_WEIGHT; - else - entity->new_weight = BFQ_MAX_WEIGHT; - } - entity->orig_weight = entity->new_weight; - if (bfqq) - bfqq->ioprio = - bfq_weight_to_ioprio(entity->orig_weight); - } - - if (bfqq && update_class_too) - bfqq->ioprio_class = bfqq->new_ioprio_class; - - /* - * Reset prio_changed only if the ioprio_class change - * is not pending any longer. - */ - if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) - entity->prio_changed = 0; - - /* - * NOTE: here we may be changing the weight too early, - * this will cause unfairness. The correct approach - * would have required additional complexity to defer - * weight changes to the proper time instants (i.e., - * when entity->finish <= old_st->vtime). - */ - new_st = bfq_entity_service_tree(entity); - - prev_weight = entity->weight; - new_weight = entity->orig_weight * - (bfqq ? bfqq->wr_coeff : 1); - /* - * If the weight of the entity changes, remove the entity - * from its old weight counter (if there is a counter - * associated with the entity), and add it to the counter - * associated with its new weight. - */ - if (prev_weight != new_weight) { - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "weight changed %d %d(%d %d)", - prev_weight, new_weight, - entity->orig_weight, - bfqq->wr_coeff); - - root = bfqq ? &bfqd->queue_weights_tree : - &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); - } - entity->weight = new_weight; - /* - * Add the entity to its weights tree only if it is - * not associated with a weight-raised queue. - */ - if (prev_weight != new_weight && - (bfqq ? bfqq->wr_coeff == 1 : 1)) - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, entity, root); - - new_st->wsum += entity->weight; - - if (new_st != old_st) { - BUG_ON(!update_class_too); - entity->start = new_st->vtime; - } - } - - return new_st; -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -#endif - -/** - * bfq_bfqq_served - update the scheduler status after selection for - * service. - * @bfqq: the queue being served. - * @served: bytes to transfer. - * - * NOTE: this can be optimized, as the timestamps of upper level entities - * are synchronized every time a new bfqq is selected for service. By now, - * we keep it to better check consistency. - */ -static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - - entity->service += served; - - BUG_ON(st->wsum == 0); - - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -#endif - st = bfq_entity_service_tree(&bfqq->entity); - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", - served, ((st->vtime>>10)*1000)>>12, st); -} - -/** - * bfq_bfqq_charge_time - charge an amount of service equivalent to the length - * of the time interval during which bfqq has been in - * service. - * @bfqd: the device - * @bfqq: the queue that needs a service update. - * @time_ms: the amount of time during which the queue has received service - * - * If a queue does not consume its budget fast enough, then providing - * the queue with service fairness may impair throughput, more or less - * severely. For this reason, queues that consume their budget slowly - * are provided with time fairness instead of service fairness. This - * goal is achieved through the BFQ scheduling engine, even if such an - * engine works in the service, and not in the time domain. The trick - * is charging these queues with an inflated amount of service, equal - * to the amount of service that they would have received during their - * service slot if they had been fast, i.e., if their requests had - * been dispatched at a rate equal to the estimated peak rate. - * - * It is worth noting that time fairness can cause important - * distortions in terms of bandwidth distribution, on devices with - * internal queueing. The reason is that I/O requests dispatched - * during the service slot of a queue may be served after that service - * slot is finished, and may have a total processing time loosely - * correlated with the duration of the service slot. This is - * especially true for short service slots. - */ -static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - unsigned long time_ms) -{ - struct bfq_entity *entity = &bfqq->entity; - int tot_serv_to_charge = entity->service; - unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); - - if (time_ms > 0 && time_ms < timeout_ms) - tot_serv_to_charge = - (bfqd->bfq_max_budget * time_ms) / timeout_ms; - - if (tot_serv_to_charge < entity->service) - tot_serv_to_charge = entity->service; - - bfq_log_bfqq(bfqq->bfqd, bfqq, - "charge_time: %lu/%u ms, %d/%d/%d sectors", - time_ms, timeout_ms, entity->service, - tot_serv_to_charge, entity->budget); - - /* Increase budget to avoid inconsistencies */ - if (tot_serv_to_charge > entity->budget) - entity->budget = tot_serv_to_charge; - - bfq_bfqq_served(bfqq, - max_t(int, 0, tot_serv_to_charge - entity->service)); -} - -static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - struct bfq_service_tree *st, - bool backshifted) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_sched_data *sd = entity->sched_data; - - /* - * When this function is invoked, entity is not in any service - * tree, then it is safe to invoke next function with the last - * parameter set (see the comments on the function). - */ - BUG_ON(entity->tree); - st = __bfq_entity_update_weight_prio(st, entity, true); - bfq_calc_finish(entity, entity->budget); - - /* - * If some queues enjoy backshifting for a while, then their - * (virtual) finish timestamps may happen to become lower and - * lower than the system virtual time. In particular, if - * these queues often happen to be idle for short time - * periods, and during such time periods other queues with - * higher timestamps happen to be busy, then the backshifted - * timestamps of the former queues can become much lower than - * the system virtual time. In fact, to serve the queues with - * higher timestamps while the ones with lower timestamps are - * idle, the system virtual time may be pushed-up to much - * higher values than the finish timestamps of the idle - * queues. As a consequence, the finish timestamps of all new - * or newly activated queues may end up being much larger than - * those of lucky queues with backshifted timestamps. The - * latter queues may then monopolize the device for a lot of - * time. This would simply break service guarantees. - * - * To reduce this problem, push up a little bit the - * backshifted timestamps of the queue associated with this - * entity (only a queue can happen to have the backshifted - * flag set): just enough to let the finish timestamp of the - * queue be equal to the current value of the system virtual - * time. This may introduce a little unfairness among queues - * with backshifted timestamps, but it does not break - * worst-case fairness guarantees. - * - * As a special case, if bfqq is weight-raised, push up - * timestamps much less, to keep very low the probability that - * this push up causes the backshifted finish timestamps of - * weight-raised queues to become higher than the backshifted - * finish timestamps of non weight-raised queues. - */ - if (backshifted && bfq_gt(st->vtime, entity->finish)) { - unsigned long delta = st->vtime - entity->finish; - - if (bfqq) - delta /= bfqq->wr_coeff; - - entity->start += delta; - entity->finish += delta; - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: new group finish %llu", - ((entity->finish>>10)*1000)>>12); -#endif - } - } - - bfq_active_insert(st, entity); - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); -#endif - } - BUG_ON(RB_EMPTY_ROOT(&st->active)); - BUG_ON(&st->active != &sd->service_tree->active && - &st->active != &(sd->service_tree+1)->active && - &st->active != &(sd->service_tree+2)->active); -} - -/** - * __bfq_activate_entity - handle activation of entity. - * @entity: the entity being activated. - * @non_blocking_wait_rq: true if entity was waiting for a request - * - * Called for a 'true' activation, i.e., if entity is not active and - * one of its children receives a new request. - * - * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possibly extracting it - * from its idle tree. - */ -static void __bfq_activate_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - bool backshifted = false; - unsigned long long min_vstart; - - BUG_ON(!sd); - BUG_ON(!st); - - /* See comments on bfq_fqq_update_budg_for_activation */ - if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { - backshifted = true; - min_vstart = entity->finish; - } else - min_vstart = st->vtime; - - if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); - BUG_ON(entity->tree); - entity->start = bfq_gt(min_vstart, entity->finish) ? - min_vstart : entity->finish; - } else { - BUG_ON(entity->tree); - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - entity->start = min_vstart; - st->wsum += entity->weight; - /* - * entity is about to be inserted into a service tree, - * and then set in service: get a reference to make - * sure entity does not disappear until it is no - * longer in service or scheduled for service. - */ - bfq_get_entity(entity); - - BUG_ON(entity->on_st && bfqq); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (entity->on_st && !bfqq) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, - bfqg, - "activate bug, class %d in_service %p", - bfq_class_idx(entity), sd->in_service_entity); - } -#endif - BUG_ON(entity->on_st && !bfqq); - entity->on_st = true; - } - - bfq_update_fin_time_enqueue(entity, st, backshifted); -} - -/** - * __bfq_requeue_entity - handle requeueing or repositioning of an entity. - * @entity: the entity being requeued or repositioned. - * - * Requeueing is needed if this entity stops being served, which - * happens if a leaf descendant entity has expired. On the other hand, - * repositioning is needed if the next_inservice_entity for the child - * entity has changed. See the comments inside the function for - * details. - * - * Basically, this function: 1) removes entity from its active tree if - * present there, 2) updates the timestamps of entity and 3) inserts - * entity back into its active tree (in the new, right position for - * the new values of the timestamps). - */ -static void __bfq_requeue_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - BUG_ON(!sd); - BUG_ON(!st); - - BUG_ON(entity != sd->in_service_entity && - entity->tree != &st->active); - - if (entity == sd->in_service_entity) { - /* - * We are requeueing the current in-service entity, - * which may have to be done for one of the following - * reasons: - * - entity represents the in-service queue, and the - * in-service queue is being requeued after an - * expiration; - * - entity represents a group, and its budget has - * changed because one of its child entities has - * just been either activated or requeued for some - * reason; the timestamps of the entity need then to - * be updated, and the entity needs to be enqueued - * or repositioned accordingly. - * - * In particular, before requeueing, the start time of - * the entity must be moved forward to account for the - * service that the entity has received while in - * service. This is done by the next instructions. The - * finish time will then be updated according to this - * new value of the start time, and to the budget of - * the entity. - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; - BUG_ON(entity->tree && entity->tree == &st->idle); - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child - * when set in service, then it was not extracted from - * the active tree. This implies that the position of - * the entity in the active tree may need to be - * changed now, because we have just updated the start - * time of the entity, and we will update its finish - * time in a moment (the requeueing is then, more - * precisely, a repositioning in this case). To - * implement this repositioning, we: 1) dequeue the - * entity here, 2) update the finish time and requeue - * the entity according to the new timestamps below. - */ - if (entity->tree) - bfq_active_extract(st, entity); - } else { /* The entity is already active, and not in service */ - /* - * In this case, this function gets called only if the - * next_in_service entity below this entity has - * changed, and this change has caused the budget of - * this entity to change, which, finally implies that - * the finish time of this entity must be - * updated. Such an update may cause the scheduling, - * i.e., the position in the active tree, of this - * entity to change. We handle this change by: 1) - * dequeueing the entity here, 2) updating the finish - * time and requeueing the entity according to the new - * timestamps below. This is the same approach as the - * non-extracted-entity sub-case above. - */ - bfq_active_extract(st, entity); - } - - bfq_update_fin_time_enqueue(entity, st, false); -} - -static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, - bool non_blocking_wait_rq) -{ - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - if (sd->in_service_entity == entity || entity->tree == &st->active) - /* - * in service or already queued on the active tree, - * requeue or reposition - */ - __bfq_requeue_entity(entity); - else - /* - * Not in service and not queued on its active tree: - * the activity is idle and this is a true activation. - */ - __bfq_activate_entity(entity, non_blocking_wait_rq); -} - - -/** - * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, - * and activate, requeue or reposition all ancestors - * for which such an update becomes necessary. - * @entity: the entity to activate. - * @non_blocking_wait_rq: true if this entity was waiting for a request - * @requeue: true if this is a requeue, which implies that bfqq is - * being expired; thus ALL its ancestors stop being served and must - * therefore be requeued - */ -static void bfq_activate_requeue_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq, - bool requeue) -{ - struct bfq_sched_data *sd; - - for_each_entity(entity) { - BUG_ON(!entity); - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && - RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && - RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); - - if (!bfq_update_next_in_service(sd, entity) && !requeue) { - BUG_ON(!sd->next_in_service); - break; - } - BUG_ON(!sd->next_in_service); - } -} - -/** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. - * @ins_into_idle_tree: if false, the entity will not be put into the - * idle tree. - * - * Deactivates an entity, independently of its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it into the idle - * tree. - */ -static bool __bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; - bool is_in_service; - - if (!entity->on_st) { /* entity never activated, or already inactive */ - BUG_ON(sd && entity == sd->in_service_entity); - return false; - } - - /* - * If we get here, then entity is active, which implies that - * bfq_group_set_parent has already been invoked for the group - * represented by entity. Therefore, the field - * entity->sched_data has been set, and we can safely use it. - */ - st = bfq_entity_service_tree(entity); - is_in_service = entity == sd->in_service_entity; - - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - - if (is_in_service) { - bfq_calc_finish(entity, entity->service); - sd->in_service_entity = NULL; - } - - if (entity->tree == &st->active) - bfq_active_extract(st, entity); - else if (!is_in_service && entity->tree == &st->idle) - bfq_idle_extract(st, entity); - else if (entity->tree) - BUG(); - - if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) - bfq_forget_entity(st, entity, is_in_service); - else - bfq_idle_insert(st, entity); - - return true; -} - -/** - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put into the idle tree - */ -static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, - bool expiration) -{ - struct bfq_sched_data *sd; - struct bfq_entity *parent = NULL; - - for_each_entity_safe(entity, parent) { - sd = entity->sched_data; - - BUG_ON(sd == NULL); /* - * It would mean that this is the - * root group. - */ - - BUG_ON(expiration && entity != sd->in_service_entity); - - BUG_ON(entity != sd->in_service_entity && - entity->tree == - &bfq_entity_service_tree(entity)->active && - !sd->next_in_service); - - if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { - /* - * entity is not in any tree any more, so - * this deactivation is a no-op, and there is - * nothing to change for upper-level entities - * (in case of expiration, this can never - * happen). - */ - BUG_ON(expiration); /* - * entity cannot be already out of - * any tree - */ - return; - } - - if (sd->next_in_service == entity) - /* - * entity was the next_in_service entity, - * then, since entity has just been - * deactivated, a new one must be found. - */ - bfq_update_next_in_service(sd, NULL); - - if (sd->next_in_service || sd->in_service_entity) { - /* - * The parent entity is still active, because - * either next_in_service or in_service_entity - * is not NULL. So, no further upwards - * deactivation must be performed. Yet, - * next_in_service has changed. Then the - * schedule does need to be updated upwards. - * - * NOTE If in_service_entity is not NULL, then - * next_in_service may happen to be NULL, - * although the parent entity is evidently - * active. This happens if 1) the entity - * pointed by in_service_entity is the only - * active entity in the parent entity, and 2) - * according to the definition of - * next_in_service, the in_service_entity - * cannot be considered as - * next_in_service. See the comments on the - * definition of next_in_service for details. - */ - BUG_ON(sd->next_in_service == entity); - BUG_ON(sd->in_service_entity == entity); - break; - } - - /* - * If we get here, then the parent is no more - * backlogged and we need to propagate the - * deactivation upwards. Thus let the loop go on. - */ - - /* - * Also let parent be queued into the idle tree on - * deactivation, to preserve service guarantees, and - * assuming that who invoked this function does not - * need parent entities too to be removed completely. - */ - ins_into_idle_tree = true; - } - - /* - * If the deactivation loop is fully executed, then there are - * no more entities to touch and next loop is not executed at - * all. Otherwise, requeue remaining entities if they are - * about to stop receiving service, or reposition them if this - * is not the case. - */ - entity = parent; - for_each_entity(entity) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - /* - * Invoke __bfq_requeue_entity on entity, even if - * already active, to requeue/reposition it in the - * active tree (because sd->next_in_service has - * changed) - */ - __bfq_requeue_entity(entity); - - sd = entity->sched_data; - BUG_ON(expiration && sd->in_service_entity != entity); - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "invoking udpdate_next for this queue"); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "invoking udpdate_next for this entity"); - } -#endif - if (!bfq_update_next_in_service(sd, entity) && - !expiration) - /* - * next_in_service unchanged or not causing - * any change in entity->parent->sd, and no - * requeueing needed for expiration: stop - * here. - */ - break; - } -} - -/** - * bfq_calc_vtime_jump - compute the value to which the vtime should jump, - * if needed, to have at least one entity eligible. - * @st: the service tree to act upon. - * - * Assumes that st is not empty. - */ -static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -{ - struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); - - if (bfq_gt(root_entity->min_start, st->vtime)) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", - root_entity->min_start); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(root_entity, struct bfq_group, - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_vtime_jump: new value %llu", - root_entity->min_start); - } -#endif - return root_entity->min_start; - } - return st->vtime; -} - -static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -{ - if (new_value > st->vtime) { - st->vtime = new_value; - bfq_forget_idle(st); - } -} - -/** - * bfq_first_active_entity - find the eligible entity with - * the smallest finish time - * @st: the service tree to select from. - * @vtime: the system virtual to use as a reference for eligibility - * - * This function searches the first schedulable entity, starting from the - * root of the tree and going on the left every time on this side there is - * a subtree with at least one eligible (start >= vtime) entity. The path on - * the right is followed only if a) the left subtree contains no eligible - * entities and b) no eligible entity has been found yet. - */ -static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, - u64 vtime) -{ - struct bfq_entity *entry, *first = NULL; - struct rb_node *node = st->active.rb_node; - - while (node) { - entry = rb_entry(node, struct bfq_entity, rb_node); -left: - if (!bfq_gt(entry->start, vtime)) - first = entry; - - BUG_ON(bfq_gt(entry->min_start, vtime)); - - if (node->rb_left) { - entry = rb_entry(node->rb_left, - struct bfq_entity, rb_node); - if (!bfq_gt(entry->min_start, vtime)) { - node = node->rb_left; - goto left; - } - } - if (first) - break; - node = node->rb_right; - } - - BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); - return first; -} - -/** - * __bfq_lookup_next_entity - return the first eligible entity in @st. - * @st: the service tree. - * - * If there is no in-service entity for the sched_data st belongs to, - * then return the entity that will be set in service if: - * 1) the parent entity this st belongs to is set in service; - * 2) no entity belonging to such parent entity undergoes a state change - * that would influence the timestamps of the entity (e.g., becomes idle, - * becomes backlogged, changes its budget, ...). - * - * In this first case, update the virtual time in @st too (see the - * comments on this update inside the function). - * - * In constrast, if there is an in-service entity, then return the - * entity that would be set in service if not only the above - * conditions, but also the next one held true: the currently - * in-service entity, on expiration, - * 1) gets a finish time equal to the current one, or - * 2) is not eligible any more, or - * 3) is idle. - */ -static struct bfq_entity * -__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -#if 0 - , bool force -#endif - ) -{ - struct bfq_entity *entity -#if 0 - , *new_next_in_service = NULL -#endif - ; - u64 new_vtime; - struct bfq_queue *bfqq; - - if (RB_EMPTY_ROOT(&st->active)) - return NULL; - - /* - * Get the value of the system virtual time for which at - * least one entity is eligible. - */ - new_vtime = bfq_calc_vtime_jump(st); - - /* - * If there is no in-service entity for the sched_data this - * active tree belongs to, then push the system virtual time - * up to the value that guarantees that at least one entity is - * eligible. If, instead, there is an in-service entity, then - * do not make any such update, because there is already an - * eligible entity, namely the in-service one (even if the - * entity is not on st, because it was extracted when set in - * service). - */ - if (!in_service) - bfq_update_vtime(st, new_vtime); - - entity = bfq_first_active_entity(st, new_vtime); - BUG_ON(bfq_gt(entity->start, new_vtime)); - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - } -#endif - - BUG_ON(!entity); - - return entity; -} - -/** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. - * - * This function is invoked when there has been a change in the trees - * for sd, and we need know what is the new next entity after this - * change. - */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -{ - struct bfq_service_tree *st = sd->service_tree; - struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); - struct bfq_entity *entity = NULL; - struct bfq_queue *bfqq; - int class_idx = 0; - - BUG_ON(!sd); - BUG_ON(!st); - /* - * Choose from idle class, if needed to guarantee a minimum - * bandwidth to this class (and if there is some active entity - * in idle class). This should also mitigate - * priority-inversion problems in case a low priority task is - * holding file system resources. - */ - if (time_is_before_jiffies(sd->bfq_class_idle_last_service + - BFQ_CL_IDLE_TIMEOUT)) { - if (!RB_EMPTY_ROOT(&idle_class_st->active)) - class_idx = BFQ_IOPRIO_CLASSES - 1; - /* About to be served if backlogged, or not yet backlogged */ - sd->bfq_class_idle_last_service = jiffies; - } - - /* - * Find the next entity to serve for the highest-priority - * class, unless the idle class needs to be served. - */ - for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { - entity = __bfq_lookup_next_entity(st + class_idx, - sd->in_service_entity); - - if (entity) - break; - } - - BUG_ON(!entity && - (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || - !RB_EMPTY_ROOT(&(st+2)->active))); - - if (!entity) - return NULL; - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", - st + class_idx, class_idx); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "chosen from st %p %d", - st + class_idx, class_idx); - } -#endif - - return entity; -} - -static bool next_queue_may_preempt(struct bfq_data *bfqd) -{ - struct bfq_sched_data *sd = &bfqd->root_group->sched_data; - - return sd->next_in_service != sd->in_service_entity; -} - -/* - * Get next queue for service. - */ -static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -{ - struct bfq_entity *entity = NULL; - struct bfq_sched_data *sd; - struct bfq_queue *bfqq; - - BUG_ON(bfqd->in_service_queue); - - if (bfqd->busy_queues == 0) - return NULL; - - /* - * Traverse the path from the root to the leaf entity to - * serve. Set in service all the entities visited along the - * way. - */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (entity) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, - "get_next_queue: lookup in this group"); - if (!sd->next_in_service) - pr_crit("get_next_queue: lookup in this group"); - } else { - bfq_log_bfqg(bfqd, bfqd->root_group, - "get_next_queue: lookup in root group"); - if (!sd->next_in_service) - pr_crit("get_next_queue: lookup in root group"); - } -#endif - - BUG_ON(!sd->next_in_service); - - /* - * WARNING. We are about to set the in-service entity - * to sd->next_in_service, i.e., to the (cached) value - * returned by bfq_lookup_next_entity(sd) the last - * time it was invoked, i.e., the last time when the - * service order in sd changed as a consequence of the - * activation or deactivation of an entity. In this - * respect, if we execute bfq_lookup_next_entity(sd) - * in this very moment, it may, although with low - * probability, yield a different entity than that - * pointed to by sd->next_in_service. This rare event - * happens in case there was no CLASS_IDLE entity to - * serve for sd when bfq_lookup_next_entity(sd) was - * invoked for the last time, while there is now one - * such entity. - * - * If the above event happens, then the scheduling of - * such entity in CLASS_IDLE is postponed until the - * service of the sd->next_in_service entity - * finishes. In fact, when the latter is expired, - * bfq_lookup_next_entity(sd) gets called again, - * exactly to update sd->next_in_service. - */ - - /* Make next_in_service entity become in_service_entity */ - entity = sd->next_in_service; - sd->in_service_entity = entity; - - /* - * Reset the accumulator of the amount of service that - * the entity is about to receive. - */ - entity->service = 0; - - /* - * If entity is no longer a candidate for next - * service, then it must be extracted from its active - * tree, so as to make sure that it won't be - * considered when computing next_in_service. See the - * comments on the function - * bfq_no_longer_next_in_service() for details. - */ - if (bfq_no_longer_next_in_service(entity)) - bfq_active_extract(bfq_entity_service_tree(entity), - entity); - - /* - * Even if entity is not to be extracted according to - * the above check, a descendant entity may get - * extracted in one of the next iterations of this - * loop. Such an event could cause a change in - * next_in_service for the level of the descendant - * entity, and thus possibly back to this level. - * - * However, we cannot perform the resulting needed - * update of next_in_service for this level before the - * end of the whole loop, because, to know which is - * the correct next-to-serve candidate entity for each - * level, we need first to find the leaf entity to set - * in service. In fact, only after we know which is - * the next-to-serve leaf entity, we can discover - * whether the parent entity of the leaf entity - * becomes the next-to-serve, and so on. - */ - - /* Log some information */ - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "get_next_queue: this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, - "get_next_queue: this entity, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - } -#endif - - } - - BUG_ON(!entity); - bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); - - /* - * We can finally update all next-to-serve entities along the - * path from the leaf entity just set in service to the root. - */ - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->sched_data; - - if(!bfq_update_next_in_service(sd, NULL)) - break; - } - - return bfqq; -} - -static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -{ - struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - - if (bfqd->in_service_bic) { - put_io_context(bfqd->in_service_bic->icq.ioc); - bfqd->in_service_bic = NULL; - } - - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; - - /* - * When this function is called, all in-service entities have - * been properly deactivated or requeued, so we can safely - * execute the final step: reset in_service_entity along the - * path from entity to the root. - */ - for_each_entity(entity) - entity->sched_data->in_service_entity = NULL; - - /* - * in_serv_entity is no longer in service, so, if it is in no - * service tree either, then release the service reference to - * the queue it represents (taken with bfq_get_entity). - */ - if (!in_serv_entity->on_st) - bfq_put_queue(in_serv_bfqq); -} - -static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool ins_into_idle_tree, bool expiration) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -} - -static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - BUG_ON(bfqq == bfqd->in_service_queue); - BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && - entity->on_st); - - bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), - false); - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -} - -static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_requeue_entity(entity, false, - bfqq == bfqd->in_service_queue); -} - -static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); - -/* - * Called when the bfqq no longer has requests pending, remove it from - * the service tree. As a special case, it can be invoked during an - * expiration. - */ -static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) -{ - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - bfq_log_bfqq(bfqd, bfqq, "del from busy"); - - bfq_clear_bfqq_busy(bfqq); - - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; - - if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - - if (bfqq->wr_coeff > 1) { - bfqd->wr_busy_queues--; - BUG_ON(bfqd->wr_busy_queues < 0); - } - - bfqg_stats_update_dequeue(bfqq_group(bfqq)); - - BUG_ON(bfqq->entity.budget < 0); - - bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -} - -/* - * Called when an inactive queue receives a new request. - */ -static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqq == bfqd->in_service_queue); - - bfq_log_bfqq(bfqd, bfqq, "add to busy"); - - bfq_activate_bfqq(bfqd, bfqq); - - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; - - if (!bfqq->dispatched) - if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); - - if (bfqq->wr_coeff > 1) { - bfqd->wr_busy_queues++; - BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); - } - -} diff --git a/block/bfq.h b/block/bfq.h deleted file mode 100644 index e35bf89b09f3..000000000000 --- a/block/bfq.h +++ /dev/null @@ -1,946 +0,0 @@ -/* - * BFQ v8r12 for 4.9.0: data structures and common functions prototypes. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2015 Paolo Valente - * - * Copyright (C) 2017 Paolo Valente - */ - -#ifndef _BFQ_H -#define _BFQ_H - -#include -#include -#include -#include -#include - -#define BFQ_IOPRIO_CLASSES 3 -#define BFQ_CL_IDLE_TIMEOUT (HZ/5) - -#define BFQ_MIN_WEIGHT 1 -#define BFQ_MAX_WEIGHT 1000 -#define BFQ_WEIGHT_CONVERSION_COEFF 10 - -#define BFQ_DEFAULT_QUEUE_IOPRIO 4 - -#define BFQ_WEIGHT_LEGACY_DFL 100 -#define BFQ_DEFAULT_GRP_IOPRIO 0 -#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - -/* - * Soft real-time applications are extremely more latency sensitive - * than interactive ones. Over-raise the weight of the former to - * privilege them against the latter. - */ -#define BFQ_SOFTRT_WEIGHT_FACTOR 100 - -struct bfq_entity; - -/** - * struct bfq_service_tree - per ioprio_class service tree. - * - * Each service tree represents a B-WF2Q+ scheduler on its own. Each - * ioprio_class has its own independent scheduler, and so its own - * bfq_service_tree. All the fields are protected by the queue lock - * of the containing bfqd. - */ -struct bfq_service_tree { - /* tree for active entities (i.e., those backlogged) */ - struct rb_root active; - /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ - struct rb_root idle; - - struct bfq_entity *first_idle; /* idle entity with minimum F_i */ - struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - - u64 vtime; /* scheduler virtual time */ - /* scheduler weight sum; active and idle entities contribute to it */ - unsigned long wsum; -}; - -/** - * struct bfq_sched_data - multi-class scheduler. - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as an - * intermediate queue in a hierarchical setup. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. - * - * The schedule is implemented by the service trees, plus the field - * @next_in_service, which points to the entity on the active trees - * that will be served next, if 1) no changes in the schedule occurs - * before the current in-service entity is expired, 2) the in-service - * queue becomes idle when it expires, and 3) if the entity pointed by - * in_service_entity is not a queue, then the in-service child entity - * of the entity pointed by in_service_entity becomes idle on - * expiration. This peculiar definition allows for the following - * optimization, not yet exploited: while a given entity is still in - * service, we already know which is the best candidate for next - * service among the other active entitities in the same parent - * entity. We can then quickly compare the timestamps of the - * in-service entity with those of such best candidate. - * - * All the fields are protected by the queue lock of the containing - * bfqd. - */ -struct bfq_sched_data { - struct bfq_entity *in_service_entity; /* entity in service */ - /* head-of-the-line entity in the scheduler (see comments above) */ - struct bfq_entity *next_in_service; - /* array of service trees, one per ioprio_class */ - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; - /* last time CLASS_IDLE was served */ - unsigned long bfq_class_idle_last_service; - -}; - -/** - * struct bfq_weight_counter - counter of the number of all active entities - * with a given weight. - */ -struct bfq_weight_counter { - unsigned int weight; /* weight of the entities this counter refers to */ - unsigned int num_active; /* nr of active entities with this weight */ - /* - * Weights tree member (see bfq_data's @queue_weights_tree and - * @group_weights_tree) - */ - struct rb_node weights_node; -}; - -/** - * struct bfq_entity - schedulable entity. - * - * A bfq_entity is used to represent either a bfq_queue (leaf node in the - * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each - * entity belongs to the sched_data of the parent group in the cgroup - * hierarchy. Non-leaf entities have also their own sched_data, stored - * in @my_sched_data. - * - * Each entity stores independently its priority values; this would - * allow different weights on different devices, but this - * functionality is not exported to userspace by now. Priorities and - * weights are updated lazily, first storing the new values into the - * new_* fields, then setting the @prio_changed flag. As soon as - * there is a transition in the entity state that allows the priority - * update to take place the effective and the requested priority - * values are synchronized. - * - * Unless cgroups are used, the weight value is calculated from the - * ioprio to export the same interface as CFQ. When dealing with - * ``well-behaved'' queues (i.e., queues that do not spend too much - * time to consume their budget and have true sequential behavior, and - * when there are no external factors breaking anticipation) the - * relative weights at each level of the cgroups hierarchy should be - * guaranteed. All the fields are protected by the queue lock of the - * containing bfqd. - */ -struct bfq_entity { - struct rb_node rb_node; /* service_tree member */ - /* pointer to the weight counter associated with this entity */ - struct bfq_weight_counter *weight_counter; - - /* - * Flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree) or is in service. - */ - bool on_st; - - u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ - u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ - - /* tree the entity is enqueued into; %NULL if not on a tree */ - struct rb_root *tree; - - /* - * minimum start time of the (active) subtree rooted at this - * entity; used for O(log N) lookups into active trees - */ - u64 min_start; - - /* amount of service received during the last service slot */ - int service; - - /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ - int budget; - - unsigned int weight; /* weight of the queue */ - unsigned int new_weight; /* next weight if a change is in progress */ - - /* original weight, used to implement weight boosting */ - unsigned int orig_weight; - - /* parent entity, for hierarchical scheduling */ - struct bfq_entity *parent; - - /* - * For non-leaf nodes in the hierarchy, the associated - * scheduler queue, %NULL on leaf nodes. - */ - struct bfq_sched_data *my_sched_data; - /* the scheduler queue this entity belongs to */ - struct bfq_sched_data *sched_data; - - /* flag, set to request a weight, ioprio or ioprio_class change */ - int prio_changed; -}; - -struct bfq_group; - -/** - * struct bfq_queue - leaf schedulable entity. - * - * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. - */ -struct bfq_queue { - /* reference counter */ - int ref; - /* parent bfq_data */ - struct bfq_data *bfqd; - - /* current ioprio and ioprio class */ - unsigned short ioprio, ioprio_class; - /* next ioprio and ioprio class if a change is in progress */ - unsigned short new_ioprio, new_ioprio_class; - - /* - * Shared bfq_queue if queue is cooperating with one or more - * other queues. - */ - struct bfq_queue *new_bfqq; - /* request-position tree member (see bfq_group's @rq_pos_tree) */ - struct rb_node pos_node; - /* request-position tree root (see bfq_group's @rq_pos_tree) */ - struct rb_root *pos_root; - - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* number of sync and async requests queued */ - int queued[2]; - /* number of sync and async requests currently allocated */ - int allocated[2]; - /* number of pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - /* entity representing this queue in the scheduler */ - struct bfq_entity entity; - - /* maximum budget allowed from the feedback mechanism */ - int max_budget; - /* budget expiration (in jiffies) */ - unsigned long budget_timeout; - - /* number of requests on the dispatch list or inside driver */ - int dispatched; - - unsigned int flags; /* status flags.*/ - - /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - - /* bit vector: a 1 for each seeky requests in history */ - u32 seek_history; - - /* node for the device's burst list */ - struct hlist_node burst_list_node; - - /* position of the last request enqueued */ - sector_t last_request_pos; - - /* Number of consecutive pairs of request completion and - * arrival, such that the queue becomes idle after the - * completion, but the next request arrives within an idle - * time slice; used only if the queue's IO_bound flag has been - * cleared. - */ - unsigned int requests_within_timer; - - /* pid of the process owning the queue, used for logging purposes */ - pid_t pid; - - /* - * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL - * if the queue is shared. - */ - struct bfq_io_cq *bic; - - /* current maximum weight-raising time for this queue */ - unsigned long wr_cur_max_time; - /* - * Minimum time instant such that, only if a new request is - * enqueued after this time instant in an idle @bfq_queue with - * no outstanding requests, then the task associated with the - * queue it is deemed as soft real-time (see the comments on - * the function bfq_bfqq_softrt_next_start()) - */ - unsigned long soft_rt_next_start; - /* - * Start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period. - */ - unsigned long last_wr_start_finish; - /* factor by which the weight of this queue is multiplied */ - unsigned int wr_coeff; - /* - * Time of the last transition of the @bfq_queue from idle to - * backlogged. - */ - unsigned long last_idle_bklogged; - /* - * Cumulative service received from the @bfq_queue since the - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; - /* - * Value of wr start time when switching to soft rt - */ - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -}; - -/** - * struct bfq_ttime - per process thinktime stats. - */ -struct bfq_ttime { - u64 last_end_request; /* completion time of last request */ - - u64 ttime_total; /* total process thinktime */ - unsigned long ttime_samples; /* number of thinktime samples */ - u64 ttime_mean; /* average process thinktime */ - -}; - -/** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ -struct bfq_io_cq { - /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; - /* associated @bfq_ttime struct */ - struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif - - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to - * be able to restore it in case of split. - */ - bool saved_has_short_ttime; - /* - * Same purpose as the previous two fields for the I/O bound - * classification of a queue. - */ - bool saved_IO_bound; - - /* - * Same purpose as the previous fields for the value of the - * field keeping the queue's belonging to a large burst - */ - bool saved_in_large_burst; - /* - * True if the queue belonged to a burst list before its merge - * with another cooperating queue. - */ - bool was_in_burst_list; - - /* - * Similar to previous fields: save wr information. - */ - unsigned long saved_wr_coeff; - unsigned long saved_last_wr_start_finish; - unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; -}; - -enum bfq_device_speed { - BFQ_BFQD_FAST, - BFQ_BFQD_SLOW, -}; - -/** - * struct bfq_data - per-device data structure. - * - * All the fields are protected by the @queue lock. - */ -struct bfq_data { - /* request queue for the device */ - struct request_queue *queue; - - /* root bfq_group for the device */ - struct bfq_group *root_group; - - /* - * rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues have - * the same weight. The tree contains one counter for each - * distinct weight associated to some active and not - * weight-raised @bfq_queue (see the comments to the functions - * bfq_weights_tree_[add|remove] for further details). - */ - struct rb_root queue_weights_tree; - /* - * rbtree of non-queue @bfq_entity weight counters, sorted by - * weight. Used to keep track of whether all @bfq_groups have - * the same weight. The tree contains one counter for each - * distinct weight associated to some active @bfq_group (see - * the comments to the functions bfq_weights_tree_[add|remove] - * for further details). - */ - struct rb_root group_weights_tree; - - /* - * Number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - */ - int busy_queues; - /* number of weight-raised busy @bfq_queues */ - int wr_busy_queues; - /* number of queued requests */ - int queued; - /* number of requests dispatched and waiting for completion */ - int rq_in_driver; - - /* - * Maximum number of requests in driver in the last - * @hw_tag_samples completed requests. - */ - int max_rq_in_driver; - /* number of samples used to calculate hw_tag */ - int hw_tag_samples; - /* flag set to one if the driver is showing a queueing behavior */ - int hw_tag; - - /* number of budgets assigned */ - int budgets_assigned; - - /* - * Timer set when idling (waiting) for the next request from - * the queue in service. - */ - struct hrtimer idle_slice_timer; - /* delayed work to restart dispatching on the request queue */ - struct work_struct unplug_work; - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; - /* bfq_io_cq (bic) associated with the @in_service_queue */ - struct bfq_io_cq *in_service_bic; - - /* on-disk position of the last served request */ - sector_t last_position; - - /* time of last request completion (ns) */ - u64 last_completion; - - /* time of first rq dispatch in current observation interval (ns) */ - u64 first_dispatch; - /* time of last rq dispatch in current observation interval (ns) */ - u64 last_dispatch; - - /* beginning of the last budget */ - ktime_t last_budget_start; - /* beginning of the last idle slice */ - ktime_t last_idling_start; - - /* number of samples in current observation interval */ - int peak_rate_samples; - /* num of samples of seq dispatches in current observation interval */ - u32 sequential_samples; - /* total num of sectors transferred in current observation interval */ - u64 tot_sectors_dispatched; - /* max rq size seen during current observation interval (sectors) */ - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; - /* current estimate of device peak rate */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ - int bfq_max_budget; - - /* list of all the bfq_queues active on the device */ - struct list_head active_list; - /* list of all the bfq_queues idle on the device */ - struct list_head idle_list; - - /* - * Timeout for async/sync requests; when it fires, requests - * are served in fifo order. - */ - u64 bfq_fifo_expire[2]; - /* weight of backward seeks wrt forward ones */ - unsigned int bfq_back_penalty; - /* maximum allowed backward seek */ - unsigned int bfq_back_max; - /* maximum idling time */ - u32 bfq_slice_idle; - - /* user-configured max budget value (0 for auto-tuning) */ - int bfq_user_max_budget; - /* - * Timeout for bfq_queues to consume their budget; used to - * prevent seeky queues from imposing long latencies to - * sequential or quasi-sequential ones (this also implies that - * seeky queues cannot receive guarantees in the service - * domain; after a timeout they are charged for the time they - * have been in service, to preserve fairness among them, but - * without service-domain guarantees). - */ - unsigned int bfq_timeout; - - /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - - /* - * Force device idling whenever needed to provide accurate - * service guarantees, without caring about throughput - * issues. CAVEAT: this may even increase latencies, in case - * of useless idling for processes that did stop doing I/O. - */ - bool strict_guarantees; - - /* - * Last time at which a queue entered the current burst of - * queues being activated shortly after each other; for more - * details about this and the following parameters related to - * a burst of activations, see the comments on the function - * bfq_handle_burst. - */ - unsigned long last_ins_in_burst; - /* - * Reference time interval used to decide whether a queue has - * been activated shortly after @last_ins_in_burst. - */ - unsigned long bfq_burst_interval; - /* number of queues in the current burst of queue activations */ - int burst_size; - - /* common parent entity for the queues in the burst */ - struct bfq_entity *burst_parent_entity; - /* Maximum burst size above which the current queue-activation - * burst is deemed as 'large'. - */ - unsigned long bfq_large_burst_thresh; - /* true if a large queue-activation burst is in progress */ - bool large_burst; - /* - * Head of the burst list (as for the above fields, more - * details in the comments on the function bfq_handle_burst). - */ - struct hlist_head burst_list; - - /* if set to true, low-latency heuristics are enabled */ - bool low_latency; - /* - * Maximum factor by which the weight of a weight-raised queue - * is multiplied. - */ - unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; - - /* Maximum weight-raising duration for soft real-time processes */ - unsigned int bfq_wr_rt_max_time; - /* - * Minimum idle period after which weight-raising may be - * reactivated for a queue (in jiffies). - */ - unsigned int bfq_wr_min_idle_time; - /* - * Minimum period between request arrivals after which - * weight-raising may be reactivated for an already busy async - * queue (in jiffies). - */ - unsigned long bfq_wr_min_inter_arr_async; - - /* Max service-rate for a soft real-time queue, in sectors/sec */ - unsigned int bfq_wr_max_softrt_rate; - /* - * Cached value of the product R*T, used for computing the - * maximum duration of weight raising automatically. - */ - u64 RT_prod; - /* device-speed class for the low-latency heuristic */ - enum bfq_device_speed device_speed; - - /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; -}; - -enum bfqq_state_flags { - BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ - BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ - BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* - * waiting for a request - * without idling the device - */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once - * having consumed at most 2/10 of - * its budget - */ - BFQ_BFQQ_FLAG_in_large_burst, /* - * bfqq activated in a large burst, - * see comments to bfq_handle_burst. - */ - BFQ_BFQQ_FLAG_softrt_update, /* - * may need softrt-next-start - * update - */ - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -}; - -#define BFQ_BFQQ_FNS(name) \ -static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -{ \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -} - -BFQ_BFQQ_FNS(just_created); -BFQ_BFQQ_FNS(busy); -BFQ_BFQQ_FNS(wait_request); -BFQ_BFQQ_FNS(non_blocking_wait_rq); -BFQ_BFQQ_FNS(must_alloc); -BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(has_short_ttime); -BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(IO_bound); -BFQ_BFQQ_FNS(in_large_burst); -BFQ_BFQQ_FNS(coop); -BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(softrt_update); -#undef BFQ_BFQQ_FNS - -/* Logging facilities. */ -#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("bfq%d%c %s " fmt "\n", \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ -} while (0) - -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s " fmt "\n", __pbuf, ##args); \ -} while (0) - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log(bfqd, fmt, args...) \ - pr_crit("bfq " fmt "\n", ##args) - -#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ -} while (0) - -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -} while (0) - -#else /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - -#define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -/* Expiration reasons. */ -enum bfqq_expiration { - BFQ_BFQQ_TOO_IDLE = 0, /* - * queue has been idling for - * too long - */ - BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ - BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ - BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ - BFQ_BFQQ_PREEMPTED /* preemption in progress */ -}; - - -struct bfqg_stats { -#ifdef CONFIG_BFQ_GROUP_IOSCHED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -#endif -}; - -#ifdef CONFIG_BFQ_GROUP_IOSCHED -/* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * - * @ps: @blkcg_policy_storage that this structure inherits - * @weight: weight of the bfq_group - */ -struct bfq_group_data { - /* must be the first member */ - struct blkcg_policy_data pd; - - unsigned int weight; -}; - -/** - * struct bfq_group - per (device, cgroup) data structure. - * @entity: schedulable entity to insert into the parent group sched_data. - * @sched_data: own sched_data, to contain child entities (they may be - * both bfq_queues and bfq_groups). - * @bfqd: the bfq_data for the device this group acts upon. - * @async_bfqq: array of async queues for all the tasks belonging to - * the group, one queue per ioprio value per ioprio_class, - * except for the idle class that has only one queue. - * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). - * @my_entity: pointer to @entity, %NULL for the toplevel group; used - * to avoid too many special cases during group creation/ - * migration. - * @active_entities: number of active entities belonging to the group; - * unused for the root group. Used to know whether there - * are groups with more than one active @bfq_entity - * (see the comments to the function - * bfq_bfqq_may_idle()). - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_find_close_cooperator()). - * - * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup - * there is a set of bfq_groups, each one collecting the lower-level - * entities belonging to the group that are acting on the same device. - * - * Locking works as follows: - * o @bfqd is protected by the queue lock, RCU is used to access it - * from the readers. - * o All the other fields are protected by the @bfqd queue lock. - */ -struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - void *bfqd; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; - - struct bfq_entity *my_entity; - - int active_entities; - - struct rb_root rq_pos_tree; - - struct bfqg_stats stats; -}; - -#else -struct bfq_group { - struct bfq_sched_data sched_data; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; - - struct rb_root rq_pos_tree; -}; -#endif - -static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - -static unsigned int bfq_class_idx(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - return bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS - 1; -} - -static struct bfq_service_tree * -bfq_entity_service_tree(struct bfq_entity *entity) -{ - struct bfq_sched_data *sched_data = entity->sched_data; - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int idx = bfq_class_idx(entity); - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - BUG_ON(sched_data == NULL); - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); - } -#endif - return sched_data->service_tree + idx; -} - -static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -{ - return bic->bfqq[is_sync]; -} - -static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, - bool is_sync) -{ - bic->bfqq[is_sync] = bfqq; -} - -static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -{ - return bic->icq.q->elevator->elevator_data; -} - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - -#else - -static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - -#endif - -static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -static void bfq_put_queue(struct bfq_queue *bfqq); -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); -static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); -#ifdef CONFIG_BFQ_GROUP_IOSCHED -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -#endif -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - -#endif /* _BFQ_H */ diff --git a/drivers/Kconfig b/drivers/Kconfig index c89e0f383be6..981778f02e56 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -216,8 +216,6 @@ source "drivers/gps/Kconfig" source "drivers/halls/Kconfig" -source "drivers/rekernel/Kconfig" - source "drivers/kernelsu/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index f691364e80c8..8d445b4401be 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -183,5 +183,4 @@ obj-$(CONFIG_SENSORS_SSC) += sensors/ obj-$(CONFIG_TEE) += tee/ obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/ obj-$(CONFIG_HALLS) += halls/ -obj-$(CONFIG_REKERNEL) += rekernel/ obj-$(CONFIG_KSU) += kernelsu/ diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 491751ab0dbf..bb2a5b581622 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -1,9 +1,8 @@ -# SPDX-License-Identifier: GPL-2.0 menu "Android" config ANDROID bool "Android Drivers" - help + ---help--- Enable support for various drivers needed on the Android platform if ANDROID @@ -12,7 +11,7 @@ config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU default n - help + ---help--- Binder is used in Android for both communication between processes, and remote method invocation. @@ -20,23 +19,11 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. -config ANDROID_BINDERFS - bool "Android Binderfs filesystem" - depends on ANDROID_BINDER_IPC - default n - help - Binderfs is a pseudo-filesystem for the Android Binder IPC driver - which can be mounted per-ipc namespace allowing to run multiple - instances of Android. - Each binderfs mount initially only contains a binder-control device. - It can be used to dynamically allocate new binder IPC devices via - ioctls. - config ANDROID_BINDER_DEVICES string "Android Binder devices" depends on ANDROID_BINDER_IPC default "binder,hwbinder,vndbinder" - help + ---help--- Default value for the binder.devices parameter. The binder.devices parameter is a comma-separated list of strings @@ -44,71 +31,29 @@ config ANDROID_BINDER_DEVICES created. Each binder device has its own context manager, and is therefore logically separated from the other devices. +config ANDROID_BINDER_IPC_32BIT + bool "Android Binder IPC 32BIT Driver" + depends on !64BIT && ANDROID_BINDER_IPC + default n + ---help--- + The Binder API has been changed to support both 32 and 64bit + applications in a mixed environment. + + Enable this to support an old 32-bit Android user-space (v4.4 and + earlier). + + Note that enabling this will break newer Android user-space. + config ANDROID_BINDER_IPC_SELFTEST bool "Android Binder IPC Driver Selftest" depends on ANDROID_BINDER_IPC - help + ---help--- This feature allows binder selftest to run. Binder selftest checks the allocation and free of binder buffers exhaustively with combinations of various buffer sizes and alignments. -config ANDROID_DEBUG_SYMBOLS - bool "Android Debug Symbols" - help - Enables export of debug symbols that are useful for offline debugging - of a kernel. These symbols would be used in vendor modules to find - addresses of the core kernel symbols for vendor extensions. - - This driver is statically compiled into kernel and maintains all the - required symbol addresses for vendor modules and provides necessary - interface vendor modules. - -config ANDROID_VENDOR_HOOKS - bool "Android Vendor Hooks" - depends on TRACEPOINTS - help - Enable vendor hooks implemented as tracepoints - - Allow vendor modules to attach to tracepoint "hooks" defined via - DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. - -config ANDROID_KABI_RESERVE - bool "Android KABI reserve padding" - default y - help - This option enables the padding that the Android GKI kernel adds - to many different kernel structures to support an in-kernel stable ABI - over the lifespan of support for the kernel. - - Only disable this option if you have a system that needs the Android - kernel drivers, but is NOT an Android GKI kernel image. If disabled - it has the possibility to make the kernel static and runtime image - slightly smaller but will NOT be supported by the Google Android - kernel team. - - If even slightly unsure, say Y. - -config ANDROID_VENDOR_OEM_DATA - bool "Android vendor and OEM data padding" - default y - help - This option enables the padding that the Android GKI kernel adds - to many different kernel structures to support an in-kernel stable ABI - over the lifespan of support for the kernel as well as OEM additional - fields that are needed by some of the Android kernel tracepoints. The - macros enabled by this option are used to enable padding in vendor modules - used for the above specified purposes. - - Only disable this option if you have a system that needs the Android - kernel drivers, but is NOT an Android GKI kernel image and you do NOT - use the Android kernel tracepoints. If disabled it has the possibility - to make the kernel static and runtime image slightly smaller but will - NOT be supported by the Google Android kernel team. - - If even slightly unsure, say Y. - endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index f1ac44102987..a01254c43ee3 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -1,8 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0-only ccflags-y += -I$(src) # needed for trace events -obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o -obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS) += android_debug_symbols.o -obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/android/android_debug_symbols.c b/drivers/android/android_debug_symbols.c deleted file mode 100644 index dd75ddac2085..000000000000 --- a/drivers/android/android_debug_symbols.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -/* - * Copyright (c) 2021, The Linux Foundation. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include "../../mm/slab.h" -#include -#include -#include -#include -#include - -struct ads_entry { - char *name; - void *addr; -}; - -bool ads_page_owner; -bool ads_slub_debug; -unsigned long ads_vmalloc_nr_pages; -unsigned long ads_pcpu_nr_pages; - -#define _ADS_ENTRY(index, symbol) \ - [index] = { .name = #symbol, .addr = (void *)symbol } -#define ADS_ENTRY(index, symbol) _ADS_ENTRY(index, symbol) - -#define _ADS_PER_CPU_ENTRY(index, symbol) \ - [index] = { .name = #symbol, .addr = (void *)&symbol } -#define ADS_PER_CPU_ENTRY(index, symbol) _ADS_PER_CPU_ENTRY(index, symbol) - -/* - * This module maintains static array of symbol and address information. - * Add all required core kernel symbols and their addresses into ads_entries[] array, - * so that vendor modules can query and to find address of non-exported symbol. - */ -static const struct ads_entry ads_entries[ADS_END] = { - ADS_ENTRY(ADS_SDATA, _sdata), - ADS_ENTRY(ADS_BSS_END, __bss_stop), - ADS_ENTRY(ADS_PER_CPU_START, __per_cpu_start), - ADS_ENTRY(ADS_PER_CPU_END, __per_cpu_end), - ADS_ENTRY(ADS_START_RO_AFTER_INIT, __start_ro_after_init), - ADS_ENTRY(ADS_END_RO_AFTER_INIT, __end_ro_after_init), - ADS_ENTRY(ADS_LINUX_BANNER, linux_banner), -#ifdef CONFIG_CMA - ADS_ENTRY(ADS_TOTAL_CMA, &totalcma_pages), -#endif - ADS_ENTRY(ADS_SLAB_CACHES, &slab_caches), - ADS_ENTRY(ADS_SLAB_MUTEX, &slab_mutex), - ADS_ENTRY(ADS_MIN_LOW_PFN, &min_low_pfn), - ADS_ENTRY(ADS_MAX_PFN, &max_pfn), - ADS_ENTRY(ADS_VMALLOC_NR_PAGES, &ads_vmalloc_nr_pages), - ADS_ENTRY(ADS_PCPU_NR_PAGES, &ads_pcpu_nr_pages), -#ifdef CONFIG_PAGE_OWNER - ADS_ENTRY(ADS_PAGE_OWNER_ENABLED, &ads_page_owner), -#endif -#ifdef CONFIG_SLUB_DEBUG - ADS_ENTRY(ADS_SLUB_DEBUG, &ads_slub_debug), -#endif -#ifdef CONFIG_SWAP - ADS_ENTRY(ADS_NR_SWAP_PAGES, &nr_swap_pages), -#endif -#ifdef CONFIG_MMU - ADS_ENTRY(ADS_MMAP_MIN_ADDR, &mmap_min_addr), -#endif - ADS_ENTRY(ADS_STACK_GUARD_GAP, &stack_guard_gap), -#ifdef CONFIG_SYSCTL - ADS_ENTRY(ADS_SYSCTL_LEGACY_VA_LAYOUT, &sysctl_legacy_va_layout), -#endif - ADS_ENTRY(ADS_SHOW_MEM, show_mem), -#ifdef CONFIG_ARM64 - ADS_ENTRY(ADS_PUT_TASK_STACK, put_task_stack), -#endif -}; - -/* - * ads_per_cpu_entries array contains all the per_cpu variable address information. - */ -static const struct ads_entry ads_per_cpu_entries[ADS_DEBUG_PER_CPU_END] = { -#ifdef CONFIG_ARM64 - ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, irq_stack_ptr), -#endif -#ifdef CONFIG_X86 - ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, hardirq_stack_ptr), -#endif -}; - -/* - * android_debug_symbol - Provide address inforamtion of debug symbol. - * @symbol: Index of debug symbol array. - * - * Return address of core kernel symbol on success and a negative errno will be - * returned in error cases. - * - */ -void *android_debug_symbol(enum android_debug_symbol symbol) -{ - if (symbol >= ADS_END) - return ERR_PTR(-EINVAL); - - return ads_entries[symbol].addr; -} -EXPORT_SYMBOL_NS_GPL(android_debug_symbol, MINIDUMP); - -/* - * android_debug_per_cpu_symbol - Provide address inforamtion of per cpu debug symbol. - * @symbol: Index of per cpu debug symbol array. - * - * Return address of core kernel symbol on success and a negative errno will be - * returned in error cases. - * - */ -void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol) -{ - if (symbol >= ADS_DEBUG_PER_CPU_END) - return ERR_PTR(-EINVAL); - - return ads_per_cpu_entries[symbol].addr; -} -EXPORT_SYMBOL_NS_GPL(android_debug_per_cpu_symbol, MINIDUMP); - -static int __init debug_symbol_init(void) -{ -#ifdef CONFIG_PAGE_OWNER - ads_page_owner = page_owner_ops.need(); -#endif -#ifdef CONFIG_SLUB_DEBUG - ads_slub_debug = __slub_debug_enabled(); -#endif - ads_vmalloc_nr_pages = vmalloc_nr_pages(); - ads_pcpu_nr_pages = pcpu_nr_pages(); - return 0; -} -module_init(debug_symbol_init); - -static void __exit debug_symbol_exit(void) -{ } -module_exit(debug_symbol_exit); - -MODULE_DESCRIPTION("Debug Symbol Driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 977cb783ea0b..20356105e4ba 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1,9 +1,18 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ /* @@ -42,6 +51,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -54,28 +64,13 @@ #include #include #include -#include -#include +#include #include -#include #include #include #include #include -#include -#include -#include -#include - -#include -#ifdef CONFIG_REKERNEL -#include <../rekernel/rekernel.h> -#endif /* CONFIG_REKERNEL */ -#include - -#include - -#include "binder_internal.h" +#include "binder_alloc.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); @@ -92,11 +87,36 @@ static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; -static int proc_show(struct seq_file *m, void *unused); -DEFINE_SHOW_ATTRIBUTE(proc); +#define BINDER_DEBUG_ENTRY(name) \ +static int binder_##name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, binder_##name##_show, inode->i_private); \ +} \ +\ +static const struct file_operations binder_##name##_fops = { \ + .owner = THIS_MODULE, \ + .open = binder_##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +static int binder_proc_show(struct seq_file *m, void *unused); +BINDER_DEBUG_ENTRY(proc); + +/* This is only defined in include/asm-arm/sizes.h */ +#ifndef SZ_1K +#define SZ_1K 0x400 +#endif + +#ifndef SZ_4M +#define SZ_4M 0x400000 +#endif #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) +#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64) + enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, @@ -118,8 +138,8 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); -char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; -module_param_named(devices, binder_devices_param, charp, 0444); +static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(devices, binder_devices_param, charp, S_IRUGO); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; @@ -140,13 +160,13 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define binder_debug(mask, x...) \ do { \ if (binder_debug_mask & mask) \ - pr_info_ratelimited(x); \ + pr_info(x); \ } while (0) #define binder_user_error(x...) \ do { \ if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \ - pr_info_ratelimited(x); \ + pr_info(x); \ if (binder_stop_on_user_error) \ binder_stop_on_user_error = 2; \ } while (0) @@ -162,6 +182,24 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) +enum binder_stat_types { + BINDER_STAT_PROC, + BINDER_STAT_THREAD, + BINDER_STAT_NODE, + BINDER_STAT_REF, + BINDER_STAT_DEATH, + BINDER_STAT_TRANSACTION, + BINDER_STAT_TRANSACTION_COMPLETE, + BINDER_STAT_COUNT +}; + +struct binder_stats { + atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1]; + atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; + atomic_t obj_created[BINDER_STAT_COUNT]; + atomic_t obj_deleted[BINDER_STAT_COUNT]; +}; + static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) @@ -189,26 +227,16 @@ struct binder_transaction_log_entry { int return_error_line; uint32_t return_error; uint32_t return_error_param; - char context_name[BINDERFS_MAX_NAME + 1]; + const char *context_name; }; - struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; - static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; -static struct kmem_cache *binder_node_pool; -static struct kmem_cache *binder_proc_pool; -static struct kmem_cache *binder_ref_death_pool; -static struct kmem_cache *binder_ref_pool; -static struct kmem_cache *binder_thread_pool; -static struct kmem_cache *binder_transaction_pool; -static struct kmem_cache *binder_work_pool; - static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { @@ -229,9 +257,320 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( return e; } +struct binder_context { + struct binder_node *binder_context_mgr_node; + struct mutex context_mgr_node_lock; + + kuid_t binder_context_mgr_uid; + const char *name; +}; + +struct binder_device { + struct hlist_node hlist; + struct miscdevice miscdev; + struct binder_context context; +}; + +/** + * struct binder_work - work enqueued on a worklist + * @entry: node enqueued on list + * @type: type of work to be performed + * + * There are separate work lists for proc, thread, and node (async). + */ +struct binder_work { + struct list_head entry; + + enum binder_work_type { + BINDER_WORK_TRANSACTION = 1, + BINDER_WORK_TRANSACTION_COMPLETE, + BINDER_WORK_RETURN_ERROR, + BINDER_WORK_NODE, + BINDER_WORK_DEAD_BINDER, + BINDER_WORK_DEAD_BINDER_AND_CLEAR, + BINDER_WORK_CLEAR_DEATH_NOTIFICATION, + } type; +}; + +struct binder_error { + struct binder_work work; + uint32_t cmd; +}; + +/** + * struct binder_node - binder node bookkeeping + * @debug_id: unique ID for debugging + * (invariant after initialized) + * @lock: lock for node fields + * @work: worklist element for node work + * (protected by @proc->inner_lock) + * @rb_node: element for proc->nodes tree + * (protected by @proc->inner_lock) + * @dead_node: element for binder_dead_nodes list + * (protected by binder_dead_nodes_lock) + * @proc: binder_proc that owns this node + * (invariant after initialized) + * @refs: list of references on this node + * (protected by @lock) + * @internal_strong_refs: used to take strong references when + * initiating a transaction + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @local_weak_refs: weak user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @local_strong_refs: strong user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @tmp_refs: temporary kernel refs + * (protected by @proc->inner_lock while @proc + * is valid, and by binder_dead_nodes_lock + * if @proc is NULL. During inc/dec and node release + * it is also protected by @lock to provide safety + * as the node dies and @proc becomes NULL) + * @ptr: userspace pointer for node + * (invariant, no lock needed) + * @cookie: userspace cookie for node + * (invariant, no lock needed) + * @has_strong_ref: userspace notified of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @pending_strong_ref: userspace has acked notification of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @has_weak_ref: userspace notified of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @pending_weak_ref: userspace has acked notification of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) + * @has_async_transaction: async transaction to node in progress + * (protected by @lock) + * @sched_policy: minimum scheduling policy for node + * (invariant after initialized) + * @accept_fds: file descriptor operations supported for node + * (invariant after initialized) + * @min_priority: minimum scheduling priority + * (invariant after initialized) + * @inherit_rt: inherit RT scheduling policy from caller + * @txn_security_ctx: require sender's security context + * (invariant after initialized) + * @async_todo: list of async work items + * (protected by @proc->inner_lock) + * + * Bookkeeping structure for binder nodes. + */ +struct binder_node { + int debug_id; + spinlock_t lock; + struct binder_work work; + union { + struct rb_node rb_node; + struct hlist_node dead_node; + }; + struct binder_proc *proc; + struct hlist_head refs; + int internal_strong_refs; + int local_weak_refs; + int local_strong_refs; + int tmp_refs; + binder_uintptr_t ptr; + binder_uintptr_t cookie; + struct { + /* + * bitfield elements protected by + * proc inner_lock + */ + u8 has_strong_ref:1; + u8 pending_strong_ref:1; + u8 has_weak_ref:1; + u8 pending_weak_ref:1; + }; + struct { + /* + * invariant after initialization + */ + u8 sched_policy:2; + u8 inherit_rt:1; + u8 accept_fds:1; + u8 txn_security_ctx:1; + u8 min_priority; + }; + bool has_async_transaction; + struct list_head async_todo; +}; + +struct binder_ref_death { + /** + * @work: worklist element for death notifications + * (protected by inner_lock of the proc that + * this ref belongs to) + */ + struct binder_work work; + binder_uintptr_t cookie; +}; + +/** + * struct binder_ref_data - binder_ref counts and id + * @debug_id: unique ID for the ref + * @desc: unique userspace handle for ref + * @strong: strong ref count (debugging only if not locked) + * @weak: weak ref count (debugging only if not locked) + * + * Structure to hold ref count and ref id information. Since + * the actual ref can only be accessed with a lock, this structure + * is used to return information about the ref to callers of + * ref inc/dec functions. + */ +struct binder_ref_data { + int debug_id; + uint32_t desc; + int strong; + int weak; +}; + +/** + * struct binder_ref - struct to track references on nodes + * @data: binder_ref_data containing id, handle, and current refcounts + * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree + * @rb_node_node: node for lookup by @node in proc's rb_tree + * @node_entry: list entry for node->refs list in target node + * (protected by @node->lock) + * @proc: binder_proc containing ref + * @node: binder_node of target node. When cleaning up a + * ref for deletion in binder_cleanup_ref, a non-NULL + * @node indicates the node must be freed + * @death: pointer to death notification (ref_death) if requested + * (protected by @node->lock) + * + * Structure to track references from procA to target node (on procB). This + * structure is unsafe to access without holding @proc->outer_lock. + */ +struct binder_ref { + /* Lookups needed: */ + /* node + proc => ref (transaction) */ + /* desc + proc => ref (transaction, inc/dec ref) */ + /* node => refs + procs (proc exit) */ + struct binder_ref_data data; + struct rb_node rb_node_desc; + struct rb_node rb_node_node; + struct hlist_node node_entry; + struct binder_proc *proc; + struct binder_node *node; + struct binder_ref_death *death; +}; + enum binder_deferred_state { - BINDER_DEFERRED_FLUSH = 0x01, - BINDER_DEFERRED_RELEASE = 0x02, + BINDER_DEFERRED_PUT_FILES = 0x01, + BINDER_DEFERRED_FLUSH = 0x02, + BINDER_DEFERRED_RELEASE = 0x04, +}; + +/** + * struct binder_priority - scheduler policy and priority + * @sched_policy scheduler policy + * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT + * + * The binder driver supports inheriting the following scheduler policies: + * SCHED_NORMAL + * SCHED_BATCH + * SCHED_FIFO + * SCHED_RR + */ +struct binder_priority { + unsigned int sched_policy; + int prio; +}; + +/** + * struct binder_proc - binder process bookkeeping + * @proc_node: element for binder_procs list + * @threads: rbtree of binder_threads in this proc + * (protected by @inner_lock) + * @nodes: rbtree of binder nodes associated with + * this proc ordered by node->ptr + * (protected by @inner_lock) + * @refs_by_desc: rbtree of refs ordered by ref->desc + * (protected by @outer_lock) + * @refs_by_node: rbtree of refs ordered by ref->node + * (protected by @outer_lock) + * @waiting_threads: threads currently waiting for proc work + * (protected by @inner_lock) + * @pid PID of group_leader of process + * (invariant after initialized) + * @tsk task_struct for group_leader of process + * (invariant after initialized) + * @files files_struct for process + * (protected by @files_lock) + * @files_lock mutex to protect @files + * @cred struct cred associated with the `struct file` + * in binder_open() + * (invariant after initialized) + * @deferred_work_node: element for binder_deferred_list + * (protected by binder_deferred_lock) + * @deferred_work: bitmap of deferred work to perform + * (protected by binder_deferred_lock) + * @is_dead: process is dead and awaiting free + * when outstanding transactions are cleaned up + * (protected by @inner_lock) + * @todo: list of work for this process + * (protected by @inner_lock) + * @stats: per-process binder statistics + * (atomics, no lock needed) + * @delivered_death: list of delivered death notification + * (protected by @inner_lock) + * @max_threads: cap on number of binder threads + * (protected by @inner_lock) + * @requested_threads: number of binder threads requested but not + * yet started. In current implementation, can + * only be 0 or 1. + * (protected by @inner_lock) + * @requested_threads_started: number binder threads started + * (protected by @inner_lock) + * @tmp_ref: temporary reference to indicate proc is in use + * (atomic since @proc->inner_lock cannot + * always be acquired) + * @default_priority: default scheduler priority + * (invariant after initialized) + * @debugfs_entry: debugfs node + * @alloc: binder allocator bookkeeping + * @context: binder_context for this proc + * (invariant after initialized) + * @inner_lock: can nest under outer_lock and/or node lock + * @outer_lock: no nesting under innor or node lock + * Lock order: 1) outer, 2) node, 3) inner + * + * Bookkeeping structure for binder processes + */ +struct binder_proc { + struct hlist_node proc_node; + struct rb_root threads; + struct rb_root nodes; + struct rb_root refs_by_desc; + struct rb_root refs_by_node; + struct list_head waiting_threads; + int pid; + struct task_struct *tsk; + struct files_struct *files; + struct mutex files_lock; + const struct cred *cred; + struct hlist_node deferred_work_node; + int deferred_work; + bool is_dead; + + struct list_head todo; + struct binder_stats stats; + struct list_head delivered_death; + int max_threads; + int requested_threads; + int requested_threads_started; + atomic_t tmp_ref; + struct binder_priority default_priority; + struct dentry *debugfs_entry; + struct binder_alloc alloc; + struct binder_context *context; + spinlock_t inner_lock; + spinlock_t outer_lock; }; enum { @@ -243,6 +582,110 @@ enum { BINDER_LOOPER_STATE_POLL = 0x20, }; +/** + * struct binder_thread - binder thread bookkeeping + * @proc: binder process for this thread + * (invariant after initialization) + * @rb_node: element for proc->threads rbtree + * (protected by @proc->inner_lock) + * @waiting_thread_node: element for @proc->waiting_threads list + * (protected by @proc->inner_lock) + * @pid: PID for this thread + * (invariant after initialization) + * @looper: bitmap of looping state + * (only accessed by this thread) + * @looper_needs_return: looping thread needs to exit driver + * (no lock needed) + * @transaction_stack: stack of in-progress transactions for this thread + * (protected by @proc->inner_lock) + * @todo: list of work to do for this thread + * (protected by @proc->inner_lock) + * @process_todo: whether work in @todo should be processed + * (protected by @proc->inner_lock) + * @return_error: transaction errors reported by this thread + * (only accessed by this thread) + * @reply_error: transaction errors reported by target thread + * (protected by @proc->inner_lock) + * @wait: wait queue for thread work + * @stats: per-thread statistics + * (atomics, no lock needed) + * @tmp_ref: temporary reference to indicate thread is in use + * (atomic since @proc->inner_lock cannot + * always be acquired) + * @is_dead: thread is dead and awaiting free + * when outstanding transactions are cleaned up + * (protected by @proc->inner_lock) + * @task: struct task_struct for this thread + * + * Bookkeeping structure for binder threads. + */ +struct binder_thread { + struct binder_proc *proc; + struct rb_node rb_node; + struct list_head waiting_thread_node; + int pid; + int looper; /* only modified by this thread */ + bool looper_need_return; /* can be written by other thread */ + struct binder_transaction *transaction_stack; + struct list_head todo; + bool process_todo; + struct binder_error return_error; + struct binder_error reply_error; + wait_queue_head_t wait; + struct binder_stats stats; + atomic_t tmp_ref; + bool is_dead; + struct task_struct *task; +}; + +struct binder_transaction { + int debug_id; + struct binder_work work; + struct binder_thread *from; + struct binder_transaction *from_parent; + struct binder_proc *to_proc; + struct binder_thread *to_thread; + struct binder_transaction *to_parent; + unsigned need_reply:1; + /* unsigned is_dead:1; */ /* not used at the moment */ + + struct binder_buffer *buffer; + unsigned int code; + unsigned int flags; + struct binder_priority priority; + struct binder_priority saved_priority; + bool set_priority_called; + kuid_t sender_euid; + binder_uintptr_t security_ctx; + /** + * @lock: protects @from, @to_proc, and @to_thread + * + * @from, @to_proc, and @to_thread can be set to NULL + * during thread teardown + */ + spinlock_t lock; +}; + +/** + * struct binder_object - union of flat binder object types + * @hdr: generic object header + * @fbo: binder object (nodes and refs) + * @fdo: file descriptor object + * @bbo: binder buffer pointer + * @fdao: file descriptor array + * + * Used for type-independent object copies + */ +struct binder_object { + union { + struct binder_object_header hdr; + struct flat_binder_object fbo; + struct binder_fd_object fdo; + struct binder_buffer_object bbo; + struct binder_fd_array_object fdao; + }; +}; + /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire @@ -253,7 +696,6 @@ enum { #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) - __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -269,7 +711,6 @@ _binder_proc_lock(struct binder_proc *proc, int line) #define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) - __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -285,7 +726,6 @@ _binder_proc_unlock(struct binder_proc *proc, int line) #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) - __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -301,7 +741,6 @@ _binder_inner_proc_lock(struct binder_proc *proc, int line) #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) - __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -317,7 +756,6 @@ _binder_inner_proc_unlock(struct binder_proc *proc, int line) #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) - __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -333,7 +771,6 @@ _binder_node_lock(struct binder_node *node, int line) #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) - __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); @@ -350,16 +787,12 @@ _binder_node_unlock(struct binder_node *node, int line) #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) - __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); - else - /* annotation for sparse */ - __acquire(&node->proc->inner_lock); } /** @@ -371,7 +804,6 @@ _binder_node_inner_lock(struct binder_node *node, int line) #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) - __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; @@ -379,9 +811,6 @@ _binder_node_inner_unlock(struct binder_node *node, int line) "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); - else - /* annotation for sparse */ - __release(&node->proc->inner_lock); spin_unlock(&node->lock); } @@ -442,7 +871,6 @@ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { - WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } @@ -460,7 +888,6 @@ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { - WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); thread->process_todo = true; } @@ -521,13 +948,69 @@ static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); +static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) +{ + unsigned long rlim_cur; + unsigned long irqs; + int ret; + + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + ret = -ESRCH; + goto err; + } + if (!lock_task_sighand(proc->tsk, &irqs)) { + ret = -EMFILE; + goto err; + } + rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); + unlock_task_sighand(proc->tsk, &irqs); + + ret = __alloc_fd(proc->files, 0, rlim_cur, flags); +err: + mutex_unlock(&proc->files_lock); + return ret; +} + +/* + * copied from fd_install + */ +static void task_fd_install( + struct binder_proc *proc, unsigned int fd, struct file *file) +{ + mutex_lock(&proc->files_lock); + if (proc->files) + __fd_install(proc->files, fd, file); + mutex_unlock(&proc->files_lock); +} + +/* + * copied from sys_close + */ +static long task_close_fd(struct binder_proc *proc, unsigned int fd) +{ + int retval; + + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + retval = -ESRCH; + goto err; + } + retval = __close_fd(proc->files, fd); + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; +err: + mutex_unlock(&proc->files_lock); + return retval; +} + static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { - int ret = 0; - - if (ret) - return true; return thread->process_todo || thread->looper_need_return || (do_proc_work && @@ -672,7 +1155,7 @@ static int to_userspace_prio(int policy, int kernel_priority) if (is_fair_policy(policy)) return PRIO_TO_NICE(kernel_priority); else - return MAX_RT_PRIO - 1 - kernel_priority; + return MAX_USER_RT_PRIO - 1 - kernel_priority; } static int to_kernel_prio(int policy, int user_priority) @@ -680,29 +1163,23 @@ static int to_kernel_prio(int policy, int user_priority) if (is_fair_policy(policy)) return NICE_TO_PRIO(user_priority); else - return MAX_RT_PRIO - 1 - user_priority; + return MAX_USER_RT_PRIO - 1 - user_priority; } -static void binder_do_set_priority(struct binder_thread *thread, - const struct binder_priority *desired, +static void binder_do_set_priority(struct task_struct *task, + struct binder_priority desired, bool verify) { - struct task_struct *task = thread->task; int priority; /* user-space prio value */ bool has_cap_nice; - unsigned int policy = desired->sched_policy; + unsigned int policy = desired.sched_policy; - if (task->policy == policy && task->normal_prio == desired->prio) { - spin_lock(&thread->prio_lock); - if (thread->prio_state == BINDER_PRIO_PENDING) - thread->prio_state = BINDER_PRIO_SET; - spin_unlock(&thread->prio_lock); + if (task->policy == policy && task->normal_prio == desired.prio) return; - } has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE); - priority = to_userspace_prio(policy, desired->prio); + priority = to_userspace_prio(policy, desired.prio); if (verify && is_rt_policy(policy) && !has_cap_nice) { long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO); @@ -727,30 +1204,16 @@ static void binder_do_set_priority(struct binder_thread *thread, } } - if (policy != desired->sched_policy || - to_kernel_prio(policy, priority) != desired->prio) + if (policy != desired.sched_policy || + to_kernel_prio(policy, priority) != desired.prio) binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: priority %d not allowed, using %d instead\n", - task->pid, desired->prio, + task->pid, desired.prio, to_kernel_prio(policy, priority)); trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, to_kernel_prio(policy, priority), - desired->prio); - - spin_lock(&thread->prio_lock); - if (!verify && thread->prio_state == BINDER_PRIO_ABORT) { - /* - * A new priority has been set by an incoming nested - * transaction. Abort this priority restore and allow - * the transaction to run at the new desired priority. - */ - spin_unlock(&thread->prio_lock); - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: %s: aborting priority restore\n", - thread->pid, __func__); - return; - } + desired.prio); /* Set the actual priority */ if (task->policy != policy || is_rt_policy(policy)) { @@ -764,46 +1227,37 @@ static void binder_do_set_priority(struct binder_thread *thread, } if (is_fair_policy(policy)) set_user_nice(task, priority); - - thread->prio_state = BINDER_PRIO_SET; - spin_unlock(&thread->prio_lock); } -static void binder_set_priority(struct binder_thread *thread, - const struct binder_priority *desired) +static void binder_set_priority(struct task_struct *task, + struct binder_priority desired) { - binder_do_set_priority(thread, desired, /* verify = */ true); + binder_do_set_priority(task, desired, /* verify = */ true); } -static void binder_restore_priority(struct binder_thread *thread, - const struct binder_priority *desired) +static void binder_restore_priority(struct task_struct *task, + struct binder_priority desired) { - binder_do_set_priority(thread, desired, /* verify = */ false); + binder_do_set_priority(task, desired, /* verify = */ false); } -static void binder_transaction_priority(struct binder_thread *thread, +static void binder_transaction_priority(struct task_struct *task, struct binder_transaction *t, - struct binder_node *node) + struct binder_priority node_prio, + bool inherit_rt) { - struct task_struct *task = thread->task; - struct binder_priority desired = t->priority; - const struct binder_priority node_prio = { - .sched_policy = node->sched_policy, - .prio = node->min_priority, - }; - bool skip = false; + struct binder_priority desired_prio = t->priority; if (t->set_priority_called) return; t->set_priority_called = true; + t->saved_priority.sched_policy = task->policy; + t->saved_priority.prio = task->normal_prio; - if (skip) - return; - - if (!node->inherit_rt && is_rt_policy(desired.sched_policy)) { - desired.prio = NICE_TO_PRIO(0); - desired.sched_policy = SCHED_NORMAL; + if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) { + desired_prio.prio = NICE_TO_PRIO(0); + desired_prio.sched_policy = SCHED_NORMAL; } if (node_prio.prio < t->priority.prio || @@ -816,29 +1270,10 @@ static void binder_transaction_priority(struct binder_thread *thread, * SCHED_FIFO, prefer SCHED_FIFO, since it can * run unbounded, unlike SCHED_RR. */ - desired = node_prio; - } - - spin_lock(&thread->prio_lock); - if (thread->prio_state == BINDER_PRIO_PENDING) { - /* - * Task is in the process of changing priorities - * saving its current values would be incorrect. - * Instead, save the pending priority and signal - * the task to abort the priority restore. - */ - t->saved_priority = thread->prio_next; - thread->prio_state = BINDER_PRIO_ABORT; - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: saved pending priority %d\n", - current->pid, thread->prio_next.prio); - } else { - t->saved_priority.sched_policy = task->policy; - t->saved_priority.prio = task->normal_prio; + desired_prio = node_prio; } - spin_unlock(&thread->prio_lock); - binder_set_priority(thread, &desired); + binder_set_priority(task, desired_prio); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, @@ -945,9 +1380,9 @@ static struct binder_node *binder_init_node_ilocked( static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { - struct binder_node *node, *new_node; + struct binder_node *node; + struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); - new_node = kmem_cache_zalloc(binder_node_pool, GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); @@ -957,14 +1392,14 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, /* * The node was already added by another thread */ - kmem_cache_free(binder_node_pool, new_node); + kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { - kmem_cache_free(binder_node_pool, node); + kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } @@ -982,7 +1417,8 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && - node == node->proc->context->binder_context_mgr_node && + node == node->proc->context-> + binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); @@ -992,12 +1428,19 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { - struct binder_thread *thread = container_of(target_list, - struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); - BUG_ON(&thread->todo != target_list); - binder_enqueue_deferred_thread_work_ilocked(thread, - &node->work); + /* + * Note: this function is the only place where we queue + * directly to a thread->todo without using the + * corresponding binder_enqueue_thread_work() helper + * functions; in this case it's ok to not set the + * process_todo flag, since we know this node work will + * always be followed by other work that starts queue + * processing: in case of synchronous transactions, a + * BR_REPLY or BR_ERROR; in case of oneway + * transactions, a BR_TRANSACTION_COMPLETE. + */ + binder_enqueue_work_ilocked(&node->work, target_list); } } else { if (!internal) @@ -1151,14 +1594,10 @@ static void binder_dec_node_tmpref(struct binder_node *node) binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); - else - __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); - else - __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 @@ -1448,9 +1887,8 @@ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); - if (ref->death) - kmem_cache_free(binder_ref_death_pool, ref->death); - kmem_cache_free(binder_ref_pool, ref); + kfree(ref->death); + kfree(ref); } /** @@ -1543,7 +1981,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); - new_ref = kmem_cache_zalloc(binder_ref_pool, GFP_KERNEL); + new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); @@ -1569,7 +2007,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, * Another thread created the ref first so * free the one we allocated */ - kmem_cache_free(binder_ref_pool, new_ref); + kfree(new_ref); return ret; } @@ -1628,9 +2066,9 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread) static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); - proc->tmp_ref--; + atomic_dec(&proc->tmp_ref); if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && - !proc->tmp_ref) { + !atomic_read(&proc->tmp_ref)) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; @@ -1674,89 +2112,45 @@ static struct binder_thread *binder_get_txn_from( */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) - __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); - if (!from) { - __acquire(&from->proc->inner_lock); + if (!from) return NULL; - } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); - __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } -/** - * binder_free_txn_fixups() - free unprocessed fd fixups - * @t: binder transaction for t->from - * - * If the transaction is being torn down prior to being - * processed by the target process, free all of the - * fd fixups and fput the file structs. It is safe to - * call this function after the fixups have been - * processed -- in that case, the list will be empty. - */ -static void binder_free_txn_fixups(struct binder_transaction *t) -{ - struct binder_txn_fd_fixup *fixup, *tmp; - - list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { - fput(fixup->file); - list_del(&fixup->fixup_entry); - kfree(fixup); - } -} - -static void binder_txn_latency_free(struct binder_transaction *t) -{ - int from_proc, from_thread, to_proc, to_thread; - - spin_lock(&t->lock); - from_proc = t->from ? t->from->proc->pid : 0; - from_thread = t->from ? t->from->pid : 0; - to_proc = t->to_proc ? t->to_proc->pid : 0; - to_thread = t->to_thread ? t->to_thread->pid : 0; - spin_unlock(&t->lock); - - trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); -} - static void binder_free_transaction(struct binder_transaction *t) { - struct binder_proc *target_proc = t->to_proc; + struct binder_proc *target_proc; + spin_lock(&t->lock); + target_proc = t->to_proc; if (target_proc) { + atomic_inc(&target_proc->tmp_ref); + spin_unlock(&t->lock); + binder_inner_proc_lock(target_proc); - target_proc->outstanding_txns--; - if (target_proc->outstanding_txns < 0) - pr_warn("%s: Unexpected outstanding_txns %d\n", - __func__, target_proc->outstanding_txns); - if (!target_proc->outstanding_txns && target_proc->is_frozen) - wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); + binder_proc_dec_tmpref(target_proc); + } else { + /* + * If the transaction has no target_proc, then + * t->buffer->transaction * has already been cleared. + */ + spin_unlock(&t->lock); } - if (trace_binder_txn_latency_free_enabled()) - binder_txn_latency_free(t); - /* - * If the transaction has no target_proc, then - * t->buffer->transaction has already been cleared. - */ - binder_free_txn_fixups(t); - /* - * If the transaction has no target_proc, then - * t->buffer->transaction has already been cleared. - */ - kmem_cache_free(binder_transaction_pool, t); + kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } @@ -1798,7 +2192,6 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_free_transaction(t); return; } - __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, @@ -1841,21 +2234,15 @@ static void binder_cleanup_transaction(struct binder_transaction *t, /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer - * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * - * Copy the binder object at the given offset into @object. If @u is - * provided then the copy is from the sender's buffer. If not, then - * it is copied from the target's @buffer. - * - * Return: If there's a valid metadata object at @offset, the + * Return: If there's a valid metadata object at @offset in @buffer, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, - const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) @@ -1865,16 +2252,11 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (offset > buffer->data_size || read_size < sizeof(*hdr)) + if (offset > buffer->data_size || read_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) return 0; - if (u) { - if (copy_from_user(object, u + offset, read_size)) - return 0; - } else { - if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, - offset, read_size)) - return 0; - } + binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, + offset, read_size); /* Ok, now see if we read a complete object. */ hdr = &object->hdr; @@ -1943,11 +2325,9 @@ static struct binder_buffer_object *binder_validate_ptr( return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; - if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, - b, buffer_offset, - sizeof(object_offset))) - return NULL; - object_size = binder_get_object(proc, NULL, b, object_offset, object); + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + b, buffer_offset, sizeof(object_offset)); + object_size = binder_get_object(proc, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) @@ -2012,8 +2392,7 @@ static bool binder_validate_fixup(struct binder_proc *proc, unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; - size_t object_size = binder_get_object(proc, NULL, b, - last_obj_offset, + size_t object_size = binder_get_object(proc, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; @@ -2027,78 +2406,15 @@ static bool binder_validate_fixup(struct binder_proc *proc, return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + - sizeof(binder_size_t) * last_bbo->parent; - if (binder_alloc_copy_from_buffer(&proc->alloc, - &last_obj_offset, - b, buffer_offset, - sizeof(last_obj_offset))) - return false; + sizeof(binder_size_t) * last_bbo->parent, + binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, + b, buffer_offset, + sizeof(last_obj_offset)); } return (fixup_offset >= last_min_offset); } -/** - * struct binder_task_work_cb - for deferred close - * - * @twork: callback_head for task work - * @fd: fd to close - * - * Structure to pass task work to be handled after - * returning from binder_ioctl() via task_work_add(). - */ -struct binder_task_work_cb { - struct callback_head twork; - struct file *file; -}; - -/** - * binder_do_fd_close() - close list of file descriptors - * @twork: callback head for task work - * - * It is not safe to call ksys_close() during the binder_ioctl() - * function if there is a chance that binder's own file descriptor - * might be closed. This is to meet the requirements for using - * fdget() (see comments for __fget_light()). Therefore use - * task_work_add() to schedule the close operation once we have - * returned from binder_ioctl(). This function is a callback - * for that mechanism and does the actual ksys_close() on the - * given file descriptor. - */ -static void binder_do_fd_close(struct callback_head *twork) -{ - struct binder_task_work_cb *twcb = container_of(twork, - struct binder_task_work_cb, twork); - - fput(twcb->file); - kfree(twcb); -} - -/** - * binder_deferred_fd_close() - schedule a close for the given file-descriptor - * @fd: file-descriptor to close - * - * See comments in binder_do_fd_close(). This function is used to schedule - * a file-descriptor to be closed after returning from binder_ioctl(). - */ -static void binder_deferred_fd_close(int fd) -{ - struct binder_task_work_cb *twcb; - - twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); - if (!twcb) - return; - init_task_work(&twcb->twork, binder_do_fd_close); - close_fd_get_file(fd, &twcb->file); - if (twcb->file) { - filp_close(twcb->file, current->files); - task_work_add(current, &twcb->twork, true); - } else { - kfree(twcb); - } -} - static void binder_transaction_buffer_release(struct binder_proc *proc, - struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t failed_at, bool is_failure) @@ -2116,20 +2432,20 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_end_offset = is_failure && failed_at ? failed_at : + off_end_offset = is_failure ? failed_at : off_start_offset + buffer->offsets_size; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; - size_t object_size = 0; + size_t object_size; struct binder_object object; binder_size_t object_offset; - if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, - buffer, buffer_offset, - sizeof(object_offset))) - object_size = binder_get_object(proc, NULL, buffer, - object_offset, &object); + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + buffer, buffer_offset, + sizeof(object_offset)); + object_size = binder_get_object(proc, buffer, + object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); @@ -2177,15 +2493,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } break; case BINDER_TYPE_FD: { - /* - * No need to close the file here since user-space - * closes it for for successfully delivered - * transactions. For transactions that weren't - * delivered, the new fd was never allocated so - * there is no need to close and the fput on the - * file is done when the transaction is torn - * down. - */ + struct binder_fd_object *fp = to_binder_fd_object(hdr); + + binder_debug(BINDER_DEBUG_TRANSACTION, + " fd %d\n", fp->fd); + if (failed_at) + task_close_fd(proc, fp->fd); } break; case BINDER_TYPE_PTR: /* @@ -2202,14 +2515,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_size_t fd_buf_size; binder_size_t num_valid; - if (is_failure) { - /* - * The fd fixups have not been applied so no - * fds need to be closed. - */ - continue; - } - num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); @@ -2219,7 +2524,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, NULL, num_valid); if (!parent) { - pr_err("transaction release %d bad parent offset\n", + pr_err("transaction release %d bad parent offset", debug_id); continue; } @@ -2249,24 +2554,15 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; - int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); - err = binder_alloc_copy_from_buffer( - &proc->alloc, &fd, buffer, - offset, sizeof(fd)); - WARN_ON(err); - if (!err) { - binder_deferred_fd_close(fd); - /* - * Need to make sure the thread goes - * back to userspace to complete the - * deferred close - */ - if (thread) - thread->looper_need_return = true; - } + binder_alloc_copy_from_buffer(&proc->alloc, + &fd, + buffer, + offset, + sizeof(fd)); + task_close_fd(proc, fd); } } break; default: @@ -2362,15 +2658,11 @@ static int binder_translate_handle(struct flat_binder_object *fp, fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); - else - __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); - else - __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", @@ -2403,16 +2695,16 @@ static int binder_translate_handle(struct flat_binder_object *fp, return ret; } -static int binder_translate_fd(u32 fd, binder_size_t fd_offset, +static int binder_translate_fd(int fd, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; - struct binder_txn_fd_fixup *fixup; + int target_fd; struct file *file; - int ret = 0; + int ret; bool target_allows_fd; if (in_reply_to) @@ -2441,24 +2733,19 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, goto err_security; } - /* - * Add fixup record for this transaction. The allocation - * of the fd in the target needs to be done from a - * target thread. - */ - fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); - if (!fixup) { + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); + if (target_fd < 0) { ret = -ENOMEM; - goto err_alloc; + goto err_get_unused_fd; } - fixup->file = file; - fixup->offset = fd_offset; - trace_binder_transaction_fd_send(t, fd, fixup->offset); - list_add_tail(&fixup->fixup_entry, &t->fd_fixups); + task_fd_install(target_proc, target_fd, file); + trace_binder_transaction_fd(t, fd, target_fd); + binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", + fd, target_fd); - return ret; + return target_fd; -err_alloc: +err_get_unused_fd: err_security: fput(file); err_fget: @@ -2466,266 +2753,17 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, return ret; } -/** - * struct binder_ptr_fixup - data to be fixed-up in target buffer - * @offset offset in target buffer to fixup - * @skip_size bytes to skip in copy (fixup will be written later) - * @fixup_data data to write at fixup offset - * @node list node - * - * This is used for the pointer fixup list (pf) which is created and consumed - * during binder_transaction() and is only accessed locally. No - * locking is necessary. - * - * The list is ordered by @offset. - */ -struct binder_ptr_fixup { - binder_size_t offset; - size_t skip_size; - binder_uintptr_t fixup_data; - struct list_head node; -}; - -/** - * struct binder_sg_copy - scatter-gather data to be copied - * @offset offset in target buffer - * @sender_uaddr user address in source buffer - * @length bytes to copy - * @node list node - * - * This is used for the sg copy list (sgc) which is created and consumed - * during binder_transaction() and is only accessed locally. No - * locking is necessary. - * - * The list is ordered by @offset. - */ -struct binder_sg_copy { - binder_size_t offset; - const void __user *sender_uaddr; - size_t length; - struct list_head node; -}; - -/** - * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data - * @alloc: binder_alloc associated with @buffer - * @buffer: binder buffer in target process - * @sgc_head: list_head of scatter-gather copy list - * @pf_head: list_head of pointer fixup list - * - * Processes all elements of @sgc_head, applying fixups from @pf_head - * and copying the scatter-gather data from the source process' user - * buffer to the target's buffer. It is expected that the list creation - * and processing all occurs during binder_transaction() so these lists - * are only accessed in local context. - * - * Return: 0=success, else -errno - */ -static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, - struct binder_buffer *buffer, - struct list_head *sgc_head, - struct list_head *pf_head) -{ - int ret = 0; - struct binder_sg_copy *sgc, *tmpsgc; - struct binder_ptr_fixup *tmppf; - struct binder_ptr_fixup *pf = - list_first_entry_or_null(pf_head, struct binder_ptr_fixup, - node); - - list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { - size_t bytes_copied = 0; - - while (bytes_copied < sgc->length) { - size_t copy_size; - size_t bytes_left = sgc->length - bytes_copied; - size_t offset = sgc->offset + bytes_copied; - - /* - * We copy up to the fixup (pointed to by pf) - */ - copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) - : bytes_left; - if (!ret && copy_size) - ret = binder_alloc_copy_user_to_buffer( - alloc, buffer, - offset, - sgc->sender_uaddr + bytes_copied, - copy_size); - bytes_copied += copy_size; - if (copy_size != bytes_left) { - BUG_ON(!pf); - /* we stopped at a fixup offset */ - if (pf->skip_size) { - /* - * we are just skipping. This is for - * BINDER_TYPE_FDA where the translated - * fds will be fixed up when we get - * to target context. - */ - bytes_copied += pf->skip_size; - } else { - /* apply the fixup indicated by pf */ - if (!ret) - ret = binder_alloc_copy_to_buffer( - alloc, buffer, - pf->offset, - &pf->fixup_data, - sizeof(pf->fixup_data)); - bytes_copied += sizeof(pf->fixup_data); - } - list_del(&pf->node); - kfree(pf); - pf = list_first_entry_or_null(pf_head, - struct binder_ptr_fixup, node); - } - } - list_del(&sgc->node); - kfree(sgc); - } - list_for_each_entry_safe(pf, tmppf, pf_head, node) { - BUG_ON(pf->skip_size == 0); - list_del(&pf->node); - kfree(pf); - } - BUG_ON(!list_empty(sgc_head)); - - return ret > 0 ? -EINVAL : ret; -} - -/** - * binder_cleanup_deferred_txn_lists() - free specified lists - * @sgc_head: list_head of scatter-gather copy list - * @pf_head: list_head of pointer fixup list - * - * Called to clean up @sgc_head and @pf_head if there is an - * error. - */ -static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, - struct list_head *pf_head) -{ - struct binder_sg_copy *sgc, *tmpsgc; - struct binder_ptr_fixup *pf, *tmppf; - - list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { - list_del(&sgc->node); - kfree(sgc); - } - list_for_each_entry_safe(pf, tmppf, pf_head, node) { - list_del(&pf->node); - kfree(pf); - } -} - -/** - * binder_defer_copy() - queue a scatter-gather buffer for copy - * @sgc_head: list_head of scatter-gather copy list - * @offset: binder buffer offset in target process - * @sender_uaddr: user address in source process - * @length: bytes to copy - * - * Specify a scatter-gather block to be copied. The actual copy must - * be deferred until all the needed fixups are identified and queued. - * Then the copy and fixups are done together so un-translated values - * from the source are never visible in the target buffer. - * - * We are guaranteed that repeated calls to this function will have - * monotonically increasing @offset values so the list will naturally - * be ordered. - * - * Return: 0=success, else -errno - */ -static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, - const void __user *sender_uaddr, size_t length) -{ - struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); - - if (!bc) - return -ENOMEM; - - bc->offset = offset; - bc->sender_uaddr = sender_uaddr; - bc->length = length; - INIT_LIST_HEAD(&bc->node); - - /* - * We are guaranteed that the deferred copies are in-order - * so just add to the tail. - */ - list_add_tail(&bc->node, sgc_head); - - return 0; -} - -/** - * binder_add_fixup() - queue a fixup to be applied to sg copy - * @pf_head: list_head of binder ptr fixup list - * @offset: binder buffer offset in target process - * @fixup: bytes to be copied for fixup - * @skip_size: bytes to skip when copying (fixup will be applied later) - * - * Add the specified fixup to a list ordered by @offset. When copying - * the scatter-gather buffers, the fixup will be copied instead of - * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup - * will be applied later (in target process context), so we just skip - * the bytes specified by @skip_size. If @skip_size is 0, we copy the - * value in @fixup. - * - * This function is called *mostly* in @offset order, but there are - * exceptions. Since out-of-order inserts are relatively uncommon, - * we insert the new element by searching backward from the tail of - * the list. - * - * Return: 0=success, else -errno - */ -static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, - binder_uintptr_t fixup, size_t skip_size) -{ - struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); - struct binder_ptr_fixup *tmppf; - - if (!pf) - return -ENOMEM; - - pf->offset = offset; - pf->fixup_data = fixup; - pf->skip_size = skip_size; - INIT_LIST_HEAD(&pf->node); - - /* Fixups are *mostly* added in-order, but there are some - * exceptions. Look backwards through list for insertion point. - */ - list_for_each_entry_reverse(tmppf, pf_head, node) { - if (tmppf->offset < pf->offset) { - list_add(&pf->node, &tmppf->node); - return 0; - } - } - /* - * if we get here, then the new offset is the lowest so - * insert at the head - */ - list_add(&pf->node, pf_head); - return 0; -} - -static int binder_translate_fd_array(struct list_head *pf_head, - struct binder_fd_array_object *fda, - const void __user *sender_ubuffer, +static int binder_translate_fd_array(struct binder_fd_array_object *fda, struct binder_buffer_object *parent, - struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { - binder_size_t fdi, fd_buf_size; + binder_size_t fdi, fd_buf_size, num_installed_fds; binder_size_t fda_offset; - const void __user *sender_ufda_base; + int target_fd; struct binder_proc *proc = thread->proc; - int ret; - - if (fda->num_fds == 0) - return 0; + struct binder_proc *target_proc = t->to_proc; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { @@ -2749,36 +2787,46 @@ static int binder_translate_fd_array(struct list_head *pf_head, */ fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) + fda->parent_offset; - sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + - fda->parent_offset; - - if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || - !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { + if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } - ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); - if (ret) - return ret; - for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; + binder_size_t offset = fda_offset + fdi * sizeof(fd); - binder_size_t sender_uoffset = fdi * sizeof(fd); - ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); - if (!ret) - ret = binder_translate_fd(fd, offset, t, thread, - in_reply_to); - if (ret) - return ret > 0 ? -EINVAL : ret; + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + target_fd = binder_translate_fd(fd, t, thread, in_reply_to); + if (target_fd < 0) + goto err_translate_fd_failed; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, offset, + &target_fd, sizeof(fd)); } return 0; + +err_translate_fd_failed: + /* + * Failed to allocate fd or security error, free fds + * installed so far. + */ + num_installed_fds = fdi; + for (fdi = 0; fdi < num_installed_fds; fdi++) { + u32 fd; + binder_size_t offset = fda_offset + fdi * sizeof(fd); + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + task_close_fd(target_proc, fd); + } + return target_fd; } -static int binder_fixup_parent(struct list_head *pf_head, - struct binder_transaction *t, +static int binder_fixup_parent(struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, @@ -2824,88 +2872,11 @@ static int binder_fixup_parent(struct list_head *pf_head, } buffer_offset = bp->parent_offset + (uintptr_t)parent->buffer - (uintptr_t)b->user_data; - return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); -} - -/** - * binder_can_update_transaction() - Can a txn be superseded by an updated one? - * @t1: the pending async txn in the frozen process - * @t2: the new async txn to supersede the outdated pending one - * - * Return: true if t2 can supersede t1 - * false if t2 can not supersede t1 - */ -static bool binder_can_update_transaction(struct binder_transaction *t1, - struct binder_transaction *t2) -{ -#ifdef CONFIG_REKERNEL - if ((t1->flags & t2->flags & TF_ONE_WAY) != TF_ONE_WAY || !t1->to_proc || !t2->to_proc) -#else - if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != - (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) -#endif /* CONFIG_REKERNEL */ - return false; - if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && - t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && - t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && - t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) - return true; - return false; -} - -/** - * binder_find_outdated_transaction_ilocked() - Find the outdated transaction - * @t: new async transaction - * @target_list: list to find outdated transaction - * - * Return: the outdated transaction if found - * NULL if no outdated transacton can be found - * - * Requires the proc->inner_lock to be held. - */ -static struct binder_transaction * -binder_find_outdated_transaction_ilocked(struct binder_transaction *t, - struct list_head *target_list) -{ - struct binder_work *w; + binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset, + &bp->buffer, sizeof(bp->buffer)); - list_for_each_entry(w, target_list, entry) { - struct binder_transaction *t_queued; - - if (w->type != BINDER_WORK_TRANSACTION) - continue; - t_queued = container_of(w, struct binder_transaction, work); - if (binder_can_update_transaction(t_queued, t)) - return t_queued; - } - return NULL; -} - -#ifdef CONFIG_REKERNEL -void rekernel_binder_transaction(bool reply, struct binder_transaction *t, - struct binder_node *target_node, struct binder_transaction_data *tr) { - struct binder_proc *to_proc; - struct binder_alloc *target_alloc; - if (!t->to_proc) - return; - to_proc = t->to_proc; - - if (reply) { - binder_reply_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, false, tr); - } else if (t->from) { - if (t->from->proc) { - binder_trans_handler(t->from->proc->pid, t->from->proc->tsk, to_proc->pid, to_proc->tsk, false, tr); - } - } else { // oneway=1 - binder_trans_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr); - - target_alloc = &to_proc->alloc; - if (target_alloc->free_async_space < (target_alloc->buffer_size / 10 + 0x300)) { - binder_overflow_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr); - } - } + return 0; } -#endif /* CONFIG_REKERNEL */ /** * binder_proc_transaction() - sends a transaction to a process and wakes it up @@ -2921,95 +2892,60 @@ void rekernel_binder_transaction(bool reply, struct binder_transaction *t, * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * - * Return: 0 if the transaction was successfully queued - * BR_DEAD_REPLY if the target process or thread is dead - * BR_FROZEN_REPLY if the target process or thread is frozen + * Return: true if the transactions was successfully queued + * false if the target process or thread is dead */ -static int binder_proc_transaction(struct binder_transaction *t, +static bool binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; + struct binder_priority node_prio; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; - bool skip = false; - struct binder_transaction *t_outdated = NULL; BUG_ON(!node); binder_node_lock(node); + node_prio.prio = node->min_priority; + node_prio.sched_policy = node->sched_policy; if (oneway) { BUG_ON(thread); - if (node->has_async_transaction) + if (node->has_async_transaction) { pending_async = true; - else + } else { node->has_async_transaction = true; + } } binder_inner_proc_lock(proc); - if (proc->is_frozen) { - proc->sync_recv |= !oneway; - proc->async_recv |= oneway; - } - if ((proc->is_frozen && !oneway) || proc->is_dead || - (thread && thread->is_dead)) { + if (proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); - return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; + return false; } - if (!thread && !pending_async && !skip) + if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { - binder_transaction_priority(thread, t, node); + binder_transaction_priority(thread->task, t, node_prio, + node->inherit_rt); binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { -#ifdef CONFIG_REKERNEL - if (frozen_task_group(proc->tsk)) { -#else - if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) { -#endif /* CONFIG_REKERNEL */ - t_outdated = binder_find_outdated_transaction_ilocked(t, - &node->async_todo); - if (t_outdated) { - binder_debug(BINDER_DEBUG_TRANSACTION, - "txn %d supersedes %d\n", - t->debug_id, t_outdated->debug_id); - list_del_init(&t_outdated->work.entry); - proc->outstanding_txns--; - } - } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); - proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); - /* - * To reduce potential contention, free the outdated transaction and - * buffer after releasing the locks. - */ - if (t_outdated) { - struct binder_buffer *buffer = t_outdated->buffer; - - t_outdated->buffer = NULL; - buffer->transaction = NULL; - trace_binder_transaction_update_buffer_release(buffer); - binder_transaction_buffer_release(proc, NULL, buffer, 0, 0); - binder_alloc_free_buf(&proc->alloc, buffer); - kfree(t_outdated); - binder_stats_deleted(BINDER_STAT_TRANSACTION); - } - - return 0; + return true; } /** @@ -3045,7 +2981,7 @@ static struct binder_node *binder_get_node_refs_for_txn( target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); - node->proc->tmp_ref++; + atomic_inc(&node->proc->tmp_ref); *procp = node->proc; } else *error = BR_DEAD_REPLY; @@ -3061,13 +2997,11 @@ static void binder_transaction(struct binder_proc *proc, { int ret; struct binder_transaction *t; - struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; - binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; @@ -3082,13 +3016,6 @@ static void binder_transaction(struct binder_proc *proc, int t_debug_id = atomic_inc_return(&binder_last_id); char *secctx = NULL; u32 secctx_sz = 0; - bool is_nested = false; - struct list_head sgc_head; - struct list_head pf_head; - const void __user *user_buffer = (const void __user *) - (uintptr_t)tr->data.ptr.buffer; - INIT_LIST_HEAD(&sgc_head); - INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -3098,7 +3025,7 @@ static void binder_transaction(struct binder_proc *proc, e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; - strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); + e->context_name = proc->context->name; if (reply) { binder_inner_proc_lock(proc); @@ -3132,8 +3059,6 @@ static void binder_transaction(struct binder_proc *proc, binder_inner_proc_unlock(proc); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { - /* annotation for sparse */ - __release(&target_thread->proc->inner_lock); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; @@ -3153,7 +3078,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_binder; } target_proc = target_thread->proc; - target_proc->tmp_ref++; + atomic_inc(&target_proc->tmp_ref); binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { @@ -3174,8 +3099,8 @@ static void binder_transaction(struct binder_proc *proc, ref->node, &target_proc, &return_error); } else { - binder_user_error("%d:%d got transaction to invalid handle, %u\n", - proc->pid, thread->pid, tr->target.handle); + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); @@ -3189,7 +3114,7 @@ static void binder_transaction(struct binder_proc *proc, else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); - if (target_node && target_proc->pid == proc->pid) { + if (target_node && target_proc == proc) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -3221,29 +3146,6 @@ static void binder_transaction(struct binder_proc *proc, goto err_invalid_target_handle; } binder_inner_proc_lock(proc); - - w = list_first_entry_or_null(&thread->todo, - struct binder_work, entry); - if (!(tr->flags & TF_ONE_WAY) && w && - w->type == BINDER_WORK_TRANSACTION) { - /* - * Do not allow new outgoing transaction from a - * thread that has a transaction at the head of - * its todo list. Only need to check the head - * because binder_select_thread_ilocked picks a - * thread from proc->waiting_threads to enqueue - * the transaction, and nothing is queued to the - * todo list while the thread is on waiting_threads. - */ - binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", - proc->pid, thread->pid); - binder_inner_proc_unlock(proc); - return_error = BR_FAILED_REPLY; - return_error_param = -EPROTO; - return_error_line = __LINE__; - goto err_bad_todo_list; - } - if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; @@ -3271,7 +3173,6 @@ static void binder_transaction(struct binder_proc *proc, atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); - is_nested = true; break; } spin_unlock(&tmp->lock); @@ -3285,18 +3186,17 @@ static void binder_transaction(struct binder_proc *proc, e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ - t = kmem_cache_zalloc(binder_transaction_pool, GFP_KERNEL); + t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } - INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); - tcomplete = kmem_cache_zalloc(binder_work_pool, GFP_KERNEL); + tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; @@ -3335,7 +3235,6 @@ static void binder_transaction(struct binder_proc *proc, t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; - t->is_nested = is_nested; if (!(t->flags & TF_ONE_WAY) && binder_supported_policy(current->policy)) { /* Inherit supported policies for synchronous transactions */ @@ -3363,15 +3262,12 @@ static void binder_transaction(struct binder_proc *proc, if (extra_buffers_size < added_size) { /* integer overflow of extra_buffers_size */ return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; + return_error_param = EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } } -#ifdef CONFIG_REKERNEL - rekernel_binder_transaction(reply, t, target_node, tr); -#endif /* CONFIG_REKERNEL */ trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, @@ -3389,20 +3285,15 @@ static void binder_transaction(struct binder_proc *proc, goto err_binder_alloc_buf_failed; } if (secctx) { - int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); t->security_ctx = (uintptr_t)t->buffer->user_data + buf_offset; - err = binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, buf_offset, - secctx, secctx_sz); - if (err) { - t->security_ctx = 0; - WARN_ON(1); - } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, buf_offset, + secctx, secctx_sz); security_release_secctx(secctx, secctx_sz); secctx = NULL; } @@ -3412,6 +3303,19 @@ static void binder_transaction(struct binder_proc *proc, t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, 0, + (const void __user *) + (uintptr_t)tr->data.ptr.buffer, + tr->data_size)) { + binder_user_error("%d:%d got transaction with invalid data ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + return_error_param = -EFAULT; + return_error_line = __LINE__; + goto err_copy_data_failed; + } if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, @@ -3456,39 +3360,14 @@ static void binder_transaction(struct binder_proc *proc, size_t object_size; struct binder_object object; binder_size_t object_offset; - binder_size_t copy_size; - if (binder_alloc_copy_from_buffer(&target_proc->alloc, - &object_offset, - t->buffer, - buffer_offset, - sizeof(object_offset))) { - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_bad_offset; - } - - /* - * Copy the source user buffer up to the next object - * that will be processed. - */ - copy_size = object_offset - user_offset; - if (copy_size && (user_offset > object_offset || - binder_alloc_copy_user_to_buffer( - &target_proc->alloc, - t->buffer, user_offset, - user_buffer + user_offset, - copy_size))) { - binder_user_error("%d:%d got transaction with invalid data ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EFAULT; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - object_size = binder_get_object(target_proc, user_buffer, - t->buffer, object_offset, &object); + binder_alloc_copy_from_buffer(&target_proc->alloc, + &object_offset, + t->buffer, + buffer_offset, + sizeof(object_offset)); + object_size = binder_get_object(target_proc, t->buffer, + object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, @@ -3500,11 +3379,6 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - /* - * Set offset to the next buffer fragment to be - * copied - */ - user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; @@ -3515,17 +3389,15 @@ static void binder_transaction(struct binder_proc *proc, fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); - - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { @@ -3533,42 +3405,37 @@ static void binder_transaction(struct binder_proc *proc, fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); - binder_size_t fd_offset = object_offset + - (uintptr_t)&fp->fd - (uintptr_t)fp; - int ret = binder_translate_fd(fp->fd, fd_offset, t, - thread, in_reply_to); + int target_fd = binder_translate_fd(fp->fd, t, thread, + in_reply_to); - fp->pad_binder = 0; - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fp, sizeof(*fp))) { + if (target_fd < 0) { return_error = BR_FAILED_REPLY; - return_error_param = ret; + return_error_param = target_fd; return_error_line = __LINE__; goto err_translate_failed; } + fp->pad_binder = 0; + fp->fd = target_fd; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; - struct binder_object user_object; - size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / @@ -3600,35 +3467,11 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_parent; } - /* - * We need to read the user version of the parent - * object to get the original user offset - */ - user_parent_size = - binder_get_object(proc, user_buffer, t->buffer, - parent_offset, &user_object); - if (user_parent_size != sizeof(user_object.bbo)) { - binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", - proc->pid, thread->pid, - user_parent_size, - sizeof(user_object.bbo)); - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_bad_parent; - } - ret = binder_translate_fd_array(&pf_head, fda, - user_buffer, parent, - &user_object.bbo, t, - thread, in_reply_to); - if (!ret) - ret = binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - fda, sizeof(*fda)); - if (ret) { + ret = binder_translate_fd_array(fda, parent, t, thread, + in_reply_to); + if (ret < 0) { return_error = BR_FAILED_REPLY; - return_error_param = ret > 0 ? -EINVAL : ret; + return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } @@ -3650,14 +3493,19 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - ret = binder_defer_copy(&sgc_head, sg_buf_offset, - (const void __user *)(uintptr_t)bp->buffer, - bp->length); - if (ret) { + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, + sg_buf_offset, + (const void __user *) + (uintptr_t)bp->buffer, + bp->length)) { + binder_user_error("%d:%d got transaction with invalid offsets ptr\n", + proc->pid, thread->pid); + return_error_param = -EFAULT; return_error = BR_FAILED_REPLY; - return_error_param = ret; return_error_line = __LINE__; - goto err_translate_failed; + goto err_copy_data_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = (uintptr_t) @@ -3666,22 +3514,20 @@ static void binder_transaction(struct binder_proc *proc, num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); - ret = binder_fixup_parent(&pf_head, t, - thread, bp, + ret = binder_fixup_parent(t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); - if (ret < 0 || - binder_alloc_copy_to_buffer(&target_proc->alloc, - t->buffer, - object_offset, - bp, sizeof(*bp))) { + if (ret < 0) { return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + bp, sizeof(*bp)); last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; @@ -3694,57 +3540,22 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_object_type; } } - /* Done processing objects, copy the rest of the buffer */ - if (binder_alloc_copy_user_to_buffer( - &target_proc->alloc, - t->buffer, user_offset, - user_buffer + user_offset, - tr->data_size - user_offset)) { - binder_user_error("%d:%d got transaction with invalid data ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EFAULT; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - - ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, - &sgc_head, &pf_head); - if (ret) { - binder_user_error("%d:%d got transaction with invalid offsets ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = ret; - return_error_line = __LINE__; - goto err_copy_data_failed; - } - if (t->buffer->oneway_spam_suspect) - tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; - else - tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; t->work.type = BINDER_WORK_TRANSACTION; if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { - return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); - target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); - if (in_reply_to->is_nested) { - spin_lock(&thread->prio_lock); - thread->prio_state = BINDER_PRIO_PENDING; - thread->prio_next = in_reply_to->saved_priority; - spin_unlock(&thread->prio_lock); - } wake_up_interruptible_sync(&target_thread->wait); - binder_restore_priority(thread, &in_reply_to->saved_priority); + binder_restore_priority(current, in_reply_to->saved_priority); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -3761,9 +3572,7 @@ static void binder_transaction(struct binder_proc *proc, t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); - return_error = binder_proc_transaction(t, - target_proc, target_thread); - if (return_error) { + if (!binder_proc_transaction(t, target_proc, target_thread)) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); @@ -3773,8 +3582,7 @@ static void binder_transaction(struct binder_proc *proc, BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); binder_enqueue_thread_work(thread, tcomplete); - return_error = binder_proc_transaction(t, target_proc, NULL); - if (return_error) + if (!binder_proc_transaction(t, target_proc, NULL)) goto err_dead_proc_or_thread; } if (target_thread) @@ -3791,6 +3599,7 @@ static void binder_transaction(struct binder_proc *proc, return; err_dead_proc_or_thread: + return_error = BR_DEAD_REPLY; return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: @@ -3798,10 +3607,8 @@ static void binder_transaction(struct binder_proc *proc, err_bad_offset: err_bad_parent: err_copy_data_failed: - binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); - binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, NULL, t->buffer, + binder_transaction_buffer_release(target_proc, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); @@ -3813,15 +3620,12 @@ static void binder_transaction(struct binder_proc *proc, if (secctx) security_release_secctx(secctx, secctx_sz); err_get_secctx_failed: - kmem_cache_free(binder_work_pool, tcomplete); + kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: - if (trace_binder_txn_latency_free_enabled()) - binder_txn_latency_free(t); - kmem_cache_free(binder_transaction_pool, t); + kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: -err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: @@ -3855,65 +3659,19 @@ static void binder_transaction(struct binder_proc *proc, */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); - WRITE_ONCE(fe->debug_id_done, t_debug_id); - } - - BUG_ON(thread->return_error.cmd != BR_OK); - if (in_reply_to) { - binder_restore_priority(thread, &in_reply_to->saved_priority); - thread->return_error.cmd = BR_TRANSACTION_COMPLETE; - binder_enqueue_thread_work(thread, &thread->return_error.work); - binder_send_failed_reply(in_reply_to, return_error); - } else { - thread->return_error.cmd = return_error; - binder_enqueue_thread_work(thread, &thread->return_error.work); - } -} - -/** - * binder_free_buf() - free the specified buffer - * @proc: binder proc that owns buffer - * @buffer: buffer to be freed - * @is_failure: failed to send transaction - * - * If buffer for an async transaction, enqueue the next async - * transaction from the node. - * - * Cleanup buffer and free it. - */ -static void -binder_free_buf(struct binder_proc *proc, - struct binder_thread *thread, - struct binder_buffer *buffer, bool is_failure) -{ - binder_inner_proc_lock(proc); - if (buffer->transaction) { - buffer->transaction->buffer = NULL; - buffer->transaction = NULL; - } - binder_inner_proc_unlock(proc); - if (buffer->async_transaction && buffer->target_node) { - struct binder_node *buf_node; - struct binder_work *w; - - buf_node = buffer->target_node; - binder_node_inner_lock(buf_node); - BUG_ON(!buf_node->has_async_transaction); - BUG_ON(buf_node->proc != proc); - w = binder_dequeue_work_head_ilocked( - &buf_node->async_todo); - if (!w) { - buf_node->has_async_transaction = false; - } else { - binder_enqueue_work_ilocked( - w, &proc->todo); - binder_wakeup_proc_ilocked(proc); - } - binder_node_inner_unlock(buf_node); + WRITE_ONCE(fe->debug_id_done, t_debug_id); + } + + BUG_ON(thread->return_error.cmd != BR_OK); + if (in_reply_to) { + binder_restore_priority(current, in_reply_to->saved_priority); + thread->return_error.cmd = BR_TRANSACTION_COMPLETE; + binder_enqueue_thread_work(thread, &thread->return_error.work); + binder_send_failed_reply(in_reply_to, return_error); + } else { + thread->return_error.cmd = return_error; + binder_enqueue_thread_work(thread, &thread->return_error.work); } - trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure); - binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, @@ -3957,7 +3715,6 @@ static int binder_thread_write(struct binder_proc *proc, ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; - mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { @@ -4114,7 +3871,35 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); - binder_free_buf(proc, thread, buffer, false); + + binder_inner_proc_lock(proc); + if (buffer->transaction) { + buffer->transaction->buffer = NULL; + buffer->transaction = NULL; + } + binder_inner_proc_unlock(proc); + if (buffer->async_transaction && buffer->target_node) { + struct binder_node *buf_node; + struct binder_work *w; + + buf_node = buffer->target_node; + binder_node_inner_lock(buf_node); + BUG_ON(!buf_node->has_async_transaction); + BUG_ON(buf_node->proc != proc); + w = binder_dequeue_work_head_ilocked( + &buf_node->async_todo); + if (!w) { + buf_node->has_async_transaction = false; + } else { + binder_enqueue_work_ilocked( + w, &proc->todo); + binder_wakeup_proc_ilocked(proc); + } + binder_node_inner_unlock(buf_node); + } + trace_binder_transaction_buffer_release(buffer); + binder_transaction_buffer_release(proc, buffer, 0, false); + binder_alloc_free_buf(&proc->alloc, buffer); break; } @@ -4197,7 +3982,7 @@ static int binder_thread_write(struct binder_proc *proc, * Allocate memory for death notification * before taking lock */ - death = kmem_cache_zalloc(binder_ref_death_pool, GFP_KERNEL); + death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); @@ -4222,8 +4007,7 @@ static int binder_thread_write(struct binder_proc *proc, "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); - if (death) - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); break; } @@ -4244,7 +4028,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); @@ -4427,7 +4211,7 @@ static int binder_wait_for_work(struct binder_thread *thread, binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { - ret = -EINTR; + ret = -ERESTARTSYS; break; } } @@ -4438,71 +4222,6 @@ static int binder_wait_for_work(struct binder_thread *thread, return ret; } -/** - * binder_apply_fd_fixups() - finish fd translation - * @proc: binder_proc associated @t->buffer - * @t: binder transaction with list of fd fixups - * - * Now that we are in the context of the transaction target - * process, we can allocate and install fds. Process the - * list of fds to translate and fixup the buffer with the - * new fds. - * - * If we fail to allocate an fd, then free the resources by - * fput'ing files that have not been processed and ksys_close'ing - * any fds that have already been allocated. - */ -static int binder_apply_fd_fixups(struct binder_proc *proc, - struct binder_transaction *t) -{ - struct binder_txn_fd_fixup *fixup, *tmp; - int ret = 0; - - list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { - int fd = get_unused_fd_flags(O_CLOEXEC); - - if (fd < 0) { - binder_debug(BINDER_DEBUG_TRANSACTION, - "failed fd fixup txn %d fd %d\n", - t->debug_id, fd); - ret = -ENOMEM; - break; - } - binder_debug(BINDER_DEBUG_TRANSACTION, - "fd fixup txn %d fd %d\n", - t->debug_id, fd); - trace_binder_transaction_fd_recv(t, fd, fixup->offset); - fd_install(fd, fixup->file); - fixup->file = NULL; - if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, - fixup->offset, &fd, - sizeof(u32))) { - ret = -EINVAL; - break; - } - } - list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { - if (fixup->file) { - fput(fixup->file); - } else if (ret) { - u32 fd; - int err; - - err = binder_alloc_copy_from_buffer(&proc->alloc, &fd, - t->buffer, - fixup->offset, - sizeof(fd)); - WARN_ON(err); - if (!err) - binder_deferred_fd_close(fd); - } - list_del(&fixup->fixup_entry); - kfree(fixup); - } - - return ret; -} - static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, @@ -4539,7 +4258,7 @@ static int binder_thread_read(struct binder_proc *proc, wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } - binder_restore_priority(thread, &proc->default_priority); + binder_restore_priority(current, proc->default_priority); } if (non_block) { @@ -4565,8 +4284,6 @@ static int binder_thread_read(struct binder_proc *proc, size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); - if (list) - goto skip; if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && @@ -4580,7 +4297,7 @@ static int binder_thread_read(struct binder_proc *proc, goto retry; break; } -skip: + if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; @@ -4606,18 +4323,11 @@ static int binder_thread_read(struct binder_proc *proc, e->cmd = BR_OK; ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, cmd); + binder_stat_br(proc, thread, e->cmd); } break; - case BINDER_WORK_TRANSACTION_COMPLETE: - case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { - if (proc->oneway_spam_detection_enabled && - w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) - cmd = BR_ONEWAY_SPAM_SUSPECT; - else - cmd = BR_TRANSACTION_COMPLETE; + case BINDER_WORK_TRANSACTION_COMPLETE: { binder_inner_proc_unlock(proc); - kmem_cache_free(binder_work_pool, w); - binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); + cmd = BR_TRANSACTION_COMPLETE; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); @@ -4626,6 +4336,8 @@ static int binder_thread_read(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); + kfree(w); + binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); @@ -4737,7 +4449,7 @@ static int binder_thread_read(struct binder_proc *proc, (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( @@ -4755,11 +4467,6 @@ static int binder_thread_read(struct binder_proc *proc, if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; - default: - binder_inner_proc_unlock(proc); - pr_err("%d:%d: bad work type %d\n", - proc->pid, thread->pid, w->type); - break; } if (!t) @@ -4768,10 +4475,14 @@ static int binder_thread_read(struct binder_proc *proc, BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; + struct binder_priority node_prio; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; - binder_transaction_priority(thread, t, target_node); + node_prio.sched_policy = target_node->sched_policy; + node_prio.prio = target_node->min_priority; + binder_transaction_priority(current, t, node_prio, + target_node->inherit_rt); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; @@ -4793,34 +4504,6 @@ static int binder_thread_read(struct binder_proc *proc, trd->sender_pid = 0; } - ret = binder_apply_fd_fixups(proc, t); - if (ret) { - struct binder_buffer *buffer = t->buffer; - bool oneway = !!(t->flags & TF_ONE_WAY); - int tid = t->debug_id; - - if (t_from) - binder_thread_dec_tmpref(t_from); - buffer->transaction = NULL; - binder_cleanup_transaction(t, "fd fixups failed", - BR_FAILED_REPLY); - binder_free_buf(proc, thread, buffer, true); - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", - proc->pid, thread->pid, - oneway ? "async " : - (cmd == BR_REPLY ? "reply " : ""), - tid, BR_FAILED_REPLY, ret, __LINE__); - if (cmd == BR_REPLY) { - cmd = BR_FAILED_REPLY; - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, cmd); - break; - } - continue; - } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = (uintptr_t)t->buffer->user_data; @@ -4940,7 +4623,7 @@ static void binder_release_work(struct binder_proc *proc, case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); - kmem_cache_free(binder_work_pool, w); + kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: @@ -4951,7 +4634,7 @@ static void binder_release_work(struct binder_proc *proc, binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); - kmem_cache_free(binder_ref_death_pool, death); + kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: @@ -5001,8 +4684,6 @@ static struct binder_thread *binder_get_thread_ilocked( thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; - spin_lock_init(&thread->prio_lock); - thread->prio_state = BINDER_PRIO_SET; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } @@ -5016,37 +4697,27 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) thread = binder_get_thread_ilocked(proc, NULL); binder_inner_proc_unlock(proc); if (!thread) { - new_thread = kmem_cache_zalloc(binder_thread_pool, GFP_KERNEL); + new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); if (new_thread == NULL) return NULL; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, new_thread); binder_inner_proc_unlock(proc); if (thread != new_thread) - kmem_cache_free(binder_thread_pool, new_thread); + kfree(new_thread); } return thread; } static void binder_free_proc(struct binder_proc *proc) { - struct binder_device *device; - BUG_ON(!list_empty(&proc->todo)); BUG_ON(!list_empty(&proc->delivered_death)); - if (proc->outstanding_txns) - pr_warn("%s: Unexpected outstanding_txns %d\n", - __func__, proc->outstanding_txns); - device = container_of(proc->context, struct binder_device, context); - if (refcount_dec_and_test(&device->ref)) { - kfree(proc->context->name); - kfree(device); - } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); - kmem_cache_free(binder_proc_pool, proc); + kfree(proc); } static void binder_free_thread(struct binder_thread *thread) @@ -5055,7 +4726,7 @@ static void binder_free_thread(struct binder_thread *thread) binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); put_task_struct(thread->task); - kmem_cache_free(binder_thread_pool, thread); + kfree(thread); } static int binder_thread_release(struct binder_proc *proc, @@ -5073,7 +4744,7 @@ static int binder_thread_release(struct binder_proc *proc, * The corresponding dec is when we actually * free the thread in binder_free_thread() */ - proc->tmp_ref++; + atomic_inc(&proc->tmp_ref); /* * take a ref on this thread to ensure it * survives while we are releasing it @@ -5085,8 +4756,6 @@ static int binder_thread_release(struct binder_proc *proc, spin_lock(&t->lock); if (t->to_thread == thread) send_reply = t; - } else { - __acquire(&t->lock); } thread->is_dead = true; @@ -5100,7 +4769,6 @@ static int binder_thread_release(struct binder_proc *proc, (t->to_thread == thread) ? "in" : "out"); if (t->to_thread == thread) { - thread->proc->outstanding_txns--; t->to_proc = NULL; t->to_thread = NULL; if (t->buffer) { @@ -5116,11 +4784,7 @@ static int binder_thread_release(struct binder_proc *proc, spin_unlock(&last_t->lock); if (t) spin_lock(&t->lock); - else - __acquire(&t->lock); } - /* annotation for sparse, lock not acquired in last iteration above */ - __release(&t->lock); /* * If this thread used poll, make sure we remove the waitqueue from any @@ -5148,7 +4812,7 @@ static int binder_thread_release(struct binder_proc *proc, return active_transactions; } -static __poll_t binder_poll(struct file *filp, +static unsigned int binder_poll(struct file *filp, struct poll_table_struct *wait) { struct binder_proc *proc = filp->private_data; @@ -5168,7 +4832,7 @@ static __poll_t binder_poll(struct file *filp, poll_wait(filp, &thread->wait, wait); if (binder_has_work(thread, wait_for_proc_work)) - return EPOLLIN; + return POLLIN; return 0; } @@ -5324,8 +4988,7 @@ static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, } static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, - struct binder_node_debug_info *info) -{ + struct binder_node_debug_info *info) { struct rb_node *n; binder_uintptr_t ptr = info->ptr; @@ -5348,100 +5011,6 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, return 0; } -static bool binder_txns_pending_ilocked(struct binder_proc *proc) -{ - struct rb_node *n; - struct binder_thread *thread; - - if (proc->outstanding_txns > 0) - return true; - - for (n = rb_first(&proc->threads); n; n = rb_next(n)) { - thread = rb_entry(n, struct binder_thread, rb_node); - if (thread->transaction_stack) - return true; - } - return false; -} - -static int binder_ioctl_freeze(struct binder_freeze_info *info, - struct binder_proc *target_proc) -{ - int ret = 0; - - if (!info->enable) { - binder_inner_proc_lock(target_proc); - target_proc->sync_recv = false; - target_proc->async_recv = false; - target_proc->is_frozen = false; - binder_inner_proc_unlock(target_proc); - return 0; - } - - /* - * Freezing the target. Prevent new transactions by - * setting frozen state. If timeout specified, wait - * for transactions to drain. - */ - binder_inner_proc_lock(target_proc); - target_proc->sync_recv = false; - target_proc->async_recv = false; - target_proc->is_frozen = true; - binder_inner_proc_unlock(target_proc); - - if (info->timeout_ms > 0) - ret = wait_event_interruptible_timeout( - target_proc->freeze_wait, - (!target_proc->outstanding_txns), - msecs_to_jiffies(info->timeout_ms)); - - /* Check pending transactions that wait for reply */ - if (ret >= 0) { - binder_inner_proc_lock(target_proc); - if (binder_txns_pending_ilocked(target_proc)) - ret = -EAGAIN; - binder_inner_proc_unlock(target_proc); - } - - if (ret < 0) { - binder_inner_proc_lock(target_proc); - target_proc->is_frozen = false; - binder_inner_proc_unlock(target_proc); - } - - return ret; -} - -static int binder_ioctl_get_freezer_info( - struct binder_frozen_status_info *info) -{ - struct binder_proc *target_proc; - bool found = false; - __u32 txns_pending; - - info->sync_recv = 0; - info->async_recv = 0; - - mutex_lock(&binder_procs_lock); - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid == info->pid) { - found = true; - binder_inner_proc_lock(target_proc); - txns_pending = binder_txns_pending_ilocked(target_proc); - info->sync_recv |= target_proc->sync_recv | - (txns_pending << 1); - info->async_recv |= target_proc->async_recv; - binder_inner_proc_unlock(target_proc); - } - } - mutex_unlock(&binder_procs_lock); - - if (!found) - return -EINVAL; - - return 0; -} - static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@ -5560,96 +5129,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } - case BINDER_FREEZE: { - struct binder_freeze_info info; - struct binder_proc **target_procs = NULL, *target_proc; - int target_procs_count = 0, i = 0; - - ret = 0; - - if (copy_from_user(&info, ubuf, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - mutex_lock(&binder_procs_lock); - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid == info.pid) - target_procs_count++; - } - - if (target_procs_count == 0) { - mutex_unlock(&binder_procs_lock); - ret = -EINVAL; - goto err; - } - - target_procs = kcalloc(target_procs_count, - sizeof(struct binder_proc *), - GFP_KERNEL); - - if (!target_procs) { - mutex_unlock(&binder_procs_lock); - ret = -ENOMEM; - goto err; - } - - hlist_for_each_entry(target_proc, &binder_procs, proc_node) { - if (target_proc->pid != info.pid) - continue; - - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - - target_procs[i++] = target_proc; - } - mutex_unlock(&binder_procs_lock); - - for (i = 0; i < target_procs_count; i++) { - if (ret >= 0) - ret = binder_ioctl_freeze(&info, - target_procs[i]); - - binder_proc_dec_tmpref(target_procs[i]); - } - - kfree(target_procs); - - if (ret < 0) - goto err; - break; - } - case BINDER_GET_FROZEN_INFO: { - struct binder_frozen_status_info info; - - if (copy_from_user(&info, ubuf, sizeof(info))) { - ret = -EFAULT; - goto err; - } - - ret = binder_ioctl_get_freezer_info(&info); - if (ret < 0) - goto err; - - if (copy_to_user(ubuf, &info, sizeof(info))) { - ret = -EFAULT; - goto err; - } - break; - } - case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: { - uint32_t enable; - - if (copy_from_user(&enable, ubuf, sizeof(enable))) { - ret = -EFAULT; - goto err; - } - binder_inner_proc_lock(proc); - proc->oneway_spam_detection_enabled = (bool)enable; - binder_inner_proc_unlock(proc); - break; - } default: ret = -EINVAL; goto err; @@ -5659,7 +5138,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); - if (ret && ret != -EINTR) + if (ret && ret != -ERESTARTSYS) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); @@ -5687,6 +5166,7 @@ static void binder_vma_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); + binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -5702,11 +5182,16 @@ static const struct vm_operations_struct binder_vm_ops = { static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { + int ret; struct binder_proc *proc = filp->private_data; + const char *failure_string; if (proc->tsk != current->group_leader) return -EINVAL; + if ((vma->vm_end - vma->vm_start) > SZ_4M) + vma->vm_end = vma->vm_start + SZ_4M; + binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", __func__, proc->pid, vma->vm_start, vma->vm_end, @@ -5714,9 +5199,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot)); if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { - pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, - proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); - return -EPERM; + ret = -EPERM; + failure_string = "bad vm_flags"; + goto err_bad_arg; } vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; vma->vm_flags &= ~VM_MAYWRITE; @@ -5724,30 +5209,39 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; - return binder_alloc_mmap_handler(&proc->alloc, vma); + ret = binder_alloc_mmap_handler(&proc->alloc, vma); + if (ret) + return ret; + mutex_lock(&proc->files_lock); + proc->files = get_files_struct(current); + mutex_unlock(&proc->files_lock); + return 0; + +err_bad_arg: + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + return ret; } static int binder_open(struct inode *nodp, struct file *filp) { - struct binder_proc *proc, *itr; + struct binder_proc *proc; struct binder_device *binder_dev; - struct binderfs_info *info; - struct dentry *binder_binderfs_dir_entry_proc = NULL; - bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); - proc = kmem_cache_zalloc(binder_proc_pool, GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); + atomic_set(&proc->tmp_ref, 0); get_task_struct(current->group_leader); proc->tsk = current->group_leader; + mutex_init(&proc->files_lock); proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); - init_waitqueue_head(&proc->freeze_wait); if (binder_supported_policy(current->policy)) { proc->default_priority.sched_policy = current->policy; proc->default_priority.prio = current->normal_prio; @@ -5756,16 +5250,8 @@ static int binder_open(struct inode *nodp, struct file *filp) proc->default_priority.prio = NICE_TO_PRIO(0); } - /* binderfs stashes devices in i_private */ - if (is_binderfs_device(nodp)) { - binder_dev = nodp->i_private; - info = nodp->i_sb->s_fs_info; - binder_binderfs_dir_entry_proc = info->proc_log_dir; - } else { - binder_dev = container_of(filp->private_data, - struct binder_device, miscdev); - } - refcount_inc(&binder_dev->ref); + binder_dev = container_of(filp->private_data, struct binder_device, + miscdev); proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); @@ -5776,52 +5262,24 @@ static int binder_open(struct inode *nodp, struct file *filp) filp->private_data = proc; mutex_lock(&binder_procs_lock); - hlist_for_each_entry(itr, &binder_procs, proc_node) { - if (itr->pid == proc->pid) { - existing_pid = true; - break; - } - } hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); - if (binder_debugfs_dir_entry_proc && !existing_pid) { + + if (binder_debugfs_dir_entry_proc) { char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* - * proc debug entries are shared between contexts. - * Only create for the first PID to avoid debugfs log spamming - * The printing code will anyway print all contexts for a given - * PID so this is not a problem. + * proc debug entries are shared between contexts, so + * this will fail if the process tries to open the driver + * again with a different context. The priting code will + * anyway print all contexts that a given PID has, so this + * is not a problem. */ proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, - &proc_fops); - } - - if (binder_binderfs_dir_entry_proc && !existing_pid) { - char strbuf[11]; - struct dentry *binderfs_entry; - - snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); - /* - * Similar to debugfs, the process specific log file is shared - * between contexts. Only create for the first PID. - * This is ok since same as debugfs, the log file will contain - * information on all contexts of a given PID. - */ - binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc, - strbuf, &proc_fops, (void *)(unsigned long)proc->pid); - if (!IS_ERR(binderfs_entry)) { - proc->binderfs_entry = binderfs_entry; - } else { - int error; - - error = PTR_ERR(binderfs_entry); - pr_warn("Unable to create file %s in binderfs (error %d)\n", - strbuf, error); - } + &binder_proc_fops); } return 0; @@ -5863,12 +5321,6 @@ static int binder_release(struct inode *nodp, struct file *filp) struct binder_proc *proc = filp->private_data; debugfs_remove(proc->debugfs_entry); - - if (proc->binderfs_entry) { - binderfs_remove_file(proc->binderfs_entry); - proc->binderfs_entry = NULL; - } - binder_defer_work(proc, BINDER_DEFERRED_RELEASE); return 0; @@ -5945,6 +5397,8 @@ static void binder_deferred_release(struct binder_proc *proc) struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; + BUG_ON(proc->files); + mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); @@ -5963,12 +5417,9 @@ static void binder_deferred_release(struct binder_proc *proc) * Make sure proc stays alive after we * remove all the threads */ - proc->tmp_ref++; + atomic_inc(&proc->tmp_ref); proc->is_dead = true; - proc->is_frozen = false; - proc->sync_recv = false; - proc->async_recv = false; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { @@ -6029,6 +5480,7 @@ static void binder_deferred_release(struct binder_proc *proc) static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; + struct files_struct *files; int defer; @@ -6046,11 +5498,23 @@ static void binder_deferred_func(struct work_struct *work) } mutex_unlock(&binder_deferred_lock); + files = NULL; + if (defer & BINDER_DEFERRED_PUT_FILES) { + mutex_lock(&proc->files_lock); + files = proc->files; + if (files) + proc->files = NULL; + mutex_unlock(&proc->files_lock); + } + if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ + + if (files) + put_files_struct(files); } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); @@ -6321,9 +5785,7 @@ static const char * const binder_return_strings[] = { "BR_FINISHED", "BR_DEAD_BINDER", "BR_CLEAR_DEATH_NOTIFICATION_DONE", - "BR_FAILED_REPLY", - "BR_FROZEN_REPLY", - "BR_ONEWAY_SPAM_SUSPECT", + "BR_FAILED_REPLY" }; static const char * const binder_command_strings[] = { @@ -6464,7 +5926,8 @@ static void print_binder_proc_stats(struct seq_file *m, print_binder_stats(m, " ", &proc->stats); } -static int state_show(struct seq_file *m, void *unused) + +static int binder_state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; @@ -6503,7 +5966,7 @@ static int state_show(struct seq_file *m, void *unused) return 0; } -static int stats_show(struct seq_file *m, void *unused) +static int binder_stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; @@ -6519,7 +5982,7 @@ static int stats_show(struct seq_file *m, void *unused) return 0; } -static int transactions_show(struct seq_file *m, void *unused) +static int binder_transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; @@ -6532,7 +5995,7 @@ static int transactions_show(struct seq_file *m, void *unused) return 0; } -static int proc_show(struct seq_file *m, void *unused) +static int binder_proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; @@ -6575,7 +6038,7 @@ static void print_binder_transaction_log_entry(struct seq_file *m, "\n" : " (incomplete)\n"); } -static int transaction_log_show(struct seq_file *m, void *unused) +static int binder_transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; unsigned int log_cur = atomic_read(&log->cur); @@ -6596,7 +6059,7 @@ static int transaction_log_show(struct seq_file *m, void *unused) return 0; } -const struct file_operations binder_fops = { +static const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, .unlocked_ioctl = binder_ioctl, @@ -6607,44 +6070,10 @@ const struct file_operations binder_fops = { .release = binder_release, }; -DEFINE_SHOW_ATTRIBUTE(state); -DEFINE_SHOW_ATTRIBUTE(stats); -DEFINE_SHOW_ATTRIBUTE(transactions); -DEFINE_SHOW_ATTRIBUTE(transaction_log); - -const struct binder_debugfs_entry binder_debugfs_entries[] = { - { - .name = "state", - .mode = 0444, - .fops = &state_fops, - .data = NULL, - }, - { - .name = "stats", - .mode = 0444, - .fops = &stats_fops, - .data = NULL, - }, - { - .name = "transactions", - .mode = 0444, - .fops = &transactions_fops, - .data = NULL, - }, - { - .name = "transaction_log", - .mode = 0444, - .fops = &transaction_log_fops, - .data = &binder_transaction_log, - }, - { - .name = "failed_transaction_log", - .mode = 0444, - .fops = &transaction_log_fops, - .data = &binder_transaction_log_failed, - }, - {} /* terminator */ -}; +BINDER_DEBUG_ENTRY(state); +BINDER_DEBUG_ENTRY(stats); +BINDER_DEBUG_ENTRY(transactions); +BINDER_DEBUG_ENTRY(transaction_log); static int __init init_binder_device(const char *name) { @@ -6659,7 +6088,6 @@ static int __init init_binder_device(const char *name) binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; - refcount_set(&binder_device->ref, 1); binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; mutex_init(&binder_device->context.context_mgr_node_lock); @@ -6675,130 +6103,70 @@ static int __init init_binder_device(const char *name) return ret; } -static int __init binder_create_pools(void) -{ - int ret; - - ret = binder_buffer_pool_create(); - if (ret) - return ret; - - binder_node_pool = KMEM_CACHE(binder_node, SLAB_HWCACHE_ALIGN); - if (!binder_node_pool) - goto err_node_pool; - - binder_proc_pool = KMEM_CACHE(binder_proc, SLAB_HWCACHE_ALIGN); - if (!binder_proc_pool) - goto err_proc_pool; - - binder_ref_death_pool = KMEM_CACHE(binder_ref_death, SLAB_HWCACHE_ALIGN); - if (!binder_ref_death_pool) - goto err_ref_death_pool; - - binder_ref_pool = KMEM_CACHE(binder_ref, SLAB_HWCACHE_ALIGN); - if (!binder_ref_pool) - goto err_ref_pool; - - binder_thread_pool = KMEM_CACHE(binder_thread, SLAB_HWCACHE_ALIGN); - if (!binder_thread_pool) - goto err_thread_pool; - - binder_transaction_pool = KMEM_CACHE(binder_transaction, SLAB_HWCACHE_ALIGN); - if (!binder_transaction_pool) - goto err_transaction_pool; - - binder_work_pool = KMEM_CACHE(binder_work, SLAB_HWCACHE_ALIGN); - if (!binder_work_pool) - goto err_work_pool; - - return 0; - -err_work_pool: - kmem_cache_destroy(binder_transaction_pool); -err_transaction_pool: - kmem_cache_destroy(binder_thread_pool); -err_thread_pool: - kmem_cache_destroy(binder_ref_pool); -err_ref_pool: - kmem_cache_destroy(binder_ref_death_pool); -err_ref_death_pool: - kmem_cache_destroy(binder_proc_pool); -err_proc_pool: - kmem_cache_destroy(binder_node_pool); -err_node_pool: - binder_buffer_pool_destroy(); - return -ENOMEM; -} - -static void __init binder_destroy_pools(void) -{ - binder_buffer_pool_destroy(); - kmem_cache_destroy(binder_node_pool); - kmem_cache_destroy(binder_proc_pool); - kmem_cache_destroy(binder_ref_death_pool); - kmem_cache_destroy(binder_ref_pool); - kmem_cache_destroy(binder_thread_pool); - kmem_cache_destroy(binder_transaction_pool); - kmem_cache_destroy(binder_work_pool); -} - static int __init binder_init(void) { int ret; - char *device_name, *device_tmp; + char *device_name, *device_names, *device_tmp; struct binder_device *device; struct hlist_node *tmp; - char *device_names = NULL; - - ret = binder_create_pools(); - if (ret) - return ret; ret = binder_alloc_shrinker_init(); if (ret) - goto err_alloc_shrinker_failed; + return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); - if (binder_debugfs_dir_entry_root) { - const struct binder_debugfs_entry *db_entry; - - binder_for_each_debugfs_entry(db_entry) - debugfs_create_file(db_entry->name, - db_entry->mode, - binder_debugfs_dir_entry_root, - db_entry->data, - db_entry->fops); - + if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); - } - if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) && - strcmp(binder_devices_param, "") != 0) { - /* - * Copy the module_parameter string, because we don't want to - * tokenize it in-place. - */ - device_names = kstrdup(binder_devices_param, GFP_KERNEL); - if (!device_names) { - ret = -ENOMEM; - goto err_alloc_device_names_failed; - } + if (binder_debugfs_dir_entry_root) { + debugfs_create_file("state", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_state_fops); + debugfs_create_file("stats", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_stats_fops); + debugfs_create_file("transactions", + 0444, + binder_debugfs_dir_entry_root, + NULL, + &binder_transactions_fops); + debugfs_create_file("transaction_log", + 0444, + binder_debugfs_dir_entry_root, + &binder_transaction_log, + &binder_transaction_log_fops); + debugfs_create_file("failed_transaction_log", + 0444, + binder_debugfs_dir_entry_root, + &binder_transaction_log_failed, + &binder_transaction_log_fops); + } - device_tmp = device_names; - while ((device_name = strsep(&device_tmp, ","))) { - ret = init_binder_device(device_name); - if (ret) - goto err_init_binder_device_failed; - } + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) { + ret = -ENOMEM; + goto err_alloc_device_names_failed; } + strcpy(device_names, binder_devices_param); - ret = init_binderfs(); - if (ret) - goto err_init_binder_device_failed; + device_tmp = device_names; + while ((device_name = strsep(&device_tmp, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; + } return ret; @@ -6814,9 +6182,6 @@ static int __init binder_init(void) err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); -err_alloc_shrinker_failed: - binder_destroy_pools(); - return ret; } @@ -6824,7 +6189,5 @@ device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(binder_transaction_received); -EXPORT_TRACEPOINT_SYMBOL_GPL(binder_txn_latency_free); MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9eb15d712567..5addcd56afb4 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1,13 +1,23 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder_alloc.c * * Android IPC Subsystem * * Copyright (C) 2007-2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -18,11 +28,8 @@ #include #include #include -#include -#include #include #include -#include #include "binder_alloc.h" #include "binder_trace.h" @@ -36,7 +43,7 @@ enum { BINDER_DEBUG_BUFFER_ALLOC = 1U << 2, BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, }; -static uint32_t binder_alloc_debug_mask = BINDER_DEBUG_USER_ERROR; +static uint32_t binder_alloc_debug_mask; module_param_named(debug_mask, binder_alloc_debug_mask, uint, 0644); @@ -44,25 +51,9 @@ module_param_named(debug_mask, binder_alloc_debug_mask, #define binder_alloc_debug(mask, x...) \ do { \ if (binder_alloc_debug_mask & mask) \ - pr_info_ratelimited(x); \ + pr_info(x); \ } while (0) -static struct kmem_cache *binder_buffer_pool; - -int binder_buffer_pool_create(void) -{ - binder_buffer_pool = KMEM_CACHE(binder_buffer, SLAB_HWCACHE_ALIGN); - if (!binder_buffer_pool) - return -ENOMEM; - - return 0; -} - -void binder_buffer_pool_destroy(void) -{ - kmem_cache_destroy(binder_buffer_pool); -} - static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer) { return list_entry(buffer->entry.next, struct binder_buffer, entry); @@ -173,7 +164,7 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( } /** - * binder_alloc_prepare_to_free() - get buffer given user ptr + * binder_alloc_buffer_lookup() - get buffer given user ptr * @alloc: binder_alloc for this proc * @user_ptr: User pointer to buffer data * @@ -228,14 +219,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, mm = alloc->vma_vm_mm; if (mm) { - down_read; + down_read(&mm->mmap_sem); + if (!mmget_still_valid(mm)) { + if (allocate == 0) + goto free_range; + goto err_no_vma; + } vma = alloc->vma; } if (!vma && need_mm) { - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf failed to map pages in userspace, no vma\n", - alloc->pid); + pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", + alloc->pid); goto err_no_vma; } @@ -284,15 +279,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, alloc->pages_high = index + 1; trace_binder_alloc_page_end(alloc, index); + /* vm_insert_page does not seem to increment the refcount */ } if (mm) { - up_read; + up_read(&mm->mmap_sem); mmput(mm); } return 0; free_range: - for (page_addr = end - PAGE_SIZE; 1; page_addr -= PAGE_SIZE) { + for (page_addr = end - PAGE_SIZE; page_addr >= start; + page_addr -= PAGE_SIZE) { bool ret; size_t index; @@ -305,8 +302,6 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); - if (page_addr == start) - break; continue; err_vm_insert_page_failed: @@ -314,47 +309,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, page->page_ptr = NULL; err_alloc_page_failed: err_page_ptr_cleared: - if (page_addr == start) - break; + ; } err_no_vma: if (mm) { - up_read; + up_read(&mm->mmap_sem); mmput(mm); } return vma ? -ENOMEM : -ESRCH; } - -static inline void binder_alloc_set_vma(struct binder_alloc *alloc, - struct vm_area_struct *vma) -{ - if (vma) - alloc->vma_vm_mm = vma->vm_mm; - /* - * If we see alloc->vma is not NULL, buffer data structures set up - * completely. Look at smp_rmb side binder_alloc_get_vma. - * We also want to guarantee new alloc->vma_vm_mm is always visible - * if alloc->vma is set. - */ - smp_wmb(); - alloc->vma = vma; -} - -static inline struct vm_area_struct *binder_alloc_get_vma( - struct binder_alloc *alloc) -{ - struct vm_area_struct *vma = NULL; - - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } - return vma; -} - -static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) +static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid) { /* * Find the amount and size of buffers allocated by the current caller; @@ -363,7 +328,7 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) * and at some point we'll catch them in the act. This is more efficient * than keeping a map per pid. */ - struct rb_node *n; + struct rb_node *n = alloc->free_buffers.rb_node; struct binder_buffer *buffer; size_t total_alloc_size = 0; size_t num_buffers = 0; @@ -382,19 +347,13 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) /* * Warn if this pid has more than 50 transactions, or more than 50% of - * async space (which is 25% of total buffer size). Oneway spam is only - * detected when the threshold is exceeded. + * async space (which is 25% of total buffer size). */ if (num_buffers > 50 || total_alloc_size > alloc->buffer_size / 4) { binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: pid %d spamming oneway? %zd buffers allocated for a total size of %zd\n", alloc->pid, pid, num_buffers, total_alloc_size); - if (!alloc->oneway_spam_detected) { - alloc->oneway_spam_detected = true; - return true; - } } - return false; } static struct binder_buffer *binder_alloc_new_buf_locked( @@ -414,15 +373,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; - down_read; - if (!binder_alloc_get_vma(alloc)) { - up_read; - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf, no vma\n", - alloc->pid); + if (alloc->vma == NULL) { + pr_err("%d: binder_alloc_buf, no vma\n", + alloc->pid); return ERR_PTR(-ESRCH); } - up_read; data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -492,14 +447,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked( if (buffer_size > largest_free_size) largest_free_size = buffer_size; } - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%d: binder_alloc_buf size %zd failed, no address space\n", - alloc->pid, size); - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", - total_alloc_size, allocated_buffers, - largest_alloc_size, total_free_size, - free_buffers, largest_free_size); + pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", + alloc->pid, size); + pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", + total_alloc_size, allocated_buffers, largest_alloc_size, + total_free_size, free_buffers, largest_free_size); return ERR_PTR(-ENOSPC); } if (n == NULL) { @@ -526,7 +478,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( if (buffer_size != size) { struct binder_buffer *new_buffer; - new_buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL); + new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!new_buffer) { pr_err("%s: %d failed to alloc new buffer struct\n", __func__, alloc->pid); @@ -550,7 +502,6 @@ static struct binder_buffer *binder_alloc_new_buf_locked( buffer->async_transaction = is_async; buffer->extra_buffers_size = extra_buffers_size; buffer->pid = pid; - buffer->oneway_spam_suspect = false; if (is_async) { alloc->free_async_space -= size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, @@ -562,9 +513,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( * of async space left (which is less than 10% of total * buffer size). */ - buffer->oneway_spam_suspect = debug_low_async_space_locked(alloc, pid); - } else { - alloc->oneway_spam_detected = false; + debug_low_async_space_locked(alloc, pid); } } return buffer; @@ -624,7 +573,6 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, { struct binder_buffer *prev, *next = NULL; bool to_free = true; - BUG_ON(alloc->buffers.next == &buffer->entry); prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); @@ -665,7 +613,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, buffer_start_page(buffer) + PAGE_SIZE); } list_del(&buffer->entry); - kmem_cache_free(binder_buffer_pool, buffer); + kfree(buffer); } static void binder_free_buf_locked(struct binder_alloc *alloc, @@ -690,7 +638,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, BUG_ON(buffer->user_data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { - alloc->free_async_space += buffer_size + sizeof(struct binder_buffer); + alloc->free_async_space += size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_free_buf size %zd async free %zd\n", @@ -731,7 +679,7 @@ static void binder_alloc_clear_buf(struct binder_alloc *alloc, * @alloc: binder_alloc for this proc * @buffer: kernel pointer to buffer * - * Free the buffer allocated via binder_alloc_new_buf() + * Free the buffer allocated via binder_alloc_new_buffer() */ void binder_alloc_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffer) @@ -773,34 +721,27 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, const char *failure_string; struct binder_buffer *buffer; - if (unlikely(vma->vm_mm != alloc->vma_vm_mm)) { - ret = -EINVAL; - failure_string = "invalid vma->vm_mm"; - goto err_invalid_mm; - } - mutex_lock(&binder_alloc_mmap_lock); - if (alloc->buffer_size) { + if (alloc->buffer) { ret = -EBUSY; failure_string = "already mapped"; goto err_already_mapped; } - alloc->buffer_size = min_t(unsigned long, vma->vm_end - vma->vm_start, - SZ_4M); - mutex_unlock(&binder_alloc_mmap_lock); alloc->buffer = (void __user *)vma->vm_start; + mutex_unlock(&binder_alloc_mmap_lock); - alloc->pages = kcalloc(alloc->buffer_size / PAGE_SIZE, - sizeof(alloc->pages[0]), + alloc->pages = kzalloc(sizeof(alloc->pages[0]) * + ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); if (alloc->pages == NULL) { ret = -ENOMEM; failure_string = "alloc page array"; goto err_alloc_pages_failed; } + alloc->buffer_size = vma->vm_end - vma->vm_start; - buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL); + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) { ret = -ENOMEM; failure_string = "alloc buffer struct"; @@ -812,7 +753,11 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; - binder_alloc_set_vma(alloc, vma); + barrier(); + alloc->vma = vma; + alloc->vma_vm_mm = vma->vm_mm; + /* Same as mmgrab() in later kernel versions */ + atomic_inc(&alloc->vma_vm_mm->mm_count); return 0; @@ -820,16 +765,12 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, kfree(alloc->pages); alloc->pages = NULL; err_alloc_pages_failed: - alloc->buffer = NULL; mutex_lock(&binder_alloc_mmap_lock); - alloc->buffer_size = 0; + alloc->buffer = NULL; err_already_mapped: mutex_unlock(&binder_alloc_mmap_lock); -err_invalid_mm: - binder_alloc_debug(BINDER_DEBUG_USER_ERROR, - "%s: %d %lx-%lx %s failed %d\n", __func__, - alloc->pid, vma->vm_start, vma->vm_end, - failure_string, ret); + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); return ret; } @@ -840,10 +781,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) int buffers, page_count; struct binder_buffer *buffer; - buffers = 0; - mutex_lock(&alloc->mutex); BUG_ON(alloc->vma); + buffers = 0; + mutex_lock(&alloc->mutex); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -865,7 +806,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) list_del(&buffer->entry); WARN_ON_ONCE(!list_empty(&alloc->buffers)); - kmem_cache_free(binder_buffer_pool, buffer); + kfree(buffer); } page_count = 0; @@ -945,18 +886,6 @@ void binder_alloc_print_pages(struct seq_file *m, int free = 0; mutex_lock(&alloc->mutex); - /* - * Make sure the binder_alloc is fully initialized, otherwise we might - * read inconsistent state. - */ - - down_read; - if (binder_alloc_get_vma(alloc) == NULL) { - up_read; - goto uninitialized; - } - - up_read; for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { page = &alloc->pages[i]; if (!page->page_ptr) @@ -966,8 +895,6 @@ void binder_alloc_print_pages(struct seq_file *m, else lru++; } - -uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); @@ -1002,7 +929,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) */ void binder_alloc_vma_close(struct binder_alloc *alloc) { - binder_alloc_set_vma(alloc, NULL); + WRITE_ONCE(alloc->vma, NULL); } /** @@ -1018,7 +945,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct list_lru_one *lru, spinlock_t *lock, void *cb_arg) - __must_hold(lock) { struct mm_struct *mm = NULL; struct binder_lru_page *page = container_of(item, @@ -1042,9 +968,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mm = alloc->vma_vm_mm; if (!mmget_not_zero(mm)) goto err_mmget; - if (!*down_read_trylock) - goto err_down_read_mmap_sem_failed; - vma = binder_alloc_get_vma(alloc); + if (!down_write_trylock(&mm->mmap_sem)) + goto err_down_write_mmap_sem_failed; + vma = alloc->vma; list_lru_isolate(lru, item); spin_unlock(lock); @@ -1056,8 +982,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_user_end(alloc, index); } - up_read; - mmput_async(mm); + up_write(&mm->mmap_sem); + mmput(mm); trace_binder_unmap_kernel_start(alloc, index); @@ -1070,7 +996,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mutex_unlock(&alloc->mutex); return LRU_REMOVED_RETRY; -err_down_read_mmap_sem_failed: +err_down_write_mmap_sem_failed: mmput_async(mm); err_mmget: err_page_already_freed: @@ -1112,8 +1038,6 @@ static struct shrinker binder_shrinker = { void binder_alloc_init(struct binder_alloc *alloc) { alloc->pid = current->group_leader->pid; - alloc->vma_vm_mm = current->mm; - mmgrab(alloc->vma_vm_mm); mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); } @@ -1271,16 +1195,15 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, return 0; } -static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc, - bool to_buffer, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *ptr, - size_t bytes) +static void binder_alloc_do_buffer_copy(struct binder_alloc *alloc, + bool to_buffer, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *ptr, + size_t bytes) { /* All copies must be 32-bit aligned and 32-bit size */ - if (!check_buffer(alloc, buffer, buffer_offset, bytes)) - return -EINVAL; + BUG_ON(!check_buffer(alloc, buffer, buffer_offset, bytes)); while (bytes) { unsigned long size; @@ -1308,25 +1231,25 @@ static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc, ptr = ptr + size; buffer_offset += size; } - return 0; } -int binder_alloc_copy_to_buffer(struct binder_alloc *alloc, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *src, - size_t bytes) +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes) { - return binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset, - src, bytes); + binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset, + src, bytes); } -int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, - void *dest, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - size_t bytes) +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes) { - return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, - dest, bytes); + binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, + dest, bytes); } + diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a30eb98d99f4..da025cc94cd9 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -1,6 +1,15 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #ifndef _LINUX_BINDER_ALLOC_H @@ -13,6 +22,11 @@ #include #include #include + +#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT +#define BINDER_IPC_32BIT 1 +#endif + #include extern struct list_lru binder_alloc_lru; @@ -26,8 +40,6 @@ struct binder_transaction; * @clear_on_free: %true if buffer must be zeroed after use * @allow_user_free: %true if user is allowed to free buffer * @async_transaction: %true if buffer is in use for an async txn - * @oneway_spam_suspect: %true if total async allocate size just exceed - * spamming detect threshold * @debug_id: unique ID for debugging * @transaction: pointer to associated struct binder_transaction * @target_node: struct binder_node associated with this buffer @@ -47,8 +59,7 @@ struct binder_buffer { unsigned clear_on_free:1; unsigned allow_user_free:1; unsigned async_transaction:1; - unsigned oneway_spam_suspect:1; - unsigned debug_id:27; + unsigned debug_id:28; struct binder_transaction *transaction; @@ -90,8 +101,6 @@ struct binder_lru_page { * @buffer_size: size of address space specified via mmap * @pid: pid for associated binder_proc (invariant after init) * @pages_high: high watermark of offset in @pages - * @oneway_spam_detected: %true if oneway spam detection fired, clear that - * flag once the async buffer has returned to a healthy state * * Bookkeeping structure for per-proc address space management for binder * buffers. It is normally initialized during binder_init() and binder_mmap() @@ -112,7 +121,6 @@ struct binder_alloc { uint32_t buffer_free; int pid; size_t pages_high; - bool oneway_spam_detected; }; #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST @@ -145,8 +153,6 @@ extern void binder_alloc_print_allocated(struct seq_file *m, struct binder_alloc *alloc); void binder_alloc_print_pages(struct seq_file *m, struct binder_alloc *alloc); -extern int binder_buffer_pool_create(void); -extern void binder_buffer_pool_destroy(void); /** * binder_alloc_get_free_async_space() - get free space available for async @@ -172,16 +178,17 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, const void __user *from, size_t bytes); -int binder_alloc_copy_to_buffer(struct binder_alloc *alloc, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - void *src, - size_t bytes); +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes); -int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, - void *dest, - struct binder_buffer *buffer, - binder_size_t buffer_offset, - size_t bytes); +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes); #endif /* _LINUX_BINDER_ALLOC_H */ + diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index c2b323bc3b3a..c839c490fde3 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -1,9 +1,18 @@ -// SPDX-License-Identifier: GPL-2.0-only /* binder_alloc_selftest.c * * Android IPC Subsystem * * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h deleted file mode 100644 index 3b6918d8a977..000000000000 --- a/drivers/android/binder_internal.h +++ /dev/null @@ -1,603 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_BINDER_INTERNAL_H -#define _LINUX_BINDER_INTERNAL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "binder_alloc.h" - -#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c) -#define ida_free ida_remove - -typedef unsigned int __poll_t; -typedef __bitwise int vm_fault_t; - -struct binder_context { - struct binder_node *binder_context_mgr_node; - struct mutex context_mgr_node_lock; - kuid_t binder_context_mgr_uid; - const char *name; -}; - -/** - * struct binder_device - information about a binder device node - * @hlist: list of binder devices (only used for devices requested via - * CONFIG_ANDROID_BINDER_DEVICES) - * @miscdev: information about a binder character device node - * @context: binder context information - * @binderfs_inode: This is the inode of the root dentry of the super block - * belonging to a binderfs mount. - */ -struct binder_device { - struct hlist_node hlist; - struct miscdevice miscdev; - struct binder_context context; - struct inode *binderfs_inode; - refcount_t ref; -}; - -/** - * binderfs_mount_opts - mount options for binderfs - * @max: maximum number of allocatable binderfs binder devices - * @stats_mode: enable binder stats in binderfs. - */ -struct binderfs_mount_opts { - int max; - int stats_mode; -}; - -/** - * binderfs_info - information about a binderfs mount - * @ipc_ns: The ipc namespace the binderfs mount belongs to. - * @control_dentry: This records the dentry of this binderfs mount - * binder-control device. - * @root_uid: uid that needs to be used when a new binder device is - * created. - * @root_gid: gid that needs to be used when a new binder device is - * created. - * @mount_opts: The mount options in use. - * @device_count: The current number of allocated binder devices. - * @proc_log_dir: Pointer to the directory dentry containing process-specific - * logs. - */ -struct binderfs_info { - struct ipc_namespace *ipc_ns; - struct dentry *control_dentry; - kuid_t root_uid; - kgid_t root_gid; - struct binderfs_mount_opts mount_opts; - int device_count; - struct dentry *proc_log_dir; -}; - -extern const struct file_operations binder_fops; - -extern char *binder_devices_param; - -#ifdef CONFIG_ANDROID_BINDERFS -extern bool is_binderfs_device(const struct inode *inode); -extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name, - const struct file_operations *fops, - void *data); -extern void binderfs_remove_file(struct dentry *dentry); -#else -static inline bool is_binderfs_device(const struct inode *inode) -{ - return false; -} -static inline struct dentry *binderfs_create_file(struct dentry *dir, - const char *name, - const struct file_operations *fops, - void *data) -{ - return NULL; -} -static inline void binderfs_remove_file(struct dentry *dentry) {} -#endif - -#ifdef CONFIG_ANDROID_BINDERFS -extern int __init init_binderfs(void); -#else -static inline int __init init_binderfs(void) -{ - return 0; -} -#endif - -struct binder_debugfs_entry { - const char *name; - umode_t mode; - const struct file_operations *fops; - void *data; -}; - -extern const struct binder_debugfs_entry binder_debugfs_entries[]; - -#define binder_for_each_debugfs_entry(entry) \ - for ((entry) = binder_debugfs_entries; \ - (entry)->name; \ - (entry)++) - -enum binder_stat_types { - BINDER_STAT_PROC, - BINDER_STAT_THREAD, - BINDER_STAT_NODE, - BINDER_STAT_REF, - BINDER_STAT_DEATH, - BINDER_STAT_TRANSACTION, - BINDER_STAT_TRANSACTION_COMPLETE, - BINDER_STAT_COUNT -}; - -struct binder_stats { - atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1]; - atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; - atomic_t obj_created[BINDER_STAT_COUNT]; - atomic_t obj_deleted[BINDER_STAT_COUNT]; -}; - -/** - * struct binder_work - work enqueued on a worklist - * @entry: node enqueued on list - * @type: type of work to be performed - * - * There are separate work lists for proc, thread, and node (async). - */ -struct binder_work { - struct list_head entry; - - enum binder_work_type { - BINDER_WORK_TRANSACTION = 1, - BINDER_WORK_TRANSACTION_COMPLETE, - BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT, - BINDER_WORK_RETURN_ERROR, - BINDER_WORK_NODE, - BINDER_WORK_DEAD_BINDER, - BINDER_WORK_DEAD_BINDER_AND_CLEAR, - BINDER_WORK_CLEAR_DEATH_NOTIFICATION, - } type; -}; - -struct binder_error { - struct binder_work work; - uint32_t cmd; -}; - -/** - * struct binder_node - binder node bookkeeping - * @debug_id: unique ID for debugging - * (invariant after initialized) - * @lock: lock for node fields - * @work: worklist element for node work - * (protected by @proc->inner_lock) - * @rb_node: element for proc->nodes tree - * (protected by @proc->inner_lock) - * @dead_node: element for binder_dead_nodes list - * (protected by binder_dead_nodes_lock) - * @proc: binder_proc that owns this node - * (invariant after initialized) - * @refs: list of references on this node - * (protected by @lock) - * @internal_strong_refs: used to take strong references when - * initiating a transaction - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @local_weak_refs: weak user refs from local process - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @local_strong_refs: strong user refs from local process - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @tmp_refs: temporary kernel refs - * (protected by @proc->inner_lock while @proc - * is valid, and by binder_dead_nodes_lock - * if @proc is NULL. During inc/dec and node release - * it is also protected by @lock to provide safety - * as the node dies and @proc becomes NULL) - * @ptr: userspace pointer for node - * (invariant, no lock needed) - * @cookie: userspace cookie for node - * (invariant, no lock needed) - * @has_strong_ref: userspace notified of strong ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @pending_strong_ref: userspace has acked notification of strong ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @has_weak_ref: userspace notified of weak ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @pending_weak_ref: userspace has acked notification of weak ref - * (protected by @proc->inner_lock if @proc - * and by @lock) - * @has_async_transaction: async transaction to node in progress - * (protected by @lock) - * @sched_policy: minimum scheduling policy for node - * (invariant after initialized) - * @accept_fds: file descriptor operations supported for node - * (invariant after initialized) - * @min_priority: minimum scheduling priority - * (invariant after initialized) - * @inherit_rt: inherit RT scheduling policy from caller - * @txn_security_ctx: require sender's security context - * (invariant after initialized) - * @async_todo: list of async work items - * (protected by @proc->inner_lock) - * - * Bookkeeping structure for binder nodes. - */ -struct binder_node { - int debug_id; - spinlock_t lock; - struct binder_work work; - union { - struct rb_node rb_node; - struct hlist_node dead_node; - }; - struct binder_proc *proc; - struct hlist_head refs; - int internal_strong_refs; - int local_weak_refs; - int local_strong_refs; - int tmp_refs; - binder_uintptr_t ptr; - binder_uintptr_t cookie; - struct { - /* - * bitfield elements protected by - * proc inner_lock - */ - u8 has_strong_ref:1; - u8 pending_strong_ref:1; - u8 has_weak_ref:1; - u8 pending_weak_ref:1; - }; - struct { - /* - * invariant after initialization - */ - u8 sched_policy:2; - u8 inherit_rt:1; - u8 accept_fds:1; - u8 txn_security_ctx:1; - u8 min_priority; - }; - bool has_async_transaction; - struct list_head async_todo; -}; - -struct binder_ref_death { - /** - * @work: worklist element for death notifications - * (protected by inner_lock of the proc that - * this ref belongs to) - */ - struct binder_work work; - binder_uintptr_t cookie; -}; - -/** - * struct binder_ref_data - binder_ref counts and id - * @debug_id: unique ID for the ref - * @desc: unique userspace handle for ref - * @strong: strong ref count (debugging only if not locked) - * @weak: weak ref count (debugging only if not locked) - * - * Structure to hold ref count and ref id information. Since - * the actual ref can only be accessed with a lock, this structure - * is used to return information about the ref to callers of - * ref inc/dec functions. - */ -struct binder_ref_data { - int debug_id; - uint32_t desc; - int strong; - int weak; -}; - -/** - * struct binder_ref - struct to track references on nodes - * @data: binder_ref_data containing id, handle, and current refcounts - * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree - * @rb_node_node: node for lookup by @node in proc's rb_tree - * @node_entry: list entry for node->refs list in target node - * (protected by @node->lock) - * @proc: binder_proc containing ref - * @node: binder_node of target node. When cleaning up a - * ref for deletion in binder_cleanup_ref, a non-NULL - * @node indicates the node must be freed - * @death: pointer to death notification (ref_death) if requested - * (protected by @node->lock) - * - * Structure to track references from procA to target node (on procB). This - * structure is unsafe to access without holding @proc->outer_lock. - */ -struct binder_ref { - /* Lookups needed: */ - /* node + proc => ref (transaction) */ - /* desc + proc => ref (transaction, inc/dec ref) */ - /* node => refs + procs (proc exit) */ - struct binder_ref_data data; - struct rb_node rb_node_desc; - struct rb_node rb_node_node; - struct hlist_node node_entry; - struct binder_proc *proc; - struct binder_node *node; - struct binder_ref_death *death; -}; - -/** - * struct binder_priority - scheduler policy and priority - * @sched_policy scheduler policy - * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT - * - * The binder driver supports inheriting the following scheduler policies: - * SCHED_NORMAL - * SCHED_BATCH - * SCHED_FIFO - * SCHED_RR - */ -struct binder_priority { - unsigned int sched_policy; - int prio; -}; - -enum binder_prio_state { - BINDER_PRIO_SET, /* desired priority set */ - BINDER_PRIO_PENDING, /* initiated a saved priority restore */ - BINDER_PRIO_ABORT, /* abort the pending priority restore */ -}; - -/** - * struct binder_proc - binder process bookkeeping - * @proc_node: element for binder_procs list - * @threads: rbtree of binder_threads in this proc - * (protected by @inner_lock) - * @nodes: rbtree of binder nodes associated with - * this proc ordered by node->ptr - * (protected by @inner_lock) - * @refs_by_desc: rbtree of refs ordered by ref->desc - * (protected by @outer_lock) - * @refs_by_node: rbtree of refs ordered by ref->node - * (protected by @outer_lock) - * @waiting_threads: threads currently waiting for proc work - * (protected by @inner_lock) - * @pid PID of group_leader of process - * (invariant after initialized) - * @tsk task_struct for group_leader of process - * (invariant after initialized) - * @cred struct cred associated with the `struct file` - * in binder_open() - * (invariant after initialized) - * @deferred_work_node: element for binder_deferred_list - * (protected by binder_deferred_lock) - * @deferred_work: bitmap of deferred work to perform - * (protected by binder_deferred_lock) - * @outstanding_txns: number of transactions to be transmitted before - * processes in freeze_wait are woken up - * (protected by @inner_lock) - * @is_dead: process is dead and awaiting free - * when outstanding transactions are cleaned up - * (protected by @inner_lock) - * @is_frozen: process is frozen and unable to service - * binder transactions - * (protected by @inner_lock) - * @sync_recv: process received sync transactions since last frozen - * bit 0: received sync transaction after being frozen - * bit 1: new pending sync transaction during freezing - * (protected by @inner_lock) - * @async_recv: process received async transactions since last frozen - * (protected by @inner_lock) - * @freeze_wait: waitqueue of processes waiting for all outstanding - * transactions to be processed - * (protected by @inner_lock) - * @todo: list of work for this process - * (protected by @inner_lock) - * @stats: per-process binder statistics - * (atomics, no lock needed) - * @delivered_death: list of delivered death notification - * (protected by @inner_lock) - * @max_threads: cap on number of binder threads - * (protected by @inner_lock) - * @requested_threads: number of binder threads requested but not - * yet started. In current implementation, can - * only be 0 or 1. - * (protected by @inner_lock) - * @requested_threads_started: number binder threads started - * (protected by @inner_lock) - * @tmp_ref: temporary reference to indicate proc is in use - * (protected by @inner_lock) - * @default_priority: default scheduler priority - * (invariant after initialized) - * @debugfs_entry: debugfs node - * @alloc: binder allocator bookkeeping - * @context: binder_context for this proc - * (invariant after initialized) - * @inner_lock: can nest under outer_lock and/or node lock - * @outer_lock: no nesting under innor or node lock - * Lock order: 1) outer, 2) node, 3) inner - * @binderfs_entry: process-specific binderfs log file - * @oneway_spam_detection_enabled: process enabled oneway spam detection - * or not - * - * Bookkeeping structure for binder processes - */ -struct binder_proc { - struct hlist_node proc_node; - struct rb_root threads; - struct rb_root nodes; - struct rb_root refs_by_desc; - struct rb_root refs_by_node; - struct list_head waiting_threads; - int pid; - struct task_struct *tsk; - const struct cred *cred; - struct hlist_node deferred_work_node; - int deferred_work; - int outstanding_txns; - bool is_dead; - bool is_frozen; - bool sync_recv; - bool async_recv; - wait_queue_head_t freeze_wait; - - struct list_head todo; - struct binder_stats stats; - struct list_head delivered_death; - int max_threads; - int requested_threads; - int requested_threads_started; - int tmp_ref; - struct binder_priority default_priority; - struct dentry *debugfs_entry; - struct binder_alloc alloc; - struct binder_context *context; - spinlock_t inner_lock; - spinlock_t outer_lock; - struct dentry *binderfs_entry; - bool oneway_spam_detection_enabled; -}; - -/** - * struct binder_thread - binder thread bookkeeping - * @proc: binder process for this thread - * (invariant after initialization) - * @rb_node: element for proc->threads rbtree - * (protected by @proc->inner_lock) - * @waiting_thread_node: element for @proc->waiting_threads list - * (protected by @proc->inner_lock) - * @pid: PID for this thread - * (invariant after initialization) - * @looper: bitmap of looping state - * (only accessed by this thread) - * @looper_needs_return: looping thread needs to exit driver - * (no lock needed) - * @transaction_stack: stack of in-progress transactions for this thread - * (protected by @proc->inner_lock) - * @todo: list of work to do for this thread - * (protected by @proc->inner_lock) - * @process_todo: whether work in @todo should be processed - * (protected by @proc->inner_lock) - * @return_error: transaction errors reported by this thread - * (only accessed by this thread) - * @reply_error: transaction errors reported by target thread - * (protected by @proc->inner_lock) - * @wait: wait queue for thread work - * @stats: per-thread statistics - * (atomics, no lock needed) - * @tmp_ref: temporary reference to indicate thread is in use - * (atomic since @proc->inner_lock cannot - * always be acquired) - * @is_dead: thread is dead and awaiting free - * when outstanding transactions are cleaned up - * (protected by @proc->inner_lock) - * @task: struct task_struct for this thread - * @prio_lock: protects thread priority fields - * @prio_next: saved priority to be restored next - * (protected by @prio_lock) - * @prio_state: state of the priority restore process as - * defined by enum binder_prio_state - * (protected by @prio_lock) - * - * Bookkeeping structure for binder threads. - */ -struct binder_thread { - struct binder_proc *proc; - struct rb_node rb_node; - struct list_head waiting_thread_node; - int pid; - int looper; /* only modified by this thread */ - bool looper_need_return; /* can be written by other thread */ - struct binder_transaction *transaction_stack; - struct list_head todo; - bool process_todo; - struct binder_error return_error; - struct binder_error reply_error; - wait_queue_head_t wait; - struct binder_stats stats; - atomic_t tmp_ref; - bool is_dead; - struct task_struct *task; - spinlock_t prio_lock; - struct binder_priority prio_next; - enum binder_prio_state prio_state; -}; - -/** - * struct binder_txn_fd_fixup - transaction fd fixup list element - * @fixup_entry: list entry - * @file: struct file to be associated with new fd - * @offset: offset in buffer data to this fixup - * - * List element for fd fixups in a transaction. Since file - * descriptors need to be allocated in the context of the - * target process, we pass each fd to be processed in this - * struct. - */ -struct binder_txn_fd_fixup { - struct list_head fixup_entry; - struct file *file; - size_t offset; -}; - -struct binder_transaction { - int debug_id; - struct binder_work work; - struct binder_thread *from; - struct binder_transaction *from_parent; - struct binder_proc *to_proc; - struct binder_thread *to_thread; - struct binder_transaction *to_parent; - unsigned need_reply:1; - /* unsigned is_dead:1; */ /* not used at the moment */ - - struct binder_buffer *buffer; - unsigned int code; - unsigned int flags; - struct binder_priority priority; - struct binder_priority saved_priority; - bool set_priority_called; - bool is_nested; - kuid_t sender_euid; - struct list_head fd_fixups; - binder_uintptr_t security_ctx; - /** - * @lock: protects @from, @to_proc, and @to_thread - * - * @from, @to_proc, and @to_thread can be set to NULL - * during thread teardown - */ - spinlock_t lock; -}; - -/** - * struct binder_object - union of flat binder object types - * @hdr: generic object header - * @fbo: binder object (nodes and refs) - * @fdo: file descriptor object - * @bbo: binder buffer pointer - * @fdao: file descriptor array - * - * Used for type-independent object copies - */ -struct binder_object { - union { - struct binder_object_header hdr; - struct flat_binder_object fbo; - struct binder_fd_object fdo; - struct binder_buffer_object bbo; - struct binder_fd_array_object fdao; - }; -}; - -#endif /* _LINUX_BINDER_INTERNAL_H */ diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 5d82cf8af88b..7674231af8cb 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -1,6 +1,15 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * */ #undef TRACE_SYSTEM @@ -119,35 +128,6 @@ TRACE_EVENT(binder_wait_for_work, __entry->thread_todo) ); -TRACE_EVENT(binder_txn_latency_free, - TP_PROTO(struct binder_transaction *t, - int from_proc, int from_thread, - int to_proc, int to_thread), - TP_ARGS(t, from_proc, from_thread, to_proc, to_thread), - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, from_proc) - __field(int, from_thread) - __field(int, to_proc) - __field(int, to_thread) - __field(unsigned int, code) - __field(unsigned int, flags) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->from_proc = from_proc; - __entry->from_thread = from_thread; - __entry->to_proc = to_proc; - __entry->to_thread = to_thread; - __entry->code = t->code; - __entry->flags = t->flags; - ), - TP_printk("transaction=%d from %d:%d to %d:%d flags=0x%x code=0x%x", - __entry->debug_id, __entry->from_proc, __entry->from_thread, - __entry->to_proc, __entry->to_thread, __entry->code, - __entry->flags) -); - TRACE_EVENT(binder_transaction, TP_PROTO(bool reply, struct binder_transaction *t, struct binder_node *target_node), @@ -267,40 +247,22 @@ TRACE_EVENT(binder_transaction_ref_to_ref, __entry->dest_ref_debug_id, __entry->dest_ref_desc) ); -TRACE_EVENT(binder_transaction_fd_send, - TP_PROTO(struct binder_transaction *t, int fd, size_t offset), - TP_ARGS(t, fd, offset), +TRACE_EVENT(binder_transaction_fd, + TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd), + TP_ARGS(t, src_fd, dest_fd), TP_STRUCT__entry( __field(int, debug_id) - __field(int, fd) - __field(size_t, offset) + __field(int, src_fd) + __field(int, dest_fd) ), TP_fast_assign( __entry->debug_id = t->debug_id; - __entry->fd = fd; - __entry->offset = offset; + __entry->src_fd = src_fd; + __entry->dest_fd = dest_fd; ), - TP_printk("transaction=%d src_fd=%d offset=%zu", - __entry->debug_id, __entry->fd, __entry->offset) -); - -TRACE_EVENT(binder_transaction_fd_recv, - TP_PROTO(struct binder_transaction *t, int fd, size_t offset), - TP_ARGS(t, fd, offset), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, fd) - __field(size_t, offset) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->fd = fd; - __entry->offset = offset; - ), - TP_printk("transaction=%d dest_fd=%d offset=%zu", - __entry->debug_id, __entry->fd, __entry->offset) + TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d", + __entry->debug_id, __entry->src_fd, __entry->dest_fd) ); DECLARE_EVENT_CLASS(binder_buffer_class, @@ -310,17 +272,14 @@ DECLARE_EVENT_CLASS(binder_buffer_class, __field(int, debug_id) __field(size_t, data_size) __field(size_t, offsets_size) - __field(size_t, extra_buffers_size) ), TP_fast_assign( __entry->debug_id = buf->debug_id; __entry->data_size = buf->data_size; __entry->offsets_size = buf->offsets_size; - __entry->extra_buffers_size = buf->extra_buffers_size; ), - TP_printk("transaction=%d data_size=%zd offsets_size=%zd extra_buffers_size=%zd", - __entry->debug_id, __entry->data_size, __entry->offsets_size, - __entry->extra_buffers_size) + TP_printk("transaction=%d data_size=%zd offsets_size=%zd", + __entry->debug_id, __entry->data_size, __entry->offsets_size) ); DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf, @@ -335,10 +294,6 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, TP_PROTO(struct binder_buffer *buffer), TP_ARGS(buffer)); -DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release, - TP_PROTO(struct binder_buffer *buffer), - TP_ARGS(buffer)); - TRACE_EVENT(binder_update_page_range, TP_PROTO(struct binder_alloc *alloc, bool allocate, void __user *start, void __user *end), diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c deleted file mode 100644 index f80d1fb9d9b2..000000000000 --- a/drivers/android/binderfs.c +++ /dev/null @@ -1,819 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "binder_internal.h" - -#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c) -#define ida_free ida_remove - -#define FIRST_INODE 1 -#define SECOND_INODE 2 -#define INODE_OFFSET 3 -#define INTSTRLEN 21 -#define BINDERFS_MAX_MINOR (1U << MINORBITS) -/* Ensure that the initial ipc namespace always has devices available. */ -#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) - -static dev_t binderfs_dev; -static DEFINE_MUTEX(binderfs_minors_mutex); -static DEFINE_IDA(binderfs_minors); - -enum binderfs_param { - Opt_max, - Opt_stats_mode, -}; - -enum binderfs_stats_mode { - binderfs_stats_mode_unset, - binderfs_stats_mode_global, -}; - -struct binder_features { - bool oneway_spam_detection; -}; - -static const struct constant_table binderfs_param_stats[] = { - { "global", binderfs_stats_mode_global }, - {} -}; - -static const struct fs_parameter_spec binderfs_fs_parameters[] = { - fsparam_u32("max", Opt_max), - fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats), - {} -}; - -static struct binder_features binder_features = { - .oneway_spam_detection = true, -}; - -static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) -{ - return sb->s_fs_info; -} - -bool is_binderfs_device(const struct inode *inode) -{ - if (inode->i_sb->s_magic == BINDERFS_SUPER_MAGIC) - return true; - - return false; -} - -/** - * binderfs_binder_device_create - allocate inode from super block of a - * binderfs mount - * @ref_inode: inode from wich the super block will be taken - * @userp: buffer to copy information about new device for userspace to - * @req: struct binderfs_device as copied from userspace - * - * This function allocates a new binder_device and reserves a new minor - * number for it. - * Minor numbers are limited and tracked globally in binderfs_minors. The - * function will stash a struct binder_device for the specific binder - * device in i_private of the inode. - * It will go on to allocate a new inode from the super block of the - * filesystem mount, stash a struct binder_device in its i_private field - * and attach a dentry to that inode. - * - * Return: 0 on success, negative errno on failure - */ -static int binderfs_binder_device_create(struct inode *ref_inode, - struct binderfs_device __user *userp, - struct binderfs_device *req) -{ - int minor, ret; - struct dentry *dentry, *root; - struct binder_device *device; - char *name = NULL; - size_t name_len; - struct inode *inode = NULL; - struct super_block *sb = ref_inode->i_sb; - struct binderfs_info *info = sb->s_fs_info; -#if defined(CONFIG_IPC_NS) - bool use_reserve = (info->ipc_ns == &init_ipc_ns); -#else - bool use_reserve = true; -#endif - - /* Reserve new minor number for the new device. */ - mutex_lock(&binderfs_minors_mutex); - if (++info->device_count <= info->mount_opts.max) - minor = ida_alloc_max(&binderfs_minors, - use_reserve ? BINDERFS_MAX_MINOR : - BINDERFS_MAX_MINOR_CAPPED, - GFP_KERNEL); - else - minor = -ENOSPC; - if (minor < 0) { - --info->device_count; - mutex_unlock(&binderfs_minors_mutex); - return minor; - } - mutex_unlock(&binderfs_minors_mutex); - - ret = -ENOMEM; - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - goto err; - - inode = new_inode(sb); - if (!inode) - goto err; - - inode->i_ino = minor + INODE_OFFSET; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - init_special_inode(inode, S_IFCHR | 0600, - MKDEV(MAJOR(binderfs_dev), minor)); - inode->i_fop = &binder_fops; - inode->i_uid = info->root_uid; - inode->i_gid = info->root_gid; - - req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */ - name_len = strlen(req->name); - /* Make sure to include terminating NUL byte */ - name = kmemdup(req->name, name_len + 1, GFP_KERNEL); - if (!name) - goto err; - - refcount_set(&device->ref, 1); - device->binderfs_inode = inode; - device->context.binder_context_mgr_uid = INVALID_UID; - device->context.name = name; - device->miscdev.name = name; - device->miscdev.minor = minor; - mutex_init(&device->context.context_mgr_node_lock); - - req->major = MAJOR(binderfs_dev); - req->minor = minor; - - if (userp && copy_to_user(userp, req, sizeof(*req))) { - ret = -EFAULT; - goto err; - } - - root = sb->s_root; - inode_lock(d_inode(root)); - - /* look it up */ - dentry = lookup_one_len(name, root, name_len); - if (IS_ERR(dentry)) { - inode_unlock(d_inode(root)); - ret = PTR_ERR(dentry); - goto err; - } - - if (d_really_is_positive(dentry)) { - /* already exists */ - dput(dentry); - inode_unlock(d_inode(root)); - ret = -EEXIST; - goto err; - } - - inode->i_private = device; - d_instantiate(dentry, inode); - fsnotify_create(root->d_inode, dentry); - inode_unlock(d_inode(root)); - - return 0; - -err: - kfree(name); - kfree(device); - mutex_lock(&binderfs_minors_mutex); - --info->device_count; - ida_free(&binderfs_minors, minor); - mutex_unlock(&binderfs_minors_mutex); - iput(inode); - - return ret; -} - -/** - * binderfs_ctl_ioctl - handle binder device node allocation requests - * - * The request handler for the binder-control device. All requests operate on - * the binderfs mount the binder-control device resides in: - * - BINDER_CTL_ADD - * Allocate a new binder device. - * - * Return: 0 on success, negative errno on failure - */ -static long binder_ctl_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret = -EINVAL; - struct inode *inode = file_inode(file); - struct binderfs_device __user *device = (struct binderfs_device __user *)arg; - struct binderfs_device device_req; - - switch (cmd) { - case BINDER_CTL_ADD: - ret = copy_from_user(&device_req, device, sizeof(device_req)); - if (ret) { - ret = -EFAULT; - break; - } - - ret = binderfs_binder_device_create(inode, device, &device_req); - break; - default: - break; - } - - return ret; -} - -static void binderfs_evict_inode(struct inode *inode) -{ - struct binder_device *device = inode->i_private; - struct binderfs_info *info = BINDERFS_SB(inode->i_sb); - - clear_inode(inode); - - if (!S_ISCHR(inode->i_mode) || !device) - return; - - mutex_lock(&binderfs_minors_mutex); - --info->device_count; - ida_free(&binderfs_minors, device->miscdev.minor); - mutex_unlock(&binderfs_minors_mutex); - - if (refcount_dec_and_test(&device->ref)) { - kfree(device->context.name); - kfree(device); - } -} - -static int binderfs_fs_context_parse_param(struct fs_context *fc, - struct fs_parameter *param) -{ - int opt; - struct binderfs_mount_opts *ctx = fc->fs_private; - struct fs_parse_result result; - - opt = fs_parse(fc, binderfs_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_max: - if (result.uint_32 > BINDERFS_MAX_MINOR) - return invalfc(fc, "Bad value for '%s'", param->key); - - ctx->max = result.uint_32; - break; - case Opt_stats_mode: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - ctx->stats_mode = result.uint_32; - break; - default: - return invalfc(fc, "Unsupported parameter '%s'", param->key); - } - - return 0; -} - -static int binderfs_fs_context_reconfigure(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx = fc->fs_private; - struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb); - - if (info->mount_opts.stats_mode != ctx->stats_mode) - return invalfc(fc, "Binderfs stats mode cannot be changed during a remount"); - - info->mount_opts.stats_mode = ctx->stats_mode; - info->mount_opts.max = ctx->max; - return 0; -} - -static int binderfs_show_options(struct seq_file *seq, struct dentry *root) -{ - struct binderfs_info *info = BINDERFS_SB(root->d_sb); - - if (info->mount_opts.max <= BINDERFS_MAX_MINOR) - seq_printf(seq, ",max=%d", info->mount_opts.max); - - switch (info->mount_opts.stats_mode) { - case binderfs_stats_mode_unset: - break; - case binderfs_stats_mode_global: - seq_printf(seq, ",stats=global"); - break; - } - - return 0; -} - -static void binderfs_put_super(struct super_block *sb) -{ - struct binderfs_info *info = sb->s_fs_info; - - if (info && info->ipc_ns) - put_ipc_ns(info->ipc_ns); - - kfree(info); - sb->s_fs_info = NULL; -} - -static const struct super_operations binderfs_super_ops = { - .evict_inode = binderfs_evict_inode, - .show_options = binderfs_show_options, - .statfs = simple_statfs, - .put_super = binderfs_put_super, -}; - -static inline bool is_binderfs_control_device(const struct dentry *dentry) -{ - struct binderfs_info *info = dentry->d_sb->s_fs_info; - - return info->control_dentry == dentry; -} - -static int binderfs_rename(struct user_namespace *mnt_userns, - struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - if (is_binderfs_control_device(old_dentry) || - is_binderfs_control_device(new_dentry)) - return -EPERM; - - return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir, - new_dentry, flags); -} - -static int binderfs_unlink(struct inode *dir, struct dentry *dentry) -{ - if (is_binderfs_control_device(dentry)) - return -EPERM; - - return simple_unlink(dir, dentry); -} - -static const struct file_operations binder_ctl_fops = { - .owner = THIS_MODULE, - .open = nonseekable_open, - .unlocked_ioctl = binder_ctl_ioctl, - .compat_ioctl = binder_ctl_ioctl, - .llseek = noop_llseek, -}; - -/** - * binderfs_binder_ctl_create - create a new binder-control device - * @sb: super block of the binderfs mount - * - * This function creates a new binder-control device node in the binderfs mount - * referred to by @sb. - * - * Return: 0 on success, negative errno on failure - */ -static int binderfs_binder_ctl_create(struct super_block *sb) -{ - int minor, ret; - struct dentry *dentry; - struct binder_device *device; - struct inode *inode = NULL; - struct dentry *root = sb->s_root; - struct binderfs_info *info = sb->s_fs_info; -#if defined(CONFIG_IPC_NS) - bool use_reserve = (info->ipc_ns == &init_ipc_ns); -#else - bool use_reserve = true; -#endif - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - return -ENOMEM; - - /* If we have already created a binder-control node, return. */ - if (info->control_dentry) { - ret = 0; - goto out; - } - - ret = -ENOMEM; - inode = new_inode(sb); - if (!inode) - goto out; - - /* Reserve a new minor number for the new device. */ - mutex_lock(&binderfs_minors_mutex); - minor = ida_alloc_max(&binderfs_minors, - use_reserve ? BINDERFS_MAX_MINOR : - BINDERFS_MAX_MINOR_CAPPED, - GFP_KERNEL); - mutex_unlock(&binderfs_minors_mutex); - if (minor < 0) { - ret = minor; - goto out; - } - - inode->i_ino = SECOND_INODE; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - init_special_inode(inode, S_IFCHR | 0600, - MKDEV(MAJOR(binderfs_dev), minor)); - inode->i_fop = &binder_ctl_fops; - inode->i_uid = info->root_uid; - inode->i_gid = info->root_gid; - - refcount_set(&device->ref, 1); - device->binderfs_inode = inode; - device->miscdev.minor = minor; - - dentry = d_alloc_name(root, "binder-control"); - if (!dentry) - goto out; - - inode->i_private = device; - info->control_dentry = dentry; - d_add(dentry, inode); - - return 0; - -out: - kfree(device); - iput(inode); - - return ret; -} - -static const struct inode_operations binderfs_dir_inode_operations = { - .lookup = simple_lookup, - .rename = binderfs_rename, - .unlink = binderfs_unlink, -}; - -static struct inode *binderfs_make_inode(struct super_block *sb, int mode) -{ - struct inode *ret; - - ret = new_inode(sb); - if (ret) { - ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET); - ret->i_mode = mode; - ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret); - } - return ret; -} - -static struct dentry *binderfs_create_dentry(struct dentry *parent, - const char *name) -{ - struct dentry *dentry; - - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) - return dentry; - - /* Return error if the file/dir already exists. */ - if (d_really_is_positive(dentry)) { - dput(dentry); - return ERR_PTR(-EEXIST); - } - - return dentry; -} - -void binderfs_remove_file(struct dentry *dentry) -{ - struct inode *parent_inode; - - parent_inode = d_inode(dentry->d_parent); - inode_lock(parent_inode); - if (simple_positive(dentry)) { - dget(dentry); - simple_unlink(parent_inode, dentry); - d_delete(dentry); - dput(dentry); - } - inode_unlock(parent_inode); -} - -struct dentry *binderfs_create_file(struct dentry *parent, const char *name, - const struct file_operations *fops, - void *data) -{ - struct dentry *dentry; - struct inode *new_inode, *parent_inode; - struct super_block *sb; - - parent_inode = d_inode(parent); - inode_lock(parent_inode); - - dentry = binderfs_create_dentry(parent, name); - if (IS_ERR(dentry)) - goto out; - - sb = parent_inode->i_sb; - new_inode = binderfs_make_inode(sb, S_IFREG | 0444); - if (!new_inode) { - dput(dentry); - dentry = ERR_PTR(-ENOMEM); - goto out; - } - - new_inode->i_fop = fops; - new_inode->i_private = data; - d_instantiate(dentry, new_inode); - fsnotify_create(parent_inode, dentry); - -out: - inode_unlock(parent_inode); - return dentry; -} - -static struct dentry *binderfs_create_dir(struct dentry *parent, - const char *name) -{ - struct dentry *dentry; - struct inode *new_inode, *parent_inode; - struct super_block *sb; - - parent_inode = d_inode(parent); - inode_lock(parent_inode); - - dentry = binderfs_create_dentry(parent, name); - if (IS_ERR(dentry)) - goto out; - - sb = parent_inode->i_sb; - new_inode = binderfs_make_inode(sb, S_IFDIR | 0755); - if (!new_inode) { - dput(dentry); - dentry = ERR_PTR(-ENOMEM); - goto out; - } - - new_inode->i_fop = &simple_dir_operations; - new_inode->i_op = &simple_dir_inode_operations; - - set_nlink(new_inode, 2); - d_instantiate(dentry, new_inode); - inc_nlink(parent_inode); - fsnotify_mkdir(parent_inode, dentry); - -out: - inode_unlock(parent_inode); - return dentry; -} - -static int binder_features_show(struct seq_file *m, void *unused) -{ - bool *feature = m->private; - - seq_printf(m, "%d\n", *feature); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(binder_features); - -static int init_binder_features(struct super_block *sb) -{ - struct dentry *dentry, *dir; - - dir = binderfs_create_dir(sb->s_root, "features"); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - dentry = binderfs_create_file(dir, "oneway_spam_detection", - &binder_features_fops, - &binder_features.oneway_spam_detection); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - return 0; -} - -static int init_binder_logs(struct super_block *sb) -{ - struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir; - const struct binder_debugfs_entry *db_entry; - struct binderfs_info *info; - int ret = 0; - - binder_logs_root_dir = binderfs_create_dir(sb->s_root, - "binder_logs"); - if (IS_ERR(binder_logs_root_dir)) { - ret = PTR_ERR(binder_logs_root_dir); - goto out; - } - - binder_for_each_debugfs_entry(db_entry) { - dentry = binderfs_create_file(binder_logs_root_dir, - db_entry->name, - db_entry->fops, - db_entry->data); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out; - } - } - - proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc"); - if (IS_ERR(proc_log_dir)) { - ret = PTR_ERR(proc_log_dir); - goto out; - } - info = sb->s_fs_info; - info->proc_log_dir = proc_log_dir; - -out: - return ret; -} - -static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc) -{ - int ret; - struct binderfs_info *info; - struct binderfs_mount_opts *ctx = fc->fs_private; - struct inode *inode = NULL; - struct binderfs_device device_info = {}; - const char *name; - size_t len; - - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; - - /* - * The binderfs filesystem can be mounted by userns root in a - * non-initial userns. By default such mounts have the MS_NODEV flag - * set in s_iflags to prevent security issues where userns root can - * just create random device nodes via mknod() since it owns the - * filesystem mount. But binderfs does not allow to create any files - * including devices nodes. The only way to create binder devices nodes - * is through the binder-control device which userns root is explicitly - * allowed to do. So removing the MS_NODEV flag from s_iflags is both - * necessary and safe. - */ - sb->s_iflags &= ~MS_NODEV; - sb->s_iflags |= SB_I_NOEXEC; - sb->s_magic = BINDERFS_SUPER_MAGIC; - sb->s_op = &binderfs_super_ops; - sb->s_time_gran = 1; - - sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL); - if (!sb->s_fs_info) - return -ENOMEM; - info = sb->s_fs_info; - - info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); - - info->root_gid = make_kgid(sb->s_user_ns, 0); - if (!gid_valid(info->root_gid)) - info->root_gid = GLOBAL_ROOT_GID; - info->root_uid = make_kuid(sb->s_user_ns, 0); - if (!uid_valid(info->root_uid)) - info->root_uid = GLOBAL_ROOT_UID; - info->mount_opts.max = ctx->max; - info->mount_opts.stats_mode = ctx->stats_mode; - - inode = new_inode(sb); - if (!inode) - return -ENOMEM; - - inode->i_ino = FIRST_INODE; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | 0755; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - inode->i_op = &binderfs_dir_inode_operations; - set_nlink(inode, 2); - - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - - ret = binderfs_binder_ctl_create(sb); - if (ret) - return ret; - - name = binder_devices_param; - for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { - strscpy(device_info.name, name, len + 1); - ret = binderfs_binder_device_create(inode, NULL, &device_info); - if (ret) - return ret; - name += len; - if (*name == ',') - name++; - } - - ret = init_binder_features(sb); - if (ret) - return ret; - - if (info->mount_opts.stats_mode == binderfs_stats_mode_global) - return init_binder_logs(sb); - - return 0; -} - -static int binderfs_fs_context_get_tree(struct fs_context *fc) -{ - return get_tree_nodev(fc, binderfs_fill_super); -} - -static void binderfs_fs_context_free(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx = fc->fs_private; - - kfree(ctx); -} - -static const struct fs_context_operations binderfs_fs_context_ops = { - .free = binderfs_fs_context_free, - .get_tree = binderfs_fs_context_get_tree, - .parse_param = binderfs_fs_context_parse_param, - .reconfigure = binderfs_fs_context_reconfigure, -}; - -static int binderfs_init_fs_context(struct fs_context *fc) -{ - struct binderfs_mount_opts *ctx; - - ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->max = BINDERFS_MAX_MINOR; - ctx->stats_mode = binderfs_stats_mode_unset; - - fc->fs_private = ctx; - fc->ops = &binderfs_fs_context_ops; - - return 0; -} - -static struct file_system_type binder_fs_type = { - .name = "binder", - .init_fs_context = binderfs_init_fs_context, - .parameters = binderfs_fs_parameters, - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, -}; - -int __init init_binderfs(void) -{ - int ret; - const char *name; - size_t len; - - /* Verify that the default binderfs device names are valid. */ - name = binder_devices_param; - for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { - if (len > BINDERFS_MAX_NAME) - return -E2BIG; - name += len; - if (*name == ',') - name++; - } - - /* Allocate new major number for binderfs. */ - ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR, - "binder"); - if (ret) - return ret; - - ret = register_filesystem(&binder_fs_type); - if (ret) { - unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR); - return ret; - } - - return ret; -} diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c deleted file mode 100644 index fd718ab02392..000000000000 --- a/drivers/android/vendor_hooks.c +++ /dev/null @@ -1,433 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* vendor_hook.c - * - * Android Vendor Hook Support - * - * Copyright 2020 Google LLC - */ - -#ifndef __GENKSYMS__ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include <../fs/mount.h> -#include <../kernel/audit.h> -#include <../kernel/locking/mutex.h> -#include <../net/can/af_can.h> -#include <../net/tipc/bearer.h> -#include <../kernel/printk/printk_ringbuffer.h> -#endif - -#define CREATE_TRACE_POINTS -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef __GENKSYMS__ -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef __GENKSYMS__ -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -/* - * Export tracepoints that act as a bare tracehook (ie: have no trace event - * associated with them) to allow external modules to probe them. - */ -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_refrigerator); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_arch_set_freq_scale); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_is_fpsimd_save); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_transaction_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_priority_skip); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_set_priority); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_restore_priority); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wakeup_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_send_sig_info); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_sleep_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_futex); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_end); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_traverse_plist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_this); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_up_q_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_process_killed); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_killed_process); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_can_spin_on_owner); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sched_show_task); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_hotplug); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller_id); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_ext_header); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_gic_v3_set_affinity); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_set_affinity); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v3_affinity_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_suspend_epoch_val); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_resume_epoch_val); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_table_limits); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_resolve_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_fast_switch); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_target); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_offline); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_skip_swapcache_flags); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_gfp_zone_flags); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_readahead_gfp_mask); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_readahead_gfp_mask); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rmqueue_bulk); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_disable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_enable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_disable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_enable); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_can_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_online); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_size_check); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_format_check); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_dump_buffer); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_compl_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_set_task); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_syscall_prctl_finished); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_uic_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sdev); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_clock_scaling); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_use_mcq_hooks); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_max_tag); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_map_tag); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_set_sqid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_handler); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_make_hba_operational); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_hba_capabilities); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_print_trs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_send_command); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_config); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_has_oustanding_reqs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_get_outstanding_reqs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_cmd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_pending); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_retry_complete); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kfree_skb); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cgroup_force_kthread_migration); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wait_for_work); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_entry); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_select_worklist_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sync_txn_recvd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpufreq_transition); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_add_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_update_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_remove_request); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_balance_anon_file_reclaim); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_show_max_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_failed_page_trylock); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_clear); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_get_result); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_page_trylock); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_referenced_check_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drain_all_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_drain_all_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pcplist_add_cma_pages_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_slab_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_insert); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_delete); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_replace); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_lookup); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_commit_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_exit_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_override_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_revert_creds); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_mutex_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rtmutex_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rwsem_lock_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_pcpu_rwsem_starttime); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_nx); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_rw); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_before_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_after_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_oom_check_panic); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf_pr_cont); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks_dn); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_meminfo_proc_show); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_mm); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_slowpath); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mem); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_print_slabinfo_header); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_shrink_slab); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cache_show); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_report_bug); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watchdog_timer_softlockup); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_logging); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_unfrozen); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_die_kernel_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sea); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_mem_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sp_pc_abort); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_undefinstr); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_ptrauth_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_panic_unhandled); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_arm64_serror_panic); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_serror); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_vmpressure); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_request_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_target_freq); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_register); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_disable_thermal_cooling_stats); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_throttle_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tk_based_time_sync); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kswapd_per_node); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_vendor_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_ep_action); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_synctype); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_connect); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_audio_usb_offload_disconnect); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_atomic_remove_fb); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drm_atomic_check_modeset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_tos_resident_on); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_cpu_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_usb_new_device_added); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_regmap_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_mutex_list_add); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_unlock_slowpath); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake_finish); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dma_buf_release); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dmabuf_heap_flags_validation); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pass_input_event); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_check_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmap_region); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_unmap_one); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_node_memcgs); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sdio_pm_flag_set); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_scan_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_swappiness); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_partial_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_cache_card_properties); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_print_transaction_info); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_tlb_conf); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_init); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_calc_decayed_watermark); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_watermark); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_reset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_attach_sd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sdhci_get_cd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_gpio_cd_irqt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_update_partition_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_cmdline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_dataline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_partition_status); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_cmdline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_dataline_timing); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_set_context); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_get_context); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_track_hash); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_id_remove); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_offline); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_online); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_free); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_alloc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_slab); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpuset_fork); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_looper_state_registered); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_read); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_free_proc); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_release); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_has_work_ilocked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_read_done); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v2_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_signal); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_alloc_new_buf_locked); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_reply); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_trans); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_preset); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_memcg_scan_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_thermal_stats); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_new_ref); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_del_ref); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mapcount_pages); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_traversal_lruvec); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_should_be_protected); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mark_page_accessed); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_ffu_update_cid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_uid); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_user); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_cpu_get_power); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_cache_forced_ra); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_acct_update_power); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rmqueue); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_inactive_ratio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_hibernation_swap); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_cpu_resume); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_hib_resume_bdev); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dma_buf_stats_teardown); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_cold_or_pageout); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_retry); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_encrypt_page); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_init_aes_encrypt); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_skip_swap_map_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_post_image_save); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dm_update_clone_bio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ctl_dirty_rate); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_direct_io_update_bio); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_loop_prepare_cmd); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_event); -EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_group); -/* - * For type visibility - */ -const struct readahead_control *GKI_struct_readahead_control; -EXPORT_SYMBOL_GPL(GKI_struct_readahead_control); diff --git a/drivers/input/input.c b/drivers/input/input.c index 6cbdf2737004..378717d1b3b4 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -377,19 +377,11 @@ static int input_get_disposition(struct input_dev *dev, return disposition; } -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_input_hook __read_mostly; -extern int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, int *value); -#endif static void input_handle_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { int disposition = input_get_disposition(dev, type, code, &value); -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_input_hook)) - ksu_handle_input_handle_event(&type, &code, &value); -#endif if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN) add_input_randomness(type, code, value); diff --git a/drivers/input/touchscreen/fts_521/fts.c b/drivers/input/touchscreen/fts_521/fts.c index 4722476a4c2d..e0750f28a609 100644 --- a/drivers/input/touchscreen/fts_521/fts.c +++ b/drivers/input/touchscreen/fts_521/fts.c @@ -2723,15 +2723,15 @@ static void fts_enter_pointer_event_handler(struct fts_ts_info *info, input_report_key(info->input_dev, BTN_TOOL_FINGER, 1); /*input_report_abs(info->input_dev, ABS_MT_TRACKING_ID, touchId); */ - input_report_abs(info->input_dev, ABS_MT_POSITION_X, x); - input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y); - input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z); - input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z); - input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance); + input_report_abs(info->input_dev, ABS_MT_POSITION_X, x); + input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y); + input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z); + input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z); + input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance); #ifdef CONFIG_INPUT_PRESS_NDT - input_report_abs(info->input_dev, ABS_MT_PRESSURE, z); + input_report_abs(info->input_dev, ABS_MT_PRESSURE, z); #endif - input_sync(info->input_dev); + input_sync(info->input_dev); /* pr_info("%s: Event 0x%02x - ID[%d], (x, y, z) = (%3d, %3d, %3d) type = %d\n", __func__, *event, touchId, x, y, z, touchType); */ diff --git a/drivers/kernelsu/Kbuild b/drivers/kernelsu/Kbuild deleted file mode 100644 index 800da52d0892..000000000000 --- a/drivers/kernelsu/Kbuild +++ /dev/null @@ -1,26 +0,0 @@ -obj-y += ksuinit.o -obj-y += allowlist.o -obj-y += app_profile.o -obj-y += apk_sign.o -obj-y += sucompat.o -obj-y += throne_tracker.o -obj-y += setuid_hook.o -obj-y += kernel_compat.o -obj-y += kernel_umount.o -obj-y += supercalls.o -obj-y += feature.o -obj-y += ksud.o -obj-y += file_wrapper.o -obj-y += su_mount_ns.o -obj-y += shim.o -obj-y += selinux/selinux.o -obj-y += selinux/sepolicy.o -obj-y += selinux/rules.o - -ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include -ccflags-y += -I$(objtree)/security/selinux -include $(srctree)/include/uapi/asm-generic/errno.h - -ccflags-y += -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat -ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-missing-prototypes - -# Keep a new line here !! Because someone may append config diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig index 8464a6c4ca4b..10608831444f 100644 --- a/drivers/kernelsu/Kconfig +++ b/drivers/kernelsu/Kconfig @@ -1,48 +1,77 @@ menu "KernelSU" config KSU - tristate "KernelSU function support" - default y + bool "KernelSU function support" + depends on !CPU_BIG_ENDIAN + depends on SECURITY_SELINUX + select SECCOMP + default n help Enable kernel-level root privileges on Android System. - To compile as a module, choose M here: the - module will be called kernelsu. -config KSU_DEBUG - bool "KernelSU debug mode" - depends on KSU +config KSU_KPROBES_KSUD + bool "Enable dynamic kprobes for early boot hooks" + depends on KPROBES && KRETPROBES + default y + help + Use dynamic hooks via kprobes for functions only + on early boot. Hooks are unregistered at boot complete + to reduce overhead. + +config KSU_TAMPER_SYSCALL_TABLE + bool "EXPERIMENTAL: tamper sys_call_table for sucompat + sys_reboot" + depends on (ARM || ARM64) && !CFI_CLANG && !CFI default n help - Enable KernelSU debug mode. + EXPERIMENTAL: use syscall table hijacking method demonstrated on zx2c4's + kernel-assisted-superuser. Replaces sys_reboot, sys_execve, sys_newfstatat, + sys_faccessat, sys_newfstat_ret manual hooks. + Personally tested on Linux 3.10 ~ 4.14, aarch64. -config KSU_ALLOWLIST_WORKAROUND - bool "KernelSU allowlist workaround" +config KSU_FEATURE_SULOG + bool "KernelSU SU Logging feature" depends on KSU - default n + default y help - Enable workaround for broken allowlist save + Build KernelSU's SU Log. -choice - prompt "KernelSU hooks" - default KSU_MANUAL_HOOK if !KPROBES - default KSU_SYSCALL_HOOK if KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS +config KSU_FEATURE_ADBROOT + bool "KernelSU ADB Root feature" + depends on KSU + default y help - KernelSU core hooks. + Build KernelSU's adb root feature. -config KSU_MANUAL_HOOK - bool "KernelSU manual hook mode." - depends on KSU && KSU != m +config KSU_FEATURE_SELINUX_HIDE + bool "KernelSU SELinux hide feature" + depends on KSU + default y help - Enable manual hook support. + Build KernelSU's SELinux hide feature. + This is a dumber implementation, but it should be fine for most cases. -config KSU_SYSCALL_HOOK - bool "KernelSU syscall hook mode." +config KSU_DEBUG + bool "KernelSU debug mode" depends on KSU - depends on KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS + default n help - Enable KPROBES, KRETPROBES and TRACEPOINT hook for KernelSU core. - This should not be used on kernel below 5.10. + Enable KernelSU debug mode. -endchoice +config KSU_THRONE_TRACKER_ALWAYS_THREADED + bool "Always run throne tracker in a kthread" + default n + help + Enable this option to run throne tracker in a kthread for the first + run, which happens at boot time / decryption stage. This can decrease + boot time, but can cause crowning failure on some FDE/FBEv1 setups. + If unsure, say n. + +config KSU_LSM_SECURITY_HOOKS + bool "Use LSM security hooks" + depends on KSU + default y + help + Disabling this is mostly useful for kernel > 6.8. + Make sure to implement manual hooks on security/security.c. endmenu diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile new file mode 100644 index 000000000000..7c2fcedc7eac --- /dev/null +++ b/drivers/kernelsu/Makefile @@ -0,0 +1,78 @@ +# NOTE: unity build. single unit. + +obj-$(CONFIG_KSU) := ksu.o + +CFLAGS_ksu.o += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include +CFLAGS_ksu.o += -I$(objtree)/security/selinux + +# uncommon, but wont hurt, check for 3-arg security_add_hooks +ifeq ($(shell grep -A1 "void security_add_hooks" $(srctree)/include/linux/lsm_hooks.h 2>/dev/null | grep -q lsm 2>/dev/null; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_SECURITY_ADD_HOOKS_V2 +endif + +ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_CURRENT_SID +endif + +ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/include/security.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_SELINUX_STATE +endif + +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT +endif + +# half-assed-backport from 5.1 +ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY +endif + +ifeq ($(shell grep -q "^DEFINE_RWLOCK(policy_rwlock);" $(srctree)/security/selinux/ss/services.c; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK +endif + +ifeq ($(shell grep -q "cpus_ptr;" $(srctree)/include/linux/sched.h; echo $$?),0) +CFLAGS_ksu.o += -DKSU_COMPAT_HAS_BACKPORTED_CPUS_PTR +endif + +ifeq ($(shell grep -q "^struct security_operations selinux_ops" $(srctree)/security/selinux/hooks.c; echo $$?),0) +CFLAGS_ksu.o += -DKSU_HAS_EXPORTED_SELINUX_OPS +endif + +# UL, look for read_iter on f_op struct +ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0) +CFLAGS_ksu.o += -DKSU_HAS_FOP_READ_ITER +endif + +# UL, look for iterate_dir on ‎fs/readdir.c +ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0) +CFLAGS_ksu.o += -DKSU_HAS_ITERATE_DIR +endif + +CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-declaration-after-statement +CFLAGS_ksu.o += -Wno-int-conversion -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast +CFLAGS_ksu.o += -Wno-unused-variable -Wno-unused-function -Wno-format +CFLAGS_ksu.o += -Wno-macro-redefined + +# dont be too strict +CFLAGS_REMOVE_ksu.o += -Werror + +# so we can see stack use atleast, as we disable all stack safety here +CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024) + +# to make sure we can use builtins +CFLAGS_REMOVE_ksu.o += -fno-builtin + +ifneq ($(CONFIG_KSU_DEBUG),y) +# strip, remove tracing / profiling +# comment out if proper backtrace is needed +CFLAGS_ksu.o += -g0 -fno-unwind-tables -fno-asynchronous-unwind-tables -fomit-frame-pointer +CFLAGS_REMOVE_ksu.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ksu.o += -pg + +# if cflags can be macro'd, this will be called 'TRUST_ME' +CFLAGS_ksu.o += -fno-stack-protector -fno-stack-check +CFLAGS_REMOVE_ksu.o += -fsanitize=shadow-call-stack +endif # CONFIG_KSU_DEBUG + +# Keep a new line here!! Because someone may append config diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/allowlist.c deleted file mode 100644 index 9152b7174b6c..000000000000 --- a/drivers/kernelsu/allowlist.c +++ /dev/null @@ -1,576 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) -#include -#endif - -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "selinux/selinux.h" -#include "allowlist.h" -#include "manager.h" -#include "kernel_compat.h" -#include "su_mount_ns.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif - -#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32 -#define FILE_FORMAT_VERSION 3 // u32 - -#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID -#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0" - -static DEFINE_MUTEX(allowlist_mutex); - -// default profiles, these may be used frequently, so we cache it -static struct root_profile default_root_profile; -static struct non_root_profile default_non_root_profile; - -static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly - __aligned(PAGE_SIZE); -static int allow_list_pointer __read_mostly = 0; - -static void remove_uid_from_arr(uid_t uid) -{ - int *temp_arr; - int i, j; - - if (allow_list_pointer == 0) - return; - - temp_arr = kzalloc(sizeof(allow_list_arr), GFP_KERNEL); - if (temp_arr == NULL) { - pr_err("%s: unable to allocate memory\n", __func__); - return; - } - - for (i = j = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) - continue; - temp_arr[j++] = allow_list_arr[i]; - } - - allow_list_pointer = j; - - for (; j < ARRAY_SIZE(allow_list_arr); j++) - temp_arr[j] = -1; - - memcpy(&allow_list_arr, temp_arr, PAGE_SIZE); - kfree(temp_arr); -} - -static void init_default_profiles(void) -{ - kernel_cap_t full_cap = CAP_FULL_SET; - - default_root_profile.uid = 0; - default_root_profile.gid = 0; - default_root_profile.groups_count = 1; - default_root_profile.groups[0] = 0; - memcpy(&default_root_profile.capabilities.effective, &full_cap, - sizeof(default_root_profile.capabilities.effective)); - default_root_profile.namespaces = KSU_NS_INHERITED; - strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN); - - // This means that we will umount modules by default! - default_non_root_profile.umount_modules = true; -} - -struct perm_data { - struct list_head list; - struct app_profile profile; -}; - -static struct list_head allow_list; - -static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE); -#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1) - -#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist" - -void persistent_allow_list(void); - -void ksu_show_allow_list(void) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - pr_info("ksu_show_allow_list\n"); - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - pr_info("uid :%d, allow: %d\n", p->profile.current_uid, - p->profile.allow_su); - } -} - -#ifdef CONFIG_KSU_DEBUG -static void ksu_grant_root_to_shell(void) -{ - struct app_profile profile = { - .version = KSU_APP_PROFILE_VER, - .allow_su = true, - .current_uid = 2000, - }; - strcpy(profile.key, "com.android.shell"); - strcpy(profile.rp_config.profile.selinux_domain, - KSU_DEFAULT_SELINUX_DOMAIN); - ksu_set_app_profile(&profile, false); -} -#endif - -bool ksu_get_app_profile(struct app_profile *profile) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - bool found = false; - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - bool uid_match = profile->current_uid == p->profile.current_uid; - if (uid_match) { - // found it, override it with ours - memcpy(profile, &p->profile, sizeof(*profile)); - found = true; - goto exit; - } - } - -exit: - return found; -} - -static inline bool forbid_system_uid(uid_t uid) -{ -#define SHELL_UID 2000 -#define SYSTEM_UID 1000 - return uid < SHELL_UID && uid != SYSTEM_UID; -} - -static bool profile_valid(struct app_profile *profile) -{ - if (!profile) { - return false; - } - - if (profile->version < KSU_APP_PROFILE_VER) { - pr_info("Unsupported profile version: %d\n", profile->version); - return false; - } - - if (profile->allow_su) { - if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) { - return false; - } - - if (strlen(profile->rp_config.profile.selinux_domain) == 0) { - return false; - } - } - - return true; -} - -bool ksu_set_app_profile(struct app_profile *profile, bool persist) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - bool result = false; - - if (!profile_valid(profile)) { - pr_err("Failed to set app profile: invalid profile!\n"); - return false; - } - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - // both uid and package must match, otherwise it will break multiple package with different user id - if (profile->current_uid == p->profile.current_uid && - !strcmp(profile->key, p->profile.key)) { - // found it, just override it all! - memcpy(&p->profile, profile, sizeof(*profile)); - result = true; - goto out; - } - } - - // not found, alloc a new node! - p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); - if (!p) { - pr_err("ksu_set_app_profile alloc failed\n"); - return false; - } - - memcpy(&p->profile, profile, sizeof(*profile)); - if (profile->allow_su) { - pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", - profile->key, profile->current_uid, - profile->rp_config.profile.gid, - profile->rp_config.profile.selinux_domain); - } else { - pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", - profile->key, profile->current_uid, - profile->nrp_config.profile.umount_modules); - } - list_add_tail(&p->list, &allow_list); - -out: - if (profile->current_uid <= BITMAP_UID_MAX) { - if (profile->allow_su) - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |= - 1 << (profile->current_uid % BITS_PER_BYTE); - else - allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &= - ~(1 << (profile->current_uid % BITS_PER_BYTE)); - } else { - if (profile->allow_su) { - /* - * 1024 apps with uid higher than BITMAP_UID_MAX - * registered to request superuser? - */ - if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) { - pr_err("too many apps registered\n"); - WARN_ON(1); - return false; - } - allow_list_arr[allow_list_pointer++] = - profile->current_uid; - } else { - remove_uid_from_arr(profile->current_uid); - } - } - result = true; - - // check if the default profiles is changed, cache it to a single struct to accelerate access. - if (unlikely(!strcmp(profile->key, "$"))) { - // set default non root profile - memcpy(&default_non_root_profile, &profile->nrp_config.profile, - sizeof(default_non_root_profile)); - } - - if (unlikely(!strcmp(profile->key, "#"))) { - // set default root profile - memcpy(&default_root_profile, &profile->rp_config.profile, - sizeof(default_root_profile)); - } - - if (persist) { - persistent_allow_list(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - // FIXME: use a new flag - ksu_mark_running_process(); -#endif - } - - return result; -} - -bool __ksu_is_allow_uid(uid_t uid) -{ - int i; - - if (forbid_system_uid(uid)) { - // do not bother going through the list if it's system - return false; - } - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { - // manager is always allowed! - return true; - } - - if (likely(uid <= BITMAP_UID_MAX)) { - return !!(allow_list_bitmap[uid / BITS_PER_BYTE] & - (1 << (uid % BITS_PER_BYTE))); - } else { - for (i = 0; i < allow_list_pointer; i++) { - if (allow_list_arr[i] == uid) - return true; - } - } - - return false; -} - -bool __ksu_is_allow_uid_for_current(uid_t uid) -{ - if (unlikely(uid == 0)) { - // already root, but only allow our domain. - return is_ksu_domain(); - } - return __ksu_is_allow_uid(uid); -} - -bool ksu_uid_should_umount(uid_t uid) -{ - struct app_profile profile = { .current_uid = uid }; - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { - // we should not umount on manager! - return false; - } - - bool found = ksu_get_app_profile(&profile); - if (!found) { - // no app profile found, it must be non root app - return default_non_root_profile.umount_modules; - } - if (profile.allow_su) { - // if found and it is granted to su, we shouldn't umount for it - return false; - } else { - // found an app profile - if (profile.nrp_config.use_default) { - return default_non_root_profile.umount_modules; - } else { - return profile.nrp_config.profile.umount_modules; - } - } -} - -struct root_profile *ksu_get_root_profile(uid_t uid) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - if (uid == p->profile.current_uid && p->profile.allow_su) { - if (!p->profile.rp_config.use_default) { - return &p->profile.rp_config.profile; - } - } - } - - // use default profile - return &default_root_profile; -} - -bool ksu_get_allow_list(int *array, int *length, bool allow) -{ - struct perm_data *p = NULL; - struct list_head *pos = NULL; - int i = 0; - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow); - if (p->profile.allow_su == allow) { - array[i++] = p->profile.current_uid; - } - } - *length = i; - - return true; -} - -static void do_persistent_allow_list(struct callback_head *_cb) -{ - u32 magic = FILE_MAGIC; - u32 version = FILE_FORMAT_VERSION; - struct perm_data *p = NULL; - struct list_head *pos = NULL; - loff_t off = 0; - - mutex_lock(&allowlist_mutex); - struct file *fp = ksu_filp_open_compat( - KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (IS_ERR(fp)) { - pr_err("save_allow_list create file failed: %ld\n", - PTR_ERR(fp)); - goto unlock; - } - - // store magic and version - if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) != - sizeof(magic)) { - pr_err("save_allow_list write magic failed.\n"); - goto close_file; - } - - if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) != - sizeof(version)) { - pr_err("save_allow_list write version failed.\n"); - goto close_file; - } - - list_for_each (pos, &allow_list) { - p = list_entry(pos, struct perm_data, list); - pr_info("save allow list, name: %s uid :%d, allow: %d\n", - p->profile.key, p->profile.current_uid, - p->profile.allow_su); - - ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile), - &off); - } - -close_file: - filp_close(fp, 0); -unlock: - mutex_unlock(&allowlist_mutex); - kfree(_cb); -} - -void persistent_allow_list(void) -{ - struct task_struct *tsk; - - tsk = get_pid_task(find_vpid(1), PIDTYPE_PID); - if (!tsk) { - pr_err("save_allow_list find init task err\n"); - return; - } - - struct callback_head *cb = - kzalloc(sizeof(struct callback_head), GFP_KERNEL); - if (!cb) { - pr_err("save_allow_list alloc cb err\b"); - goto put_task; - } - cb->func = do_persistent_allow_list; - if (task_work_add(tsk, cb, TWA_RESUME)) { - kfree(cb); - pr_warn("save_allow_list add task_work failed\n"); - } - -put_task: - put_task_struct(tsk); -} - -void ksu_load_allow_list(void) -{ - loff_t off = 0; - ssize_t ret = 0; - struct file *fp = NULL; - u32 magic; - u32 version; - -#ifdef CONFIG_KSU_DEBUG - // always allow adb shell by default - ksu_grant_root_to_shell(); -#endif - - // load allowlist now! - fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); - if (IS_ERR(fp)) { - pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp)); - return; - } - - // verify magic - if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) != - sizeof(magic) || - magic != FILE_MAGIC) { - pr_err("allowlist file invalid: %d!\n", magic); - goto exit; - } - - if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) != - sizeof(version)) { - pr_err("allowlist read version: %d failed\n", version); - goto exit; - } - - pr_info("allowlist version: %d\n", version); - - while (true) { - struct app_profile profile; - - ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile), - &off); - - if (ret <= 0) { - pr_info("load_allow_list read err: %zd\n", ret); - break; - } - - pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", - profile.key, profile.current_uid, profile.allow_su); - ksu_set_app_profile(&profile, false); - } - -exit: - ksu_show_allow_list(); - filp_close(fp, 0); -} - -void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), - void *data) -{ - struct perm_data *np, *n = NULL; - - if (!ksu_boot_completed) { - pr_info("boot not completed, skip prune\n"); - return; - } - - bool modified = false; - // TODO: use RCU! - mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - uid_t uid = np->profile.current_uid; - char *package = np->profile.key; - // we use this uid for special cases, don't prune it! - bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID; - if (!is_preserved_uid && !is_uid_valid(uid, package, data)) { - modified = true; - pr_info("prune uid: %d, package: %s\n", uid, package); - list_del(&np->list); - if (likely(uid <= BITMAP_UID_MAX)) { - allow_list_bitmap[uid / BITS_PER_BYTE] &= - ~(1 << (uid % BITS_PER_BYTE)); - } - remove_uid_from_arr(uid); - smp_mb(); - kfree(np); - } - } - mutex_unlock(&allowlist_mutex); - - if (modified) { - persistent_allow_list(); - } -} - -void ksu_allowlist_init(void) -{ - int i; - - BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE); - BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE); - - for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++) - allow_list_arr[i] = -1; - - INIT_LIST_HEAD(&allow_list); - - init_default_profiles(); -} - -void ksu_allowlist_exit(void) -{ - struct perm_data *np, *n = NULL; - - // free allowlist - mutex_lock(&allowlist_mutex); - list_for_each_entry_safe (np, n, &allow_list, list) { - list_del(&np->list); - kfree(np); - } - mutex_unlock(&allowlist_mutex); -} diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/app_profile.c deleted file mode 100644 index 4d2f333ebffd..000000000000 --- a/drivers/kernelsu/app_profile.c +++ /dev/null @@ -1,206 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include // signal_struct -#include -#endif -#include -#include -#include -#include -#include - -#include "allowlist.h" -#include "app_profile.h" -#include "arch.h" -#include "kernel_compat.h" -#include "klog.h" // IWYU pragma: keep -#include "selinux/selinux.h" -#include "su_mount_ns.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0) -static struct group_info root_groups = { - .usage = REFCOUNT_INIT(2), -}; -#else -static struct group_info root_groups = { .usage = ATOMIC_INIT(2) }; -#endif - -void setup_groups(struct root_profile *profile, struct cred *cred) -{ - if (profile->groups_count > KSU_MAX_GROUPS) { - pr_warn("Failed to setgroups, too large group: %d!\n", - profile->uid); - return; - } - - if (profile->groups_count == 1 && profile->groups[0] == 0) { - // setgroup to root and return early. - if (cred->group_info) - put_group_info(cred->group_info); - cred->group_info = get_group_info(&root_groups); - return; - } - - u32 ngroups = profile->groups_count; - struct group_info *group_info = groups_alloc(ngroups); - if (!group_info) { - pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid); - return; - } - - int i; - for (i = 0; i < ngroups; i++) { - gid_t gid = profile->groups[i]; - kgid_t kgid = make_kgid(current_user_ns(), gid); - if (!gid_valid(kgid)) { - pr_warn("Failed to setgroups, invalid gid: %d\n", gid); - put_group_info(group_info); - return; - } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) - group_info->gid[i] = kgid; -#else - GROUP_AT(group_info, i) = kgid; -#endif - } - - groups_sort(group_info); - set_groups(cred, group_info); - put_group_info(group_info); -} - -static void do_disable_seccomp(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - struct task_struct *fake; - fake = kmalloc(sizeof(*fake), GFP_ATOMIC); - if (!fake) { - pr_err("%s: cannot allocate fake struct!\n", __func__); - return; - } -#endif - - // Refer to kernel/seccomp.c: seccomp_set_mode_strict - // When disabling Seccomp, ensure that current->sighand->siglock is held during the operation. - spin_lock_irq(¤t->sighand->siglock); - // disable seccomp -#if defined(CONFIG_GENERIC_ENTRY) && \ - LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - clear_syscall_work(SECCOMP); -#else - clear_thread_flag(TIF_SECCOMP); -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - memcpy(fake, current, sizeof(*fake)); -#endif - current->seccomp.mode = 0; -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) - // put_seccomp_filter is allowed while we holding sighand - put_seccomp_filter(current); -#endif - current->seccomp.filter = NULL; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 7, 0) - atomic_set(¤t->seccomp.filter_count, 0); -#endif - spin_unlock_irq(¤t->sighand->siglock); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) - // https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577 - fake->flags |= PF_EXITING; -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - // https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558 - fake->sighand = NULL; -#endif - seccomp_filter_release(fake); - kfree(fake); -#endif -} - -void disable_seccomp(void) -{ - // https://github.com/backslashxx/KernelSU/tree/e28930645e764b9f0e5d0d1b0d5e236464939075/kernel/app_profile.c - if (!!!current->seccomp.mode) { - return; - } - - do_disable_seccomp(); -} - -void escape_with_root_profile(void) -{ - struct cred *cred; -#ifdef CONFIG_KSU_SYSCALL_HOOK - struct task_struct *t; -#endif - - if (current_euid().val == 0) { - pr_warn("Already root, don't escape!\n"); - return; - } - - cred = prepare_creds(); - if (!cred) { - pr_warn("prepare_creds failed!\n"); - return; - } - - struct root_profile *profile = ksu_get_root_profile(cred->uid.val); - - cred->uid.val = profile->uid; - cred->suid.val = profile->uid; - cred->euid.val = profile->uid; - cred->fsuid.val = profile->uid; - - cred->gid.val = profile->gid; - cred->fsgid.val = profile->gid; - cred->sgid.val = profile->gid; - cred->egid.val = profile->gid; - cred->securebits = 0; - - BUILD_BUG_ON(sizeof(profile->capabilities.effective) != - sizeof(kernel_cap_t)); - - // setup capabilities - // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process - // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! - u64 cap_for_ksud = - profile->capabilities.effective | CAP_DAC_READ_SEARCH; - memcpy(&cred->cap_effective, &cap_for_ksud, - sizeof(cred->cap_effective)); - memcpy(&cred->cap_permitted, &profile->capabilities.effective, - sizeof(cred->cap_permitted)); - memcpy(&cred->cap_bset, &profile->capabilities.effective, - sizeof(cred->cap_bset)); - - setup_groups(profile, cred); - - commit_creds(cred); - - disable_seccomp(); - - setup_selinux(profile->selinux_domain); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - for_each_thread (current, t) { - ksu_set_task_tracepoint_flag(t); - } -#endif - - setup_mount_ns(profile->namespaces); -} - -void escape_to_root_for_init(void) -{ - setup_selinux(KERNEL_SU_CONTEXT); -} diff --git a/drivers/kernelsu/app_profile.h b/drivers/kernelsu/app_profile.h deleted file mode 100644 index 1263509c2f5e..000000000000 --- a/drivers/kernelsu/app_profile.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef __KSU_H_APP_PROFILE -#define __KSU_H_APP_PROFILE - -#include - -// Forward declarations -struct cred; - -#define KSU_APP_PROFILE_VER 2 -#define KSU_MAX_PACKAGE_NAME 256 -// NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. -#define KSU_MAX_GROUPS 32 -#define KSU_SELINUX_DOMAIN 64 - -struct root_profile { - int32_t uid; - int32_t gid; - - int32_t groups_count; - int32_t groups[KSU_MAX_GROUPS]; - - // kernel_cap_t is u32[2] for capabilities v3 - struct { - u64 effective; - u64 permitted; - u64 inheritable; - } capabilities; - - char selinux_domain[KSU_SELINUX_DOMAIN]; - - int32_t namespaces; -}; - -struct non_root_profile { - bool umount_modules; -}; - -struct app_profile { - // It may be utilized for backward compatibility, although we have never explicitly made any promises regarding this. - u32 version; - - // this is usually the package of the app, but can be other value for special apps - char key[KSU_MAX_PACKAGE_NAME]; - int32_t current_uid; - bool allow_su; - - union { - struct { - bool use_default; - char template_name[KSU_MAX_PACKAGE_NAME]; - - struct root_profile profile; - } rp_config; - - struct { - bool use_default; - - struct non_root_profile profile; - } nrp_config; - }; -}; - -// Escalate current process to root with the appropriate profile -void escape_with_root_profile(void); - -void escape_to_root_for_init(void); - -#endif diff --git a/drivers/kernelsu/arch.h b/drivers/kernelsu/arch.h deleted file mode 100644 index b1c79a8c9985..000000000000 --- a/drivers/kernelsu/arch.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef __KSU_H_ARCH -#define __KSU_H_ARCH - -#include - -#if defined(__aarch64__) - -#define __PT_PARM1_REG regs[0] -#define __PT_PARM2_REG regs[1] -#define __PT_PARM3_REG regs[2] -#define __PT_SYSCALL_PARM4_REG regs[3] -#define __PT_CCALL_PARM4_REG regs[3] -#define __PT_PARM5_REG regs[4] -#define __PT_PARM6_REG regs[5] -#define __PT_RET_REG regs[30] -#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ -#define __PT_RC_REG regs[0] -#define __PT_SP_REG sp -#define __PT_IP_REG pc - -#define REBOOT_SYMBOL "__arm64_sys_reboot" -#define SYS_READ_SYMBOL "__arm64_sys_read" -#define SYS_EXECVE_SYMBOL "__arm64_sys_execve" - -#elif defined(__x86_64__) - -#define __PT_PARM1_REG di -#define __PT_PARM2_REG si -#define __PT_PARM3_REG dx -/* syscall uses r10 for PARM4 */ -#define __PT_SYSCALL_PARM4_REG r10 -#define __PT_CCALL_PARM4_REG cx -#define __PT_PARM5_REG r8 -#define __PT_PARM6_REG r9 -#define __PT_RET_REG sp -#define __PT_FP_REG bp -#define __PT_RC_REG ax -#define __PT_SP_REG sp -#define __PT_IP_REG ip - -#define REBOOT_SYMBOL "__x64_sys_reboot" -#define SYS_READ_SYMBOL "__x64_sys_read" -#define SYS_EXECVE_SYMBOL "__x64_sys_execve" - -#else -#ifdef CONFIG_KSU_SYSCALL_HOOK -#error "Unsupported arch" -#endif -#endif - -/* allow some architecutres to override `struct pt_regs` */ -#ifndef __PT_REGS_CAST -#define __PT_REGS_CAST(x) (x) -#endif - -#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) -#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) -#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) -#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG) -#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG) -#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) -#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) -#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) -#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) -#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) -#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) -#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) - -#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs)) - -#endif diff --git a/drivers/kernelsu/feature/adb_root.c b/drivers/kernelsu/feature/adb_root.c new file mode 100644 index 000000000000..125d0470e75b --- /dev/null +++ b/drivers/kernelsu/feature/adb_root.c @@ -0,0 +1,289 @@ +#ifdef CONFIG_KSU_FEATURE_ADBROOT + +static bool ksu_adb_root __read_mostly = false; + +static long is_exec_adbd(const char __user **filename_user) +{ + // should be bigger than `/apex/com.android.adbd/bin/adbd` + char buf[40] = { 0 }; + size_t copysize = sizeof("/apex/com.android.adbd/bin/adbd"); + + if (!!copy_from_user(buf, *filename_user, copysize)) + return 0; + + if (!!endswith(buf, "/adbd")) + return 0; + + pr_info("%s: adbd: %s \n", __func__, buf); + + return 1; +} + +static long is_libadbroot_ok() +{ + static const char kLibAdbRoot[] = "/data/adb/ksu/lib/libadbroot.so"; + struct path path; + long ret = kern_path(kLibAdbRoot, 0, &path); + if (ret < 0) { + if (ret == -ENOENT) { + pr_err("libadbroot.so not exists, skip adb root. Please run `ksud install`\n"); + ret = 0; + } else { + pr_err("access libadbroot.so failed: %ld, skip adb root\n", ret); + } + return ret; + } else { + ret = 1; + } + path_put(&path); + return ret; +} + +// NOTE: envp is (void ***), void * const char __user * const char __user * +static long setup_ld_preload(void ***envp_arg) +{ + static const char kLdPreload[] = "LD_PRELOAD=/data/adb/ksu/lib/libadbroot.so"; + static const char kLdLibraryPath[] = "LD_LIBRARY_PATH=/data/adb/ksu/lib"; + static const size_t kReadEnvBatch = 16; + static const size_t kPtrSize = sizeof(unsigned long); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + unsigned long stackp = current_user_stack_pointer(); +#else + volatile unsigned long stackp = current->mm->start_stack; // its just a stack smash in the end, it'll work. +#endif + unsigned long envp, ld_preload_p, ld_library_path_p; + unsigned long *envp_p = (uintptr_t)envp_arg; + unsigned long *tmp_env_p = NULL, *tmp_env_p2 = NULL; + size_t env_count = 0, total_size; + long ret; + + envp = (char __user **)untagged_addr((unsigned long)*envp_p); + + ld_preload_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdPreload), 8); // 2 words on 32-bit, 32-on-64 its gonna be fine dw. + ret = copy_to_user(ld_preload_p, kLdPreload, sizeof(kLdPreload)); + if (ret != 0) { + pr_warn("write ld_preload when adb_root_handle_execve failed: %ld\n", ret); + return -EFAULT; + } + + ld_library_path_p = stackp = ALIGN_DOWN(stackp - sizeof(kLdLibraryPath), 8); + ret = copy_to_user(ld_library_path_p, kLdLibraryPath, sizeof(kLdLibraryPath)); + if (ret != 0) { + pr_warn("write ld_library_path when adb_root_handle_execve failed: %ld\n", ret); + return -EFAULT; + } + + for (;;) { + tmp_env_p2 = krealloc(tmp_env_p, (env_count + kReadEnvBatch + 2) * kPtrSize, GFP_KERNEL); + if (tmp_env_p2 == NULL) { + pr_err("alloc tmp env failed\n"); + ret = -ENOMEM; + goto out_release_env_p; + } + tmp_env_p = tmp_env_p2; + ret = copy_from_user(&tmp_env_p[env_count], envp + env_count * kPtrSize, kReadEnvBatch * kPtrSize); + if (ret < 0) { + pr_warn("Access envp when adb_root_handle_execve failed: %ld\n", ret); + ret = -EFAULT; + goto out_release_env_p; + } + size_t read_count = kReadEnvBatch * kPtrSize - ret; + size_t max_new_env_count = read_count / kPtrSize, new_env_count = 0; + bool meet_zero = false; + for (; new_env_count < max_new_env_count; new_env_count++) { + if (!tmp_env_p[new_env_count + env_count]) { + meet_zero = true; + break; + } + } + if (!meet_zero) { + if (read_count % kPtrSize != 0) { + pr_err("unaligned envp array!\n"); + ret = -EFAULT; + goto out_release_env_p; + } else if (ret != 0) { + pr_err("truncated envp array!\n"); + ret = -EFAULT; + goto out_release_env_p; + } + } + env_count += new_env_count; + if (meet_zero) + break; + } + + // We should have allocated enough memory + // TODO: handle existing LD_PRELOAD + tmp_env_p[env_count++] = ld_preload_p; + tmp_env_p[env_count++] = ld_library_path_p; + tmp_env_p[env_count++] = 0; + total_size = env_count * kPtrSize; + + stackp -= total_size; + ret = copy_to_user(stackp, tmp_env_p, total_size); + if (ret != 0) { + pr_err("copy new env failed: %ld\n", ret); + ret = -EFAULT; + goto out_release_env_p; + } + + *envp_p = stackp; + ret = 0; + +out_release_env_p: + if (tmp_env_p) { + kfree(tmp_env_p); + } + + return ret; +} + +static noinline void do_ksu_adb_root_handle_execve(void *filename, void *envp_in) +{ + if (likely(test_thread_flag(TIF_SECCOMP))) + return; + + uid_t uid = current_euid().val; + if (uid != 0 && uid != 2000) + return; + + // filename is void * char __user * + const char __user **filename_user = (const char __user **)filename; + + if (likely(!is_exec_adbd(filename_user))) + return; + + if (unlikely(!is_libadbroot_ok())) + return; + + if (setup_ld_preload((void ***)envp_in)) + return; + + pr_info("escape to root for adb\n"); + escape_to_root_for_adb_root(); + escape_with_root_profile(); // why is this needed for 3.x? + return; +} + +static noinline void do_ksu_adb_root_handle_execveat(void *filename, void *envp_in) +{ + if (likely(test_thread_flag(TIF_SECCOMP))) + return; + + uid_t uid = current_euid().val; + if (uid != 0 && uid != 2000) + return; + + if (!filename) + return; + + // filename is char ** + if (!*(void **)filename) + return; + + if (!!endswith(*(char **)filename, "/adbd")) + return; + + if (unlikely(!is_libadbroot_ok())) + return; + + if (!envp_in) + return; + + struct user_arg_ptr *envp = (struct user_arg_ptr *)envp_in; + + void ***envp_addr = (void ***)&envp->ptr.native; +#ifdef CONFIG_COMPAT + if (unlikely(envp->is_compat)) + envp_addr = (void ***)&envp->ptr.compat; +#endif + + pr_info("%s: envp 0x%lx \n", __func__, (uintptr_t)*envp_addr ); + + if (setup_ld_preload(envp_addr)) + return; + + pr_info("escape to root for adb\n"); + escape_to_root_for_adb_root(); + escape_with_root_profile(); // why is this needed? + return; +} + +#ifdef KSU_CAN_USE_JUMP_LABEL // see kernel_compat.h + +DEFINE_STATIC_KEY_FALSE(ksu_adb_root_key); + +static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in) +{ + if (static_branch_unlikely(&ksu_adb_root_key)) + do_ksu_adb_root_handle_execve(filename, envp_in); +} +static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in) +{ + if (static_branch_unlikely(&ksu_adb_root_key)) + do_ksu_adb_root_handle_execveat(filename, envp_in); +} + +static inline void ksu_static_branch_enable() { static_branch_enable(&ksu_adb_root_key); smp_mb(); } +static inline void ksu_static_branch_disable() { static_branch_disable(&ksu_adb_root_key); smp_mb(); } +#else /* ! KSU_CAN_USE_JUMP_LABEL */ +static inline void ksu_adb_root_handle_execve(void *filename, void *envp_in) +{ + if (unlikely(ksu_adb_root)) + do_ksu_adb_root_handle_execve(filename, envp_in); +} +static inline void ksu_adb_root_handle_execveat(void *filename, void *envp_in) +{ + if (unlikely(ksu_adb_root)) + do_ksu_adb_root_handle_execveat(filename, envp_in); +} +static inline void ksu_static_branch_enable() { } // no-op +static inline void ksu_static_branch_disable() { } // no-op +#endif // KSU_CAN_USE_JUMP_LABEL + +static int kernel_adb_root_feature_get(u64 *value) +{ + *value = ksu_adb_root ? 1 : 0; + return 0; +} + +static int kernel_adb_root_feature_set(u64 value) +{ + bool enable = value != 0; + + // prevent double enable / double disable + // as old api does ref inc / dec, its a 'lil risky + if (enable == ksu_adb_root) + return 0; + + if (enable) { + ksu_adb_root = true; + ksu_static_branch_enable(); + } else { + ksu_adb_root = false; + ksu_static_branch_disable(); + } + pr_info("adb_root: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler ksu_adb_root_handler = { + .feature_id = KSU_FEATURE_ADB_ROOT, + .name = "adb_root", + .get_handler = kernel_adb_root_feature_get, + .set_handler = kernel_adb_root_feature_set, +}; + +void __init ksu_adb_root_init(void) +{ + if (ksu_register_feature_handler(&ksu_adb_root_handler)) { + pr_err("Failed to register adb_root feature handler\n"); + } +} + +void __exit ksu_adb_root_exit(void) +{ + ksu_unregister_feature_handler(KSU_FEATURE_ADB_ROOT); +} + +#endif // CONFIG_KSU_FEATURE_ADBROOT diff --git a/drivers/kernelsu/feature/adb_root.h b/drivers/kernelsu/feature/adb_root.h new file mode 100644 index 000000000000..331148751ca5 --- /dev/null +++ b/drivers/kernelsu/feature/adb_root.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_ADB_ROOT +#define __KSU_H_ADB_ROOT + +#ifdef CONFIG_KSU_FEATURE_ADBROOT +void ksu_adb_root_init(void); +void ksu_adb_root_exit(void); +#endif + +#endif diff --git a/drivers/kernelsu/feature/kernel_umount.c b/drivers/kernelsu/feature/kernel_umount.c new file mode 100644 index 000000000000..f5d399657852 --- /dev/null +++ b/drivers/kernelsu/feature/kernel_umount.c @@ -0,0 +1,115 @@ +static bool ksu_kernel_umount_enabled __read_mostly = true; + +static int kernel_umount_feature_get(u64 *value) +{ + *value = ksu_kernel_umount_enabled ? 1 : 0; + return 0; +} + +static int kernel_umount_feature_set(u64 value) +{ + bool enable = value != 0; + ksu_kernel_umount_enabled = enable; + pr_info("kernel_umount: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler kernel_umount_handler = { + .feature_id = KSU_FEATURE_KERNEL_UMOUNT, + .name = "kernel_umount", + .get_handler = kernel_umount_feature_get, + .set_handler = kernel_umount_feature_set, +}; + +extern int path_umount(struct path *path, int flags); + +static inline void ksu_umount_mnt(const char *mnt, struct path *path, int flags) +{ + int err = path_umount(path, flags); + if (err) + pr_info("umount %s failed: %d\n", mnt, err); +} + +static void try_umount(const char *mnt, int flags) +{ + struct path path; + int err = kern_path(mnt, 0, &path); + if (err) { + return; + } + + if (path.dentry != path.mnt->mnt_root) { + // it is not root mountpoint, maybe umounted by others already. + path_put(&path); + return; + } + + ksu_umount_mnt(mnt, &path, flags); +} + +static inline int ksu_handle_umount(struct cred *new, const struct cred *old) +{ + uid_t new_uid = ksu_get_uid_t(new->uid); + uid_t old_uid = ksu_get_uid_t(old->uid); + + if (!ksu_kernel_umount_enabled) + return 0; + + // if there isn't any module mounted, just ignore it! + if (!ksu_module_mounted) + return 0; + + if (!ksu_cred) + return 0; + + // There are 6 scenarios: + // 1. Normal app: zygote -> appuid + // 2. Isolated process forked from zygote: zygote -> isolated_process + // 3. App zygote forked from zygote: zygote -> appuid + // 4. Webview zygote forked from zygote: zygote -> WEBVIEW_ZYGOTE_UID (no need to handle, app cannot run custom code) + // 5. Isolated process forked from app zygote: appuid -> isolated_process (already handled by 3) + // 6. Isolated process forked from webview zygote (no need to handle, app cannot run custom code) + if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) + return 0; + + if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) + return 0; + + // check old process's selinux context, if it is not zygote, ignore it! + // because some su apps may setuid to untrusted_app but they are in global mount namespace + // when we umount for such process, that is a disaster! + // also handle case 4 and 5 + bool is_zygote_child = is_zygote(old); + if (!is_zygote_child) { + pr_info("handle umount ignore non zygote child: %d\n", current->pid); + return 0; + } + // umount the target mnt + pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); + + const struct cred *saved = override_creds(ksu_cred); + + struct mount_entry *entry; + down_read(&mount_list_lock); + list_for_each_entry (entry, &mount_list, list) { + pr_info("%s: unmounting: %s flags: 0x%x\n", __func__, entry->umountable, entry->flags); + try_umount(entry->umountable, entry->flags); + } + up_read(&mount_list_lock); + + revert_creds(saved); + + return 0; +} + +void __init ksu_kernel_umount_init(void) +{ + if (ksu_register_feature_handler(&kernel_umount_handler)) { + pr_err("Failed to register kernel_umount feature handler\n"); + } +} + +void __exit ksu_kernel_umount_exit(void) +{ + ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT); +} diff --git a/drivers/kernelsu/feature/kernel_umount.h b/drivers/kernelsu/feature/kernel_umount.h new file mode 100644 index 000000000000..51af740d619c --- /dev/null +++ b/drivers/kernelsu/feature/kernel_umount.h @@ -0,0 +1,13 @@ +#ifndef __KSU_H_KERNEL_UMOUNT +#define __KSU_H_KERNEL_UMOUNT + +// for the umount list +struct mount_entry { + char *umountable; + unsigned int flags; + struct list_head list; +}; +extern struct list_head mount_list; +extern struct rw_semaphore mount_list_lock; + +#endif diff --git a/drivers/kernelsu/feature/selinux_hide.c b/drivers/kernelsu/feature/selinux_hide.c new file mode 100644 index 000000000000..962fadf7fa8c --- /dev/null +++ b/drivers/kernelsu/feature/selinux_hide.c @@ -0,0 +1,404 @@ +/** + * NOTE: this isnt the fullblown thing like upstream's where we straight up backport + * SELinux. This is just questionable to do when we want to support a plethora of + * non-standard kernels. + * + * While what we are doing here is kinda improper, for most cases + * this should be mroe than enough. + * + * this will include write_op / selinux_transaction_write spoofing and then avc spoofing. + * our goal for this one is to be self contained as much as possible + * with only one call from ksu's initcall. + * + */ + +// enabled by default +static bool ksu_selinux_hide_enabled __read_mostly = true; + +// sids for avc spoofing +static u32 su_sid __read_mostly = 0; +static u32 ksu_sid __read_mostly = 0; +static u32 priv_app_sid __read_mostly = 0; + +static inline int ksu_selinux_get_sids() +{ + // dont load at all if we cant get sids + int err = security_secctx_to_secid("u:r:su:s0", strlen("u:r:su:s0"), &su_sid); + if (!err) + pr_info("selinux_hide: su_sid: %u\n", su_sid); + + err = security_secctx_to_secid("u:r:ksu:s0", strlen("u:r:ksu:s0"), &ksu_sid); + if (!err) + pr_info("selinux_hide: ksu_sid: %u\n", su_sid); + + err = security_secctx_to_secid("u:r:priv_app:s0:c512,c768", strlen("u:r:priv_app:s0:c512,c768"), &priv_app_sid); + if (!err) + pr_info("selinux_hide: priv_app_sid: %u\n", su_sid); + + if (!su_sid || !ksu_sid || !priv_app_sid) + return -1; + + return 0; +} + +// deprecate in a month +int ksu_handle_slow_avc_audit_new(u32 tsid, u16 *tclass) +{ + if (!ksu_selinux_hide_enabled) + return 0; + + if (tsid != su_sid && tsid != ksu_sid) + return 0; + + pr_info("selinux_hide: prevent log for sid: %u\n", tsid); + *tclass = 0; + + return 0; +} + +void ksu_slow_avc_audit(u32 *tsid) +{ + if (!ksu_selinux_hide_enabled) + return; + + // if tsid is su, we just replace it + // unsure if its enough, but this is how it is aye? + if (*tsid == su_sid || *tsid == ksu_sid) { + pr_info("selinux_hide: slow_avc_audit: replace tsid: %u with priv_app_sid: %u\n", *tsid, priv_app_sid); + *tsid = priv_app_sid; + } + + return; +} + +static inline bool ksu_should_destroy_context(char *str) +{ + if (!str) + return false; + + struct ksu_hidden_node *node; + + read_lock(&ksu_sepolicy_shitlist_lock); + list_for_each_entry(node, &ksu_sepolicy_rule_list, list) { + if (strstr(str, node->name)) { + read_unlock(&ksu_sepolicy_shitlist_lock); + return true; + } + } + read_unlock(&ksu_sepolicy_shitlist_lock); + + return false; +} + +/** + * security_setprocattr is a weird LSM on 5.4 and up, and this is normally backported + * down to 4.14 and 4.19. somehow this LSM is a one-shot. only the first to register + * is called. + * + * however this is not an issue for us on 3.x as we are hijacking selinux_ops on it + * + */ +int ksu_hide_setprocattr(const char *name, void *value, size_t size) +{ + if (!ksu_selinux_hide_enabled) + return 0; + + // only hook when seccomp is enabled + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + // only appuid + if (current_uid().val < 10000) + return 0; + + if (!size) + return 0; + + if (!name) + return 0; + + if (!!strcmp(name, "current")) + return 0; + + char *str = (char *)value; + + if (!str) + return 0; + + // to make sure its terminated + char buf[64] = { 0 }; + size_t len = (size < 63) ? size : 63; + + memcpy(buf, str, len); + + if (!ksu_should_destroy_context(buf)) + return 0; + + pr_info("block setprocattr for context: %s\n", buf); + str[1] = '1'; + + return 0; +} + +// for manual hook +void ksu_sel_write_context(struct file **file, char **buf, size_t *size) +{ + if (!ksu_selinux_hide_enabled) + return; + + // only hook when seccomp is enabled + if (!test_thread_flag(TIF_SECCOMP)) + return; + + // only appuid + if (current_uid().val < 10000) + return; + + // upstream doesnt do this, so we should also not. + //if (!ksu_uid_should_umount(current_uid().val)) + // return; + + char *mbuf = *buf; + + if (!mbuf) + return; + + if (!ksu_should_destroy_context(mbuf)) + return; + + pr_info("selinux_hide: destroy: %s \n", mbuf); + mbuf[1] = '1'; + return; + +} + +#if defined(CONFIG_KPROBES) + +#include +static struct kprobe *slow_avc_audit_kp; +static struct kprobe *sel_write_context_kp; +static struct kprobe *sel_write_access_kp; + +static int slow_avc_audit_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + +#if defined(KSU_COMPAT_HAS_SELINUX_STATE) + u32 *tsid = (u32 *)&PT_REGS_PARM3(regs); +#else + u32 *tsid = (u32 *)&PT_REGS_PARM2(regs); +#endif + + ksu_slow_avc_audit(tsid); + + return 0; +} + +static int sel_write_context_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + char **buf = (char **)&PT_REGS_PARM2(regs); + + ksu_sel_write_context(NULL, buf, NULL); + return 0; +} + +// this deals with __user, this is here in case its really needed. +#if 0 +static int selinux_transaction_write_pre_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + + bool *should_destroy = (bool *)ri->data; + *should_destroy = false; + + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + if (current_uid().val < 10000) + return 0; + + if (!ksu_uid_should_umount(current_uid().val)) + return 0; + + const char __user **buf = (const char __user **)&PT_REGS_PARM2(regs); + char __user *uptr = *(char **)buf; + + char kbuf[128] = { 0 }; + + if (ksu_copy_from_user_retry(kbuf, uptr, 127)) + return 0; + + // move ptr to the next one after space + char *target = strchr(kbuf, ' '); + if (likely(target)) + target++; + else + target = kbuf; + + if (!ksu_should_destroy_context(target)) + return 0; + + pr_info("selinux_transaction_write: destroy: %s \n", kbuf); + *should_destroy = true; + + return 0; +} + +static int selinux_transaction_write_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + // if bool is true, mod PT_REGS_RC to ret EINVAL + bool *should_destroy = (bool *)ri->data; + + if (*should_destroy) + PT_REGS_RC(regs) = -EINVAL; + + return 0; +} + +static struct kretprobe selinux_transaction_write_rp = { + .kp.symbol_name = "selinux_transaction_write", + .handler = selinux_transaction_write_ret_handler, + .entry_handler = selinux_transaction_write_pre_handler, + .data_size = sizeof(bool), + .maxactive = 20, +}; +#endif + +// copied from upstream +static struct kprobe *init_kprobe(const char *name, kprobe_pre_handler_t handler) +{ + struct kprobe *kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kp) + return NULL; + kp->symbol_name = name; + kp->pre_handler = handler; + + int ret = register_kprobe(kp); + pr_info("%s: register %s kprobe: %d\n", __func__, name, ret); + if (ret) { + kfree(kp); + return NULL; + } + + return kp; +} +static void destroy_kprobe(struct kprobe **kp_ptr) +{ + struct kprobe *kp = *kp_ptr; + if (!kp) + return; + unregister_kprobe(kp); + synchronize_rcu(); + kfree(kp); + *kp_ptr = NULL; +} +#endif // CONFIG_KPROBES + + +static void ksu_selinux_hide_enable() +{ + int ret = ksu_selinux_get_sids(); + if (ret) + pr_info("selinux_hide: sid grab fail!\n"); + +#if defined(CONFIG_KPROBES) + slow_avc_audit_kp = init_kprobe("slow_avc_audit", slow_avc_audit_pre_handler); + + sel_write_context_kp = init_kprobe("sel_write_context", sel_write_context_pre_handler); + sel_write_access_kp = init_kprobe("sel_write_access", sel_write_context_pre_handler); +#endif + + pr_info("selinux_hide: started! make sure manual hooks are in-place!\n"); + + ksu_selinux_hide_enabled = true; +} + +static void ksu_selinux_hide_disable() +{ +#if defined(CONFIG_KPROBES) + pr_info("selinux_hide: unregister slow_avc_audit kprobe!\n"); + destroy_kprobe(&slow_avc_audit_kp); + + pr_info("selinux_hide: unregister sel_write_context kprobe!\n"); + destroy_kprobe(&sel_write_context_kp); + + pr_info("selinux_hide: unregister sel_write_access kprobe!\n"); + destroy_kprobe(&sel_write_access_kp); +#endif + + pr_info("selinux_hide: closing down hooks!\n"); + + ksu_selinux_hide_enabled = false; +} + +// init kthread +static int ksu_hide_init_thread(void *data) +{ + unsigned int i = 0; + + set_user_nice(current, 19); // low prio + +start: + if (!!*(volatile bool *)&ksu_boot_completed) + goto bail; + + msleep(5000); + + i++; + + if (i < 12) + goto start; + +bail: + + ksu_add_shit_to_list(KERNEL_SU_DOMAIN); + ksu_add_shit_to_list(KERNEL_SU_FILE); + + ksu_selinux_hide_enable(); + return 0; +} + +static int selinux_hide_feature_get(u64 *value) +{ + *value = ksu_selinux_hide_enabled ? 1 : 0; + return 0; +} + +static int selinux_hide_feature_set(u64 value) +{ + bool enable = value != 0; + int ret = 0; + + if (enable == ksu_selinux_hide_enabled) + return 0; + + pr_info("selinux_hide: set to %d\n", enable); + + if (enable) + ksu_selinux_hide_enable(); + else + ksu_selinux_hide_disable(); + + return ret; +} + +static const struct ksu_feature_handler selinux_hide_handler = { + .feature_id = KSU_FEATURE_SELINUX_HIDE, + .name = "selinux_hide", + .get_handler = selinux_hide_feature_get, + .set_handler = selinux_hide_feature_set, +}; + +void __init ksu_selinux_hide_init() +{ + // we init this on a kthread + kthread_run(ksu_hide_init_thread, NULL, "kthread"); + + if (ksu_register_feature_handler(&selinux_hide_handler)) { + pr_err("Failed to register selinux_hide feature handler\n"); + } +} + +void __exit ksu_selinux_hide_exit() +{ + ksu_unregister_feature_handler(KSU_FEATURE_SELINUX_HIDE); +} + diff --git a/drivers/kernelsu/feature/selinux_hide.h b/drivers/kernelsu/feature/selinux_hide.h new file mode 100644 index 000000000000..39c60206b9c6 --- /dev/null +++ b/drivers/kernelsu/feature/selinux_hide.h @@ -0,0 +1,65 @@ +#ifndef __KSU_H_SELINUX_HIDE +#define __KSU_H_SELINUX_HIDE + +void ksu_selinux_hide_init(); +void ksu_selinux_hide_exit(); + +// /selinux/rules.c, linked list +LIST_HEAD(ksu_sepolicy_rule_list); +DEFINE_RWLOCK(ksu_sepolicy_shitlist_lock); + +struct ksu_hidden_node { + struct list_head list; + char *name; +}; + +static void ksu_add_shit_to_list(const char *name) +{ + if (!name) + return; + + if (!strcmp(name, "zygote")) + return; + + if (!strcmp(name, "app_zygote")) + return; + + struct ksu_hidden_node *node; + size_t name_len = strlen(name); + + // check for dupes + write_lock(&ksu_sepolicy_shitlist_lock); + list_for_each_entry(node, &ksu_sepolicy_rule_list, list) { + // ":name:" + if (strlen(node->name) == (name_len + 2) && !memcmp(node->name + 1, name, name_len)) + goto unlock_list; + } + + node = kmalloc(sizeof(*node), GFP_ATOMIC); + if (!node) + goto unlock_list; + + // ':' + original + ':' + \0 + size_t len = strlen(name); + node->name = kmalloc(name_len + 3, GFP_ATOMIC); + if (!node->name) { + kfree(node); + goto unlock_list; + } + + node->name[0] = ':'; + memcpy(node->name + 1, name, name_len); + node->name[name_len + 1] = ':'; + node->name[name_len + 2] = '\0'; + + list_add(&node->list, &ksu_sepolicy_rule_list); + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("%s: now tracking type: %s, padded: %s \n", __func__, name, node->name); + +unlock_list: + write_unlock(&ksu_sepolicy_shitlist_lock); + return; +} + +#endif diff --git a/drivers/kernelsu/feature/sucompat.c b/drivers/kernelsu/feature/sucompat.c new file mode 100644 index 000000000000..174e170cb146 --- /dev/null +++ b/drivers/kernelsu/feature/sucompat.c @@ -0,0 +1,419 @@ +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +#define SUCOMPAT_HOOK_TYPE static __always_inline int +#else +#define SUCOMPAT_HOOK_TYPE int +#endif + +#define SU_PATH "/system/bin/su" +#define SH_PATH "/system/bin/sh" + +static bool ksu_su_compat_enabled __read_mostly = true; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static void __user *userspace_stack_buffer(const void *d, size_t len) +{ + /* To avoid having to mmap a page in userspace, just write below the stack + * pointer. */ + char __user *p = (void __user *)current_user_stack_pointer() - len; + + return copy_to_user(p, d, len) ? NULL : p; +} +#else +static void __user *userspace_stack_buffer(const void *d, size_t len) +{ + if (!current->mm) + return NULL; + + volatile unsigned long start_stack = current->mm->start_stack; + unsigned int step = 32; + +start_loop: + ; + char __user *p = (void __user *)(start_stack - step - len); + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("%s: start_stack: %lx p: %lx len: %zu\n", __func__, start_stack, (unsigned long)p, len ); + + if (!copy_to_user(p, d, len)) + return p; + + step = step + step; + + if (step <= 2048) + goto start_loop; + + return NULL; +} +#endif + +static char __user *sh_user_path(void) +{ + static const char sh_path[] = "/system/bin/sh"; + + return userspace_stack_buffer(sh_path, sizeof(sh_path)); +} + +static char __user *ksud_user_path(void) +{ + static const char ksud_path[] = KSUD_PATH; + + return userspace_stack_buffer(ksud_path, sizeof(ksud_path)); +} + +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) && defined(KSU_CAN_USE_JUMP_LABEL) +DEFINE_STATIC_KEY_TRUE(ksud_sucompat_key); +static inline void ksu_sucompat_enable_branch() +{ + pr_info("su_compat: enable sucompat branches\n"); + static_branch_enable(&ksud_sucompat_key); + smp_mb(); +} +static inline void ksu_sucompat_disable_branch() +{ + pr_info("su_compat: remove sucompat branches\n"); + static_branch_disable(&ksud_sucompat_key); + smp_mb(); +} +#else +static inline void ksu_sucompat_enable_branch() { } // no-op +static inline void ksu_sucompat_disable_branch() { } // no-op +#endif + +__attribute__((hot)) +static __always_inline bool is_su_allowed(const void **ptr_to_check) +{ +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE +#ifdef KSU_CAN_USE_JUMP_LABEL + // read as: if not 'likely' disabled + if (!!!static_branch_likely(&ksud_sucompat_key)) + return false; +#else + if (!ksu_su_compat_enabled) + return false; +#endif // KSU_CAN_USE_JUMP_LABEL +#endif + + if (likely(test_thread_flag(TIF_SECCOMP))) + return false; + + // see seccomp check above + // so if its root but not ksu domain, deny, see __ksu_is_allow_uid_for_current + // actually, we can likely skip this step? + uid_t uid = current_uid().val; + if (!!uid) + goto uid_check; + + if (!is_ksu_domain()) + return false; + goto check_ptr; + + // NOTE: shell has its seccomp disabled, so we only need to check for this thing + // short-circuit if not shell! as we allow apps on setuid lsm by disabling seccomp +uid_check: + if (likely(uid != 2000)) + goto check_ptr; + + // use internal function, not the macro + if (!__ksu_is_allow_uid(uid)) + return false; + +check_ptr: + // first check the pointer-to-pointer + if (unlikely(!ptr_to_check)) + return false; + + // now dereference pointer-to-pointer to check actual pointer + if (unlikely(!*ptr_to_check)) + return false; + + return true; +} + +static __always_inline void ksu_sucompat_user_common(const char __user **filename_user, + const char *syscall_name, + const bool escalate, + const uint8_t sym) +{ + uintptr_t buf; + const char su[] = SU_PATH; + + // sugar prep + uintptr_t *su_p = (uintptr_t *)su; + uintptr_t __user *fn_p = (uintptr_t *)*(char **)filename_user; + + // assert /system/bin/su\0 = 15 bytes. + BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad + BUILD_BUG_ON(sizeof(su) < 15); + + /* + * it seems this is actually the slowest part, we peek last word first to speed it up + * NOTE: get_user rets EFAULT on err, so if we are copying a pointer + * that goes to nothing, we also detect that and ret fast + * + * first read overreads, reading 8 bytes, "bin/su\0?" / 4 bytes, "su\0?" when we only need 7/3 + * but this is fine as we are guaranteed alignment, hardware provides trailing garbeg + * if it is specially crafted and hits a page guard, we just get EFAULT anyway + * + * on 64-bit we do this in 2 word compare, 4 on 32-bit + * + * we can do some bitmasking 0xFFFFFF blah blah to do that tail compare (7 or 3 bytes), + * but hot damn I hate that shit, lets just have __builtin_memcmp do it for us + * + */ + +#ifdef CONFIG_64BIT + if (get_user(buf, &fn_p[1])) + return; + + if (likely(!!__builtin_memcmp(&buf, su + sizeof(uintptr_t), sizeof(su) - sizeof(uintptr_t) ))) + return; +#else + if (get_user(buf, &fn_p[3])) + return; + + if (likely(!!__builtin_memcmp(&buf, su + (3 * sizeof(uintptr_t)), sizeof(su) - (3 * sizeof(uintptr_t)) ))) + return; + + if (unlikely(get_user(buf, &fn_p[2]))) + return; + + if (buf != su_p[2]) + return; + + if (unlikely(get_user(buf, &fn_p[1]))) + return; + + if (unlikely(buf != su_p[1])) + return; +#endif + // last word + if (unlikely(get_user(buf, &fn_p[0]))) + return; + + if (unlikely(buf != su_p[0])) + return; + + write_sulog(sym); + + if (!escalate) + goto no_escalate; + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); +#endif + if (!!escape_with_root_profile()) + return; + + // NOTE: we only check file existence, not exec success! + struct path kpath; + if (!!kern_path("/data/adb/ksud", 0, &kpath)) + goto no_ksud; + + path_put(&kpath); + pr_info("%s su->ksud!\n", syscall_name); + *filename_user = ksud_user_path(); + return; + +no_ksud: +no_escalate: + pr_info("%s su->sh!\n", syscall_name); + *filename_user = sh_user_path(); + return; + +} + +// sys_faccessat +SUCOMPAT_HOOK_TYPE ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, int *__unused_flags) +{ + if (!is_su_allowed((const void **)filename_user)) + return 0; + + ksu_sucompat_user_common(filename_user, "faccessat", false, 'a'); + return 0; +} + +// sys_newfstatat, sys_fstat64 +SUCOMPAT_HOOK_TYPE ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) +{ + if (!is_su_allowed((const void **)filename_user)) + return 0; + + ksu_sucompat_user_common(filename_user, "newfstatat", false, 's'); + return 0; +} + +// sys_execve, compat_sys_execve +SUCOMPAT_HOOK_TYPE ksu_handle_execve(const char __user **filename_user, void *argv, void *envp) +{ + sys_execve_escape_ksud((void *)filename_user); + +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execve((void *)filename_user, (void *)envp); +#endif + + if (!is_su_allowed((const void **)filename_user)) + return 0; + + ksu_sucompat_user_common(filename_user, "sys_execve", true, 'x'); + return 0; +} + +#ifndef CONFIG_KSU_TAMPER_SYSCALL_TABLE +static __always_inline void ksu_sucompat_kernel_common(void **filename_ptr, void *argv, void *envp, const char *function_name) +{ + kernel_execve_escape_ksud((void *)filename_ptr); + +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_handle_execveat((void *)filename_ptr, (void *)envp); +#endif + + if (!is_su_allowed((const void **)filename_ptr)) + return; + + // it seems this is actually the slowest part, we peek last word first to speed it up + // sugar prep + const char su[] = SU_PATH; + uintptr_t *su_p = (uintptr_t *)su; + uintptr_t *fn_p = (uintptr_t *)*(char **)filename_ptr; + + // assert /system/bin/su\0 = 15 bytes. + BUILD_BUG_ON(sizeof(su) > 16); // compielr might to pad + BUILD_BUG_ON(sizeof(su) < 15); + + // getname_flags pads this so nothing to worry about, dereference with confidence! +#ifdef CONFIG_64BIT + if (likely(!!__builtin_memcmp(&fn_p[1], &su_p[1], sizeof(su) - sizeof(uintptr_t) ))) + return; +#else + if (likely(!!__builtin_memcmp(&fn_p[3], &su_p[3], sizeof(su) - (3 * sizeof(uintptr_t)) ))) + return; + + if (fn_p[2] != su_p[2]) + return; + + if (fn_p[1] != su_p[1]) + return; +#endif + + if (unlikely(fn_p[0] != su_p[0])) + return; + + // we only handle execve here after removing vfs_statx hook for >= 6.1 + write_sulog('x'); + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit(KSU_SULOG_EVENT_SUCOMPAT, NULL, NULL, GFP_KERNEL); +#endif + if (!!escape_with_root_profile()) + return; + + // NOTE: we only check file existence, not exec success! + struct path kpath; + if (!!kern_path("/data/adb/ksud", 0, &kpath)) + goto no_ksud; + + path_put(&kpath); + pr_info("%s su->ksud!\n", function_name); + memcpy(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)); + return; + +no_ksud: + pr_info("%s su->sh!\n", function_name); + memcpy(*filename_ptr, SH_PATH, sizeof(SH_PATH)); + return; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0) +// take note: struct filename **filename, for do_execveat_common / do_execve_common on >= 3.14 +int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, void *envp, int *flags) +{ + struct filename *filename = *filename_ptr; + if (IS_ERR(filename)) // see getname_flags + return 0; + + ksu_sucompat_kernel_common((void **)&filename->name, argv, envp, "do_execveat_common"); + return 0; +} +#else +// take note: char **filename, for do_execve_common on < 3.14 +int ksu_legacy_execve_sucompat(const char **filename_ptr, void *argv, void *envp) +{ + ksu_sucompat_kernel_common((void **)filename_ptr, argv, envp, "do_execve_common"); + return 0; +} +#endif +#endif // CONFIG_KSU_TAMPER_SYSCALL_TABLE + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +static void syscall_table_sucompat_enable(); +static void syscall_table_sucompat_disable(); +#else +static inline void syscall_table_sucompat_enable() { } // no-op +static inline void syscall_table_sucompat_disable() { } // no-op +#endif + +static void ksu_sucompat_enable() +{ + + ksu_sucompat_enable_branch(); + syscall_table_sucompat_enable(); + + ksu_su_compat_enabled = true; + pr_info("%s: hooks enabled: exec, faccessat, stat\n", __func__); +} + +static void ksu_sucompat_disable() +{ + + ksu_sucompat_disable_branch(); + syscall_table_sucompat_disable(); + + ksu_su_compat_enabled = false; + pr_info("%s: hooks disabled: exec, faccessat, stat\n", __func__); +} + +static int su_compat_feature_get(u64 *value) +{ + *value = ksu_su_compat_enabled ? 1 : 0; + return 0; +} + +static int su_compat_feature_set(u64 value) +{ + bool enable = value != 0; + + if (enable == ksu_su_compat_enabled) { + pr_info("su_compat: no need to change\n"); + return 0; + } + + if (enable) { + ksu_sucompat_enable(); + } else { + ksu_sucompat_disable(); + } + + ksu_su_compat_enabled = enable; + pr_info("su_compat: set to %d\n", enable); + + return 0; +} + +static const struct ksu_feature_handler su_compat_handler = { + .feature_id = KSU_FEATURE_SU_COMPAT, + .name = "su_compat", + .get_handler = su_compat_feature_get, + .set_handler = su_compat_feature_set, +}; + +// sucompat: permited process can execute 'su' to gain root access. +void __init ksu_sucompat_init() +{ + if (ksu_register_feature_handler(&su_compat_handler)) { + pr_err("Failed to register su_compat feature handler\n"); + } +} + +void __exit ksu_sucompat_exit() +{ + ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT); +} diff --git a/drivers/kernelsu/feature/sucompat.h b/drivers/kernelsu/feature/sucompat.h new file mode 100644 index 000000000000..580384ee9c6c --- /dev/null +++ b/drivers/kernelsu/feature/sucompat.h @@ -0,0 +1,7 @@ +#ifndef __KSU_H_SUCOMPAT +#define __KSU_H_SUCOMPAT + +void ksu_sucompat_init(void); +void ksu_sucompat_exit(void); + +#endif diff --git a/drivers/kernelsu/feature/sulog.c b/drivers/kernelsu/feature/sulog.c new file mode 100644 index 000000000000..9f76805ca4f6 --- /dev/null +++ b/drivers/kernelsu/feature/sulog.c @@ -0,0 +1,57 @@ +static bool ksu_sulog_enabled __read_mostly = false; + +static int sulog_feature_get(u64 *value) +{ + *value = ksu_sulog_enabled ? 1 : 0; + return 0; +} + +static int sulog_feature_set(u64 value) +{ + bool enable = value != 0; + + ksu_sulog_enabled = enable; + pr_info("sulog: set to %d\n", enable); + return 0; +} + +static const struct ksu_feature_handler sulog_handler = { + .feature_id = KSU_FEATURE_SULOG, + .name = "sulog", + .get_handler = sulog_feature_get, + .set_handler = sulog_feature_set, +}; + +bool ksu_sulog_is_enabled(void) +{ + return ksu_sulog_enabled; +} + +void __init ksu_sulog_init(void) +{ + int ret; + + ksu_sulog_enabled = false; + + ret = ksu_register_feature_handler(&sulog_handler); + if (ret) { + pr_err("Failed to register sulog feature handler\n"); + return; + } + + ret = ksu_sulog_events_init(); + if (ret) { + pr_err("Failed to initialize sulog events: %d\n", ret); + ksu_unregister_feature_handler(KSU_FEATURE_SULOG); + return; + } + + ksu_sulog_fd_init(); +} + +void __exit ksu_sulog_exit(void) +{ + ksu_sulog_fd_exit(); + ksu_sulog_events_exit(); + ksu_unregister_feature_handler(KSU_FEATURE_SULOG); +} diff --git a/drivers/kernelsu/feature/sulog.h b/drivers/kernelsu/feature/sulog.h new file mode 100644 index 000000000000..565f59113cd0 --- /dev/null +++ b/drivers/kernelsu/feature/sulog.h @@ -0,0 +1,8 @@ +#ifndef __KSU_H_SULOG +#define __KSU_H_SULOG + +bool ksu_sulog_is_enabled(void); +void ksu_sulog_init(void); +void ksu_sulog_exit(void); + +#endif diff --git a/drivers/kernelsu/hook/core_hook.c b/drivers/kernelsu/hook/core_hook.c new file mode 100644 index 000000000000..54572c2c611f --- /dev/null +++ b/drivers/kernelsu/hook/core_hook.c @@ -0,0 +1,440 @@ +#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS +#define LSM_HANDLER_TYPE static int +#else +#define LSM_HANDLER_TYPE int +#endif + +LSM_HANDLER_TYPE ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry) +{ + ksu_rename_observer(old_dentry, new_dentry); + return 0; +} + +LSM_HANDLER_TYPE ksu_task_fix_setuid(struct cred *new, const struct cred *old, int flags) +{ + // see sys_setresuid + if (flags == LSM_SETID_RES) + ksu_handle_setresuid_cred(new, old); + + return 0; +} + +LSM_HANDLER_TYPE ksu_bprm_check(struct linux_binprm *bprm) +{ + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit_bprm((const char *)bprm->filename); +#endif + + return 0; +} + +LSM_HANDLER_TYPE ksu_file_permission(struct file *file, int mask) +{ +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_install_rc_hook(file); +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_install_rc_hook(file); +#endif +#endif + + return 0; +} + +#ifdef CONFIG_KSU_LSM_SECURITY_HOOKS +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +static struct security_hook_list ksu_hooks[] __ro_after_init = { + LSM_HOOK_INIT(inode_rename, ksu_inode_rename), + LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid), +#ifdef CONFIG_KSU_FEATURE_SULOG + LSM_HOOK_INIT(bprm_check_security, ksu_bprm_check), +#endif +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) + LSM_HOOK_INIT(file_permission, ksu_file_permission), +#endif +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) || defined(KSU_COMPAT_SECURITY_ADD_HOOKS_V2) +#define ksu_security_add_hooks security_add_hooks +#else +#define ksu_security_add_hooks(a, b, c) security_add_hooks(a, b) +#endif + +static __init void ksu_lsm_hook_init(void) +{ + ksu_security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); + + pr_info("core_hook: initialized %d LSMs \n", ARRAY_SIZE(ksu_hooks)); +} + +#else /* < 4.2, LSM */ + +// selinux_ops (LSM), security_operations struct tampering for ultra legacy + +static uintptr_t selinux_ops_addr = NULL; + +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE +static int (*orig_setprocattr) (struct task_struct *p, char *name, void *value, size_t size) = NULL; +static int hook_setprocattr(struct task_struct *p, char *name, void *value, size_t size) +{ + + ksu_hide_setprocattr(name, value, size); + return orig_setprocattr(p, name, value, size); +} +#endif + +static int (*orig_inode_rename) (struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) = NULL; +static int hook_inode_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry) +{ + ksu_inode_rename(old_inode, old_dentry, new_inode, new_dentry); + return orig_inode_rename(old_inode, old_dentry, new_inode, new_dentry); +} + +static int (*orig_task_fix_setuid) (struct cred *new, const struct cred *old, int flags) = NULL; +static int hook_task_fix_setuid(struct cred *new, const struct cred *old, int flags) +{ + ksu_task_fix_setuid(new, old, flags); + return orig_task_fix_setuid(new, old, flags); +} + +static int (*orig_bprm_check_security)(struct linux_binprm *bprm) = NULL; +static int hook_bprm_check_security(struct linux_binprm *bprm) +{ + ksu_bprm_check(bprm); + return orig_bprm_check_security(bprm); +} + +static int (*orig_file_permission) (struct file *file, int mask) = NULL; +static int hook_file_permission(struct file *file, int mask) +{ + + ksu_file_permission(file, mask); + return orig_file_permission(file, mask); +} + +static inline bool verify_selinux_cred_free(void *fn_ptr) +{ + bool success = false; + + if (!fn_ptr) + return false; + + // ref: https://elixir.bootlin.com/linux/v3.18.140/source/security/selinux/hooks.c#L3474 + void (*selinux_cred_free_fn)(struct cred *) = fn_ptr; + + struct cred dummy_cred; + + // explicitly set it to NULL + // make sure this happens! + // #1. it wont trigger BUG_ON + // #2. this way it will kfree(NULL), which does nothing + *(volatile void **)&dummy_cred.security = NULL; + barrier(); + + selinux_cred_free_fn(&dummy_cred); + + // check if selinux_cred_free is successful + if ((unsigned long)*(volatile void **)&dummy_cred.security == 0x7UL) + success = true; + + pr_info("selinux_cred_free: 0x%lx cred->security: 0x%lx success: %d\n", (unsigned long)fn_ptr, (unsigned long)dummy_cred.security, success); + + return success; +} + +// we should see a lot of pointers that is inside stext && etext +// basically we check for "pointer density" +static inline bool is_selinux_ops_valid(uintptr_t addr) +{ + extern char _stext[], _etext[]; + int total_slots = sizeof(struct security_operations) / sizeof(void *); + int valid_ptr = 0; + int i = 0; + + uintptr_t member_ptr = 0; + uintptr_t current_slot_addr; + + // we will be off by one or off by two due to sizeof("selinux") + // thats 8 bytes, on 32 bit, this is two pointers worth, not a big deal + +density_verify_start: + current_slot_addr = addr + (i * sizeof(void *)); + + member_ptr = 0; + if (copy_from_kernel_nofault(&member_ptr, (void *)current_slot_addr, sizeof(uintptr_t) )) + goto next_iter; // if it fails, just try next slot + + // give up early + if (!valid_ptr && i >= 20) + return false; + + // pr_info("%s: member_ptr: 0x%lx \n", __func__, (long)member_ptr); + if (member_ptr >= (uintptr_t)_stext && member_ptr <= (uintptr_t)_etext) + valid_ptr++; + +next_iter: + i++; + if (i < total_slots) + goto density_verify_start; + + pr_info("%s: density: valid: %lu slots: %lu \n", __func__, valid_ptr, total_slots); + + // maybe increase to 75% or something? + return (valid_ptr > (total_slots / 2)); +} + +static inline bool check_candidate(uintptr_t addr) +{ + struct security_operations *candidate = (struct security_operations *)addr; + + char char_buf[sizeof("selinux")] = { 0 }; + + if (copy_from_kernel_nofault(char_buf, (void *)addr, sizeof("selinux") )) + return false; + + if (!!memcmp(char_buf, "selinux", sizeof("selinux"))) + return false; + + // candidate found! + pr_info("%s: candidate selinux_ops at 0x%lx\n", __func__, (long)addr); + + // check ptr density + if (!is_selinux_ops_valid(addr)) + return false; + + if (!candidate->cred_free) + return false; + +#ifdef CONFIG_KALLSYMS // not always available, can also fail, but it wont hurt to try. + uintptr_t ksym_ptr = (uintptr_t)kallsyms_lookup_name("selinux_cred_free"); + if (unlikely(ksym_ptr != (uintptr_t)candidate->cred_free)) + goto test_fn; + + pr_info("%s: selinux_cred_free found via ksym_lookup: 0x%lx probe_result: 0x%lx \n", __func__, (long)ksym_ptr, (long)candidate->cred_free); + return true; + +test_fn: +#endif + + pr_info("%s: candidate selinux_cred_free at 0x%lx\n", __func__, (long)candidate->cred_free); + return verify_selinux_cred_free((void *)candidate->cred_free); +} + +/** + * we do this in blocks of sequential 10k pointers. + * 10k pointers up, 10k pointers down + * this is predictable, more cache friendly, no trashing. + * + * one up, one down oscillating scan isn't as friendly to teh cahce. + * once ptrdiff of up vs down is larger than L1, it will be trashy. + * + */ +static noinline void *hunt_for_selinux_ops(void *heuristic_ptr) +{ + uintptr_t anchor = (uintptr_t)heuristic_ptr; + uintptr_t curr; + unsigned long iter_count = 0; + unsigned long max_index = 10000; // max number of pointers to test, one way + unsigned long i = 0; + + uintptr_t start = anchor - max_index * sizeof(void *); + uintptr_t end = anchor + max_index * sizeof(void *); + pr_info("%s: scan range: 0x%lx - 0x%lx anchor: 0x%lx\n", __func__, (long)start, (long)end, (long)anchor); + +scan_up: + if (i >= max_index) { + i = 1; + goto scan_down; + } + + curr = anchor + (i * sizeof(void *)); + i++; + iter_count++; + + if (check_candidate(curr)) + goto found; + + goto scan_up; + +scan_down: + if (i >= max_index) + goto not_found; + + curr = anchor - (i * sizeof(void *)); + i++; + iter_count++; + + if (check_candidate(curr)) + goto found; + + goto scan_down; + +found: + pr_info("%s: found selinux_ops at 0x%lx iter_count: %lu \n", __func__, curr, iter_count); + return (void *)curr; + +not_found: + pr_info("%s: selinux_ops not found in range! iter_count: %lu \n", __func__, iter_count); + return NULL; +} + +static inline void set_selinux_ops() +{ + extern int selinux_enabled; + extern struct security_class_mapping secclass_map[]; + extern struct list_head crypto_alg_list; + extern unsigned int avc_cache_threshold; + + struct security_operations *ops = NULL; + +// if user exports selinux_ops, we just go for it! +#ifdef KSU_HAS_EXPORTED_SELINUX_OPS + extern struct security_operations selinux_ops; + if (!ops) + ops = (struct security_operations *)&selinux_ops; +#endif + +// not always available, can also fail, but it wont hurt to try. +#ifdef CONFIG_KALLSYMS + if (!ops) + ops = (struct security_operations *)kallsyms_lookup_name("selinux_ops"); +#endif + +#ifdef CONFIG_KEYS + extern struct key_user root_key_user; + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&root_key_user); +#endif + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&avc_cache_threshold); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&crypto_alg_list); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&selinux_enabled); + + if (!ops) + ops = (struct security_operations *)hunt_for_selinux_ops((void *)&secclass_map); + + if (!ops) + return; + + selinux_ops_addr = (uintptr_t)ops; +} + +// stop_machine +static int ksu_unregister_lsm_hook(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + + if (orig_file_permission) { + pr_info("%s: restoring file_permission 0x%lx -> 0x%lx\n", __func__, (long)ops->file_permission, (long)orig_file_permission); + ops->file_permission = orig_file_permission; + } + + return 0; +} + +static int ksu_lsm_hook_restore(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (!ops) + return 0; + + if (!!strcmp((char *)ops, "selinux")) + return 0; + +loop_start: + + msleep(1000); + + if (*(volatile bool *)&ksu_vfs_read_hook) + goto loop_start; + + pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); + + stop_machine(ksu_unregister_lsm_hook, NULL, NULL); + + return 0; +} + +// stop_machine +static int ksu_register_lsm_hook(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + + orig_bprm_set_creds = ops->bprm_set_creds; + ops->bprm_set_creds = hook_bprm_set_creds; + + orig_inode_rename = ops->inode_rename; + ops->inode_rename = hook_inode_rename; + +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE + orig_setprocattr = ops->setprocattr; + ops->setprocattr = hook_setprocattr; +#endif + + orig_task_fix_setuid = ops->task_fix_setuid; + ops->task_fix_setuid = hook_task_fix_setuid; + +#ifdef CONFIG_KSU_FEATURE_SULOG + orig_bprm_check_security = ops->bprm_check_security; + ops->bprm_check_security = hook_bprm_check_security; +#endif + +#if !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) + orig_file_permission = ops->file_permission; + ops->file_permission = hook_file_permission; +#endif + + return 0; +} + +static void ksu_lsm_hook_init(void) +{ + set_selinux_ops(); + + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (!ops) + return; + + if (!!strcmp((char *)ops, "selinux")) + return; + + pr_info("%s: selinux_ops: 0x%lx .name = %s\n", __func__, (long)ops, (const char *)ops ); + + stop_machine(ksu_register_lsm_hook, NULL, NULL); + + kthread_run(ksu_lsm_hook_restore, NULL, "unhook"); + return; +} + +#endif // < 4.2 + +#else /* ! CONFIG_KSU_LSM_SECURITY_HOOKS */ +// TEMP hooks, remove this in a month. +int ksu_handle_setuid(struct cred *new, const struct cred *old) +{ + ksu_handle_setresuid_cred(new, old); + return 0; +} +int ksu_handle_rename(struct dentry *old_dentry, struct dentry *new_dentry) +{ + ksu_rename_observer(old_dentry, new_dentry); + return 0; +} +static inline void ksu_lsm_hook_init(void) { } // nothing, no-op +#endif // CONFIG_KSU_LSM_SECURITY_HOOKS + +void __init ksu_core_init(void) +{ + ksu_lsm_hook_init(); +} diff --git a/drivers/kernelsu/hook/kp_ksud.c b/drivers/kernelsu/hook/kp_ksud.c new file mode 100644 index 000000000000..24ad5c3a14b4 --- /dev/null +++ b/drivers/kernelsu/hook/kp_ksud.c @@ -0,0 +1,143 @@ +#include + +// sys_newfstat rp +// upstream: https://github.com/tiann/KernelSU/commit/df640917d11dd0eff1b34ea53ec3c0dc49667002 + +static int sys_newfstat_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + + // grab ptr on entry + uintptr_t *arg = (uintptr_t *)p->data; + arg[0] = (uintptr_t)PT_REGS_PARM1(regs); + arg[1] = (uintptr_t)PT_REGS_PARM2(regs); + + return 0; +} + +static int sys_newfstat_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) +{ + uintptr_t *arg = (uintptr_t *)p->data; + unsigned int fd = (unsigned int)arg[0]; + struct stat __user *statbuf = (struct stat __user *)arg[1]; + + ksu_handle_newfstat_ret(&fd, &statbuf); + + return 0; +} + +static struct kretprobe sys_newfstat_rp = { + .kp.symbol_name = SYS_NEWFSTAT_SYMBOL, + .entry_handler = sys_newfstat_handler_pre, + .handler = sys_newfstat_handler_post, + .data_size = sizeof(uintptr_t) * 2, // int + ptr, should fit +}; + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +static int sys_fstat64_handler_pre(struct kretprobe_instance *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + + // grab ptr on entry + uintptr_t *arg = (uintptr_t *)p->data; + arg[0] = (uintptr_t)PT_REGS_PARM1(regs); + arg[1] = (uintptr_t)PT_REGS_PARM2(regs); + + return 0; +} + +static int sys_fstat64_handler_post(struct kretprobe_instance *p, struct pt_regs *regs) +{ + uintptr_t *arg = (uintptr_t *)p->data; + unsigned long fd = (unsigned long)arg[0]; + struct stat64 __user *statbuf = (struct stat64 __user *)arg[1]; + + ksu_handle_fstat64_ret(&fd, &statbuf); + + return 0; +} + +static struct kretprobe sys_fstat64_rp = { + .kp.symbol_name = SYS_FSTAT64_SYMBOL, + .entry_handler = sys_fstat64_handler_pre, + .handler = sys_fstat64_handler_post, + .data_size = sizeof(uintptr_t) * 2, // long + ptr, should fit +}; +#endif + +// sys_reboot +static int sys_reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) +{ + struct pt_regs *real_regs = PT_REAL_REGS(regs); + int *magic1 = (int *)&PT_REGS_PARM1(real_regs); // ptr so we can mutate this + int magic2 = (int)PT_REGS_PARM2(real_regs); + int cmd = (int)PT_REGS_PARM3(real_regs); + void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs); + + if (*magic1 != KSU_INSTALL_MAGIC1) + return 0; + + // HACK: flip preempt status inside kp + // checking not really needed but its cool + bool got_flipped = false; + if (likely(!preemptible())) { + preempt_enable(); + got_flipped = true; + } + + ksu_handle_sys_reboot(*magic1, magic2, cmd, arg); + + if (got_flipped) + preempt_disable(); + + // to prevent double hooking + *magic1 = 0; + + return 0; +} + +static struct kprobe sys_reboot_kp = { + .symbol_name = SYS_REBOOT_SYMBOL, + .pre_handler = sys_reboot_handler_pre, +}; + +static int unregister_kprobe_function(void *data) +{ + set_user_nice(current, 19); // low prio + +loop_start: + + msleep(1000); + + if (*(volatile bool *)&ksu_vfs_read_hook) + goto loop_start; + + pr_info("kp_ksud: unregistering kprobes...\n"); + + unregister_kretprobe(&sys_newfstat_rp); + pr_info("kp_ksud: unregister sys_newfstat_rp!\n"); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + unregister_kretprobe(&sys_fstat64_rp); + pr_info("kp_ksud: unregister sys_fstat64_rp!\n"); +#endif + + return 0; +} + +static __init int kp_ksud_init() +{ + int ret = register_kprobe(&sys_reboot_kp); // dont unreg this one + pr_info("kp_ksud: sys_reboot_kp: %d\n", ret); + + int ret2 = register_kretprobe(&sys_newfstat_rp); + pr_info("kp_ksud: sys_newfstat_rp: %d\n", ret2); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + int ret3 = register_kretprobe(&sys_fstat64_rp); + pr_info("kp_ksud: sys_fstat64_rp: %d\n", ret3); +#endif + + kthread_run(unregister_kprobe_function, NULL, "kp_unreg"); + return 0; +} diff --git a/drivers/kernelsu/hook/setuid_hook.c b/drivers/kernelsu/hook/setuid_hook.c new file mode 100644 index 000000000000..2c0aeab247ae --- /dev/null +++ b/drivers/kernelsu/hook/setuid_hook.c @@ -0,0 +1,35 @@ +static __always_inline void ksu_handle_setresuid_cred(struct cred *new, const struct cred *old) +{ + if (!new || !old) + return; + + uid_t new_uid = ksu_get_uid_t(new->uid); + uid_t old_uid = ksu_get_uid_t(old->uid); + + // old process is not root, ignore it. + if (unlikely(!!old_uid)) + return; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("handle_setresuid from %d to %d\n", old_uid, new_uid); + + // we dont have those new fancy things upstream has + // lets just do the original thing where we disable seccomp + if (unlikely(is_uid_manager(new_uid))) + goto install_ksu_fd; + + if (ksu_is_allow_uid_for_current(new_uid)) + goto kill_seccomp; + + // Handle kernel umount + ksu_handle_umount(new, old); + return; + +install_ksu_fd: + pr_info("install fd for manager: %d\n", new_uid); + ksu_install_fd(); + +kill_seccomp: + disable_seccomp(); + return; +} diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm.c b/drivers/kernelsu/hook/syscall_table_hook_arm.c new file mode 100644 index 000000000000..996b3da89a06 --- /dev/null +++ b/drivers/kernelsu/hook/syscall_table_hook_arm.c @@ -0,0 +1,398 @@ +#ifndef CONFIG_ARM +#error "only meant for ARM" +#endif + +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h + +#define __ARMEABI_reboot 88 +#define __ARMEABI_execve 11 +#define __ARMEABI_faccessat 334 +#define __ARMEABI_fstatat64 327 +#define __ARMEABI_fstat64 197 +#define __ARMEABI_read 3 + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) + +// on 4.19+ its is no longer just a void *sys_call_table[] +// it becomes syscall_fn_t sys_call_table[]; + +static syscall_fn_t armeabi_reboot __read_mostly = NULL; +static long hook_armeabi_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return armeabi_reboot(regs); +} + +static syscall_fn_t armeabi_execve __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; + void ***envp = (void ***)®s->regs[2]; + + ksu_handle_execve(filename, argv, envp); + return armeabi_execve(regs); +} + +static syscall_fn_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return armeabi_faccessat(regs); +} + +static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_fstatat64(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return armeabi_fstatat64(regs); +} + +static syscall_fn_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned long *fd = (unsigned long *)®s->regs[0]; + struct stat64 __user **statbuf = (struct stat64 __user **)®s->regs[1]; + + long ret = armeabi_fstat64(regs); + ksu_handle_fstat64_ret(fd, statbuf); + return ret; +} + +static syscall_fn_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return armeabi_read(regs); +} + +#else // END OF 4.19+ SYSCALL HANDLERS + +/** + * for legacy syscall abi, we straight up call the syscall symbol + * this is easier and maybe a little bit faster + * + */ + +extern void *sys_call_table[]; + +static uintptr_t armeabi_reboot __read_mostly = NULL; +static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return sys_reboot(magic1, magic2, cmd, arg); +} + +static uintptr_t armeabi_execve __read_mostly = NULL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +__attribute__((hot)) +static long hook_armeabi_execve(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) +{ + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return sys_execve(filename, argv, envp); +} + +#else /* sys_execve_oabi */ + +/** + * on 3.0 / 3.4 ARM, sys_execve sc entry accepts 3 args (r0, r1, r2) + * however, sys_execve on that version, needs 4. the kernel does this small wrapper + * where it puts sp + 8 on r3. without it, hook won't work. + * + * // arch/arm/kernel/entry-common.S + * + * sys_execve_wrapper: + * add r3, sp, #S_OFF + * b sys_execve + * ENDPROC(sys_execve_wrapper) + * + */ +#include + +__attribute__((used, noipa)) +static long hook_sys_execve(const char __user *filenamei, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) +{ + ksu_handle_execve(&filenamei, (void ***)&argv, (void ***)&envp); + return sys_execve(filenamei, argv, envp, regs); +} + +#define S_OFF "8" +__attribute__((naked)) +static noinline void hook_armeabi_execve() +{ + asm volatile( + "add r3, sp, #" S_OFF "\n" + "b hook_sys_execve\n" + ); +} + +#endif /* sys_execve_oabi */ + + +static uintptr_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return sys_faccessat(dfd, filename, mode); +} + +static uintptr_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return sys_fstatat64(dfd, filename, statbuf, flag); +} + +static uintptr_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) +{ + // we handle it like rp + long ret = sys_fstat64(fd, statbuf); + ksu_handle_fstat64_ret(&fd, &statbuf); + return ret; +} + +static uintptr_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return sys_read(fd, buf, count); +} + +#endif // SYSCALL HANDLERS + +// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head +// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou" + +// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) +// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void ** +// for 4.19- old_ptr is actually void ** +// target_table is void *target_table[]; +static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + /* + * basically the trick is + * addr, say 0xffff1234, this is READ-ONLY + * align it, 0xffff0000 + * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234 + * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE + * write on 0xcccc0000 + 0x00001234 + * + */ + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + preempt_disable(); + local_irq_disable(); + + *(void **)old_ptr = *target_slot; + + *target_slot = new_ptr; + smp_mb(); // ^^ + + local_irq_enable(); + preempt_enable(); + + vunmap(writable_addr); + + smp_mb(); +} + +static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + /* + * we do this to make sure that old_ptr is filled. + * we risk a dead syscall !!! + * if read_and_replace failed or we restore again, it wont be pointing to anything + * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast + * yeah it really just dummy copies machine instructions at this point. + * + * normally we use probe_kernel_address / get_kernel_nofault here but the API is + * so inconsistent across kernel versions, and since its just a dummied wrapper + * for copy_from_kernel_nofault we can do it ourselves + * + */ + + long dummy = 0; + if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long))) + return; + + pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + // check if its ours + if (*target_slot != new_ptr) { + pr_info("%s: syscall is not ours!\n", __func__); + goto out; + } + + pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr ); + + preempt_disable(); + local_irq_disable(); + + *target_slot = *(void **)old_ptr; + smp_mb(); // ^^ + + *(void **)old_ptr = NULL; // explicit reset + + local_irq_enable(); + preempt_enable(); + +out: + vunmap(writable_addr); + + smp_mb(); +} + +static int ksu_syscall_table_restore() +{ + set_user_nice(current, 19); // low prio + +loop_start: + + msleep(1000); + + if (*(volatile bool *)&ksu_vfs_read_hook) + goto loop_start; + + restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); + restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); + + return 0; +} + +static DEFINE_MUTEX(sucompat_toggle_mutex); + +static void syscall_table_sucompat_enable() +{ + mutex_lock(&sucompat_toggle_mutex); + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); + mutex_unlock(&sucompat_toggle_mutex); +} + +static void syscall_table_sucompat_disable() +{ + mutex_lock(&sucompat_toggle_mutex); + restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)sys_call_table); + restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)sys_call_table); + restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)sys_call_table); + mutex_unlock(&sucompat_toggle_mutex); +} + +static __init int ksu_syscall_table_hook_init() +{ + // enable on init! + syscall_table_sucompat_enable(); + + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)sys_call_table); + + // theres an issue on fstat64 on oabi, so lets not hook it + // this is not that much of a loss since 3.0 / 3.4 devices aren't really running A17 + // TODO: fix and handle this +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)sys_call_table); +#endif + + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)sys_call_table); + + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); + return 0; +} +device_initcall_sync(ksu_syscall_table_hook_init); + +// EOF diff --git a/drivers/kernelsu/hook/syscall_table_hook_arm64.c b/drivers/kernelsu/hook/syscall_table_hook_arm64.c new file mode 100644 index 000000000000..ced382be024c --- /dev/null +++ b/drivers/kernelsu/hook/syscall_table_hook_arm64.c @@ -0,0 +1,513 @@ +#ifndef CONFIG_ARM64 +#error "only meant for ARM64" +#endif + +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/include/uapi/asm-generic/unistd.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd32.h +// ref: https://elixir.bootlin.com/linux/v4.14.1/source/arch/arm64/include/asm/unistd.h + +#define __AARCH64_reboot 142 +#define __AARCH64_execve 221 +#define __AARCH64_faccessat 48 +#define __AARCH64_newfstatat 79 +#define __AARCH64_newfstat 80 +#define __AARCH64_read 63 + +// NOTE: CONFIG_COMPAT implies __ARCH_WANT_COMPAT_STAT64 (fstatat64, fstat64) +#define __ARMEABI_reboot 88 +#define __ARMEABI_execve 11 +#define __ARMEABI_faccessat 334 +#define __ARMEABI_fstatat64 327 +#define __ARMEABI_fstat64 197 +#define __ARMEABI_read 3 + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) + +// on 4.19+ its is no longer just a void *sys_call_table[] +// it becomes syscall_fn_t sys_call_table[]; + +static syscall_fn_t aarch64_reboot __read_mostly = NULL; +static long hook_aarch64_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return aarch64_reboot(regs); +} + +static syscall_fn_t aarch64_execve __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; + void ***envp = (void ***)®s->regs[2]; + + ksu_handle_execve(filename, argv, envp); + return aarch64_execve(regs); +} + +static syscall_fn_t aarch64_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return aarch64_faccessat(regs); +} + +static syscall_fn_t aarch64_newfstatat __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_newfstatat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return aarch64_newfstatat(regs); +} + +static syscall_fn_t aarch64_newfstat __read_mostly = NULL; +__attribute__((cold)) +static long hook_aarch64_newfstat_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned int *fd = (unsigned int *)®s->regs[0]; + struct stat __user **statbuf = (struct stat __user **)®s->regs[1]; + + long ret = aarch64_newfstat(regs); + ksu_handle_newfstat_ret(fd, statbuf); + return ret; +} + +static syscall_fn_t aarch64_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_aarch64_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return aarch64_read(regs); +} + +#ifdef CONFIG_COMPAT +static syscall_fn_t armeabi_reboot __read_mostly = NULL; +static long hook_armeabi_reboot(const struct pt_regs *regs) +{ + int magic1 = (int)regs->regs[0]; + int magic2 = (int)regs->regs[1]; + unsigned int cmd = (unsigned int)regs->regs[2]; + void __user **arg = (void __user **)®s->regs[3]; + + ksu_handle_sys_reboot(magic1, magic2, cmd, arg); + return armeabi_reboot(regs); +} + +static syscall_fn_t armeabi_execve __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_execve(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[0]; + void ***argv = (void ***)®s->regs[1]; + void ***envp = (void ***)®s->regs[2]; + + ksu_handle_execve(filename, argv, envp); + return armeabi_execve(regs); +} + +static syscall_fn_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_faccessat(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_faccessat(NULL, filename, NULL, NULL); + return armeabi_faccessat(regs); +} + +static syscall_fn_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_fstatat64(const struct pt_regs *regs) +{ + const char __user **filename = (const char __user **)®s->regs[1]; + + ksu_handle_stat(NULL, filename, NULL); + return armeabi_fstatat64(regs); +} + +static syscall_fn_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_fstat64_ret(const struct pt_regs *regs) +{ + // we handle it like rp + unsigned long *fd = (unsigned long *)®s->regs[0]; + struct stat64 __user **statbuf = (struct stat64 __user **)®s->regs[1]; + + long ret = armeabi_fstat64(regs); + ksu_handle_fstat64_ret(fd, statbuf); + return ret; +} + +static syscall_fn_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_read(const struct pt_regs *regs) +{ + unsigned int fd = (unsigned int)regs->regs[0]; + + ksu_handle_sys_read_fd(fd); + return armeabi_read(regs); +} + +#endif // CONFIG_COMPAT + +#else // END OF 4.19+ SYSCALL HANDLERS + +/** + * for legacy syscall abi, we straight up call the syscall symbol + * this is easier and maybe a little bit faster + * + */ + +static uintptr_t aarch64_reboot __read_mostly = NULL; +static long hook_aarch64_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return sys_reboot(magic1, magic2, cmd, arg); +} + +static uintptr_t aarch64_execve __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_execve(const char __user * filename, + const char __user *const __user * argv, + const char __user *const __user * envp) +{ + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return sys_execve(filename, argv, envp); +} + +static uintptr_t aarch64_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return sys_faccessat(dfd, filename, mode); +} + +static uintptr_t aarch64_newfstatat __read_mostly = NULL; +__attribute__((hot)) +static long hook_aarch64_newfstatat(int dfd, const char __user * filename, struct stat __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return sys_newfstatat(dfd, filename, statbuf, flag); +} + +static uintptr_t aarch64_newfstat __read_mostly = NULL; +__attribute__((cold)) +static long hook_aarch64_newfstat_ret(unsigned int fd, struct stat __user * statbuf) +{ + // we handle it like rp + long ret = sys_newfstat(fd, statbuf); + ksu_handle_newfstat_ret(&fd, &statbuf); + return ret; +} + +static uintptr_t aarch64_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_aarch64_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return sys_read(fd, buf, count); +} + +#ifdef CONFIG_COMPAT +extern const void *compat_sys_call_table[]; + +static uintptr_t armeabi_reboot __read_mostly = NULL; +static long hook_armeabi_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg) +{ + ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); + return sys_reboot(magic1, magic2, cmd, arg); +} + +static uintptr_t armeabi_execve __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_execve(const char __user * filename, + const compat_uptr_t __user * argv, + const compat_uptr_t __user * envp) +{ + ksu_handle_execve(&filename, (void ***)&argv, (void ***)&envp); + return compat_sys_execve(filename, argv, envp); +} + +static uintptr_t armeabi_faccessat __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_faccessat(int dfd, const char __user * filename, int mode) +{ + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + return sys_faccessat(dfd, filename, mode); +} + +static uintptr_t armeabi_fstatat64 __read_mostly = NULL; +__attribute__((hot)) +static long hook_armeabi_fstatat64(int dfd, const char __user * filename, struct stat64 __user * statbuf, int flag) +{ + ksu_handle_stat(&dfd, &filename, &flag); + return sys_fstatat64(dfd, filename, statbuf, flag); +} + +static uintptr_t armeabi_fstat64 __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_fstat64_ret(unsigned long fd, struct stat64 __user * statbuf) +{ + // we handle it like rp + long ret = sys_fstat64(fd, statbuf); + ksu_handle_fstat64_ret(&fd, &statbuf); + return ret; +} + +static uintptr_t armeabi_read __read_mostly = NULL; +__attribute__((cold)) +static long hook_armeabi_read(unsigned int fd, char __user *buf, size_t count) +{ + ksu_handle_sys_read_fd(fd); + return sys_read(fd, buf, count); +} + +#endif // CONFIG_COMPAT + +#endif // SYSCALL HANDLERS + +// 'vmapping for writable' idea copied from upstream's LSM_HOOK_HACK, override_security_head +// no more "Unable to handle kernel write to read-only memory at virtual address ffffffuckyou" + +// WARNING!!! void * abuse ahead! (type-punning, pointer-hiding!) +// for 4.19+ old_ptr is actually syscall_fn_t *, which is just long * so we can consider this void ** +// for 4.19- old_ptr is actually void ** +// target_table is void *target_table[]; +static void read_and_replace_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + pr_info("%s: hooking syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + /* + * basically the trick is + * addr, say 0xffff1234, this is READ-ONLY + * align it, 0xffff0000 + * ptrdiff 0xffff1234 - 0xffff0000, 0x00001234 + * vmap 0xffff0000, say we get 0xcccc0000 , now WRITABLE + * write on 0xcccc0000 + 0x00001234 + * + */ + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + preempt_disable(); + local_irq_disable(); + + *(void **)old_ptr = *target_slot; + + *target_slot = new_ptr; + smp_mb(); // ^^ + + local_irq_enable(); + preempt_enable(); + + vunmap(writable_addr); + + smp_mb(); +} + +static void restore_syscall(void *old_ptr, unsigned long syscall_nr, void *new_ptr, void *target_table) +{ + void **sctable = (void **)target_table; + void **syscall_slot_addr = &sctable[syscall_nr]; + + if (!*syscall_slot_addr) + return; + + /* + * we do this to make sure that old_ptr is filled. + * we risk a dead syscall !!! + * if read_and_replace failed or we restore again, it wont be pointing to anything + * it just copies wordsize of whatever is in *old_ptr, it should fill up a wordzie atleast + * yeah it really just dummy copies machine instructions at this point. + * + * normally we use probe_kernel_address / get_kernel_nofault here but the API is + * so inconsistent across kernel versions, and since its just a dummied wrapper + * for copy_from_kernel_nofault we can do it ourselves + * + */ + + long dummy = 0; + if (copy_from_kernel_nofault((void *)&dummy, *(void **)old_ptr, sizeof(long))) + return; + + pr_info("%s: restore syscall #%d at 0x%lx\n", __func__, syscall_nr, (long)syscall_slot_addr); + + // prep vmap alias + unsigned long addr = (unsigned long)syscall_slot_addr; + unsigned long base = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; // offset_in_page + + // this is impossible for our case because the page alignment + // but be careful for other cases! + // BUG_ON(offset + len > PAGE_SIZE); + if (offset + sizeof(void *) > PAGE_SIZE) { + pr_info("%s: syscall slot crosses page boundary! aborting.\n", __func__); + return; + } + + // virtual mapping of a physical page + struct page *page = phys_to_page(__pa(base)); + if (!page) + return; + + // create a "writabel address" which is mapped to teh same address + void *writable_addr = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!writable_addr) + return; + + // swap on the alias + void **target_slot = (void **)((unsigned long)writable_addr + offset); + + // check if its ours + if (*target_slot != new_ptr) { + pr_info("%s: syscall is not ours!\n", __func__); + goto out; + } + + pr_info("%s: syscall is ours! *target_slot: 0x%lx new_ptr: 0x%lx\n", __func__, (long)*target_slot, (long)new_ptr ); + + preempt_disable(); + local_irq_disable(); + + *target_slot = *(void **)old_ptr; + smp_mb(); // ^^ + + *(void **)old_ptr = NULL; // explicit reset + + local_irq_enable(); + preempt_enable(); + +out: + vunmap(writable_addr); + + smp_mb(); +} + +static int ksu_syscall_table_restore() +{ + set_user_nice(current, 19); // low prio + +loop_start: + + msleep(1000); + + if (*(volatile bool *)&ksu_vfs_read_hook) + goto loop_start; + + restore_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); + restore_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + restore_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); +#endif + + return 0; +} + +static DEFINE_MUTEX(sucompat_toggle_mutex); + +static void syscall_table_sucompat_enable() +{ + mutex_lock(&sucompat_toggle_mutex); + + read_and_replace_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + read_and_replace_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); +#endif + + mutex_unlock(&sucompat_toggle_mutex); +} + +static void syscall_table_sucompat_disable() +{ + mutex_lock(&sucompat_toggle_mutex); + + restore_syscall((void *)&aarch64_execve, __AARCH64_execve, (void *)hook_aarch64_execve, (void *)sys_call_table); + restore_syscall((void *)&aarch64_faccessat, __AARCH64_faccessat, (void *)hook_aarch64_faccessat, (void *)sys_call_table); + restore_syscall((void *)&aarch64_newfstatat, __AARCH64_newfstatat, (void *)hook_aarch64_newfstatat, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + restore_syscall((void *)&armeabi_execve, __ARMEABI_execve, (void *)hook_armeabi_execve, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_faccessat, __ARMEABI_faccessat, (void *)hook_armeabi_faccessat, (void *)compat_sys_call_table); + restore_syscall((void *)&armeabi_fstatat64, __ARMEABI_fstatat64, (void *)hook_armeabi_fstatat64, (void *)compat_sys_call_table); +#endif + + mutex_unlock(&sucompat_toggle_mutex); +} + +static __init int ksu_syscall_table_hook_init() +{ + // enable on init! + syscall_table_sucompat_enable(); + + read_and_replace_syscall((void *)&aarch64_reboot, __AARCH64_reboot, (void *)hook_aarch64_reboot, (void *)sys_call_table); + + // will be unregged + read_and_replace_syscall((void *)&aarch64_newfstat, __AARCH64_newfstat, (void *)hook_aarch64_newfstat_ret, (void *)sys_call_table); + read_and_replace_syscall((void *)&aarch64_read, __AARCH64_read, (void *)hook_aarch64_read, (void *)sys_call_table); + +#if defined(CONFIG_COMPAT) + read_and_replace_syscall((void *)&armeabi_reboot, __ARMEABI_reboot, (void *)hook_armeabi_reboot, (void *)compat_sys_call_table); + + // will be unregged + read_and_replace_syscall((void *)&armeabi_fstat64, __ARMEABI_fstat64, (void *)hook_armeabi_fstat64_ret, (void *)compat_sys_call_table); + read_and_replace_syscall((void *)&armeabi_read, __ARMEABI_read, (void *)hook_armeabi_read, (void *)compat_sys_call_table); +#endif // COMPAT + + // start unreg kthread + kthread_run(ksu_syscall_table_restore, NULL, "unhook"); + return 0; +} +late_initcall(ksu_syscall_table_hook_init); + +// EOF diff --git a/drivers/kernelsu/include/arch.h b/drivers/kernelsu/include/arch.h new file mode 100644 index 000000000000..c80db6632efa --- /dev/null +++ b/drivers/kernelsu/include/arch.h @@ -0,0 +1,134 @@ +#ifndef __KSU_H_ARCH +#define __KSU_H_ARCH + +#if defined(__aarch64__) + +#define __PT_PARM1_REG regs[0] +#define __PT_PARM2_REG regs[1] +#define __PT_PARM3_REG regs[2] +#define __PT_SYSCALL_PARM4_REG regs[3] +#define __PT_CCALL_PARM4_REG regs[3] +#define __PT_PARM5_REG regs[4] +#define __PT_PARM6_REG regs[5] +#define __PT_RET_REG regs[30] +#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[0] +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define SYS_EXECVE_SYMBOL "__arm64_sys_execve" +#define SYS_REBOOT_SYMBOL "__arm64_sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "__arm64_sys_newfstat" +#define SYS_FSTAT64_SYMBOL "__arm64_sys_fstat64" +#define SYS_READ_SYMBOL "__arm64_sys_read" +#define SYS_NEWFSTATAT_SYMBOL "__arm64_sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "__arm64_sys_faccessat" +#else +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" +#endif + +#elif defined(__arm__) + +// https://elixir.bootlin.com/linux/v6.17-rc6/source/tools/lib/bpf/bpf_tracing.h +#define __PT_PARM1_REG uregs[0] +#define __PT_PARM2_REG uregs[1] +#define __PT_PARM3_REG uregs[2] +#define __PT_PARM4_REG uregs[3] + +// seems to work atleast on 3.0 on samsung galaxy s3 +// nfi what im doing +#define __PT_SYSCALL_PARM4_REG uregs[3] +#define __PT_CCALL_PARM4_REG uregs[3] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + +#define __PT_RET_REG uregs[14] +#define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG uregs[0] +#define __PT_SP_REG uregs[13] +#define __PT_IP_REG uregs[12] + +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" + +#elif defined(__x86_64__) + +#define __PT_PARM1_REG di +#define __PT_PARM2_REG si +#define __PT_PARM3_REG dx +/* syscall uses r10 for PARM4 */ +#define __PT_SYSCALL_PARM4_REG r10 +#define __PT_CCALL_PARM4_REG cx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +#define __PT_RET_REG sp +#define __PT_FP_REG bp +#define __PT_RC_REG ax +#define __PT_SP_REG sp +#define __PT_IP_REG ip + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define SYS_EXECVE_SYMBOL "__x64_sys_execve" +#define SYS_REBOOT_SYMBOL "__x64_sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" +#define SYS_FSTAT64_SYMBOL "__ia32_compat_sys_x86_fstat64" +#define SYS_NEWFSTAT_SYMBOL "__x64_sys_newfstat" +#define SYS_NEWFSTATAT_SYMBOL "__x64_sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "__x64_sys_faccessat" +#else +#define SYS_EXECVE_SYMBOL "sys_execve" +#define SYS_REBOOT_SYMBOL "sys_reboot" +#define SYS_NEWFSTAT_SYMBOL "sys_newfstat" +#define SYS_FSTAT64_SYMBOL "sys_fstat64" +#define SYS_READ_SYMBOL "sys_read" +#define SYS_NEWFSTATAT_SYMBOL "sys_newfstatat" +#define SYS_FACCESSAT_SYMBOL "sys_faccessat" +#endif + +#else +#error "Unsupported arch" +#endif + +/* allow some architecutres to override `struct pt_regs` */ +#ifndef __PT_REGS_CAST +#define __PT_REGS_CAST(x) (x) +#endif + +#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) +#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) +#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) +#define PT_REGS_SYSCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_SYSCALL_PARM4_REG) +#define PT_REGS_CCALL_PARM4(x) (__PT_REGS_CAST(x)->__PT_CCALL_PARM4_REG) +#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) +#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) +#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) +#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) +#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) +#define PT_REAL_REGS(regs) ((struct pt_regs *)PT_REGS_PARM1(regs)) +#else +#define PT_REAL_REGS(regs) ((regs)) +#endif + +#endif diff --git a/drivers/kernelsu/klog.h b/drivers/kernelsu/include/klog.h similarity index 82% rename from drivers/kernelsu/klog.h rename to drivers/kernelsu/include/klog.h index a934027fbeeb..6de40a66680e 100644 --- a/drivers/kernelsu/klog.h +++ b/drivers/kernelsu/include/klog.h @@ -1,8 +1,6 @@ #ifndef __KSU_H_KLOG #define __KSU_H_KLOG -#include - #ifdef pr_fmt #undef pr_fmt #define pr_fmt(fmt) "KernelSU: " fmt diff --git a/drivers/kernelsu/ksu.h b/drivers/kernelsu/include/ksu.h similarity index 75% rename from drivers/kernelsu/ksu.h rename to drivers/kernelsu/include/ksu.h index 32e81d967fff..2f5841290b1d 100644 --- a/drivers/kernelsu/ksu.h +++ b/drivers/kernelsu/include/ksu.h @@ -1,11 +1,7 @@ #ifndef __KSU_H_KSU #define __KSU_H_KSU -#include -#include -#include - -#define KERNEL_SU_VERSION 32430 +#define KERNEL_SU_VERSION 32485 #define EVENT_POST_FS_DATA 1 #define EVENT_BOOT_COMPLETED 2 @@ -25,6 +21,6 @@ static inline int endswith(const char *s, const char *t) return strcmp(s + slen - tlen, t); } -extern struct cred *ksu_cred; +extern struct cred* ksu_cred; #endif diff --git a/drivers/kernelsu/include/uapi/app_profile.h b/drivers/kernelsu/include/uapi/app_profile.h new file mode 100644 index 000000000000..7aa29e0f6293 --- /dev/null +++ b/drivers/kernelsu/include/uapi/app_profile.h @@ -0,0 +1,61 @@ +#ifndef __KSU_UAPI_APP_PROFILE_H +#define __KSU_UAPI_APP_PROFILE_H + +#define KSU_APP_PROFILE_VER 3 +#define KSU_MAX_PACKAGE_NAME 256 +/* NGROUPS_MAX for Linux is 65535 generally, but we only supports 32 groups. */ +#define KSU_MAX_GROUPS 32 +#define KSU_SELINUX_DOMAIN 64 + +struct root_profile { + __s32 uid; + __s32 gid; + + __u32 groups_count; + __s32 groups[KSU_MAX_GROUPS]; + + /* kernel_cap_t is u32[2] for capabilities v3 */ + struct { + __u64 effective; + __u64 permitted; + __u64 inheritable; + } capabilities; + + char selinux_domain[KSU_SELINUX_DOMAIN]; + + __s32 namespaces; +}; + +struct non_root_profile { + bool umount_modules; +}; + +struct app_profile { + /* + * It may be utilized for backward compatibility, although we have never + * explicitly made any promises regarding this. + */ + __u32 version; + + /* this is usually the package of the app, but can be other value for special apps */ + char key[KSU_MAX_PACKAGE_NAME]; + __s32 curr_uid; + bool allow_su; + + union { + struct { + bool use_default; + char template_name[KSU_MAX_PACKAGE_NAME]; + + struct root_profile profile; + } rp_config; + + struct { + bool use_default; + + struct non_root_profile profile; + } nrp_config; + }; +}; + +#endif diff --git a/drivers/kernelsu/include/uapi/feature.h b/drivers/kernelsu/include/uapi/feature.h new file mode 100644 index 000000000000..b1b92f2fdc48 --- /dev/null +++ b/drivers/kernelsu/include/uapi/feature.h @@ -0,0 +1,14 @@ +#ifndef __KSU_UAPI_FEATURE_H +#define __KSU_UAPI_FEATURE_H + +enum ksu_feature_id { + KSU_FEATURE_SU_COMPAT = 0, + KSU_FEATURE_KERNEL_UMOUNT = 1, + KSU_FEATURE_SULOG = 2, + KSU_FEATURE_ADB_ROOT = 3, + KSU_FEATURE_SELINUX_HIDE = 4, + + KSU_FEATURE_MAX +}; + +#endif diff --git a/drivers/kernelsu/include/uapi/selinux.h b/drivers/kernelsu/include/uapi/selinux.h new file mode 100644 index 000000000000..960454f7f46a --- /dev/null +++ b/drivers/kernelsu/include/uapi/selinux.h @@ -0,0 +1,29 @@ +#ifndef __KSU_UAPI_SELINUX_H +#define __KSU_UAPI_SELINUX_H + +#define KSU_SEPOLICY_CMD_NORMAL_PERM 1 +#define KSU_SEPOLICY_CMD_XPERM 2 +#define KSU_SEPOLICY_CMD_TYPE_STATE 3 +#define KSU_SEPOLICY_CMD_TYPE 4 +#define KSU_SEPOLICY_CMD_TYPE_ATTR 5 +#define KSU_SEPOLICY_CMD_ATTR 6 +#define KSU_SEPOLICY_CMD_TYPE_TRANSITION 7 +#define KSU_SEPOLICY_CMD_TYPE_CHANGE 8 +#define KSU_SEPOLICY_CMD_GENFSCON 9 + +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW 1 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY 2 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW 3 +#define KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT 4 + +#define KSU_SEPOLICY_SUBCMD_XPERM_ALLOW 1 +#define KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW 2 +#define KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT 3 + +#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE 1 +#define KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE 2 + +#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE 1 +#define KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER 2 + +#endif diff --git a/drivers/kernelsu/include/uapi/sulog.h b/drivers/kernelsu/include/uapi/sulog.h new file mode 100644 index 000000000000..9453a4bd0c16 --- /dev/null +++ b/drivers/kernelsu/include/uapi/sulog.h @@ -0,0 +1,32 @@ +#ifndef __KSU_UAPI_SULOG_H +#define __KSU_UAPI_SULOG_H + +#include +#include + +#define KSU_SULOG_EVENT_VERSION 1 +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +enum ksu_sulog_event_type { + KSU_SULOG_EVENT_ROOT_EXECVE = 1, + KSU_SULOG_EVENT_SUCOMPAT = 2, + KSU_SULOG_EVENT_IOCTL_GRANT_ROOT = 3, +}; + +struct ksu_sulog_event { + __u16 version; + __u16 event_type; + __s32 retval; + __u32 pid; + __u32 tgid; + __u32 ppid; + __u32 uid; + __u32 euid; + char comm[TASK_COMM_LEN]; + __u32 filename_len; + __u32 argv_len; +} __packed; + +#endif diff --git a/drivers/kernelsu/include/uapi/supercall.h b/drivers/kernelsu/include/uapi/supercall.h new file mode 100644 index 000000000000..dbfc5f1158bd --- /dev/null +++ b/drivers/kernelsu/include/uapi/supercall.h @@ -0,0 +1,162 @@ +#ifndef __KSU_UAPI_SUPERCALL_H +#define __KSU_UAPI_SUPERCALL_H + +/* Magic numbers for reboot hook to install fd */ +#define KSU_INSTALL_MAGIC1 0xDEADBEEF +#define KSU_INSTALL_MAGIC2 0xCAFEBABE + +struct ksu_become_daemon_cmd { + __u8 token[65]; /* Input: daemon token (null-terminated) */ +}; + +#define EVENT_POST_FS_DATA 1 +#define EVENT_BOOT_COMPLETED 2 +#define EVENT_MODULE_MOUNTED 3 + +#define KSU_GET_INFO_FLAG_LKM (1U << 0) +#define KSU_GET_INFO_FLAG_MANAGER (1U << 1) +#define KSU_GET_INFO_FLAG_LATE_LOAD (1U << 2) +#define KSU_GET_INFO_FLAG_PR_BUILD (1U << 3) + +struct ksu_get_info_cmd { + __u32 version; /* Output: KERNEL_SU_VERSION */ + __u32 flags; /* Output: KSU_GET_INFO_FLAG_* bits */ + __u32 features; /* Output: max feature ID supported */ +}; + +struct ksu_report_event_cmd { + __u32 event; /* Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. */ +}; + +struct ksu_set_sepolicy_cmd { + __u64 data_len; /* Input: bytes of serialized command payload */ + __aligned_u64 data; /* Input: pointer to serialized payload */ +}; + +struct ksu_sepolicy_cmd_hdr { + __u32 cmd; /* Input: command type, CMD_* */ + __u32 subcmd; /* Input: command subtype */ +}; +/* + * After each ksu_sepolicy_cmd_hdr, command arguments are encoded sequentially as: + * [u32 len][len bytes][\0], where len excludes the trailing '\0'. + * len == 0 represents ALL. + * Argument count is derived from cmd: + * KSU_SEPOLICY_CMD_NORMAL_PERM=4, KSU_SEPOLICY_CMD_XPERM=5, + * KSU_SEPOLICY_CMD_TYPE_STATE=1, KSU_SEPOLICY_CMD_TYPE=2, + * KSU_SEPOLICY_CMD_TYPE_ATTR=2, KSU_SEPOLICY_CMD_ATTR=1, + * KSU_SEPOLICY_CMD_TYPE_TRANSITION=5, KSU_SEPOLICY_CMD_TYPE_CHANGE=4, + * KSU_SEPOLICY_CMD_GENFSCON=3. + */ + +struct ksu_check_safemode_cmd { + __u8 in_safe_mode; /* Output: true if in safe mode, false otherwise */ +}; + +/* deprecated */ +struct ksu_get_allow_list_cmd { + __u32 uids[128]; /* Output: array of allowed/denied UIDs */ + __u32 count; /* Output: number of UIDs in array */ + __u8 allow; /* Input: true for allow list, false for deny list */ +}; + +struct ksu_new_get_allow_list_cmd { + __u16 count; /* Input / Output: number of UIDs in array */ + __u16 total_count; /* Output: total number of UIDs in requested list */ + __u32 uids[0]; /* Output: array of allowed/denied UIDs */ +}; + +struct ksu_uid_granted_root_cmd { + __u32 uid; /* Input: target UID to check */ + __u8 granted; /* Output: true if granted, false otherwise */ +}; + +struct ksu_uid_should_umount_cmd { + __u32 uid; /* Input: target UID to check */ + __u8 should_umount; /* Output: true if should umount, false otherwise */ +}; + +struct ksu_get_manager_appid_cmd { + __u32 appid; /* Output: manager app id */ +}; + +struct ksu_get_app_profile_cmd { + struct app_profile profile; /* Input/Output: app profile structure */ +}; + +struct ksu_set_app_profile_cmd { + struct app_profile profile; /* Input: app profile structure */ +}; + +struct ksu_get_feature_cmd { + __u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */ + __u64 value; /* Output: feature value/state */ + __u8 supported; /* Output: true if feature is supported, false otherwise */ +}; + +struct ksu_set_feature_cmd { + __u32 feature_id; /* Input: feature ID (enum ksu_feature_id) */ + __u64 value; /* Input: feature value/state to set */ +}; + +struct ksu_get_wrapper_fd_cmd { + __u32 fd; /* Input: userspace fd */ + __u32 flags; /* Input: flags of userspace fd */ +}; + +struct ksu_manage_mark_cmd { + __u32 operation; /* Input: KSU_MARK_* */ + __s32 pid; /* Input: target pid (0 for all processes) */ + __u32 result; /* Output: for get operation - mark status or reg_count */ +}; + +#define KSU_MARK_GET 1 +#define KSU_MARK_MARK 2 +#define KSU_MARK_UNMARK 3 +#define KSU_MARK_REFRESH 4 + +struct ksu_nuke_ext4_sysfs_cmd { + __aligned_u64 arg; /* Input: mnt pointer */ +}; + +struct ksu_add_try_umount_cmd { + __aligned_u64 arg; /* char ptr, this is the mountpoint */ + __u32 flags; /* this is the flag we use for it */ + __u8 mode; /* denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry */ +}; + +struct ksu_get_sulog_fd_cmd { + __u32 flags; /* Input: reserved for future use, must be 0 */ +}; + +#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list +#define KSU_UMOUNT_ADD 1 // add entry (path + flags) +#define KSU_UMOUNT_DEL 2 // delete entry, strcmp + +// IOCTL command definitions +#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) +#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) +#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) +#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ|_IOC_WRITE, 'K', 4, 0) +#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) +// deprecated +#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 6, 0) +// deprecated +#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ|_IOC_WRITE, 'K', 7, 0) +#define KSU_IOCTL_NEW_GET_ALLOW_LIST _IOWR('K', 6, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_NEW_GET_DENY_LIST _IOWR('K', 7, struct ksu_new_get_allow_list_cmd) +#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ|_IOC_WRITE, 'K', 8, 0) +#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ|_IOC_WRITE, 'K', 9, 0) +#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) +#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ|_IOC_WRITE, 'K', 11, 0) +#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) +#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ|_IOC_WRITE, 'K', 13, 0) +#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) +#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) +#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ|_IOC_WRITE, 'K', 16, 0) +#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) +#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) +#define KSU_IOCTL_SET_INIT_PGRP _IO('K', 19) +#define KSU_IOCTL_GET_SULOG_FD _IOW('K', 20, struct ksu_get_sulog_fd_cmd) + +#endif diff --git a/drivers/kernelsu/infra/event_queue.c b/drivers/kernelsu/infra/event_queue.c new file mode 100644 index 000000000000..333a10c0c523 --- /dev/null +++ b/drivers/kernelsu/infra/event_queue.c @@ -0,0 +1,393 @@ +struct ksu_event_queue_node { + struct list_head list; + struct ksu_event_record_hdr hdr; + __u8 payload[]; +}; + +static size_t ksu_event_queue_record_size(__u32 payload_len) +{ + return sizeof(struct ksu_event_record_hdr) + payload_len; +} + +static void ksu_event_queue_note_drop_locked(struct ksu_event_queue *queue, __u64 seq) +{ + queue->dropped_total++; + if (!queue->dropped_pending) { + queue->dropped_first_seq = seq; + } + queue->dropped_pending++; + queue->dropped_last_seq = seq; +} + +static bool ksu_event_queue_has_data_locked(const struct ksu_event_queue *queue) +{ + return queue->dropped_pending || queue->dropped_inflight || !list_empty(&queue->pending); +} + +static void ksu_event_queue_mark_closed(struct ksu_event_queue *queue) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + queue->closed = true; + spin_unlock_irqrestore(&queue->lock, irq_flags); +} + +void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len) +{ + spin_lock_init(&queue->lock); + mutex_init(&queue->read_lock); + INIT_LIST_HEAD(&queue->pending); + init_waitqueue_head(&queue->read_wait); + queue->queued = 0; + queue->max_queued = max_queued; + queue->max_payload_len = max_payload_len; + queue->next_seq = 1; + queue->dropped_total = 0; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + queue->closed = false; +} + +void ksu_event_queue_destroy(struct ksu_event_queue *queue) +{ + struct ksu_event_queue_node *node, *tmp; + unsigned long irq_flags; + + ksu_event_queue_mark_closed(queue); + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); + + mutex_lock(&queue->read_lock); + spin_lock_irqsave(&queue->lock, irq_flags); + list_for_each_entry_safe (node, tmp, &queue->pending, list) { + list_del(&node->list); + kfree(node); + } + queue->queued = 0; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + mutex_unlock(&queue->read_lock); + + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); +} + +int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len, gfp_t gfp) +{ + struct ksu_event_queue_node *node = NULL; + unsigned long irq_flags; + __u64 seq; + bool wake = false; + int ret = 0; + + if (len > queue->max_payload_len) { + return -EMSGSIZE; + } + + if (len && !payload) { + return -EINVAL; + } + + node = kmalloc(struct_size(node, payload, len), gfp); + + if (node) { + INIT_LIST_HEAD(&node->list); + node->hdr.type = type; + node->hdr.flags = flags; + node->hdr.len = len; + node->hdr.ts_ns = 0; + node->hdr.seq = 0; + + if (len) { + memcpy(node->payload, payload, len); + } + } + + spin_lock_irqsave(&queue->lock, irq_flags); + if (queue->closed) { + ret = -EPIPE; + goto out_unlock; + } + + seq = queue->next_seq++; + if (!node || (queue->max_queued && queue->queued >= queue->max_queued)) { + ksu_event_queue_note_drop_locked(queue, seq); + wake = true; + ret = node ? -ENOSPC : -ENOMEM; + goto out_unlock; + } + + node->hdr.seq = seq; + node->hdr.ts_ns = ktime_get_ns(); + list_add_tail(&node->list, &queue->pending); + queue->queued++; + wake = true; + +out_unlock: + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (ret && node) { + kfree(node); + } + + if (wake) { + wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM); + } + + return ret; +} + +void ksu_event_queue_drop(struct ksu_event_queue *queue) +{ + unsigned long irq_flags; + __u64 seq; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (queue->closed) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return; + } + + seq = queue->next_seq++; + ksu_event_queue_note_drop_locked(queue, seq); + spin_unlock_irqrestore(&queue->lock, irq_flags); + + wake_up_interruptible_poll(&queue->read_wait, EPOLLIN | EPOLLRDNORM); +} + +static int ksu_event_queue_wait_ready(struct ksu_event_queue *queue, int file_flags) +{ + int ret; + + for (;;) { + if (ksu_event_queue_has_data(queue)) { + return 0; + } + + if (READ_ONCE(queue->closed)) { + return 0; + } + + if (file_flags & O_NONBLOCK) { + return -EAGAIN; + } + + ret = wait_event_interruptible(queue->read_wait, queue->closed || ksu_event_queue_has_data(queue)); + if (ret) { + return ret; + } + } +} + +static ssize_t ksu_event_queue_read_drop(struct ksu_event_queue *queue, char __user *buf, size_t count) +{ + struct ksu_event_record_hdr hdr; + struct ksu_event_queue_dropped_info info; + size_t record_size = ksu_event_queue_record_size(sizeof(info)); + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (!queue->dropped_pending) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return 0; + } + if (count < record_size) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return -EMSGSIZE; + } + + hdr.type = KSU_EVENT_QUEUE_TYPE_DROPPED; + hdr.flags = KSU_EVENT_RECORD_FLAG_INTERNAL; + hdr.len = sizeof(info); + hdr.seq = queue->dropped_first_seq; + hdr.ts_ns = ktime_get_ns(); + + info.dropped = queue->dropped_pending; + info.first_seq = queue->dropped_first_seq; + info.last_seq = queue->dropped_last_seq; + + queue->dropped_inflight = queue->dropped_pending; + queue->dropped_inflight_first_seq = queue->dropped_first_seq; + queue->dropped_inflight_last_seq = queue->dropped_last_seq; + queue->dropped_pending = 0; + queue->dropped_first_seq = 0; + queue->dropped_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (copy_to_user(buf, &hdr, sizeof(hdr))) { + goto out_restore; + } + + if (copy_to_user(buf + sizeof(hdr), &info, sizeof(info))) { + goto out_restore; + } + + spin_lock_irqsave(&queue->lock, irq_flags); + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return record_size; + +out_restore: + spin_lock_irqsave(&queue->lock, irq_flags); + if (!queue->dropped_pending) { + queue->dropped_pending = queue->dropped_inflight; + queue->dropped_first_seq = queue->dropped_inflight_first_seq; + queue->dropped_last_seq = queue->dropped_inflight_last_seq; + } else { + queue->dropped_pending += queue->dropped_inflight; + queue->dropped_first_seq = queue->dropped_inflight_first_seq; + } + queue->dropped_inflight = 0; + queue->dropped_inflight_first_seq = 0; + queue->dropped_inflight_last_seq = 0; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return -EFAULT; +} + +static ssize_t ksu_event_queue_read_node(struct ksu_event_queue *queue, char __user *buf, size_t count) +{ + struct ksu_event_queue_node *node; + struct list_head *first; + size_t record_size; + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + if (list_empty(&queue->pending)) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return 0; + } + + first = queue->pending.next; + node = list_entry(first, struct ksu_event_queue_node, list); + record_size = ksu_event_queue_record_size(node->hdr.len); + if (count < record_size) { + spin_unlock_irqrestore(&queue->lock, irq_flags); + return -EMSGSIZE; + } + spin_unlock_irqrestore(&queue->lock, irq_flags); + + if (copy_to_user(buf, &node->hdr, sizeof(node->hdr))) { + return -EFAULT; + } + + if (node->hdr.len && copy_to_user(buf + sizeof(node->hdr), node->payload, node->hdr.len)) { + return -EFAULT; + } + + spin_lock_irqsave(&queue->lock, irq_flags); + list_del(first); + queue->queued--; + spin_unlock_irqrestore(&queue->lock, irq_flags); + + kfree(node); + return record_size; +} + +ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags) +{ + ssize_t ret; + ssize_t copied = 0; + + if (!count) { + return 0; + } + + ret = mutex_lock_interruptible(&queue->read_lock); + if (ret) { + return ret; + } + + ret = ksu_event_queue_wait_ready(queue, file_flags); + if (ret) { + copied = ret; + goto out_unlock; + } + + while (count > 0) { + ret = ksu_event_queue_read_drop(queue, buf, count); + if (ret < 0) { + if (!copied) { + copied = ret; + } + break; + } + if (ret > 0) { + copied += ret; + buf += ret; + count -= ret; + continue; + } + + ret = ksu_event_queue_read_node(queue, buf, count); + if (ret < 0) { + if (!copied) { + copied = ret; + } + break; + } + if (ret == 0) { + break; + } + + copied += ret; + buf += ret; + count -= ret; + } + + if (!copied && READ_ONCE(queue->closed)) { + copied = 0; + } + +out_unlock: + mutex_unlock(&queue->read_lock); + return copied; +} + +unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait) +{ + unsigned __bitwise mask = 0; + unsigned long irq_flags; + + poll_wait(file, &queue->read_wait, wait); + + spin_lock_irqsave(&queue->lock, irq_flags); + if (ksu_event_queue_has_data_locked(queue)) { + mask |= POLLIN | POLLRDNORM; + } + if (queue->closed) { + mask |= POLLHUP; + } + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return mask; +} + +void ksu_event_queue_close(struct ksu_event_queue *queue) +{ + ksu_event_queue_mark_closed(queue); + wake_up_interruptible_poll(&queue->read_wait, EPOLLHUP | POLLHUP); +} + +bool ksu_event_queue_has_data(struct ksu_event_queue *queue) +{ + bool has_data; + unsigned long irq_flags; + + spin_lock_irqsave(&queue->lock, irq_flags); + has_data = ksu_event_queue_has_data_locked(queue); + spin_unlock_irqrestore(&queue->lock, irq_flags); + + return has_data; +} diff --git a/drivers/kernelsu/infra/event_queue.h b/drivers/kernelsu/infra/event_queue.h new file mode 100644 index 000000000000..2170f64fd8c8 --- /dev/null +++ b/drivers/kernelsu/infra/event_queue.h @@ -0,0 +1,54 @@ +#ifndef KSU_EVENT_QUEUE_H +#define KSU_EVENT_QUEUE_H + +#define KSU_EVENT_RECORD_FLAG_INTERNAL (1U << 0) +#define KSU_EVENT_QUEUE_TYPE_DROPPED ((__u16)0xFFFF) + +struct ksu_event_record_hdr { + __u16 type; + __u16 flags; + __u32 len; + __u64 seq; + __u64 ts_ns; +}; + +struct ksu_event_queue_dropped_info { + __u64 dropped; + __u64 first_seq; + __u64 last_seq; +}; + +struct ksu_event_queue { + spinlock_t lock; + /* The first implementation supports a single reader. */ + struct mutex read_lock; + struct list_head pending; + wait_queue_head_t read_wait; + __u32 queued; + __u32 max_queued; + __u32 max_payload_len; + __u64 next_seq; + __u64 dropped_total; + __u64 dropped_pending; + __u64 dropped_first_seq; + __u64 dropped_last_seq; + __u64 dropped_inflight; + __u64 dropped_inflight_first_seq; + __u64 dropped_inflight_last_seq; + bool closed; +}; + +void ksu_event_queue_init(struct ksu_event_queue *queue, __u32 max_queued, __u32 max_payload_len); +void ksu_event_queue_destroy(struct ksu_event_queue *queue); + +int ksu_event_queue_push(struct ksu_event_queue *queue, __u16 type, __u16 flags, const void *payload, __u32 len, + gfp_t gfp); +void ksu_event_queue_drop(struct ksu_event_queue *queue); + +ssize_t ksu_event_queue_read(struct ksu_event_queue *queue, char __user *buf, size_t count, int file_flags); +unsigned __bitwise ksu_event_queue_poll(struct ksu_event_queue *queue, struct file *file, poll_table *wait); + +void ksu_event_queue_close(struct ksu_event_queue *queue); +bool ksu_event_queue_has_data(struct ksu_event_queue *queue); + +#endif // KSU_EVENT_QUEUE_H diff --git a/drivers/kernelsu/file_wrapper.c b/drivers/kernelsu/infra/file_wrapper.c similarity index 54% rename from drivers/kernelsu/file_wrapper.c rename to drivers/kernelsu/infra/file_wrapper.c index f2b252334645..98bb2539073a 100644 --- a/drivers/kernelsu/file_wrapper.c +++ b/drivers/kernelsu/infra/file_wrapper.c @@ -1,23 +1,3 @@ -#include -#include -#include -#include -#include // kernel 3.18 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "objsec.h" -#include "ksud.h" - struct ksu_file_wrapper { struct file *orig; struct file_operations ops; @@ -28,8 +8,7 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp); static int ksu_wrapper_open(struct inode *ino, struct file *fp) { struct path *orig_path = fp->f_path.dentry->d_fsdata; - struct file *orig_file = - dentry_open(orig_path, fp->f_flags, current_cred()); + struct file *orig_file = dentry_open(orig_path, fp->f_flags, current_cred()); if (IS_ERR(orig_file)) { return PTR_ERR(orig_file); } @@ -49,151 +28,136 @@ static const struct file_operations ksu_file_wrapper_inode_fops = { .open = ksu_wrapper_open }; -static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static loff_t ksu_wrapper_llseek(struct file *fp, loff_t off, int flags) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->llseek(data->orig, off, flags); } -static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz, - loff_t *off) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_read(struct file *fp, char __user *ptr, size_t sz, loff_t *off) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->read(orig, ptr, sz, off); } -static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr, - size_t sz, loff_t *off) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_write(struct file *fp, const char __user *ptr, size_t sz, loff_t *off) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->write(orig, ptr, sz, off); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) -static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi) -{ - struct ksu_file_wrapper *data = iocb->ki_filp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_read_iter(struct kiocb *iocb, struct iov_iter *iovi) { + struct ksu_file_wrapper* data = iocb->ki_filp->private_data; + struct file* orig = data->orig; iocb->ki_filp = orig; return orig->f_op->read_iter(iocb, iovi); } -static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi) -{ - struct ksu_file_wrapper *data = iocb->ki_filp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_write_iter(struct kiocb *iocb, struct iov_iter *iovi) { + struct ksu_file_wrapper* data = iocb->ki_filp->private_data; + struct file* orig = data->orig; iocb->ki_filp = orig; return orig->f_op->write_iter(iocb, iovi); } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch *icb, - unsigned int v) -{ - struct ksu_file_wrapper *data = kiocb->ki_filp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_iopoll(struct kiocb *kiocb, struct io_comp_batch* icb, unsigned int v) { + struct ksu_file_wrapper* data = kiocb->ki_filp->private_data; + struct file* orig = data->orig; kiocb->ki_filp = orig; return orig->f_op->iopoll(kiocb, icb, v); } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) -static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin) -{ - struct ksu_file_wrapper *data = kiocb->ki_filp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_iopoll(struct kiocb *kiocb, bool spin) { + struct ksu_file_wrapper* data = kiocb->ki_filp->private_data; + struct file* orig = data->orig; kiocb->ki_filp = orig; return orig->f_op->iopoll(kiocb, spin); } #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) -static int ksu_wrapper_iterate(struct file *fp, struct dir_context *dc) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR)) +static int ksu_wrapper_iterate (struct file *fp, struct dir_context *dc) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->iterate(orig, dc); } +#endif + +// int (*readdir) (struct file *, void *, filldir_t); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) +static int ksu_wrapper_readdir(struct file *fp, void *ptr, filldir_t filler) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; + return orig->f_op->readdir(orig, ptr, filler); +} #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_iterate_shared(struct file *fp, struct dir_context *dc) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->iterate_shared(orig, dc); } #endif // typedef unsigned __bitwise __poll_t; -static unsigned __bitwise ksu_wrapper_poll(struct file *fp, - struct poll_table_struct *pts) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static unsigned __bitwise ksu_wrapper_poll(struct file *fp, struct poll_table_struct *pts) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->poll(orig, pts); } -static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd, - unsigned long arg) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static long ksu_wrapper_unlocked_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->unlocked_ioctl(orig, cmd, arg); } -static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd, - unsigned long arg) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static long ksu_wrapper_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->compat_ioctl(orig, cmd, arg); } -static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_mmap(struct file *fp, struct vm_area_struct * vma) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->mmap(orig, vma); } -static int ksu_wrapper_flush(struct file *fp, fl_owner_t id) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_flush(struct file *fp, fl_owner_t id) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->flush(orig, id); } -static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2, - int datasync) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; + +static int ksu_wrapper_fsync(struct file *fp, loff_t off1, loff_t off2, int datasync) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->fsync(orig, off1, off2, datasync); } -static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_fasync(int arg, struct file *fp, int arg2) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->fasync(arg, orig, arg2); } -static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_lock(struct file *fp, int arg1, struct file_lock *fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; return orig->f_op->lock(orig, arg1, fl); } + #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) -static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, - size_t sz, loff_t *off, int arg2) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, size_t sz, loff_t *off, int arg2) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->sendpage) { return orig->f_op->sendpage(orig, pg, arg1, sz, off, arg2); } @@ -201,51 +165,38 @@ static ssize_t ksu_wrapper_sendpage(struct file *fp, struct page *pg, int arg1, } #endif -static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static unsigned long ksu_wrapper_get_unmapped_area(struct file *fp, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->get_unmapped_area) { - return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3, - arg4); + return orig->f_op->get_unmapped_area(orig, arg1, arg2, arg3, arg4); } return -EINVAL; } // static int ksu_wrapper_check_flags(int arg) {} -static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_flock(struct file *fp, int arg1, struct file_lock *fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->flock) { return orig->f_op->flock(orig, arg1, fl); } return -EINVAL; } -static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info *pii, - struct file *fp, loff_t *off, size_t sz, - unsigned int arg1) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_splice_write(struct pipe_inode_info * pii, struct file *fp, loff_t *off, size_t sz, unsigned int arg1) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->splice_write) { return orig->f_op->splice_write(pii, orig, off, sz, arg1); } return -EINVAL; } -static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, - struct pipe_inode_info *pii, size_t sz, - unsigned int arg1) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, struct pipe_inode_info *pii, size_t sz, unsigned int arg1) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->splice_read) { return orig->f_op->splice_read(orig, off, pii, sz, arg1); } @@ -253,10 +204,9 @@ static ssize_t ksu_wrapper_splice_read(struct file *fp, loff_t *off, } #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) -void ksu_wrapper_splice_eof(struct file *fp) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +void ksu_wrapper_splice_eof(struct file *fp) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->splice_eof) { return orig->f_op->splice_eof(orig); } @@ -264,46 +214,36 @@ void ksu_wrapper_splice_eof(struct file *fp) #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -static int ksu_wrapper_setlease(struct file *fp, int arg1, - struct file_lease **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lease **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->setlease) { return orig->f_op->setlease(orig, arg1, fl, p); } return -EINVAL; } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) -static int ksu_wrapper_setlease(struct file *fp, int arg1, - struct file_lock **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_setlease(struct file *fp, int arg1, struct file_lock **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->setlease) { return orig->f_op->setlease(orig, arg1, fl, p); } return -EINVAL; } -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) -// int (*setlease)(struct file *, long, struct file_lock **, void **); -static int ksu_wrapper_setlease(struct file *fp, long arg1, - struct file_lock **fl, void **p) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) // int (*setlease)(struct file *, long, struct file_lock **, void **); +static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl, void **p) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->setlease) { return orig->f_op->setlease(orig, arg1, fl, p); } return -EINVAL; } -#else -// int (*setlease)(struct file *, long, struct file_lock **); -static int ksu_wrapper_setlease(struct file *fp, long arg1, - struct file_lock **fl) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +#else // int (*setlease)(struct file *, long, struct file_lock **); +static int ksu_wrapper_setlease(struct file *fp, long arg1, struct file_lock **fl) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->setlease) { return orig->f_op->setlease(orig, arg1, fl); } @@ -311,11 +251,9 @@ static int ksu_wrapper_setlease(struct file *fp, long arg1, } #endif -static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, - loff_t len) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->fallocate) { return orig->f_op->fallocate(orig, mode, offset, len); } @@ -323,19 +261,17 @@ static long ksu_wrapper_fallocate(struct file *fp, int mode, loff_t offset, } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) -static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct ksu_file_wrapper *data = f->private_data; - struct file *orig = data->orig; +static void ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) { + struct ksu_file_wrapper* data = f->private_data; + struct file* orig = data->orig; if (orig->f_op->show_fdinfo) { orig->f_op->show_fdinfo(m, orig); } } -#else -static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct ksu_file_wrapper *data = f->private_data; - struct file *orig = data->orig; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) { + struct ksu_file_wrapper* data = f->private_data; + struct file* orig = data->orig; if (orig->f_op->show_fdinfo) { orig->f_op->show_fdinfo(m, orig); } @@ -345,15 +281,11 @@ static int ksu_wrapper_show_fdinfo(struct seq_file *m, struct file *f) #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/read_write.c;l=1593-1606;drc=398da7defe218d3e51b0f3bdff75147e28125b60 -static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, - loff_t pos_out, size_t len, - unsigned int flags) -{ - struct ksu_file_wrapper *data = file_out->private_data; - struct file *orig = data->orig; - return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len, - flags); +static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, + loff_t pos_out, size_t len, unsigned int flags) { + struct ksu_file_wrapper* data = file_out->private_data; + struct file* orig = data->orig; + return orig->f_op->copy_file_range(file_in, pos_in, orig, pos_out, len, flags); } #endif @@ -364,30 +296,24 @@ static ssize_t ksu_wrapper_copy_file_range(struct file *file_in, loff_t pos_in, // REMAP_FILE_DEDUP: use file_out // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/remap_range.c;l=483-484;drc=398da7defe218d3e51b0f3bdff75147e28125b60 static loff_t ksu_wrapper_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, - loff_t pos_out, loff_t len, - unsigned int remap_flags) -{ + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { if (remap_flags & REMAP_FILE_DEDUP) { - struct ksu_file_wrapper *data = file_out->private_data; - struct file *orig = data->orig; - return orig->f_op->remap_file_range(file_in, pos_in, orig, - pos_out, len, remap_flags); + struct ksu_file_wrapper* data = file_out->private_data; + struct file* orig = data->orig; + return orig->f_op->remap_file_range(file_in, pos_in, orig, pos_out, len, remap_flags); } else { - struct ksu_file_wrapper *data = file_in->private_data; - struct file *orig = data->orig; - return orig->f_op->remap_file_range(orig, pos_in, file_out, - pos_out, len, remap_flags); + struct ksu_file_wrapper* data = file_in->private_data; + struct file* orig = data->orig; + return orig->f_op->remap_file_range(orig, pos_in, file_out, pos_out, len, remap_flags); } } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) -static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, - int flags) -{ - struct ksu_file_wrapper *data = fp->private_data; - struct file *orig = data->orig; +static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, int flags) { + struct ksu_file_wrapper* data = fp->private_data; + struct file* orig = data->orig; if (orig->f_op->fadvise) { return orig->f_op->fadvise(orig, off1, off2, flags); } @@ -397,8 +323,7 @@ static int ksu_wrapper_fadvise(struct file *fp, loff_t off1, loff_t off2, static void ksu_release_file_wrapper(struct ksu_file_wrapper *data); -static int ksu_wrapper_release(struct inode *inode, struct file *filp) -{ +static int ksu_wrapper_release(struct inode *inode, struct file *filp) { // https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/fs/file_table.c;l=467-473;drc=3be0b283b562eabbc2b1f3bb534dc8903079bbaa // f_op->release is called before fops_put(f_op), so we put it manually. fops_put(filp->f_op); @@ -408,10 +333,8 @@ static int ksu_wrapper_release(struct inode *inode, struct file *filp) return 0; } -static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp) -{ - struct ksu_file_wrapper *p = - kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL); +static struct ksu_file_wrapper* ksu_create_file_wrapper(struct file* fp) { + struct ksu_file_wrapper* p = kcalloc(1, sizeof(struct ksu_file_wrapper), GFP_KERNEL); if (!p) { return ERR_PTR(-ENOMEM); } @@ -425,24 +348,23 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp) p->ops.write = fp->f_op->write ? ksu_wrapper_write : NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) p->ops.read_iter = fp->f_op->read_iter ? ksu_wrapper_read_iter : NULL; - p->ops.write_iter = - fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL; + p->ops.write_iter = fp->f_op->write_iter ? ksu_wrapper_write_iter : NULL; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) p->ops.iopoll = fp->f_op->iopoll ? ksu_wrapper_iopoll : NULL; #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) && (LINUX_VERSION_CODE > KERNEL_VERSION(3, 11, 0) || defined(KSU_HAS_ITERATE_DIR)) p->ops.iterate = fp->f_op->iterate ? ksu_wrapper_iterate : NULL; #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) + p->ops.readdir = fp->f_op->readdir ? ksu_wrapper_readdir : NULL; +#endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) - p->ops.iterate_shared = - fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL; + p->ops.iterate_shared = fp->f_op->iterate_shared ? ksu_wrapper_iterate_shared : NULL; #endif p->ops.poll = fp->f_op->poll ? ksu_wrapper_poll : NULL; - p->ops.unlocked_ioctl = - fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL; - p->ops.compat_ioctl = - fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL; + p->ops.unlocked_ioctl = fp->f_op->unlocked_ioctl ? ksu_wrapper_unlocked_ioctl : NULL; + p->ops.compat_ioctl = fp->f_op->compat_ioctl ? ksu_wrapper_compat_ioctl : NULL; p->ops.mmap = fp->f_op->mmap ? ksu_wrapper_mmap : NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) p->ops.fop_flags = fp->f_op->fop_flags; @@ -457,34 +379,27 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp) #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) p->ops.sendpage = fp->f_op->sendpage ? ksu_wrapper_sendpage : NULL; #endif - p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ? - ksu_wrapper_get_unmapped_area : - NULL; + p->ops.get_unmapped_area = fp->f_op->get_unmapped_area ? ksu_wrapper_get_unmapped_area : NULL; p->ops.check_flags = fp->f_op->check_flags; p->ops.flock = fp->f_op->flock ? ksu_wrapper_flock : NULL; - p->ops.splice_write = - fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL; - p->ops.splice_read = - fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL; + p->ops.splice_write = fp->f_op->splice_write ? ksu_wrapper_splice_write : NULL; + p->ops.splice_read = fp->f_op->splice_read ? ksu_wrapper_splice_read : NULL; p->ops.setlease = fp->f_op->setlease ? ksu_wrapper_setlease : NULL; p->ops.fallocate = fp->f_op->fallocate ? ksu_wrapper_fallocate : NULL; - p->ops.show_fdinfo = - fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) + p->ops.show_fdinfo = fp->f_op->show_fdinfo ? ksu_wrapper_show_fdinfo : NULL; +#endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) - p->ops.copy_file_range = - fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL; + p->ops.copy_file_range = fp->f_op->copy_file_range ? ksu_wrapper_copy_file_range : NULL; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) - p->ops.remap_file_range = fp->f_op->remap_file_range ? - ksu_wrapper_remap_file_range : - NULL; + p->ops.remap_file_range = fp->f_op->remap_file_range ? ksu_wrapper_remap_file_range : NULL; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) p->ops.fadvise = fp->f_op->fadvise ? ksu_wrapper_fadvise : NULL; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) - p->ops.splice_eof = - fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL; + p->ops.splice_eof = fp->f_op->splice_eof ? ksu_wrapper_splice_eof : NULL; #endif return p; @@ -492,12 +407,12 @@ static struct ksu_file_wrapper *ksu_create_file_wrapper(struct file *fp) static void ksu_release_file_wrapper(struct ksu_file_wrapper *data) { - fput((struct file *)data->orig); + fput((struct file*) data->orig); kfree(data); } static char *ksu_wrapper_d_dname(struct dentry *dentry, char *buffer, - int buflen) + int buflen) { struct path *orig_path = dentry->d_fsdata; return d_path(orig_path, buffer, buflen); @@ -519,71 +434,8 @@ static const struct dentry_operations ksu_file_wrapper_d_ops = { #define ksu_anon_inode_create_getfile_compat anon_inode_create_getfile #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) #define ksu_anon_inode_create_getfile_compat anon_inode_getfile_secure -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) -// There is no anon_inode_create_getfile in 4.19, but it's not difficult to implement it. -// https://cs.android.com/android/kernel/superproject/+/common-android12-5.10:common/fs/anon_inodes.c;l=58-125;drc=0d34ce8aa78e38affbb501690bcabec4df88620e - -// Borrow kernel's anon_inode_mnt, so that we don't need to mount one by ourselves. -static struct vfsmount *anon_inode_mnt __read_mostly; - -static struct inode * -ksu_anon_inode_make_secure_inode(const char *name, - const struct inode *context_inode) -{ - struct inode *inode; - - if (unlikely(!anon_inode_mnt)) { - return ERR_PTR(-ENODEV); - } - - inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); - if (IS_ERR(inode)) - return inode; - inode->i_flags &= ~S_PRIVATE; - - return inode; -} - -static struct file *ksu_anon_inode_create_getfile_compat( - const char *name, const struct file_operations *fops, void *priv, - int flags, const struct inode *context_inode) -{ - struct inode *inode; - struct file *file; - - if (fops->owner && !try_module_get(fops->owner)) - return ERR_PTR(-ENOENT); - - inode = ksu_anon_inode_make_secure_inode(name, context_inode); - if (IS_ERR(inode)) { - file = ERR_CAST(inode); - goto err; - } - - file = alloc_file_pseudo(inode, anon_inode_mnt, name, - flags & (O_ACCMODE | O_NONBLOCK), fops); - if (IS_ERR(file)) - goto err_iput; - - file->f_mapping = inode->i_mapping; - - file->private_data = priv; - - return file; - -err_iput: - iput(inode); -err: - module_put(fops->owner); - return file; -} -#else // KERNEL_VERSION < 4.19 -struct file *ksu_anon_inode_create_getfile_compat( - const char *name, const struct file_operations *fops, void *priv, - int flags, const struct inode *context_inode) -{ - return anon_inode_getfile(name, fops, priv, flags); -} +#else +#define ksu_anon_inode_create_getfile_compat(a, b, c, d, e) anon_inode_getfile(a, b, c, d) #endif int ksu_install_file_wrapper(int fd) @@ -611,8 +463,7 @@ int ksu_install_file_wrapper(int fd) "[ksu_fdwrapper]", &file_wrapper_data->ops, file_wrapper_data, orig_file->f_flags, NULL); if (IS_ERR(wrapper_file)) { - pr_err("ksu_fdwrapper: getfile failed: %ld\n", - PTR_ERR(wrapper_file)); + pr_err("ksu_fdwrapper: getfile failed: %ld\n", PTR_ERR(wrapper_file)); ret = PTR_ERR(wrapper_file); goto out_release_wrapper; } @@ -623,15 +474,7 @@ int ksu_install_file_wrapper(int fd) struct inode *wrapper_inode = file_inode(wrapper_file); // libc's stdio relies on the fstat() result of the fd to determine its buffer type. wrapper_inode->i_mode = file_inode(orig_file)->i_mode; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) - struct inode_security_struct *wrapper_sec = - selinux_inode(wrapper_inode); -#else - struct inode_security_struct *wrapper_sec = - (struct inode_security_struct *)wrapper_inode->i_security; -#endif - + struct inode_security_struct *wrapper_sec = selinux_inode(wrapper_inode); // Use ksu_file_sid to bypass SELinux check. // When we call `su` from terminal app, this is useful. if (wrapper_sec) { @@ -670,21 +513,4 @@ int ksu_install_file_wrapper(int fd) return ret; } -void ksu_file_wrapper_init(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) && \ - LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) - static const struct file_operations tmp = { .owner = THIS_MODULE }; - struct file *dummy = anon_inode_getfile("dummy", &tmp, NULL, 0); - if (IS_ERR(dummy)) { - pr_err("file_wrapper: initialize anon_inode_mnt failed, can't get file: %ld\n", - PTR_ERR(dummy)); - return; - } - anon_inode_mnt = dummy->f_path.mnt; - if (unlikely(!anon_inode_mnt)) { - pr_err("file_wrapper: initialize anon_inode_mnt failed, got NULL\n"); - } - fput(dummy); -#endif -} +void __init ksu_file_wrapper_init(void) { } diff --git a/drivers/kernelsu/file_wrapper.h b/drivers/kernelsu/infra/file_wrapper.h similarity index 76% rename from drivers/kernelsu/file_wrapper.h rename to drivers/kernelsu/infra/file_wrapper.h index faae4dded301..ee672312b7aa 100644 --- a/drivers/kernelsu/file_wrapper.h +++ b/drivers/kernelsu/infra/file_wrapper.h @@ -1,9 +1,6 @@ #ifndef KSU_FILE_WRAPPER_H #define KSU_FILE_WRAPPER_H -#include -#include - int ksu_install_file_wrapper(int fd); void ksu_file_wrapper_init(void); diff --git a/drivers/kernelsu/su_mount_ns.c b/drivers/kernelsu/infra/su_mount_ns.c similarity index 52% rename from drivers/kernelsu/su_mount_ns.c rename to drivers/kernelsu/infra/su_mount_ns.c index 4a0e4a29b103..7f5651d5de73 100644 --- a/drivers/kernelsu/su_mount_ns.c +++ b/drivers/kernelsu/infra/su_mount_ns.c @@ -1,88 +1,38 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -#include -#else -#include -#endif - -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "kernel_compat.h" -#include "su_mount_ns.h" - extern int path_mount(const char *dev_name, struct path *path, - const char *type_page, unsigned long flags, - void *data_page); + const char *type_page, unsigned long flags, + void *data_page); #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - -// RKSU: tiny arch.h, avoid depending on real arch.h -#ifndef __PT_REGS_CAST -#define __PT_REGS_CAST(x) (x) -#endif - #if defined(__aarch64__) -#define PT_PARM1(x) (__PT_REGS_CAST(x)->regs[0]) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->regs[1]) extern long __arm64_sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (__arm64_sys_setns(regs)) #elif defined(__x86_64__) -#define PT_PARM1(x) (__PT_REGS_CAST(x)->di) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->si) extern long __x64_sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (__x64_sys_setns(regs)) #elif defined(__arm__) // https://syscalls.mebeim.net/?table=arm/32/eabi/latest -// taken from: -// https://github.com/backslashxx/KernelSU/blob/8b71e8bce199e8ac44538648e298092a9b3ef42b/kernel/arch.h#L29 -#define PT_PARM1(x) (__PT_REGS_CAST(x)->uregs[0]) -#define PT_PARM2(x) (__PT_REGS_CAST(x)->uregs[1]) extern long sys_setns(const struct pt_regs *regs); -#define do_sys_setns(regs) (sys_setns(regs)) #endif static long ksu_sys_setns(int fd, int flags) { -#ifdef PT_PARM1 struct pt_regs regs; memset(®s, 0, sizeof(regs)); - PT_PARM1(®s) = fd; - PT_PARM2(®s) = flags; + PT_REGS_PARM1(®s) = fd; + PT_REGS_PARM2(®s) = flags; - return do_sys_setns(®s); +#if defined(__aarch64__) + return __arm64_sys_setns(®s); +#elif defined(__x86_64__) + return __x64_sys_setns(®s); +#elif defined(__arm__) + return sys_setns(®s); #else return -ENOSYS; #endif } #else -static long ksu_sys_setns(int fd, int flags) -{ - return sys_setns(fd, flags); -} - -int ksys_unshare(unsigned long unshare_flags) -{ - return sys_unshare(unshare_flags); -} -#endif +#define ksu_sys_setns sys_setns +#define ksys_unshare sys_unshare +#endif // > 4.17 // global mode , need CAP_SYS_ADMIN and CAP_SYS_CHROOT to perform setns static void ksu_mnt_ns_global(void) @@ -103,14 +53,14 @@ static void ksu_mnt_ns_global(void) if (IS_ERR(pwd_path)) { if (PTR_ERR(pwd_path) == -ENAMETOOLONG) { pr_warn("absolute pwd longer than: %d, skip restore pwd!!\n", - PATH_MAX); + PATH_MAX); } else { - pr_warn("get absolute pwd failed: %ld\n", - PTR_ERR(pwd_path)); + pr_warn("get absolute pwd failed: %ld\n", PTR_ERR(pwd_path)); } pwd_path = NULL; } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) try_setns: rcu_read_lock(); @@ -130,18 +80,37 @@ static void ksu_mnt_ns_global(void) goto out; } struct path ns_path; - long ret = ns_get_path(&ns_path, pid1_task, &mntns_operations); + long ret = (long)ns_get_path(&ns_path, pid1_task, &mntns_operations); put_task_struct(pid1_task); if (ret) { pr_warn("failed get path for init mount namespace: %ld\n", ret); goto out; } +#else +try_setns: + ; + // on UL kernels we can try to just feed it with struct path of /proc/1/ns/mnt + // we do NOT have ns_get_path. if it works, GOOD. if it doesn't I don't care. + struct path ns_path; + const struct cred *saved = override_creds(ksu_cred); + + // make sure to LOOKUP_FOLLOW + // /proc/1/ns/mnt -> 'mnt:[4026531840]' + long ret = kern_path("/proc/1/ns/mnt", LOOKUP_FOLLOW, &ns_path); + if (ret) { + revert_creds(saved); + pr_warn("kern_path /proc/1/ns/mnt fail! ret: %d\n", ret); + goto out; + } + revert_creds(saved); +#endif + struct file *ns_file = dentry_open(&ns_path, O_RDONLY, ksu_cred); path_put(&ns_path); if (IS_ERR(ns_file)) { pr_warn("failed open file for init mount namespace: %ld\n", - PTR_ERR(ns_file)); + PTR_ERR(ns_file)); goto out; } @@ -155,7 +124,7 @@ static void ksu_mnt_ns_global(void) fd_install(fd, ns_file); ret = ksu_sys_setns(fd, CLONE_NEWNS); - do_close_fd(fd); + close_fd(fd); if (ret) { pr_warn("call setns failed: %ld\n", ret); @@ -169,8 +138,7 @@ static void ksu_mnt_ns_global(void) set_fs_pwd(current->fs, &new_pwd); path_put(&new_pwd); } else { - pr_warn("restore pwd failed: %d, path: %s\n", err, - pwd_path); + pr_warn("restore pwd failed: %d, path: %s\n", err, pwd_path); } } out: @@ -189,8 +157,7 @@ static void ksu_mnt_ns_individual(void) // make root mount private struct path root_path; get_fs_root(current->fs, &root_path); - int pm_ret = - path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL); + int pm_ret = path_mount(NULL, &root_path, NULL, MS_PRIVATE | MS_REC, NULL); path_put(&root_path); if (pm_ret < 0) { @@ -198,54 +165,6 @@ static void ksu_mnt_ns_individual(void) } } -#ifdef CONFIG_KSU_SYSCALL_HOOK -struct ksu_mns_tw { - struct callback_head cb; - int32_t ns_mode; -}; - -static void ksu_setup_mount_ns_tw_func(struct callback_head *cb) -{ - struct ksu_mns_tw *tw = container_of(cb, struct ksu_mns_tw, cb); - const struct cred *old_cred = override_creds(ksu_cred); - if (tw->ns_mode == KSU_NS_GLOBAL) { - ksu_mnt_ns_global(); - } else { - ksu_mnt_ns_individual(); - } - revert_creds(old_cred); - kfree(tw); -} - -static void ksu_handle_setup_mount_ns(int32_t ns_mode) -{ - struct ksu_mns_tw *tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) { - pr_err("no mem for tw! skip mnt_ns magic for pid: %d.\n", - current->pid); - return; - } - tw->cb.func = ksu_setup_mount_ns_tw_func; - tw->ns_mode = ns_mode; - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_err("add task work failed! skip mnt_ns magic for pid: %d.\n", - current->pid); - } -} -#else -static void ksu_handle_setup_mount_ns(int32_t ns_mode) -{ - const struct cred *old_cred = override_creds(ksu_cred); - if (ns_mode == KSU_NS_GLOBAL) { - ksu_mnt_ns_global(); - } else { - ksu_mnt_ns_individual(); - } - revert_creds(old_cred); -} -#endif - void setup_mount_ns(int32_t ns_mode) { // inherit mode @@ -255,16 +174,21 @@ void setup_mount_ns(int32_t ns_mode) } if (ns_mode != KSU_NS_GLOBAL && ns_mode != KSU_NS_INDIVIDUAL) { - pr_warn("pid: %d ,unknown mount namespace mode: %d\n", - current->pid, ns_mode); + pr_warn("pid: %d ,unknown mount namespace mode: %d\n", current->pid, + ns_mode); return; } if (!ksu_cred) { - pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n", - current->pid); + pr_err("no ksu cred! skip mnt_ns magic for pid: %d.\n", current->pid); return; } - ksu_handle_setup_mount_ns(ns_mode); + const struct cred *old_cred = override_creds(ksu_cred); + if (ns_mode == KSU_NS_GLOBAL) { + ksu_mnt_ns_global(); + } else { + ksu_mnt_ns_individual(); + } + revert_creds(old_cred); } diff --git a/drivers/kernelsu/su_mount_ns.h b/drivers/kernelsu/infra/su_mount_ns.h similarity index 100% rename from drivers/kernelsu/su_mount_ns.h rename to drivers/kernelsu/infra/su_mount_ns.h diff --git a/drivers/kernelsu/kernel_compat.c b/drivers/kernelsu/kernel_compat.c index 38f0251f08a4..26f4a9471de5 100644 --- a/drivers/kernelsu/kernel_compat.c +++ b/drivers/kernelsu/kernel_compat.c @@ -1,199 +1,102 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif -#include -#include -#include - -#include "klog.h" // IWYU pragma: keep -#include "kernel_compat.h" - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -#include -#include -#include - -extern int install_session_keyring_to_cred(struct cred *, struct key *); -struct key *init_session_keyring = NULL; - -static int install_session_keyring(struct key *keyring) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +__weak int path_mount(const char *dev_name, struct path *path, + const char *type_page, unsigned long flags, void *data_page) { - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; + // 384 is enough + char buf[384] = {0}; - ret = install_session_keyring_to_cred(new, keyring); - if (ret < 0) { - abort_creds(new); - return ret; - } + // -1 on the size as implicit null termination + // as we zero init the thing + char *realpath = d_path(path, buf, sizeof(buf) - 1); + if (!(realpath && realpath != buf)) + return -ENOENT; - return commit_creds(new); + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + long ret = do_mount(dev_name, (const char __user *)realpath, type_page, flags, data_page); + set_fs(old_fs); + return ret; } #endif -struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) - if (init_session_keyring != NULL && !current_cred()->session_keyring && - (current->flags & PF_WQ_WORKER)) { - pr_info("installing init session keyring for older kernel\n"); - install_session_keyring(init_session_keyring); - } -#endif - return filp_open(filename, flags, mode); -} - -ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, - loff_t *pos) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) +__weak int path_umount(struct path *path, int flags) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_read(p, buf, count, pos); -#else - loff_t offset = pos ? *pos : 0; - ssize_t result = kernel_read(p, offset, (char *)buf, count); - if (pos && result > 0) { - *pos = offset + result; - } - return result; -#endif -} + char buf[256] = {0}; + int ret; -ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, - loff_t *pos) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) - return kernel_write(p, buf, count, pos); -#else - loff_t offset = pos ? *pos : 0; - ssize_t result = kernel_write(p, buf, count, offset); - if (pos && result > 0) { - *pos = offset + result; + // -1 on the size as implicit null termination + // as we zero init the thing + char *usermnt = d_path(path, buf, sizeof(buf) - 1); + if (!(usermnt && usermnt != buf)) { + ret = -ENOENT; + goto out; } - return result; -#endif -} -static inline long -do_strncpy_user_nofault(char *dst, const void __user *unsafe_addr, long count) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) - return strncpy_from_user_nofault(dst, unsafe_addr, count); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) - return strncpy_from_unsafe_user(dst, unsafe_addr, count); -#else mm_segment_t old_fs = get_fs(); - long ret; + set_fs(KERNEL_DS); - if (unlikely(count <= 0)) - return 0; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) + ret = ksys_umount((char __user *)usermnt, flags); +#else + ret = (int)sys_umount((char __user *)usermnt, flags); +#endif - set_fs(USER_DS); - pagefault_disable(); - ret = strncpy_from_user(dst, unsafe_addr, count); - pagefault_enable(); set_fs(old_fs); - if (ret >= count) { - ret = count; - dst[ret - 1] = '\0'; - } else if (ret > 0) { - ret++; - } - + // release ref here! user_path_at increases it + // then only cleans for itself +out: + path_put(path); return ret; -#endif } +#endif -long ksu_strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, - long count) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +__weak long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { -#ifdef CONFIG_KSU_MANUAL_HOOK + // https://elixir.bootlin.com/linux/v5.2.21/source/mm/maccess.c#L27 long ret; + mm_segment_t old_fs = get_fs(); - ret = do_strncpy_user_nofault(dst, unsafe_addr, count); - if (likely(ret >= 0)) - return ret; - - // we faulted! fallback to slow path - if (unlikely(!ksu_access_ok(unsafe_addr, count))) - return -EFAULT; - - ret = strncpy_from_user(dst, unsafe_addr, count); - if (ret >= count) { - ret = count; - dst[ret - 1] = '\0'; - } else if (ret >= 0) { - ret++; - } + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + set_fs(old_fs); - return ret; -#else - return do_strncpy_user_nofault(dst, unsafe_addr, count); -#endif + return ret ? -EFAULT : 0; } +#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 9, 0) -int path_mount(const char *dev_name, struct path *path, const char *type_page, - unsigned long flags, void *data_page) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +__weak long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { - // 384 is enough - char buf[384] = { 0 }; - mm_segment_t old_fs; - long ret; + // https://elixir.bootlin.com/linux/v5.8/source/mm/maccess.c#L205 + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); - // -1 on the size as implicit null termination - // as we zero init the thing - char *realpath = d_path(path, buf, sizeof(buf) - 1); - if (!(realpath && realpath != buf)) - return -ENOENT; + set_fs(USER_DS); + + // normally theres an access_ok check here + // but for what we use it, it will always be true. + // so we skip it + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = do_mount(dev_name, (const char __user *)realpath, type_page, - flags, data_page); set_fs(old_fs); - return ret; -} -#endif -int do_close_fd(unsigned int fd) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - return close_fd(fd); -#else - return __close_fd(current->files, fd); -#endif + if (ret) + return -EFAULT; + return 0; } +#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -// https://elixir.bootlin.com/linux/v5.10.247/source/mm/util.c#L664 -void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) || !defined(CONFIG_EXT4_FS) +__weak void ext4_unregister_sysfs(struct super_block *sb) { - void *newp; - - if (oldsize >= newsize) - return (void *)p; - newp = kvmalloc(newsize, flags); - if (!newp) - return NULL; - memcpy(newp, p, oldsize); - kvfree(p); - return newp; + pr_info("%s: feature not implemented!\n", __func__); } #endif diff --git a/drivers/kernelsu/kernel_compat.h b/drivers/kernelsu/kernel_compat.h index b8fe8874d17d..147efae61ccf 100644 --- a/drivers/kernelsu/kernel_compat.h +++ b/drivers/kernelsu/kernel_compat.h @@ -1,62 +1,375 @@ #ifndef __KSU_H_KERNEL_COMPAT #define __KSU_H_KERNEL_COMPAT -#include -#include -#include -#include - -/* - * Adapt to Huawei HISI kernel without affecting other kernels , - * Huawei Hisi Kernel EBITMAP Enable or Disable Flag , - * From ss/ebitmap.h +#if defined(CONFIG_KEYS) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) +extern int install_session_keyring_to_cred(struct cred *cred, struct key *keyring); +static struct key *init_session_keyring = NULL; + +bool is_init(const struct cred* cred); + +static inline int install_session_keyring(struct key *keyring) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = install_session_keyring_to_cred(new, keyring); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +// up to 5.1, struct key __rcu *session_keyring; /* keyring inherited over fork */ +// so we need to grab this using rcu_dereference +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->session_keyring); } +#else +static inline struct key *ksu_get_current_session_keyring() { return rcu_dereference(current->cred->tgcred->session_keyring); } +#endif + +__attribute__((cold)) +static noinline void ksu_grab_init_session_keyring() +{ + if (init_session_keyring) + return; + + if (!!strcmp(current->comm, "init")) + return; + + if (!!!is_init(current_cred())) + return; + + // now we are sure that this is the key we want + struct key *keyring = ksu_get_current_session_keyring(); + if (!keyring) + return; + + init_session_keyring = key_get(keyring); + + pr_info("%s: init_session_keyring: 0x%lx \n", __func__, (uintptr_t)init_session_keyring); +} + +static noinline struct file *ksu_filp_open_compat(const char *filename, int flags, umode_t mode) +{ + // it used to be that we put this on (current->flags & PF_WQ_WORKER) + // but since things actually needing this has been offloaded to kthread + // like allowlist write, we check for that instead. + if (!(current->flags & PF_KTHREAD)) + goto filp_open; + + if (!!ksu_get_current_session_keyring()) + goto filp_open; + + if (!!!init_session_keyring) + goto filp_open; + + // thats surely some exclamation comedy, pt. 2 + // now we are sure that we need to install init keyring to current + install_session_keyring(init_session_keyring); + +filp_open: + return filp_open(filename, flags, mode); +} +#define filp_open ksu_filp_open_compat +#else +static inline void ksu_grab_init_session_keyring() {} // no-op +#endif // KEYS && < 5.2 + +#ifndef __ro_after_init +#define __ro_after_init +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) +#define d_inode(dentry) ((dentry)->d_inode) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) && defined(CONFIG_ARM64) +#ifndef TIF_SECCOMP +#define TIF_SECCOMP 11 +#endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) +static inline void *ksu_kvmalloc(size_t size, gfp_t flags) +{ + void *buf = kmalloc(size, flags); + if (!buf) + buf = vmalloc(size); + + return buf; +} + +static inline void ksu_kvfree(void *buf) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} +#define kvmalloc ksu_kvmalloc +#define kvfree ksu_kvfree +#endif + +// for supercalls.c fd install tw +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) && !defined(TWA_RESUME) +#define TWA_RESUME 1 +#endif + +// this is ksys_close, however that is spotty to use +// as 5.10 backported close_fd and rekt ksys_close +// so we use what it does internally, __close_fd +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#define close_fd(fd) __close_fd(current->files, fd) +#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0) +#define close_fd sys_close +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 0) +static inline struct file *ksu_dentry_open(const struct path *path, int flags, const struct cred *cred) +{ + return dentry_open((*path).dentry, (*path).mnt, flags, cred); +} +#define dentry_open ksu_dentry_open +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) +#ifndef replace_fops +#define replace_fops(f, fops) \ + do { \ + struct file *__file = (f); \ + fops_put(__file->f_op); \ + BUG_ON(!(__file->f_op = (fops))); \ + } while(0) +#endif +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0) && defined(CONFIG_JUMP_LABEL) +#define KSU_CAN_USE_JUMP_LABEL + +// https://elixir.bootlin.com/linux/v3.10.108/source/include/linux/jump_label.h#L211 +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) +static inline void ksu_static_key_enable(struct static_key *key) +{ + int count = atomic_read(&key->enabled); + if (!count) + static_key_slow_inc(key); +} + +static inline void ksu_static_key_disable(struct static_key *key) +{ + int count = atomic_read(&key->enabled); + if (count) + static_key_slow_dec(key); +} + +#define static_branch_enable(k) ksu_static_key_enable(k) +#define static_branch_disable(k) ksu_static_key_disable(k) + +#define static_branch_unlikely(k) static_key_false(k) +#define static_branch_likely(k) static_key_true(k) + +#ifndef DEFINE_STATIC_KEY_FALSE +#define DEFINE_STATIC_KEY_FALSE(k) struct static_key k = STATIC_KEY_INIT_FALSE +#endif + +#ifndef DEFINE_STATIC_KEY_TRUE +#define DEFINE_STATIC_KEY_TRUE(k) struct static_key k = STATIC_KEY_INIT_TRUE +#endif + +#endif // < 4.3 +#endif // >= 3.4 && CONFIG_JUMP_LABEL + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + const compat_uptr_t __user *compat; +#endif + } ptr; +}; + +extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); + +/** + * ksu_copy_from_user_retry + * try nofault copy first, if it fails, try with plain + * paramters are the same as copy_from_user + * 0 = success */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) && \ - (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)) && \ - (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) -#ifdef HISI_SELINUX_EBITMAP_RO -#define CONFIG_IS_HW_HISI +extern long copy_from_user_nofault(void *dst, const void __user *src, size_t size); +static __always_inline long ksu_copy_from_user_retry(void *to, const void __user *from, unsigned long count) +{ + long ret = copy_from_user_nofault(to, from, count); + if (likely(!ret)) + return ret; + + // we faulted! fallback to slow path + return copy_from_user(to, from, count); +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) // caller is reponsible for sanity! +static inline void ksu_zeroed_strncpy(char *dest, const char *src, size_t count) +{ + // this is actually faster due to dead store elimination + // count - 1 as implicit null termination + __builtin_memset(dest, 0, count); + __builtin_strncpy(dest, src, count - 1); +} +#define strscpy_pad ksu_zeroed_strncpy #endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) +#define strscpy ksu_zeroed_strncpy #endif -extern long ksu_strncpy_from_user_nofault(char *dst, - const void __user *unsafe_addr, - long count); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) +#define d_is_reg(dentry) S_ISREG((dentry)->d_inode->i_mode) +#endif -extern struct file *ksu_filp_open_compat(const char *filename, int flags, - umode_t mode); -extern ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, - loff_t *pos); -extern ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, - size_t count, loff_t *pos); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 5, 0) +struct user_struct *ksu_alloc_uid(kuid_t uid) { return alloc_uid(current_user_ns(), uid); } +#define alloc_uid ksu_alloc_uid +#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -extern struct key *init_session_keyring; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(KSU_HAS_ITERATE_DIR) +struct dir_context { const filldir_t actor; loff_t pos; }; +#define iterate_dir(file, ctx) vfs_readdir(file, (ctx)->actor, ctx) #endif -extern int do_close_fd(unsigned int fd); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +__weak char *bin2hex(char *dst, const void *src, size_t count) +{ + const unsigned char *_src = src; + while (count--) + dst = pack_hex_byte(dst, *_src++); + return dst; +} +#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -extern void *ksu_compat_kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) +#define file_inode(f) ((f)->f_path.dentry->d_inode) #endif -#ifndef VERIFY_READ -#define ksu_access_ok(addr, size) access_ok(addr, size) -#else -#define ksu_access_ok(addr, size) access_ok(VERIFY_READ, addr, size) +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) && !defined(CONFIG_LSM) +#define selinux_inode(inode) ((inode)->i_security) +#define selinux_cred(cred) ((cred)->security) #endif -// Linux >= 5.7 -// task_work_add (struct, struct, enum) -// Linux pre-5.7 -// task_work_add (struct, struct, bool) -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) -#ifndef TWA_RESUME -#define TWA_RESUME true +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 15, 0) +__weak void groups_sort(struct group_info *group_info) { } // no-op #endif + +#ifndef U16_MAX +#define U16_MAX ((u16)(~0U)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0) && !defined(EPOLLIN) +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 +#endif // < 4.12 && !EPOLLIN + +#ifndef READ_ONCE +#define READ_ONCE(x) (*(const volatile typeof(x) *)&(x)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 15, 0) +#define task_ppid_nr(a) (pid_t)sys_getppid() +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 17, 0) +static inline u64 ksu_ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } +#define ktime_get_ns ksu_ktime_get_ns +#endif + +// WARNING: no overflow safety! +#ifndef struct_size +#define struct_size(p, member, n) (sizeof(*(p)) + (n) * sizeof(*(p)->member)) #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION (4, 12, 0) +#ifndef ALIGN_DOWN +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) #endif +#endif + +#ifndef untagged_addr +#define untagged_addr(addr) (addr) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L418 +static noinline ssize_t ksu_kernel_read_compat(struct file *p, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t result = vfs_read(p, (void __user *)buf, count, pos); + set_fs(old_fs); + return result; +} +// https://elixir.bootlin.com/linux/v4.14.336/source/fs/read_write.c#L512 +static noinline ssize_t ksu_kernel_write_compat(struct file *p, const void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(get_ds()); + ssize_t res = vfs_write(p, (__force const char __user *)buf, count, pos); + set_fs(old_fs); + return res; +} +#define kernel_read ksu_kernel_read_compat +#define kernel_write ksu_kernel_write_compat +#endif // < 4.14 + +static inline void ksu_kfree_byref(void *buf) { kfree(*(void **)buf); } + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 9, 0) +// hashtable.h, list.h, rculist.h +// ref: https://github.com/torvalds/linux/commit/b67bfe0d42cac56c512dd5da4b1b347a23f4b70a +#include "linux_hashtable.h" +static inline int __must_check ksu_kref_get_unless_zero(struct kref *kref) +{ + return atomic_add_unless(&kref->refcount, 1, 0); +} +#define kref_get_unless_zero ksu_kref_get_unless_zero +#endif // < 3.9 + +/** + * kver agnostic workaround for < 3.14's CONFIG_UIDGID_STRICT_TYPE_CHECKS=n + * + * - force dereferences an unsigned int (uid_t) + * - redefines current_uid / current_euid macros + * + * ref + * - https://elixir.bootlin.com/linux/v3.13/source/include/linux/uidgid.h + * - https://elixir.bootlin.com/linux/v3.13/source/include/linux/cred.h#L331 + */ +#define ksu_get_uid_t(x) *(unsigned int *)&(x) + +#if LINUX_VERSION_CODE < KERNEL_VERSION (3, 14, 0) +#undef current_uid +#undef current_euid +typedef struct { uid_t val; } ksu_kuid_t; +static inline ksu_kuid_t current_uid() { return *(ksu_kuid_t *)(¤t_cred()->uid); } +static inline ksu_kuid_t current_euid() { return *(ksu_kuid_t *)(¤t_cred()->euid); } +#endif // < 3.14 + +#endif // __KSU_H_KERNEL_COMPAT diff --git a/drivers/kernelsu/kernel_includes.h b/drivers/kernelsu/kernel_includes.h new file mode 100644 index 000000000000..c3ea6cb0db09 --- /dev/null +++ b/drivers/kernelsu/kernel_includes.h @@ -0,0 +1,179 @@ +#ifndef __KSU_H_KERNEL_INCLUDES +#define __KSU_H_KERNEL_INCLUDES + +// common +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// versioned / conditional + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) +#include +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) +#include +#else +#include +#endif +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) +#include +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) +#include +#else +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#include +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#include +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) +#include +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#include +#endif + +/** + * replace common mem/str functions with builtins + * so legacy kernels get better inlining and optimized routines (with newer compielrs) + * a lot of people rice their flags (mcpu/march), this'll be a good reward for them. + * minimum that people use is gcc 4.9 for 3.x kernels, so these are fineee + * https://github.com/gcc-mirror/gcc/blob/releases/gcc-4.9/gcc/builtins.def#L562 + * + */ +#if !defined(CONFIG_KSU_DEBUG) + +#define memchr __builtin_memchr +#define memcmp __builtin_memcmp +#define memcpy __builtin_memcpy +#define memmove __builtin_memmove +#define memset __builtin_memset +#define strcasecmp __builtin_strcasecmp +#define strcat __builtin_strcat +#define strchr __builtin_strchr +#define strcmp __builtin_strcmp +#define strcpy __builtin_strcpy +#define strcspn __builtin_strcspn +#define strlen __builtin_strlen +#define strncasecmp __builtin_strncasecmp +#define strncat __builtin_strncat +#define strncmp __builtin_strncmp +#define strncpy __builtin_strncpy +#define strpbrk __builtin_strpbrk +#define strrchr __builtin_strrchr +#define strspn __builtin_strspn +//#define strstr __builtin_strstr + +#endif // !CONFIG_KSU_DEBUG + +#endif // __KSU_H_KERNEL_INCLUDES diff --git a/drivers/kernelsu/kernel_umount.c b/drivers/kernelsu/kernel_umount.c deleted file mode 100644 index cd9889ea7f72..000000000000 --- a/drivers/kernelsu/kernel_umount.c +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kernel_umount.h" -#include "klog.h" // IWYU pragma: keep -#include "allowlist.h" -#include "kernel_compat.h" -#include "selinux/selinux.h" -#include "feature.h" -#include "ksud.h" -#include "ksu.h" - -bool __read_mostly ksu_kernel_umount_enabled = true; - -static int kernel_umount_feature_get(u64 *value) -{ - *value = ksu_kernel_umount_enabled ? 1 : 0; - return 0; -} - -static int kernel_umount_feature_set(u64 value) -{ - bool enable = value != 0; - ksu_kernel_umount_enabled = enable; - pr_info("kernel_umount: set to %d\n", enable); - return 0; -} - -static const struct ksu_feature_handler kernel_umount_handler = { - .feature_id = KSU_FEATURE_KERNEL_UMOUNT, - .name = "kernel_umount", - .get_handler = kernel_umount_feature_get, - .set_handler = kernel_umount_feature_set, -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) -extern int path_umount(struct path *path, int flags); -static int ksu_umount_mnt(const char *__never_use_mnt, struct path *path, - int flags) -{ - return path_umount(path, flags); -} -#else -static int ksu_sys_umount(const char *mnt, int flags) -{ - char __user *usermnt = (char __user *)mnt; - mm_segment_t old_fs; - int ret = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - ret = ksys_umount(usermnt, flags); -#else - // Perhaps its not necessary to cast it - ret = (int)sys_umount(usermnt, flags); // cuz asmlinkage long sys##name -#endif - set_fs(old_fs); - return ret; -} -#define ksu_umount_mnt(mnt, __unused, flags) \ - ({ \ - path_put(__unused); \ - ksu_sys_umount(mnt, flags); \ - }) - -#endif - -static void try_umount(const char *mnt, int flags) -{ - struct path path; - int ret = 0; - if (kern_path(mnt, 0, &path)) { - return; - } - - if (path.dentry != path.mnt->mnt_root) { - // it is not root mountpoint, maybe umounted by others already. - path_put(&path); - return; - } - - ret = ksu_umount_mnt(mnt, &path, flags); - if (ret) { - pr_info("%s: umounting %s (flags=0x%x) failed, err: %d\n", - __func__, mnt, flags, ret); - } -} - -struct umount_tw { - struct callback_head cb; -}; - -static void umount_tw_func(struct callback_head *cb) -{ - struct umount_tw *tw = container_of(cb, struct umount_tw, cb); - const struct cred *saved = override_creds(ksu_cred); - - down_read(&mount_list_lock); - struct mount_entry *entry; - list_for_each_entry (entry, &mount_list, list) { - pr_info("%s: unmounting: %s flags 0x%x\n", __func__, - entry->umountable, entry->flags); - try_umount(entry->umountable, entry->flags); - } - up_read(&mount_list_lock); - - revert_creds(saved); - kfree(tw); -} - -int ksu_handle_umount(uid_t old_uid, uid_t new_uid) -{ - // if there isn't any module mounted, just ignore it! - if (!ksu_module_mounted) { - return 0; - } - - if (!ksu_kernel_umount_enabled) { - return 0; - } - - if (!ksu_cred) { - return 0; - } - - // There are 5 scenarios: - // 1. Normal app: zygote -> appuid - // 2. Isolated process forked from zygote: zygote -> isolated_process - // 3. App zygote forked from zygote: zygote -> appuid - // 4. Isolated process froked from app zygote: appuid -> isolated_process (already handled by 3) - // 5. Isolated process froked from webview zygote (no need to handle, app cannot run custom code) - if (!is_appuid(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - if (!ksu_uid_should_umount(new_uid) && !is_isolated_process(new_uid)) { - return 0; - } - - // check old process's selinux context, if it is not zygote, ignore it! - // because some su apps may setuid to untrusted_app but they are in global mount namespace - // when we umount for such process, that is a disaster! - // also handle case 4 and 5 - bool is_zygote_child = is_zygote(get_current_cred()); - if (!is_zygote_child) { - pr_info("handle umount ignore non zygote child: %d\n", - current->pid); - return 0; - } - // umount the target mnt - pr_info("handle umount for uid: %d, pid: %d\n", new_uid, current->pid); - - struct umount_tw *tw; - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return 0; - - tw->cb.func = umount_tw_func; - - int err = task_work_add(current, &tw->cb, TWA_RESUME); - if (err) { - kfree(tw); - pr_warn("unmount add task_work failed\n"); - } - - return 0; -} - -void ksu_kernel_umount_init(void) -{ - if (ksu_register_feature_handler(&kernel_umount_handler)) { - pr_err("Failed to register kernel_umount feature handler\n"); - } -} - -void ksu_kernel_umount_exit(void) -{ - ksu_unregister_feature_handler(KSU_FEATURE_KERNEL_UMOUNT); -} diff --git a/drivers/kernelsu/kernel_umount.h b/drivers/kernelsu/kernel_umount.h deleted file mode 100644 index 96a23fba5bcd..000000000000 --- a/drivers/kernelsu/kernel_umount.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __KSU_H_KERNEL_UMOUNT -#define __KSU_H_KERNEL_UMOUNT - -#include -#include -#include - -void ksu_kernel_umount_init(void); -void ksu_kernel_umount_exit(void); - -// Handler function to be called from setresuid hook -int ksu_handle_umount(uid_t old_uid, uid_t new_uid); - -// for the umount list -struct mount_entry { - char *umountable; - unsigned int flags; - struct list_head list; -}; -extern struct list_head mount_list; -extern struct rw_semaphore mount_list_lock; - -extern bool __read_mostly ksu_kernel_umount_enabled; - -#endif diff --git a/drivers/kernelsu/kp_hook.c b/drivers/kernelsu/kp_hook.c deleted file mode 100644 index 23ef72fb14ba..000000000000 --- a/drivers/kernelsu/kp_hook.c +++ /dev/null @@ -1,167 +0,0 @@ -#include -#include -#include - -#define DECL_KP(name, sym, pre) \ - struct kprobe name = { \ - .symbol_name = sym, \ - .pre_handler = pre, \ - } - -// ksud.c - -static struct work_struct stop_vfs_read_work, stop_execve_hook_work, - stop_input_hook_work; - -static int sys_execve_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - const char __user **filename_user = - (const char **)&PT_REGS_PARM1(real_regs); - const char __user *const __user *__argv = - (const char __user *const __user *)PT_REGS_PARM2(real_regs); - struct user_arg_ptr argv = { .ptr.native = __argv }; - struct filename filename_in, *filename_p; - char path[32]; - - if (!filename_user) - return 0; - if (!ksu_retry_filename_access(filename_user, path, 32, false)) - return 0; - - filename_in.name = path; - filename_p = &filename_in; - return ksu_handle_execveat_ksud((int *)AT_FDCWD, &filename_p, &argv, - NULL, NULL); -} - -static int sys_read_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - unsigned int fd = PT_REGS_PARM1(real_regs); - char __user **buf_ptr = (char __user **)&PT_REGS_PARM2(real_regs); - size_t *count_ptr = (size_t *)&PT_REGS_PARM3(real_regs); - - return ksu_handle_sys_read(fd, buf_ptr, count_ptr); -} - -static int input_handle_event_handler_pre(struct kprobe *p, - struct pt_regs *regs) -{ - unsigned int *type = (unsigned int *)&PT_REGS_PARM2(regs); - unsigned int *code = (unsigned int *)&PT_REGS_PARM3(regs); - int *value = (int *)&PT_REGS_CCALL_PARM4(regs); - return ksu_handle_input_handle_event(type, code, value); -} - -static DECL_KP(execve_kp, SYS_EXECVE_SYMBOL, sys_execve_handler_pre); -static DECL_KP(vfs_read_kp, SYS_READ_SYMBOL, sys_read_handler_pre); -static DECL_KP(input_event_kp, "input_event", input_handle_event_handler_pre); - -static void do_stop_vfs_read_hook(struct work_struct *work) -{ - unregister_kprobe(&vfs_read_kp); -} - -static void do_stop_execve_hook(struct work_struct *work) -{ - unregister_kprobe(&execve_kp); -} - -static void do_stop_input_hook(struct work_struct *work) -{ - unregister_kprobe(&input_event_kp); -} - -void kp_handle_ksud_stop(enum ksud_stop_code stop_code) -{ - bool ret; - switch (stop_code) { - case VFS_READ_HOOK_KP: { - ret = schedule_work(&stop_vfs_read_work); - pr_info("unregister vfs_read kprobe: %d!\n", ret); - break; - } - case EXECVE_HOOK_KP: { - ret = schedule_work(&stop_execve_hook_work); - pr_info("unregister execve kprobe: %d!\n", ret); - break; - } - case INPUT_EVENT_HOOK_KP: { - static bool input_hook_stopped = false; - if (input_hook_stopped) { - return; - } - input_hook_stopped = true; - ret = schedule_work(&stop_input_hook_work); - pr_info("unregister input kprobe: %d!\n", ret); - break; - } - default: - return; - } - return; -} - -void kp_handle_ksud_init(void) -{ - int ret; - - ret = register_kprobe(&execve_kp); - pr_info("ksud: execve_kp: %d\n", ret); - - ret = register_kprobe(&vfs_read_kp); - pr_info("ksud: vfs_read_kp: %d\n", ret); - - ret = register_kprobe(&input_event_kp); - pr_info("ksud: input_event_kp: %d\n", ret); - - INIT_WORK(&stop_vfs_read_work, do_stop_vfs_read_hook); - INIT_WORK(&stop_execve_hook_work, do_stop_execve_hook); - INIT_WORK(&stop_input_hook_work, do_stop_input_hook); -} - -void kp_handle_ksud_exit(void) -{ - unregister_kprobe(&execve_kp); - // this should be done before unregister vfs_read_kp - // unregister_kprobe(&vfs_read_kp); - unregister_kprobe(&input_event_kp); -} - -// supercalls.c - -extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, - void __user **arg); - -static int reboot_handler_pre(struct kprobe *p, struct pt_regs *regs) -{ - struct pt_regs *real_regs = PT_REAL_REGS(regs); - int magic1 = (int)PT_REGS_PARM1(real_regs); - int magic2 = (int)PT_REGS_PARM2(real_regs); - void __user **arg = (void __user **)&PT_REGS_SYSCALL_PARM4(real_regs); - - // cmd is not really used here, so we NULL! - if (ksu_handle_sys_reboot(magic1, magic2, NULL, arg)) { - pr_err("kp_hook: sys_reboot failure\n"); - } - - return 0; -} - -static DECL_KP(reboot_kp, REBOOT_SYMBOL, reboot_handler_pre); - -void kp_handle_supercalls_init(void) -{ - int rc = register_kprobe(&reboot_kp); - if (rc) { - pr_err("reboot kprobe failed: %d\n", rc); - return; - } - pr_info("reboot kprobe registered successfully\n"); -} - -void kp_handle_supercalls_exit(void) -{ - unregister_kprobe(&reboot_kp); -} diff --git a/drivers/kernelsu/kp_hook.h b/drivers/kernelsu/kp_hook.h deleted file mode 100644 index 708e78665ba8..000000000000 --- a/drivers/kernelsu/kp_hook.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __KSU_H_KP_HOOK -#define __KSU_H_KP_HOOK - -// ksud.c -enum ksud_stop_code { - VFS_READ_HOOK_KP = 0, - EXECVE_HOOK_KP, - INPUT_EVENT_HOOK_KP, -}; - -int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr, - size_t *count_ptr); - -int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, - int *value); - -void kp_handle_ksud_stop(enum ksud_stop_code); -void kp_handle_ksud_init(void); -void kp_handle_ksud_exit(void); - -// supercalls.c -void kp_handle_supercalls_init(void); -void kp_handle_supercalls_exit(void); - -#endif diff --git a/drivers/kernelsu/kp_util.c b/drivers/kernelsu/kp_util.c deleted file mode 100644 index 05e6715672c8..000000000000 --- a/drivers/kernelsu/kp_util.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include - -static bool try_set_access_flag(unsigned long addr) -{ -#ifdef CONFIG_ARM64 - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - bool ret = false; - - if (!mm) - return false; - - if (!mmap_read_trylock(mm)) - return false; - - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) - goto out_unlock; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out_unlock; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - goto out_unlock; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - goto out_unlock; - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out_unlock; - - if (pmd_trans_huge(*pmd)) - goto out_unlock; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!ptep) - goto out_unlock; - - pte = *ptep; - - if (!pte_present(pte)) - goto out_pte_unlock; - - if (pte_young(pte)) { - ret = true; - goto out_pte_unlock; - } - - ptep_set_access_flags(vma, addr, ptep, pte_mkyoung(pte), 0); - pr_info("set AF for addr %lx\n", addr); - ret = true; - -out_pte_unlock: - pte_unmap_unlock(ptep, ptl); -out_unlock: - mmap_read_unlock(mm); - return ret; -#else - return false; -#endif -} - -bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest, - size_t dest_len, bool exit_atomic_ctx) -{ - unsigned long addr; - const char __user *fn; - long ret; - - if (!char_usr_ptr) - return false; - - addr = untagged_addr((unsigned long)*char_usr_ptr); -#ifdef CONFIG_KSU_DEBUG - pr_info("got addr: %lu\n", addr); -#endif - fn = (const char __user *)addr; - memset(dest, 0, dest_len); - ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len); - - if (ret < 0 && try_set_access_flag(addr)) { - ret = ksu_strncpy_from_user_nofault(dest, fn, dest_len); - } - - /* - * This is crazy, but we know what we are doing: - * Temporarily exit atomic context to handle page faults, then restore it. - */ - if (exit_atomic_ctx) { - if (ret < 0 && preempt_count()) { -#ifdef CONFIG_KSU_DEBUG - pr_info("access to pointer failed, attempting to rescue..\n"); -#endif - preempt_enable_no_resched_notrace(); - ret = strncpy_from_user(dest, fn, dest_len); - preempt_disable_notrace(); - } - } - - if (ret < 0) { - pr_err("all fallback were tried. err: %lu\n", ret); - return false; - } - - return true; -} diff --git a/drivers/kernelsu/kp_util.h b/drivers/kernelsu/kp_util.h deleted file mode 100644 index b9128964d6a8..000000000000 --- a/drivers/kernelsu/kp_util.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __KSU_H_KP_UTIL -#define __KSU_H_KP_UTIL -#include - -#ifndef preempt_enable_no_resched_notrace -#define preempt_enable_no_resched_notrace() \ - do { \ - barrier(); \ - __preempt_count_dec(); \ - } while (0) -#endif - -#ifndef preempt_disable_notrace -#define preempt_disable_notrace() \ - do { \ - __preempt_count_inc(); \ - barrier(); \ - } while (0) -#endif - -bool ksu_retry_filename_access(const char __user **char_usr_ptr, char *dest, - size_t dest_len, bool exit_atomic_ctx); - -#endif diff --git a/drivers/kernelsu/ksu.c b/drivers/kernelsu/ksu.c new file mode 100644 index 000000000000..79b98fd73e21 --- /dev/null +++ b/drivers/kernelsu/ksu.c @@ -0,0 +1,164 @@ +#include "kernel_includes.h" + +// uapi +#include "include/uapi/app_profile.h" +#include "include/uapi/feature.h" +#include "include/uapi/selinux.h" +#include "include/uapi/supercall.h" +#include "include/uapi/sulog.h" + +// includes +#include "include/klog.h" +#include "include/arch.h" +#include "include/ksu.h" + +// selinux includes +#include "avc_ss.h" +#include "objsec.h" +#include "ss/services.h" +#include "ss/symtab.h" +#include "xfrm.h" +#ifndef KSU_COMPAT_USE_SELINUX_STATE +#include "avc.h" +#endif + +// kernel compat, lite ones +#include "kernel_compat.h" + +#include "policy/app_profile.h" +#include "policy/allowlist.h" +#include "policy/feature.h" +#include "manager/apk_sign.h" +#include "manager/manager_identity.h" +#include "manager/throne_tracker.h" +#include "supercall/internal.h" +#include "supercall/supercall.h" +#include "infra/su_mount_ns.h" +#include "infra/file_wrapper.h" +#include "infra/event_queue.h" +#include "feature/adb_root.h" +#include "feature/kernel_umount.h" +#include "feature/selinux_hide.h" +#include "feature/sucompat.h" +#include "feature/sulog.h" +#include "runtime/ksud.h" +#include "runtime/ksud_escape.h" +#include "sulog/event.h" +#include "sulog/fd.h" + +#include "selinux/selinux.h" +#include "selinux/sepolicy.h" + +// unity build +#include "tiny_sulog.c" +#include "policy/allowlist.c" +#include "policy/app_profile.c" +#include "policy/feature.c" +#include "manager/apk_sign.c" +#include "manager/pkg_observer.c" +#include "manager/throne_tracker.c" + +#include "supercall/perm.c" +#include "supercall/dispatch.c" +#include "supercall/supercall.c" + +#include "infra/su_mount_ns.c" +#include "infra/file_wrapper.c" +#include "infra/event_queue.c" + +#include "feature/adb_root.c" +#include "feature/kernel_umount.c" +#include "feature/selinux_hide.c" +#include "feature/sucompat.c" +#include "feature/sulog.c" +#include "runtime/ksud.c" +#include "runtime/ksud_escape.c" + +#include "sulog/event.c" +#include "sulog/fd.c" + +#include "hook/setuid_hook.c" +#include "hook/core_hook.c" // lsm + +#include "selinux/selinux.c" +#include "selinux/sepolicy.c" +#include "selinux/rules.c" + +#ifdef CONFIG_KSU_TAMPER_SYSCALL_TABLE +#ifdef CONFIG_ARM64 + #include "hook/syscall_table_hook_arm64.c" +#elif defined(CONFIG_ARM) + #include "hook/syscall_table_hook_arm.c" +#endif +#endif /* CONFIG_KSU_TAMPER_SYSCALL_TABLE */ + +#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) +#include "hook/kp_ksud.c" +#endif + +// __weak fn's +#include "kernel_compat.c" + +struct cred* ksu_cred; + +extern void ksu_supercalls_init(); + +int __init kernelsu_init(void) +{ +#ifdef CONFIG_KSU_DEBUG + pr_alert("*************************************************************"); + pr_alert("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); + pr_alert("** **"); + pr_alert("** You are running KernelSU in DEBUG mode **"); + pr_alert("** **"); + pr_alert("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); + pr_alert("*************************************************************"); +#endif + + ksu_cred = prepare_creds(); + if (!ksu_cred) { + pr_err("prepare cred failed!\n"); + } + + ksu_feature_init(); + + ksu_supercalls_init(); + + ksu_sucompat_init(); // so the feature is registered + + ksu_kernel_umount_init(); // so the feature is registered + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_init(); // so the feature is registered +#endif + +#ifdef CONFIG_KSU_FEATURE_ADBROOT + ksu_adb_root_init(); // so the feature is registered +#endif + +#ifdef CONFIG_KSU_FEATURE_SELINUX_HIDE + ksu_selinux_hide_init(); +#endif + + ksu_core_init(); + +#if defined(CONFIG_KSU_KPROBES_KSUD) && !defined(CONFIG_KSU_TAMPER_SYSCALL_TABLE) + kp_ksud_init(); +#endif + + ksu_allowlist_init(); + + ksu_throne_tracker_init(); + + ksu_ksud_init(); + + ksu_file_wrapper_init(); + + return 0; +} + +device_initcall(kernelsu_init); + +// MODULE_LICENSE("GPL"); +// MODULE_AUTHOR("weishu"); +// MODULE_DESCRIPTION("Android KernelSU"); diff --git a/drivers/kernelsu/ksud.c b/drivers/kernelsu/ksud.c deleted file mode 100644 index c880d2270c3a..000000000000 --- a/drivers/kernelsu/ksud.c +++ /dev/null @@ -1,644 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) -#include -#else -#include -#endif -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif - -#include "manager.h" -#include "allowlist.h" -#include "arch.h" -#include "kernel_compat.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_hook.h" -#endif -#include "selinux/selinux.h" -#include "throne_tracker.h" - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern int ksu_observer_init(void); -#endif - -bool ksu_module_mounted __read_mostly = false; -bool ksu_boot_completed __read_mostly = false; - -static const char KERNEL_SU_RC[] = - "\n" - - "on post-fs-data\n" - " start logd\n" - // We should wait for the post-fs-data finish - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH - " post-fs-data\n" - "\n" - - "on nonencrypted\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" - "\n" - - "on property:vold.decrypt=trigger_restart_framework\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" - "\n" - - "on property:sys.boot_completed=1\n" - " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH - " boot-completed\n" - "\n" - - "\n"; - -static void stop_vfs_read_hook(void); -static void stop_execve_hook(void); -static void stop_input_hook(void); - -#ifdef CONFIG_KSU_MANUAL_HOOK -bool ksu_vfs_read_hook __read_mostly = true; -bool ksu_execveat_hook __read_mostly = true; -bool ksu_input_hook __read_mostly = true; -#endif - -void on_post_fs_data(void) -{ - static bool already_post_fs_data = false; - if (already_post_fs_data) { - pr_info("on_post_fs_data already done\n"); - return; - } - already_post_fs_data = true; - pr_info("on_post_fs_data!\n"); - ksu_load_allow_list(); -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - ksu_observer_init(); -#endif - stop_input_hook(); -} - -extern void ext4_unregister_sysfs(struct super_block *sb); -int nuke_ext4_sysfs(const char *mnt) -{ - struct path path; - int err = kern_path(mnt, 0, &path); - if (err) { - pr_err("nuke path err: %d\n", err); - return err; - } - - struct super_block *sb = path.dentry->d_inode->i_sb; - const char *name = sb->s_type->name; - if (strcmp(name, "ext4") != 0) { - pr_info("nuke but module aren't mounted\n"); - path_put(&path); - return -EINVAL; - } - - ext4_unregister_sysfs(sb); - path_put(&path); - return 0; -} - -void on_module_mounted(void) -{ - pr_info("on_module_mounted!\n"); - ksu_module_mounted = true; -} - -void on_boot_completed(void) -{ - ksu_boot_completed = true; - pr_info("on_boot_completed!\n"); -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - track_throne(true); -#endif -} - -#define MAX_ARG_STRINGS 0x7FFFFFFF - -static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) -{ - const char __user *native; - -#ifdef CONFIG_COMPAT - if (unlikely(argv.is_compat)) { - compat_uptr_t compat; - - if (get_user(compat, argv.ptr.compat + nr)) - return ERR_PTR(-EFAULT); - - return compat_ptr(compat); - } -#endif - - if (get_user(native, argv.ptr.native + nr)) - return ERR_PTR(-EFAULT); - - return native; -} - -/* - * count() counts the number of strings in array ARGV. - */ - -/* - * Make sure old GCC compiler can use __maybe_unused, - * Test passed in 4.4.x ~ 4.9.x when use GCC. - */ - -static int __maybe_unused count(struct user_arg_ptr argv, int max) -{ - int i = 0; - - if (argv.ptr.native != NULL) { - for (;;) { - const char __user *p = get_user_arg_ptr(argv, i); - - if (!p) - break; - - if (IS_ERR(p)) - return -EFAULT; - - if (i >= max) - return -E2BIG; - ++i; - - if (fatal_signal_pending(current)) - return -ERESTARTNOHAND; -#ifdef CONFIG_KSU_MANUAL_HOOK - cond_resched(); -#endif - } - } - return i; -} - -static void on_post_fs_data_cbfun(struct callback_head *cb) -{ - on_post_fs_data(); -} - -static struct callback_head on_post_fs_data_cb = { - .func = on_post_fs_data_cbfun -}; - -static inline void handle_second_stage(void) -{ - apply_kernelsu_rules(); - cache_sid(); - setup_ksu_cred(); -} - -static bool check_argv(struct user_arg_ptr argv, int index, - const char *expected, char *buf, size_t buf_len) -{ - const char __user *p; - int argc; - long ret; - - argc = count(argv, MAX_ARG_STRINGS); - if (argc <= index) { - return false; - } - - p = get_user_arg_ptr(argv, index); - if (IS_ERR_OR_NULL(p)) { - if (PTR_ERR(p)) { - pr_err("check_argv: invalid user pointer, err: %ld\n", - PTR_ERR(p)); - } - return false; - } - - ret = ksu_strncpy_from_user_nofault(buf, p, buf_len); - if (ret <= 0) { - pr_err("check_argv: failed to copy pointer, err: %ld\n", ret); - return false; - } - - buf[buf_len - 1] = '\0'; - - return !strcmp(buf, expected); -} - -// IMPORTANT NOTE: the call from execve_handler_pre WON'T provided correct value for envp and flags in GKI version -int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr, - struct user_arg_ptr *argv, - struct user_arg_ptr *envp, int *flags) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_execveat_hook) { - return 0; - } -#endif - struct filename *filename; - - static const char app_process[] = "/system/bin/app_process"; - static bool first_zygote = true; - - /* This applies to versions Android 10+ */ - static const char system_bin_init[] = "/system/bin/init"; - /* This applies to versions between Android 6 ~ 9 */ - static const char old_system_init[] = "/init"; - static bool init_second_stage_executed = false; - - if (!filename_ptr) - return 0; - - filename = *filename_ptr; - if (IS_ERR(filename)) { - return 0; - } - -#ifdef CONFIG_KSU_MANUAL_HOOK - if (current->pid != 1 && is_init(get_current_cred())) { - if (unlikely(strcmp(filename->name, KSUD_PATH) == 0)) { - pr_info("escape to root for init executing ksud: %d\n", - current->pid); - escape_to_root_for_init(); - } - } -#endif - - if (unlikely(!memcmp(filename->name, system_bin_init, - sizeof(system_bin_init) - 1) && - argv)) { - char buf[16]; - if (!init_second_stage_executed && - check_argv(*argv, 1, "second_stage", buf, sizeof(buf))) { - pr_info("/system/bin/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = true; - } - } else if (unlikely(!memcmp(filename->name, old_system_init, - sizeof(old_system_init) - 1) && - argv)) { - char buf[16]; - if (!init_second_stage_executed && - check_argv(*argv, 1, "--second-stage", buf, sizeof(buf))) { - /* This applies to versions between Android 6 ~ 7 */ - pr_info("/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = true; - } else if (count(*argv, MAX_ARG_STRINGS) == 1 && - !init_second_stage_executed && envp) { - /* This applies to versions between Android 8 ~ 9 */ - int envc = count(*envp, MAX_ARG_STRINGS); - if (envc > 0) { - int n; - for (n = 1; n <= envc; n++) { - const char __user *p = - get_user_arg_ptr(*envp, n); - if (!p || IS_ERR(p)) { - continue; - } - char env[256]; - // Reading environment variable strings from user space - if (ksu_strncpy_from_user_nofault( - env, p, sizeof(env)) < 0) - continue; - // Parsing environment variable names and values - char *env_name = env; - char *env_value = strchr(env, '='); - if (env_value == NULL) - continue; - // Replace equal sign with string terminator - *env_value = '\0'; - env_value++; - // Check if the environment variable name and value are matching - if (!strcmp(env_name, - "INIT_SECOND_STAGE") && - (!strcmp(env_value, "1") || - !strcmp(env_value, "true"))) { - pr_info("/init second_stage executed\n"); - handle_second_stage(); - init_second_stage_executed = - true; - } - } - } - } - } - - if (unlikely(first_zygote && - !memcmp(filename->name, app_process, - sizeof(app_process) - 1) && - argv)) { - char buf[16]; - if (check_argv(*argv, 1, "-Xzygote", buf, sizeof(buf))) { - pr_info("exec zygote, /data prepared, second_stage: %d\n", - init_second_stage_executed); - rcu_read_lock(); - struct task_struct *init_task = - rcu_dereference(current->real_parent); - if (init_task) - task_work_add(init_task, &on_post_fs_data_cb, - TWA_RESUME); - rcu_read_unlock(); - first_zygote = false; - stop_execve_hook(); - } - } - - return 0; -} - -static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); -static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *); -static struct file_operations fops_proxy; -static ssize_t ksu_rc_pos = 0; -const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1; - -// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da -// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da -// The system will read init.rc file until EOF, whenever read() returns 0, -// so we begin append ksu rc when we meet EOF. - -static ssize_t read_proxy(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - ssize_t ret = 0; - size_t append_count; - if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) - goto append_ksu_rc; - - ret = orig_read(file, buf, count, pos); - if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { - return ret; - } else { - pr_info("read_proxy: orig read finished, start append rc\n"); - } -append_ksu_rc: - append_count = ksu_rc_len - ksu_rc_pos; - if (append_count > count - ret) - append_count = count - ret; - // copy_to_user returns the number of not copied - if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) { - pr_info("read_proxy: append error, totally appended %zd\n", - ksu_rc_pos); - } else { - pr_info("read_proxy: append %zu\n", append_count); - - ksu_rc_pos += append_count; - if (ksu_rc_pos == ksu_rc_len) { - pr_info("read_proxy: append done\n"); - } - ret += append_count; - } - - return ret; -} - -static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to) -{ - ssize_t ret = 0; - size_t append_count; - if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) - goto append_ksu_rc; - - ret = orig_read_iter(iocb, to); - if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { - return ret; - } else { - pr_info("read_iter_proxy: orig read finished, start append rc\n"); - } -append_ksu_rc: - // copy_to_iter returns the number of copied bytes - append_count = copy_to_iter(KERNEL_SU_RC + ksu_rc_pos, - ksu_rc_len - ksu_rc_pos, to); - if (!append_count) { - pr_info("read_iter_proxy: append error, totally appended %zd\n", - ksu_rc_pos); - } else { - pr_info("read_iter_proxy: append %zu\n", append_count); - - ksu_rc_pos += append_count; - if (ksu_rc_pos == ksu_rc_len) { - pr_info("read_iter_proxy: append done\n"); - } - ret += append_count; - } - return ret; -} - -static bool check_init_path(char *dpath) -{ - const char *valid_paths[] = { "/system/etc/init/hw/init.rc", - "/init.rc" }; - bool path_match = false; - int i; - - for (i = 0; i < ARRAY_SIZE(valid_paths); i++) { - if (strcmp(dpath, valid_paths[i]) == 0) { - path_match = true; - break; - } - } - - if (!path_match) { - pr_err("vfs_read: couldn't determine init.rc path for %s\n", - dpath); - return false; - } - - pr_info("vfs_read: got init.rc path: %s\n", dpath); - return true; -} - -int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr, - size_t *count_ptr, loff_t **pos) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_vfs_read_hook) { - return 0; - } -#endif - - struct file *file; - size_t count; - - if (strcmp(current->comm, "init")) { - // we are only interest in `init` process - return 0; - } - - file = *file_ptr; - if (IS_ERR(file)) { - return 0; - } - - if (!d_is_reg(file->f_path.dentry)) { - return 0; - } - - const char *short_name = file->f_path.dentry->d_name.name; - if (strcmp(short_name, "init.rc")) { - // we are only interest `init.rc` file name file - return 0; - } - char path[256]; - char *dpath = d_path(&file->f_path, path, sizeof(path)); - - if (IS_ERR(dpath)) { - return 0; - } - - if (!check_init_path(dpath)) { - return 0; - } - - // we only process the first read - static bool rc_hooked = false; - if (rc_hooked) { - // we don't need this kprobe, unregister it! - stop_vfs_read_hook(); - return 0; - } - rc_hooked = true; - - // now we can sure that the init process is reading - // `/system/etc/init/hw/init.rc` or `/init.rc` - count = *count_ptr; - - pr_info("vfs_read: %s, comm: %s, count: %zu, rc_count: %zu\n", dpath, - current->comm, count, ksu_rc_len); - - // Now we need to proxy the read and modify the result! - // But, we can not modify the file_operations directly, because it's in read-only memory. - // We just replace the whole file_operations with a proxy one. - memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations)); - orig_read = file->f_op->read; - if (orig_read) { - fops_proxy.read = read_proxy; - } - orig_read_iter = file->f_op->read_iter; - if (orig_read_iter) { - fops_proxy.read_iter = read_iter_proxy; - } - // replace the file_operations - file->f_op = &fops_proxy; - - return 0; -} - -int ksu_handle_sys_read(unsigned int fd, char __user **buf_ptr, - size_t *count_ptr) -{ - struct file *file = fget(fd); - if (!file) { - return 0; - } - int result = ksu_handle_vfs_read(&file, buf_ptr, count_ptr, NULL); - fput(file); - return result; -} - -static unsigned int volumedown_pressed_count = 0; - -static bool is_volumedown_enough(unsigned int count) -{ - return count >= 3; -} - -int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, - int *value) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_input_hook) { - return 0; - } -#endif - - if (*type == EV_KEY && *code == KEY_VOLUMEDOWN && *value) { - // key pressed, count it - volumedown_pressed_count++; - pr_info("input_handle_event: vol_down pressed count: %u\n", - volumedown_pressed_count); - if (is_volumedown_enough(volumedown_pressed_count)) { - pr_info("input_handle_event: vol_down pressed MAX! safe mode is active!\n"); - stop_input_hook(); - } - } - - return 0; -} - -bool ksu_is_safe_mode(void) -{ - return is_volumedown_enough(volumedown_pressed_count); -} - -static void stop_vfs_read_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(VFS_READ_HOOK_KP); -#else - ksu_vfs_read_hook = false; - pr_info("stop vfs_read_hook\n"); -#endif -} - -static void stop_execve_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(EXECVE_HOOK_KP); -#else - ksu_execveat_hook = false; - pr_info("stop execve_hook\n"); -#endif -} - -static void stop_input_hook(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_stop(INPUT_EVENT_HOOK_KP); -#else - // No need to stop when its already stopped. - if (!ksu_input_hook) { - return; - } - ksu_input_hook = false; - pr_info("stop input_hook\n"); -#endif -} - -// ksud: module support -void ksu_ksud_init(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_init(); -#endif -} - -void ksu_ksud_exit(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_ksud_exit(); -#endif -} diff --git a/drivers/kernelsu/ksud.h b/drivers/kernelsu/ksud.h deleted file mode 100644 index 68c545714c24..000000000000 --- a/drivers/kernelsu/ksud.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __KSU_H_KSUD -#define __KSU_H_KSUD - -#include - -#define KSUD_PATH "/data/adb/ksud" - -void ksu_ksud_init(void); -void ksu_ksud_exit(void); - -void on_post_fs_data(void); -void on_module_mounted(void); -void on_boot_completed(void); - -bool ksu_is_safe_mode(void); - -int nuke_ext4_sysfs(const char *mnt); - -extern u32 ksu_file_sid; -extern bool ksu_module_mounted; -extern bool ksu_boot_completed; - -struct user_arg_ptr { -#ifdef CONFIG_COMPAT - bool is_compat; -#endif - union { - const char __user *const __user *native; -#ifdef CONFIG_COMPAT - const compat_uptr_t __user *compat; -#endif - } ptr; -}; - -int ksu_handle_execveat_ksud(int *fd, struct filename **filename_ptr, - struct user_arg_ptr *argv, - struct user_arg_ptr *envp, int *flags); - -#endif diff --git a/drivers/kernelsu/ksuinit.c b/drivers/kernelsu/ksuinit.c deleted file mode 100644 index 75cfced0268d..000000000000 --- a/drivers/kernelsu/ksuinit.c +++ /dev/null @@ -1,140 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include /* LINUX_VERSION_CODE, KERNEL_VERSION macros */ - -#include "allowlist.h" -#include "arch.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "throne_tracker.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK -#include "setuid_hook.h" -#include "sucompat.h" -#endif -#include "ksud.h" -#include "supercalls.h" -#include "ksu.h" -#include "file_wrapper.h" - -struct cred *ksu_cred; - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern void __init ksu_lsm_hook_init(void); -#endif - -int __init kernelsu_init(void) -{ -#ifdef CONFIG_KSU_DEBUG - pr_alert( - "*************************************************************"); - pr_alert( - "** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); - pr_alert( - "** **"); - pr_alert( - "** You are running KernelSU in DEBUG mode **"); - pr_alert( - "** **"); - pr_alert( - "** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **"); - pr_alert( - "*************************************************************"); -#endif - - ksu_cred = prepare_creds(); - if (!ksu_cred) { - pr_err("prepare cred failed!\n"); - } - - ksu_feature_init(); - - ksu_supercalls_init(); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_syscall_hook_manager_init(); -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0) - ksu_lsm_hook_init(); -#endif - ksu_setuid_hook_init(); - ksu_sucompat_init(); -#endif - - ksu_allowlist_init(); - - ksu_throne_tracker_init(); - - ksu_ksud_init(); - - ksu_file_wrapper_init(); - -#ifdef MODULE -#ifndef CONFIG_KSU_DEBUG - kobject_del(&THIS_MODULE->mkobj.kobj); -#endif -#endif - return 0; -} - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -extern void ksu_observer_exit(void); -#endif - -void kernelsu_exit(void) -{ - ksu_allowlist_exit(); - - ksu_throne_tracker_exit(); - -#if defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) - ksu_observer_exit(); -#endif - - ksu_ksud_exit(); - -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_syscall_hook_manager_exit(); -#endif -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_sucompat_exit(); - ksu_setuid_hook_exit(); -#endif - - ksu_supercalls_exit(); - - ksu_feature_exit(); - - if (ksu_cred) { - put_cred(ksu_cred); - } -} - -module_init(kernelsu_init); -module_exit(kernelsu_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("weishu"); -MODULE_DESCRIPTION("Android KernelSU"); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0) -MODULE_IMPORT_NS("VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver"); -#else -MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver); -#endif -#endif diff --git a/drivers/kernelsu/linux_hashtable.h b/drivers/kernelsu/linux_hashtable.h new file mode 100644 index 000000000000..3d4516102bee --- /dev/null +++ b/drivers/kernelsu/linux_hashtable.h @@ -0,0 +1,243 @@ +/* + * Statically sized hash table implementation + * (C) 2012 Sasha Levin + */ + +#ifndef _LINUX_HASHTABLE_H +#define _LINUX_HASHTABLE_H + +#include +#include +#include +#include +#include + +#define DEFINE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + +#define DECLARE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] + +#define HASH_SIZE(name) (ARRAY_SIZE(name)) +#define HASH_BITS(name) ilog2(HASH_SIZE(name)) + +/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ +#define hash_min(val, bits) \ + (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) + +static inline void __hash_init(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + INIT_HLIST_HEAD(&ht[i]); +} + +/** + * hash_init - initialize a hash table + * @hashtable: hashtable to be initialized + * + * Calculates the size of the hashtable from the given parameter, otherwise + * same as hash_init_size. + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_add - add an object to a hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add(hashtable, node, key) \ + hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_add_rcu - add an object to a rcu enabled hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add_rcu(hashtable, node, key) \ + hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_hashed - check whether an object is in any hashtable + * @node: the &struct hlist_node of the object to be checked + */ +static inline bool hash_hashed(struct hlist_node *node) +{ + return !hlist_unhashed(node); +} + +static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + if (!hlist_empty(&ht[i])) + return false; + + return true; +} + +/** + * hash_empty - check whether a hashtable is empty + * @hashtable: hashtable to check + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_del - remove an object from a hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del(struct hlist_node *node) +{ + hlist_del_init(node); +} + +/** + * hash_del_rcu - remove an object from a rcu enabled hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del_rcu(struct hlist_node *node) +{ + hlist_del_init_rcu(node); +} + +#undef hlist_entry_safe +#undef hlist_for_each_entry_rcu +#undef hlist_for_each_entry +#undef hlist_for_each_entry_safe + +#define hlist_entry_safe(ptr, type, member) \ + (ptr) ? hlist_entry(ptr, type, member) : NULL + +/** + * hlist_for_each_entry_rcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define hlist_for_each_entry_rcu(pos, head, member) \ + for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ + pos && ({ n = pos->member.next; 1; }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#undef hash_for_each +#undef hash_for_each_rcu +#undef hash_for_each_safe +#undef hash_for_each_possible +#undef hash_for_each_possible_rcu + +/** + * hash_for_each - iterate over a hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry(obj, &name[bkt], member) + +/** + * hash_for_each_rcu - iterate over a rcu enabled hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_rcu(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_rcu(obj, &name[bkt], member) + +/** + * hash_for_each_safe - iterate over a hashtable safe against removal of + * hash entry + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @tmp: a &struct used for temporary storage + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_safe(name, bkt, tmp, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) + +/** + * hash_for_each_possible - iterate over all possible objects hashing to the + * same bucket + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible(name, obj, member, key) \ + hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) + +/** + * hash_for_each_possible_rcu - iterate over all possible objects hashing to the + * same bucket in an rcu enabled hashtable + * in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_rcu(name, obj, member, key) \ + hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ + member) + +/** + * hash_for_each_possible_safe - iterate over all possible objects hashing to the + * same bucket safe against removals + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @tmp: a &struct used for temporary storage + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_safe(name, obj, tmp, member, key) \ + hlist_for_each_entry_safe(obj, tmp,\ + &name[hash_min(key, HASH_BITS(name))], member) + + +#endif diff --git a/drivers/kernelsu/lsm_hook.c b/drivers/kernelsu/lsm_hook.c deleted file mode 100644 index e1c0a76ec5ba..000000000000 --- a/drivers/kernelsu/lsm_hook.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) -static int ksu_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) -{ - if (init_session_keyring != NULL) { - return 0; - } - if (strcmp(current->comm, "init")) { - // we are only interested in `init` process - return 0; - } - init_session_keyring = cred->session_keyring; - pr_info("kernel_compat: got init_session_keyring\n"); - return 0; -} -#endif - -static int ksu_inode_rename(struct inode *old_inode, struct dentry *old_dentry, - struct inode *new_inode, struct dentry *new_dentry) -{ - // skip kernel threads - if (!current->mm) { - return 0; - } - - // skip non system uid - if (current_uid().val != 1000) { - return 0; - } - - if (!old_dentry || !new_dentry) { - return 0; - } - - // /data/system/packages.list.tmp -> /data/system/packages.list - if (strcmp(new_dentry->d_iname, "packages.list")) { - return 0; - } - - char path[128]; - char *buf = dentry_path_raw(new_dentry, path, sizeof(path)); - if (IS_ERR(buf)) { - pr_err("dentry_path_raw failed.\n"); - return 0; - } - - if (!strstr(buf, "/system/packages.list")) { - return 0; - } - - pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, - new_dentry->d_iname, buf); - - /* - * RKSU note: - * track_throne(true) only occurs on on_boot_completed event. - * When using this LSM, we must handle it here, else it returns - * ENOENT (-2). - */ - static bool did = false; - if (ksu_boot_completed && !did) { - did = true; - track_throne(true); - return 0; - } - - track_throne(false); - - return 0; -} - -static int ksu_task_fix_setuid(struct cred *new, const struct cred *old, - int flags) -{ - if (!new || !old) - return 0; - - return ksu_handle_setuid_common(new->uid.val, old->uid.val, new->euid.val); -} - -static struct security_hook_list ksu_hooks[] = { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ - defined(CONFIG_IS_HW_HISI) || defined(CONFIG_KSU_ALLOWLIST_WORKAROUND) - LSM_HOOK_INIT(key_permission, ksu_key_permission), -#endif - LSM_HOOK_INIT(inode_rename, ksu_inode_rename), - LSM_HOOK_INIT(task_fix_setuid, ksu_task_fix_setuid) -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) -static const struct lsm_id ksu_lsmid = { - .name = "ksu", - .id = 912, -}; -#endif - -void __init ksu_lsm_hook_init(void) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), &ksu_lsmid); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks), "ksu"); -#else - // https://elixir.bootlin.com/linux/v4.10.17/source/include/linux/lsm_hooks.h#L1892 - security_add_hooks(ksu_hooks, ARRAY_SIZE(ksu_hooks)); -#endif - pr_info("LSM hooks initialized.\n"); -} diff --git a/drivers/kernelsu/manager.h b/drivers/kernelsu/manager.h deleted file mode 100644 index a22ac52ec1f2..000000000000 --- a/drivers/kernelsu/manager.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __KSU_H_KSU_MANAGER -#define __KSU_H_KSU_MANAGER - -#include -#include -#include "allowlist.h" - -#define KSU_INVALID_APPID -1 - -extern uid_t ksu_manager_appid; // DO NOT DIRECT USE - -static inline bool ksu_is_manager_appid_valid(void) -{ - return ksu_manager_appid != KSU_INVALID_APPID; -} - -static inline bool is_manager(void) -{ - return unlikely(ksu_manager_appid == - current_uid().val % PER_USER_RANGE); -} - -static inline uid_t ksu_get_manager_appid(void) -{ - return ksu_manager_appid; -} - -static inline void ksu_set_manager_appid(uid_t appid) -{ - ksu_manager_appid = appid; -} - -static inline void ksu_invalidate_manager_uid(void) -{ - ksu_manager_appid = KSU_INVALID_APPID; -} - -#endif diff --git a/drivers/kernelsu/apk_sign.c b/drivers/kernelsu/manager/apk_sign.c similarity index 73% rename from drivers/kernelsu/apk_sign.c rename to drivers/kernelsu/manager/apk_sign.c index 4c6c63d0d886..b5965842b5e2 100644 --- a/drivers/kernelsu/apk_sign.c +++ b/drivers/kernelsu/manager/apk_sign.c @@ -1,34 +1,8 @@ -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_KSU_DEBUG -#include -#endif -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) -#include -#else -#include -#endif - -#include "apk_sign.h" -#include "app_profile.h" -#include "klog.h" // IWYU pragma: keep -#include "kernel_compat.h" -#include "manager_sign.h" - struct sdesc { struct shash_desc shash; char ctx[]; }; -static apk_sign_key_t apk_sign_keys[] = { - { EXPECTED_SIZE_RSUNTK, EXPECTED_HASH_RSUNTK }, // RKSU -}; - static struct sdesc *init_sdesc(struct crypto_shash *alg) { struct sdesc *sdesc; @@ -76,42 +50,39 @@ static int ksu_sha256(const unsigned char *data, unsigned int datalen, return ret; } -static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset) +static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset, + unsigned expected_size, const char *expected_sha256) { - int i; - apk_sign_key_t sign_key; - - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer-sequence length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signer length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // signed data length + kernel_read(fp, size4, 0x4, pos); // signer-sequence length + kernel_read(fp, size4, 0x4, pos); // signer length + kernel_read(fp, size4, 0x4, pos); // signed data length *offset += 0x4 * 3; - ksu_kernel_read_compat(fp, size4, 0x4, pos); // digests-sequence length + kernel_read(fp, size4, 0x4, pos); // digests-sequence length *pos += *size4; *offset += 0x4 + *size4; - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificates length - ksu_kernel_read_compat(fp, size4, 0x4, pos); // certificate length + kernel_read(fp, size4, 0x4, pos); // certificates length + kernel_read(fp, size4, 0x4, pos); // certificate length *offset += 0x4 * 2; - for (i = 0; i < ARRAY_SIZE(apk_sign_keys); i++) { - sign_key = apk_sign_keys[i]; - - if (*size4 != sign_key.size) - continue; + if (*size4 == expected_size) { *offset += *size4; #define CERT_MAX_LENGTH 1024 - char cert[CERT_MAX_LENGTH]; + char *cert __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(CERT_MAX_LENGTH, GFP_KERNEL); + if (!cert) + return false; + if (*size4 > CERT_MAX_LENGTH) { pr_info("cert length overlimit\n"); return false; } - ksu_kernel_read_compat(fp, cert, *size4, pos); + kernel_read(fp, cert, *size4, pos); unsigned char digest[SHA256_DIGEST_SIZE]; - if (ksu_sha256(cert, *size4, digest) < 0) { + if (ksu_sha256(cert, *size4, digest) < 0 ) { pr_info("sha256 error\n"); return false; } @@ -121,8 +92,8 @@ static bool check_block(struct file *fp, u32 *size4, loff_t *pos, u32 *offset) bin2hex(hash_str, digest, SHA256_DIGEST_SIZE); pr_info("sha256: %s, expected: %s\n", hash_str, - sign_key.sha256); - if (strcmp(sign_key.sha256, hash_str) == 0) { + expected_sha256); + if (strcmp(expected_sha256, hash_str) == 0) { return true; } } @@ -151,7 +122,7 @@ static bool has_v1_signature_file(struct file *fp) loff_t pos = 0; - while (ksu_kernel_read_compat(fp, &header, + while (kernel_read(fp, &header, sizeof(struct zip_entry_header), &pos) == sizeof(struct zip_entry_header)) { if (header.signature != 0x04034b50) { @@ -161,7 +132,7 @@ static bool has_v1_signature_file(struct file *fp) // Read the entry file name if (header.file_name_length == sizeof(MANIFEST) - 1) { char fileName[sizeof(MANIFEST)]; - ksu_kernel_read_compat(fp, fileName, + kernel_read(fp, fileName, header.file_name_length, &pos); fileName[header.file_name_length] = '\0'; @@ -182,7 +153,9 @@ static bool has_v1_signature_file(struct file *fp) return false; } -static __always_inline bool check_v2_signature(char *path) +static __always_inline bool check_v2_signature(char *path, + unsigned expected_size, + const char *expected_sha256) { unsigned char buffer[0x11] = { 0 }; u32 size4; @@ -196,9 +169,27 @@ static __always_inline bool check_v2_signature(char *path) bool v3_1_signing_exist = false; int i; - struct file *fp = ksu_filp_open_compat(path, O_RDONLY, 0); + + struct path kpath; + if (kern_path(path, 0, &kpath)) + return false; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) + if (inode_is_locked(kpath.dentry->d_inode)) +#else + if (mutex_is_locked(&kpath.dentry->d_inode->i_mutex)) +#endif + { + pr_info("%s: inode is locked for %s\n", __func__, path); + path_put(&kpath); + return false; + } + + path_put(&kpath); + + struct file *fp = filp_open(path, O_RDONLY, 0); if (IS_ERR(fp)) { - pr_err("open %s error.\n", path); + // pr_err("open %s error.\n", path); return false; } @@ -208,11 +199,11 @@ static __always_inline bool check_v2_signature(char *path) // https://en.wikipedia.org/wiki/Zip_(file_format)#End_of_central_directory_record_(EOCD) for (i = 0;; ++i) { unsigned short n; - pos = generic_file_llseek(fp, -i - 2, SEEK_END); - ksu_kernel_read_compat(fp, &n, 2, &pos); + pos = vfs_llseek(fp, -i - 2, SEEK_END); + kernel_read(fp, &n, 2, &pos); if (n == i) { pos -= 22; - ksu_kernel_read_compat(fp, &size4, 4, &pos); + kernel_read(fp, &size4, 4, &pos); if ((size4 ^ 0xcafebabeu) == 0xccfbf1eeu) { break; } @@ -225,17 +216,17 @@ static __always_inline bool check_v2_signature(char *path) pos += 12; // offset - ksu_kernel_read_compat(fp, &size4, 0x4, &pos); + kernel_read(fp, &size4, 0x4, &pos); pos = size4 - 0x18; - ksu_kernel_read_compat(fp, &size8, 0x8, &pos); - ksu_kernel_read_compat(fp, buffer, 0x10, &pos); - if (strcmp((char *)buffer, "APK Sig Block 42")) { + kernel_read(fp, &size8, 0x8, &pos); + kernel_read(fp, buffer, 0x10, &pos); + if (memcmp(buffer, "APK Sig Block 42", 16)) { goto clean; } pos = size4 - (size8 + 0x8); - ksu_kernel_read_compat(fp, &size_of_block, 0x8, &pos); + kernel_read(fp, &size_of_block, 0x8, &pos); if (size_of_block != size8) { goto clean; } @@ -244,17 +235,17 @@ static __always_inline bool check_v2_signature(char *path) while (loop_count++ < 10) { uint32_t id; uint32_t offset; - ksu_kernel_read_compat(fp, &size8, 0x8, - &pos); // sequence length + kernel_read(fp, &size8, 0x8, &pos); // sequence length if (size8 == size_of_block) { break; } - ksu_kernel_read_compat(fp, &id, 0x4, &pos); // id + kernel_read(fp, &id, 0x4, &pos); // id offset = 4; if (id == 0x7109871au) { v2_signing_blocks++; v2_signing_valid = - check_block(fp, &size4, &pos, &offset); + check_block(fp, &size4, &pos, &offset, + expected_size, expected_sha256); } else if (id == 0xf05368c0u) { // http://aospxref.com/android-14.0.0_r2/xref/frameworks/base/core/java/android/util/apk/ApkSignatureSchemeV3Verifier.java#73 v3_signing_exist = true; @@ -302,8 +293,6 @@ static __always_inline bool check_v2_signature(char *path) int ksu_debug_manager_appid = -1; -#include "manager.h" - static int set_expected_size(const char *val, const struct kernel_param *kp) { int rv = param_set_uint(val, kp); @@ -318,7 +307,7 @@ static struct kernel_param_ops expected_size_ops = { }; module_param_cb(ksu_debug_manager_appid, &expected_size_ops, - &ksu_debug_manager_appid, S_IRUSR | S_IWUSR); + &ksu_debug_manager_appid, S_IRUSR | S_IWUSR); #endif @@ -363,5 +352,7 @@ int get_pkg_from_apk_path(char *pkg, const char *path) bool is_manager_apk(char *path) { - return check_v2_signature(path); + return (check_v2_signature(path, 0x363, "4359c171f32543394cbc23ef908c4bb94cad7c8087002ba164c8230948c21549") // dummy.keystore + || check_v2_signature(path, 0x033b, "c371061b19d8c7d7d6133c6a9bafe198fa944e50c1b31c9d8daa8d7f1fc2d2d6") // kernelsu official + ); } diff --git a/drivers/kernelsu/apk_sign.h b/drivers/kernelsu/manager/apk_sign.h similarity index 85% rename from drivers/kernelsu/apk_sign.h rename to drivers/kernelsu/manager/apk_sign.h index b4d4ce3756c4..65b3a1e51cdd 100644 --- a/drivers/kernelsu/apk_sign.h +++ b/drivers/kernelsu/manager/apk_sign.h @@ -1,10 +1,7 @@ #ifndef __KSU_H_APK_V2_SIGN #define __KSU_H_APK_V2_SIGN -#include - bool is_manager_apk(char *path); - int get_pkg_from_apk_path(char *pkg, const char *path); #endif diff --git a/drivers/kernelsu/manager/manager_identity.h b/drivers/kernelsu/manager/manager_identity.h new file mode 100644 index 000000000000..0891a6a6f571 --- /dev/null +++ b/drivers/kernelsu/manager/manager_identity.h @@ -0,0 +1,41 @@ +#ifndef __KSU_H_MANAGER_IDENTITY +#define __KSU_H_MANAGER_IDENTITY + +// #include "allowlist.h" + +#define KSU_INVALID_APPID -1 +#define KSU_PER_USER_RANGE 100000 + +extern uid_t ksu_manager_appid; // DO NOT DIRECT USE + +static inline bool ksu_is_manager_appid_valid() +{ + return ksu_manager_appid != KSU_INVALID_APPID; +} + +static inline bool is_manager() +{ + return unlikely(ksu_manager_appid == current_uid().val % KSU_PER_USER_RANGE); +} + +static inline bool is_uid_manager(uid_t uid) +{ + return unlikely(ksu_manager_appid == uid % KSU_PER_USER_RANGE); +} + +static inline uid_t ksu_get_manager_appid() +{ + return ksu_manager_appid; +} + +static inline void ksu_set_manager_appid(uid_t appid) +{ + ksu_manager_appid = appid; +} + +static inline void ksu_invalidate_manager_uid() +{ + ksu_manager_appid = KSU_INVALID_APPID; +} + +#endif diff --git a/drivers/kernelsu/manager/pkg_observer.c b/drivers/kernelsu/manager/pkg_observer.c new file mode 100644 index 000000000000..3a913a6b5ed0 --- /dev/null +++ b/drivers/kernelsu/manager/pkg_observer.c @@ -0,0 +1,96 @@ +/** + * ! this is on inode_rename, NOT fsnotify + * we have access to LSM and overhead is way lower. + * we watch one file, check ifs on the same parent inode. + * a few int compare and a ptr compare. thats it. + * as for throne tracker, we just async it by hand + * by offloading it to a kthread. + */ + +static uintptr_t system_dir_inode_ptr = NULL; + +__attribute__((cold)) +static noinline void ksu_grab_data_system_inode() +{ + struct path path; + int ret = kern_path("/data/system", LOOKUP_FOLLOW, &path); + if (ret) { + pr_info("renameat: /data/system not ready? ret: (%d)\n", ret); + return; + } + + system_dir_inode_ptr = (uintptr_t)d_inode(path.dentry); + pr_info("renameat: cached /data/system d_inode: 0x%lx\n", system_dir_inode_ptr); + path_put(&path); +} + +__attribute__((cold)) +static noinline void ksu_rename_observer_slow(struct dentry *old_dentry, struct dentry *new_dentry) +{ + system_dir_inode_ptr = NULL; // reset cached inode + + char path[128] = { 0 }; + char *buf = dentry_path_raw(new_dentry, path, sizeof(path) - 1); + if (IS_ERR(buf)) { + pr_err("dentry_path_raw failed.\n"); + return; + } + + if (!strstr(buf, "/system/packages.list")) + return; + + pr_info("renameat: %s -> %s, new path: %s\n", old_dentry->d_iname, new_dentry->d_iname, buf); + track_throne(false); + return; +} + +static inline void ksu_rename_observer(struct dentry *old_dentry, struct dentry *new_dentry) +{ + // skip kernel threads + if (!current->mm) + return; + + if (!old_dentry || !new_dentry) + return; + + // skip non system uid + if (likely(current_uid().val != 1000)) + return; + + // HASH_LEN_DECLARE see dcache.h + if (likely(new_dentry->d_name.len != sizeof("packages.list") - 1 )) + return; + + // /data/system/packages.list.tmp -> /data/system/packages.list + if (likely(!!__builtin_memcmp(new_dentry->d_iname, "packages.list", sizeof("packages.list") - 1 ))) + return; + + // cache dir inode, we try to go for fast path, lockless + if (unlikely(!system_dir_inode_ptr)) + ksu_grab_data_system_inode(); + + if (unlikely(!system_dir_inode_ptr)) + goto slow_path; + + if (unlikely(!new_dentry->d_parent || !new_dentry->d_parent->d_inode)) + goto slow_path; + + /* + * fallback to slow path, but this should NOT change unless someone overlays /data/system + * but then again maybe https://github.com/tiann/KernelSU/pull/2633#discussion_r2141740346 + * but /data is casefolded, overlaying is really really unlikely + * we self heal this thing, so on enxt run, it will try to grab d inode again + * alternatively we can use packages.list inode change as trigger too, however, + * we need to save last state. more writes. + */ + if (unlikely((uintptr_t)new_dentry->d_parent->d_inode != system_dir_inode_ptr)) + goto slow_path; + + pr_info("renameat: %s -> %s, /data/system d_inode: 0x%lx \n", old_dentry->d_iname, new_dentry->d_iname, system_dir_inode_ptr); + track_throne(false); + return; + +slow_path: + ksu_rename_observer_slow(old_dentry, new_dentry); + return; +} diff --git a/drivers/kernelsu/throne_tracker.c b/drivers/kernelsu/manager/throne_tracker.c similarity index 50% rename from drivers/kernelsu/throne_tracker.c rename to drivers/kernelsu/manager/throne_tracker.c index a129fa9f4935..f61bdf3a36b1 100644 --- a/drivers/kernelsu/throne_tracker.c +++ b/drivers/kernelsu/manager/throne_tracker.c @@ -1,27 +1,6 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "allowlist.h" -#include "apk_sign.h" -#include "klog.h" // IWYU pragma: keep -#include "manager.h" -#include "kernel_compat.h" -#include "throne_tracker.h" - uid_t ksu_manager_appid = KSU_INVALID_APPID; -#if defined(CONFIG_KSU_MANUAL_HOOK) -#define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list.tmp" -#elif defined(CONFIG_KSU_SYSCALL_HOOK) || \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) #define SYSTEM_PACKAGES_LIST_PATH "/data/system/packages.list" -#endif struct uid_data { struct list_head list; @@ -29,7 +8,7 @@ struct uid_data { char package[KSU_MAX_PACKAGE_NAME]; }; -static void crown_manager(const char *apk, struct list_head *uid_data) +static __always_inline void crown_manager(const char *apk, struct list_head *uid_data) { char pkg[KSU_MAX_PACKAGE_NAME]; if (get_pkg_from_apk_path(pkg, apk) < 0) { @@ -65,8 +44,6 @@ struct apk_path_hash { struct list_head list; }; -static struct list_head apk_path_hash_list; - struct my_dir_context { struct dir_context ctx; struct list_head *data_path_list; @@ -76,9 +53,7 @@ struct my_dir_context { int *stop; }; // https://docs.kernel.org/filesystems/porting.html -// filldir_t (readdir callbacks) calling conventions have changed. -// Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). -// Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted. +// filldir_t (readdir callbacks) calling conventions have changed. Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). Rationale: callers never looked at specific -E... values anyway. -> iterate_shared() instances require no changes at all, all filldir_t ones in the tree converted. #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) #define FILLDIR_RETURN_TYPE bool #define FILLDIR_ACTOR_CONTINUE true @@ -88,25 +63,28 @@ struct my_dir_context { #define FILLDIR_ACTOR_CONTINUE 0 #define FILLDIR_ACTOR_STOP -EINVAL #endif -extern bool is_manager_apk(char *path); -static inline void print_iter(bool is_manager, char *path) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("Found new base.apk at path: %s, is_manager: %d\n", path, - is_manager); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) +#define MY_ACTOR_CTX_ARG struct dir_context *ctx #else - if (is_manager) - pr_info("Found KernelSU base.apk at %s\n", path); +#define MY_ACTOR_CTX_ARG void *ctx_void #endif -} -FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name, +extern bool is_manager_apk(char *path); +FILLDIR_RETURN_TYPE my_actor(MY_ACTOR_CTX_ARG, const char *name, int namelen, loff_t off, u64 ino, unsigned int d_type) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) + // then pull it out of the void + struct dir_context *ctx = (struct dir_context *)ctx_void; +#endif struct my_dir_context *my_ctx = container_of(ctx, struct my_dir_context, ctx); + + // we put the apk path we collected here + char *candidate_path = (char *)my_ctx->private_data; + char dirpath[DATA_PATH_LEN]; if (!my_ctx) { @@ -136,8 +114,7 @@ FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name, if (d_type == DT_DIR && my_ctx->depth > 0 && (my_ctx->stop && !*my_ctx->stop)) { - struct data_path *data = - kzalloc(sizeof(struct data_path), GFP_ATOMIC); + struct data_path *data = kzalloc(sizeof(struct data_path), GFP_KERNEL); if (!data) { pr_err("Failed to allocate memory for %s\n", dirpath); @@ -147,121 +124,110 @@ FILLDIR_RETURN_TYPE my_actor(struct dir_context *ctx, const char *name, strscpy(data->dirpath, dirpath, DATA_PATH_LEN); data->depth = my_ctx->depth - 1; list_add_tail(&data->list, my_ctx->data_path_list); - } else { - if ((namelen == 8) && - (strncmp(name, "base.apk", namelen) == 0)) { - struct apk_path_hash *pos; -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) - unsigned int hash = - full_name_hash(dirpath, strlen(dirpath)); -#else - unsigned int hash = - full_name_hash(NULL, dirpath, strlen(dirpath)); -#endif - list_for_each_entry (pos, &apk_path_hash_list, list) { - if (hash == pos->hash) { - pos->exists = true; - return FILLDIR_ACTOR_CONTINUE; - } - } + + return FILLDIR_ACTOR_CONTINUE; + } - bool is_manager = is_manager_apk(dirpath); - print_iter(is_manager, dirpath); - if (is_manager) { - crown_manager(dirpath, my_ctx->private_data); - *my_ctx->stop = 1; - } - } + // now put this on candidate_path + if (d_type == DT_REG && namelen == 8 && !memcmp(name, "base.apk", 8)) { + snprintf(candidate_path, DATA_PATH_LEN, "%s/%.*s", my_ctx->parent_dir, namelen, name); } return FILLDIR_ACTOR_CONTINUE; } -static void search_manager(const char *path, int depth, - struct list_head *uid_data) +// compat: https://elixir.bootlin.com/linux/v3.9/source/include/linux/fs.h#L771 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) +#define ksu_get_magic(x) ((x)->f_inode->i_sb->s_magic) +#else +#define ksu_get_magic(x) ((x)->f_path.dentry->d_inode->i_sb->s_magic) +#endif + +static noinline void search_manager(const char *path, int depth, struct list_head *uid_data) { int i, stop = 0; struct list_head data_path_list; INIT_LIST_HEAD(&data_path_list); - INIT_LIST_HEAD(&apk_path_hash_list); unsigned long data_app_magic = 0; - // Initialize APK cache list - struct apk_path_hash *pos, *n; - list_for_each_entry (pos, &apk_path_hash_list, list) { - pos->exists = false; - } - // First depth - struct data_path data; - strscpy(data.dirpath, path, DATA_PATH_LEN); - data.depth = depth; - list_add_tail(&data.list, &data_path_list); + struct data_path *data __attribute__((__cleanup__(ksu_kfree_byref))) = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + strscpy(data->dirpath, path, DATA_PATH_LEN); + data->depth = depth; + list_add_tail(&data->list, &data_path_list); + + // we put the apk path we collected here + char candidate_path[DATA_PATH_LEN]; for (i = depth; i >= 0; i--) { struct data_path *pos, *n; - list_for_each_entry_safe (pos, n, &data_path_list, list) { + list_for_each_entry_safe(pos, n, &data_path_list, list) { struct my_dir_context ctx = { .ctx.actor = my_actor, - .data_path_list = - &data_path_list, - .parent_dir = - pos->dirpath, - .private_data = uid_data, + .data_path_list = &data_path_list, + .parent_dir = pos->dirpath, + .private_data = candidate_path, .depth = pos->depth, .stop = &stop }; - struct file *file; - - if (!stop) { - file = ksu_filp_open_compat( - pos->dirpath, O_RDONLY | O_NOFOLLOW, 0); - if (IS_ERR(file)) { - pr_err("Failed to open directory: %s, err: %ld\n", - pos->dirpath, PTR_ERR(file)); - goto skip_iterate; - } - // grab magic on first folder, which is /data/app - if (!data_app_magic) { - if (file->f_inode->i_sb->s_magic) { - data_app_magic = - file->f_inode->i_sb - ->s_magic; - pr_info("%s: dir: %s got magic! 0x%lx\n", - __func__, pos->dirpath, - data_app_magic); - } else { - filp_close(file, NULL); - goto skip_iterate; - } - } + // make sure to clean buffer on every iteration + memset(candidate_path, 0, DATA_PATH_LEN); + + if (stop) + goto skip_iterate; - if (file->f_inode->i_sb->s_magic != - data_app_magic) { - pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", - __func__, pos->dirpath, - file->f_inode->i_sb->s_magic, - data_app_magic); + struct file *file = filp_open(pos->dirpath, O_RDONLY | O_NOFOLLOW | O_DIRECTORY, 0); + if (IS_ERR(file)) { + pr_err("Failed to open directory: %s, err: %ld\n", pos->dirpath, PTR_ERR(file)); + goto skip_iterate; + } + + // grab magic on first folder, which is /data/app + if (!data_app_magic) { + if (ksu_get_magic(file)) { + data_app_magic = ksu_get_magic(file); + pr_info("%s: dir: %s got magic! 0x%lx\n", __func__, pos->dirpath, data_app_magic); + } else { filp_close(file, NULL); goto skip_iterate; } - - iterate_dir(file, &ctx.ctx); + } + + if (ksu_get_magic(file) != data_app_magic) { + pr_info("%s: skip: %s magic: 0x%lx expected: 0x%lx\n", __func__, pos->dirpath, ksu_get_magic(file), data_app_magic); filp_close(file, NULL); + goto skip_iterate; } - skip_iterate: + + iterate_dir(file, &ctx.ctx); + filp_close(file, NULL); + + // ^ oh so thats the issue! + // we were calling is_manager_apk inside iterate_dir + // now we defer file opens after iterate_dir + // this way we dont open apks while inside that + if (!strstarts(candidate_path, "/data/ap") ) + goto skip_iterate; + + bool is_manager = is_manager_apk(candidate_path); + pr_info("Found new base.apk at path: %s, is_manager: %d\n", candidate_path, is_manager); + + if (likely(!is_manager)) + goto skip_iterate; + + crown_manager(candidate_path, uid_data); + stop = 1; + +skip_iterate: list_del(&pos->list); - if (pos != &data) + if (pos != data) kfree(pos); } } - // clear apk_path_hash_list unconditionally - pr_info("Search manager: cleanup!\n"); - list_for_each_entry_safe (pos, n, &apk_path_hash_list, list) { - list_del(&pos->list); - kfree(pos); - } } static bool is_uid_exist(uid_t uid, char *package, void *data) @@ -280,13 +246,11 @@ static bool is_uid_exist(uid_t uid, char *package, void *data) return exist; } -void track_throne(bool prune_only) +static void throne_tracker_fn(bool prune_only) { - struct file *fp = - ksu_filp_open_compat(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); + struct file *fp = filp_open(SYSTEM_PACKAGES_LIST_PATH, O_RDONLY, 0); if (IS_ERR(fp)) { - pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", - __func__, PTR_ERR(fp)); + pr_err("%s: open " SYSTEM_PACKAGES_LIST_PATH " failed: %ld\n", __func__, PTR_ERR(fp)); return; } @@ -298,18 +262,19 @@ void track_throne(bool prune_only) loff_t line_start = 0; char buf[KSU_MAX_PACKAGE_NAME]; for (;;) { - ssize_t count = - ksu_kernel_read_compat(fp, &chr, sizeof(chr), &pos); + ssize_t count = kernel_read(fp, &chr, sizeof(chr), &pos); if (count != sizeof(chr)) break; if (chr != '\n') continue; - count = ksu_kernel_read_compat(fp, buf, sizeof(buf), - &line_start); + count = kernel_read(fp, buf, sizeof(buf) - 1, &line_start); + if (count <= 0) { + break; + } + buf[count] = '\0'; - struct uid_data *data = - kzalloc(sizeof(struct uid_data), GFP_ATOMIC); + struct uid_data *data = kzalloc(sizeof(struct uid_data), GFP_KERNEL); if (!data) { filp_close(fp, 0); goto out; @@ -339,13 +304,12 @@ void track_throne(bool prune_only) } filp_close(fp, 0); - if (prune_only) { - pr_info("throne_tracker: prune allowlist only!\n"); - goto prune; - } - // now update uid list - struct uid_data *np, *n; + struct uid_data *np; + struct uid_data *n; + + if (prune_only) + goto prune; // first, check if manager_uid exist! bool manager_exist = false; @@ -364,7 +328,7 @@ void track_throne(bool prune_only) } pr_info("Searching manager...\n"); search_manager("/data/app", 2, &uid_list); - pr_info("Search manager finished.\n"); + pr_info("Search manager finished\n"); } prune: @@ -378,12 +342,73 @@ void track_throne(bool prune_only) } } -void ksu_throne_tracker_init(void) +static DEFINE_MUTEX(throne_tracker_mutex); + +static int throne_tracker_thread(void *data) +{ + // now de-void it here + bool prune_only = (bool)data; + + pr_info("throne_tracker: pid: %d started\n", current->pid); + + mutex_lock(&throne_tracker_mutex); + +test_tmp: + if (!is_file_existing("/data/system/packages.list.tmp")) + goto test_list; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("throne_tracker: rename not finished! retry!\n"); + + msleep(20); // yield + goto test_tmp; + +test_list: + if (is_file_stable(SYSTEM_PACKAGES_LIST_PATH)) + goto start_tt; + + if (IS_ENABLED(CONFIG_KSU_DEBUG)) + pr_info("throne_tracker: rename not finished! retry!\n"); + + msleep(20); // yield + goto test_list; + +start_tt: + // lessen that window where user opens manager right away, yet its not crowned + set_user_nice(current, -10); + + escape_to_root_forced(); + throne_tracker_fn(prune_only); + + mutex_unlock(&throne_tracker_mutex); + + pr_info("throne_tracker: pid: %d exit!\n", current->pid); + return 0; +} + +void track_throne(bool prune_only) +{ +#ifndef CONFIG_KSU_THRONE_TRACKER_ALWAYS_THREADED + static bool throne_tracker_first_run __read_mostly = true; + if (unlikely(throne_tracker_first_run)) { + mutex_lock(&throne_tracker_mutex); + throne_tracker_fn(prune_only); + mutex_unlock(&throne_tracker_mutex); + throne_tracker_first_run = false; + return; + } +#endif + + // HACK: force cast prune_only to be a void * + kthread_run(throne_tracker_thread, (void *)prune_only, "ksu_throne"); +} + +void ksu_throne_tracker_init() { // nothing to do } -void ksu_throne_tracker_exit(void) +void ksu_throne_tracker_exit() { // nothing to do } diff --git a/drivers/kernelsu/manager/throne_tracker.h b/drivers/kernelsu/manager/throne_tracker.h new file mode 100644 index 000000000000..48beebcf8fd9 --- /dev/null +++ b/drivers/kernelsu/manager/throne_tracker.h @@ -0,0 +1,60 @@ +#ifndef __KSU_H_UID_OBSERVER +#define __KSU_H_UID_OBSERVER + +void ksu_throne_tracker_init(); + +void ksu_throne_tracker_exit(); + +void track_throne(bool prune_only); + +/* + * small helper to check if file exists + * true - file exists + * false - file does NOT exist + * + */ +static inline bool is_file_existing(const char *path) +{ + struct path kpath; + + if (!!kern_path(path, 0, &kpath)) + return false; + + path_put(&kpath); + return true; +} + +/* + * small helper to check if file is stable + * note: if we can hold d_lock ourselves, file is stable + * true - file is stable + * false - file is deleted / being deleted/renamed + * + */ +static bool is_file_stable(const char *path) +{ + struct path kpath; + + // kern_path returns 0 on success + if (kern_path(path, 0, &kpath)) + return false; + + // just being defensive + if (!kpath.dentry) { + path_put(&kpath); + return false; + } + + if (!spin_trylock(&kpath.dentry->d_lock)) { + pr_info("%s: lock held for %s, bail out!\n", __func__, path); + path_put(&kpath); + return false; + } + // we hold it ourselves here! + + spin_unlock(&kpath.dentry->d_lock); + path_put(&kpath); + return true; +} + +#endif diff --git a/drivers/kernelsu/manager_sign.h b/drivers/kernelsu/manager_sign.h deleted file mode 100644 index 2766b261e311..000000000000 --- a/drivers/kernelsu/manager_sign.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __KSU_H_MANAGER_SIGN -#define __KSU_H_MANAGER_SIGN - -#include - -// rsuntk/KernelSU -#define EXPECTED_SIZE_RSUNTK 0x396 -#define EXPECTED_HASH_RSUNTK \ - "f415f4ed9435427e1fdf7f1fccd4dbc07b3d6b8751e4dbcec6f19671f427870b" - -typedef struct { - u32 size; - const char *sha256; -} apk_sign_key_t; - -#endif /* MANAGER_SIGN_H */ diff --git a/drivers/kernelsu/pkg_observer.c b/drivers/kernelsu/pkg_observer.c deleted file mode 100644 index 049c58e38caf..000000000000 --- a/drivers/kernelsu/pkg_observer.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include - -#define MASK_SYSTEM (FS_CREATE | FS_MOVE | FS_EVENT_ON_CHILD) - -struct watch_dir { - const char *path; - u32 mask; - struct path kpath; - struct inode *inode; - struct fsnotify_mark *mark; -}; - -static struct fsnotify_group *g; - -static int ksu_handle_inode_event(struct fsnotify_mark *mark, u32 mask, - struct inode *inode, struct inode *dir, - const struct qstr *file_name, u32 cookie) -{ - if (!file_name) - return 0; - if (mask & FS_ISDIR) - return 0; - if (file_name->len == 13 && - !memcmp(file_name->name, "packages.list", 13)) { - pr_info("packages.list detected: %d\n", mask); - track_throne(false); - } - return 0; -} - -static const struct fsnotify_ops ksu_ops = { - .handle_inode_event = ksu_handle_inode_event, -}; - -static int add_mark_on_inode(struct inode *inode, u32 mask, - struct fsnotify_mark **out) -{ - struct fsnotify_mark *m; - - m = kzalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return -ENOMEM; - - fsnotify_init_mark(m, g); - m->mask = mask; - - if (fsnotify_add_inode_mark(m, inode, 0)) { - fsnotify_put_mark(m); - return -EINVAL; - } - *out = m; - return 0; -} - -static int watch_one_dir(struct watch_dir *wd) -{ - int ret = kern_path(wd->path, LOOKUP_FOLLOW, &wd->kpath); - if (ret) { - pr_info("path not ready: %s (%d)\n", wd->path, ret); - return ret; - } - wd->inode = d_inode(wd->kpath.dentry); - ihold(wd->inode); - - ret = add_mark_on_inode(wd->inode, wd->mask, &wd->mark); - if (ret) { - pr_err("Add mark failed for %s (%d)\n", wd->path, ret); - path_put(&wd->kpath); - iput(wd->inode); - wd->inode = NULL; - return ret; - } - pr_info("watching %s\n", wd->path); - return 0; -} - -static void unwatch_one_dir(struct watch_dir *wd) -{ - if (wd->mark) { - fsnotify_destroy_mark(wd->mark, g); - fsnotify_put_mark(wd->mark); - wd->mark = NULL; - } - if (wd->inode) { - iput(wd->inode); - wd->inode = NULL; - } - if (wd->kpath.dentry) { - path_put(&wd->kpath); - memset(&wd->kpath, 0, sizeof(wd->kpath)); - } -} - -static struct watch_dir g_watch = { .path = "/data/system", - .mask = MASK_SYSTEM }; - -int ksu_observer_init(void) -{ - int ret = 0; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 0, 0) - g = fsnotify_alloc_group(&ksu_ops, 0); -#else - g = fsnotify_alloc_group(&ksu_ops); -#endif - if (IS_ERR(g)) - return PTR_ERR(g); - - ret = watch_one_dir(&g_watch); - pr_info("observer init done\n"); - return 0; -} - -void ksu_observer_exit(void) -{ - unwatch_one_dir(&g_watch); - fsnotify_put_group(g); - pr_info("observer exit done\n"); -} diff --git a/drivers/kernelsu/policy/allowlist.c b/drivers/kernelsu/policy/allowlist.c new file mode 100644 index 000000000000..f793935f955b --- /dev/null +++ b/drivers/kernelsu/policy/allowlist.c @@ -0,0 +1,543 @@ +#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32 +#define FILE_FORMAT_VERSION 3 // u32 + +#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID +#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0" + +static DEFINE_MUTEX(allowlist_mutex); + +// default profiles, these may be used frequently, so we cache it +static struct root_profile default_root_profile; +static struct non_root_profile default_non_root_profile; + +static void __init init_default_profiles() +{ + kernel_cap_t full_cap = CAP_FULL_SET; + + default_root_profile.uid = 0; + default_root_profile.gid = 0; + default_root_profile.groups_count = 1; + default_root_profile.groups[0] = 0; + memcpy(&default_root_profile.capabilities.effective, &full_cap, + sizeof(default_root_profile.capabilities.effective)); + default_root_profile.namespaces = KSU_NS_INHERITED; + strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN); + + // This means that we will umount modules by default! + default_non_root_profile.umount_modules = true; +} + +struct perm_data { + struct hlist_node list; + struct rcu_head rcu; + struct kref ref; + struct app_profile profile; +}; + +// protected by rcu +#define ALLOW_LIST_BITS 8 +static DEFINE_HASHTABLE(allow_list, ALLOW_LIST_BITS); +static u16 allow_list_count = 0; + +#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist" + +void ksu_persistent_allow_list(void); + +void ksu_show_allow_list(void) +{ + int i; + struct perm_data *p = NULL; + pr_info("ksu_show_allow_list\n"); + rcu_read_lock(); + hash_for_each_rcu (allow_list, i, p, list) { + pr_info("uid :%d, allow: %d\n", p->profile.curr_uid, p->profile.allow_su); + } + rcu_read_unlock(); +} + +struct app_profile *ksu_get_app_profile(uid_t uid) +{ + struct perm_data *p = NULL; + bool found; + +retry: + found = false; + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid) { + // found it, override it with ours + found = true; + break; + } + } + + if (!found) + return NULL; + + if (!kref_get_unless_zero(&p->ref)) { + goto retry; + } + + return &p->profile; +} + +static inline bool forbid_system_uid(uid_t uid) +{ +#define SHELL_UID 2000 +#define SYSTEM_UID 1000 + return uid < SHELL_UID && uid != SYSTEM_UID; +} + +static bool profile_valid(struct app_profile *profile) +{ + if (!profile) { + return false; + } + + bool need_migrate_su_domain = false; + + if (unlikely(profile->version == 2)) { + profile->version = KSU_APP_PROFILE_VER; + need_migrate_su_domain = true; + } + + if (strnlen(profile->key, sizeof(profile->key)) >= sizeof(profile->key)) { + pr_err("invalid app_profile key\n"); + return false; + } + + if (profile->version < KSU_APP_PROFILE_VER) { + pr_info("Unsupported profile version: %d\n", profile->version); + return false; + } + + if (profile->allow_su) { + if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) { + pr_err("invalid groups_count in app_profile: %s\n", profile->key); + return false; + } + + char *domain = profile->rp_config.profile.selinux_domain; + static const size_t domain_len = sizeof(profile->rp_config.profile.selinux_domain); + if (unlikely(need_migrate_su_domain)) { + if (strncmp(domain, "u:r:su:s0", domain_len) == 0) { + strscpy_pad(domain, KSU_DEFAULT_SELINUX_DOMAIN, domain_len); + pr_info("migrated profile domain: %s\n", profile->key); + } + } + size_t len = strnlen(domain, domain_len); + + if (len == 0 || len >= domain_len) { + pr_err("invalid selinux_domain in app_profile: %s\n", profile->key); + return false; + } + } + + return true; +} + +static void release_perm_data(struct kref *ref) +{ + struct perm_data *p = container_of(ref, struct perm_data, ref); + kfree_rcu(p, rcu); +} + +static void put_perm_data(struct perm_data *data) +{ + kref_put(&data->ref, release_perm_data); +} + +int ksu_set_app_profile(struct app_profile *profile) +{ + struct perm_data *p, *np; + int result = 0; + + if (!profile_valid(profile)) { + pr_err("Failed to set app profile: invalid profile!\n"); + return -EINVAL; + } + + // only allow default non root profile + if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID && strcmp(profile->key, "$") != 0)) { + return -EINVAL; + } + + mutex_lock(&allowlist_mutex); + + hash_for_each_possible (allow_list, p, list, profile->curr_uid) { + if (profile->curr_uid == p->profile.curr_uid) { + if (strcmp(profile->key, p->profile.key) != 0) { + pr_warn("ksu_set_app_profile: key changed: uid=%d orig=%s new=%s\n", profile->curr_uid, p->profile.key, + profile->key); + } + // found it, just override it all! + np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); + if (!np) { + result = -ENOMEM; + goto out_unlock; + } + kref_init(&np->ref); + memcpy(&np->profile, profile, sizeof(*profile)); + hlist_replace_rcu(&p->list, &np->list); + put_perm_data(p); + goto out; + } + } + + if (unlikely(allow_list_count == U16_MAX)) { + pr_err("too many app profile\n"); + result = -E2BIG; + goto out_unlock; + } + + // not found, alloc a new node! + np = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL); + if (!np) { + pr_err("ksu_set_app_profile alloc failed\n"); + result = -ENOMEM; + goto out_unlock; + } + + kref_init(&np->ref); + memcpy(&np->profile, profile, sizeof(*profile)); + if (profile->allow_su) { + pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n", profile->key, profile->curr_uid, + profile->rp_config.profile.gid, profile->rp_config.profile.selinux_domain); + } else { + pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n", profile->key, profile->curr_uid, + profile->nrp_config.profile.umount_modules); + } + + hash_add_rcu(allow_list, &np->list, np->profile.curr_uid); + ++allow_list_count; + +out: + result = 0; + + if (unlikely(profile->curr_uid == KSU_APP_PROFILE_PRESERVE_UID)) { + // set default non root profile + default_non_root_profile.umount_modules = profile->nrp_config.profile.umount_modules; + } + +out_unlock: + mutex_unlock(&allowlist_mutex); + return result; +} + +bool __ksu_is_allow_uid(uid_t uid) +{ + struct perm_data *p; + + if (forbid_system_uid(uid)) { + // do not bother going through the list if it's system + return false; + } + + if (unlikely(is_uid_manager(uid))) { + // manager is always allowed! + return true; + } + + if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) + return true; + + rcu_read_lock(); + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid && p->profile.allow_su) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; +} + +bool __ksu_is_allow_uid_for_current(uid_t uid) +{ + if (unlikely(uid == 0)) { + // already root, but only allow our domain. + return is_ksu_domain(); + } + return __ksu_is_allow_uid(uid); +} + +bool ksu_uid_should_umount(uid_t uid) +{ + struct app_profile *profile; + bool res; + if (likely(ksu_is_manager_appid_valid()) && unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) { + // we should not umount on manager! + return false; + } + if (unlikely(uid == WEBVIEW_ZYGOTE_UID)) { + // we should not umount for webview zygote + return false; + } + + rcu_read_lock(); + profile = ksu_get_app_profile(uid); + if (!profile) { + // no app profile found, it must be non root app + res = default_non_root_profile.umount_modules; + } else if (profile->allow_su) { + // if found and it is granted to su, we shouldn't umount for it + res = false; + } else { + // found an app profile + if (profile->nrp_config.use_default) { + res = default_non_root_profile.umount_modules; + } else { + res = profile->nrp_config.profile.umount_modules; + } + } + rcu_read_unlock(); + + if (profile) + ksu_put_app_profile(profile); + return res; +} + +void ksu_put_app_profile(struct app_profile *profile) +{ + struct perm_data *p = container_of(profile, struct perm_data, profile); + put_perm_data(p); +} + +struct root_profile *ksu_get_root_profile(uid_t uid) +{ + struct perm_data *p = NULL; + struct root_profile *res; + + rcu_read_lock(); + if (is_uid_manager(uid)) { + goto use_default; + } + + if (IS_ENABLED(CONFIG_KSU_DEBUG) && unlikely(uid == SHELL_UID)) + goto use_default; + +retry: + res = NULL; + hash_for_each_possible_rcu (allow_list, p, list, uid) { + if (uid == p->profile.curr_uid && p->profile.allow_su) { + if (!p->profile.rp_config.use_default) { + if (!kref_get_unless_zero(&p->ref)) { + goto retry; + } + res = &p->profile.rp_config.profile; + } + break; + } + } + + if (unlikely(!res)) { + use_default: + res = &default_root_profile; + } + + rcu_read_unlock(); + return res; +} + +void ksu_put_root_profile(struct root_profile *profile) +{ + if (likely(profile == &default_root_profile)) + return; + struct perm_data *p = container_of(profile, struct perm_data, profile.rp_config.profile); + put_perm_data(p); +} + +bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow) +{ + struct perm_data *p = NULL; + u16 i = 0, j = 0; + int iter; + rcu_read_lock(); + hash_for_each_rcu (allow_list, iter, p, list) { + // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow); + if (p->profile.allow_su == allow && !is_uid_manager(p->profile.curr_uid)) { + if (j < length) { + array[j++] = p->profile.curr_uid; + } + ++i; + } + } + rcu_read_unlock(); + if (out_length) { + *out_length = j; + } + if (out_total) { + *out_total = i; + } + + return true; +} + +static void do_persistent_allow_list() +{ + u32 magic = FILE_MAGIC; + u32 version = FILE_FORMAT_VERSION; + struct perm_data *p = NULL; + loff_t off = 0; + int i; + + struct file *fp = filp_open(KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (IS_ERR(fp)) { + pr_err("save_allow_list create file failed: %ld\n", PTR_ERR(fp)); + goto out; + } + + // store magic and version + if (kernel_write(fp, &magic, sizeof(magic), &off) != sizeof(magic)) { + pr_err("save_allow_list write magic failed.\n"); + goto close_file; + } + + if (kernel_write(fp, &version, sizeof(version), &off) != sizeof(version)) { + pr_err("save_allow_list write version failed.\n"); + goto close_file; + } + + hash_for_each (allow_list, i, p, list) { + pr_info("save allow list, name: %s uid :%d, allow: %d\n", p->profile.key, p->profile.curr_uid, + p->profile.allow_su); + + kernel_write(fp, &p->profile, sizeof(p->profile), &off); + } + +close_file: + filp_close(fp, 0); +out: + return; +} + +// this is a bit heavier than task work / workqueue but this allows +// us to have our own context. we give it a full escaped-to-root one. +static int persistent_allow_list_pre(void *data) +{ + pr_info("do_persistent_allow_list: pid: %d started\n", current->pid); + + /** + * repurpose the mutex they were holding on ksu_persistent_allow_list_fn + * since all this does eventually is to call kernel_write + * we hit two birds in one stone. exclusive io + exclusive kthread + * there wont be a single instance lock, but for what we need, its finee + * we just let other threads stall. + * 'mutex-trylock-fail-then-return' is detrimental here + */ + mutex_lock(&allowlist_mutex); + + escape_to_root_forced(); // give permissions for everything + do_persistent_allow_list(); + + mutex_unlock(&allowlist_mutex); + + pr_info("do_persistent_allow_list: pid: %d exit\n", current->pid); + return 0; +} + +void ksu_persistent_allow_list() +{ + kthread_run(persistent_allow_list_pre, NULL, "allowlist"); +} + +void ksu_load_allow_list() +{ + loff_t off = 0; + ssize_t ret = 0; + struct file *fp = NULL; + u32 magic; + u32 version; + + // load allowlist now! + fp = filp_open(KERNEL_SU_ALLOWLIST, O_RDONLY, 0); + if (IS_ERR(fp)) { + pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp)); + return; + } + + // verify magic + if (kernel_read(fp, &magic, sizeof(magic), &off) != sizeof(magic) || magic != FILE_MAGIC) { + pr_err("allowlist file invalid: %d!\n", magic); + goto exit; + } + + if (kernel_read(fp, &version, sizeof(version), &off) != sizeof(version)) { + pr_err("allowlist read version: %d failed\n", version); + goto exit; + } + + pr_info("allowlist version: %d\n", version); + + while (true) { + struct app_profile profile; + + ret = kernel_read(fp, &profile, sizeof(profile), &off); + + if (ret <= 0) { + pr_info("load_allow_list read err: %zd\n", ret); + break; + } + + pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n", profile.key, profile.curr_uid, profile.allow_su); + ksu_set_app_profile(&profile); + } + +exit: + ksu_show_allow_list(); + filp_close(fp, 0); +} + +void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *), void *data) +{ + struct perm_data *np = NULL; + struct hlist_node *tmp; + int i; + + if (!ksu_boot_completed) { + pr_info("boot not completed, skip prune\n"); + return; + } + + bool modified = false; + mutex_lock(&allowlist_mutex); + hash_for_each_safe (allow_list, i, tmp, np, list) { + uid_t uid = np->profile.curr_uid; + char *package = np->profile.key; + // we use this uid for special cases, don't prune it! + bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID; + if (!is_preserved_uid && !is_uid_valid(uid, package, data)) { + modified = true; + pr_info("prune uid: %d, package: %s\n", uid, package); + hlist_del_rcu(&np->list); + put_perm_data(np); + --allow_list_count; + } + } + mutex_unlock(&allowlist_mutex); + + if (modified) { + smp_mb(); + ksu_persistent_allow_list(); + } +} + +void __init ksu_allowlist_init(void) +{ + init_default_profiles(); +} + +void __exit ksu_allowlist_exit(void) +{ + struct perm_data *np = NULL; + struct hlist_node *tmp; + int i; + + // free allowlist + mutex_lock(&allowlist_mutex); + hash_for_each_safe (allow_list, i, tmp, np, list) { + hlist_del(&np->list); + put_perm_data(np); + } + mutex_unlock(&allowlist_mutex); +} diff --git a/drivers/kernelsu/allowlist.h b/drivers/kernelsu/policy/allowlist.h similarity index 63% rename from drivers/kernelsu/allowlist.h rename to drivers/kernelsu/policy/allowlist.h index d52795afe866..59809cc7ccd3 100644 --- a/drivers/kernelsu/allowlist.h +++ b/drivers/kernelsu/policy/allowlist.h @@ -1,11 +1,10 @@ #ifndef __KSU_H_ALLOWLIST #define __KSU_H_ALLOWLIST -#include -#include #include "app_profile.h" #define PER_USER_RANGE 100000 +#define WEBVIEW_ZYGOTE_UID 1053 #define FIRST_APPLICATION_UID 10000 #define LAST_APPLICATION_UID 19999 #define FIRST_ISOLATED_UID 99000 @@ -25,19 +24,23 @@ bool __ksu_is_allow_uid(uid_t uid); // Check if the uid is in allow list, or current is ksu domain root bool __ksu_is_allow_uid_for_current(uid_t uid); -#define ksu_is_allow_uid_for_current(uid) \ - unlikely(__ksu_is_allow_uid_for_current(uid)) +#define ksu_is_allow_uid_for_current(uid) unlikely(__ksu_is_allow_uid_for_current(uid)) -bool ksu_get_allow_list(int *array, int *length, bool allow); +bool ksu_get_allow_list(int *array, u16 length, u16 *out_length, u16 *out_total, bool allow); -void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), - void *data); +void ksu_prune_allowlist(bool (*is_uid_exist)(uid_t, char *, void *), void *data); +void ksu_persistent_allow_list(); -bool ksu_get_app_profile(struct app_profile *); -bool ksu_set_app_profile(struct app_profile *, bool persist); +// should be called with rcu read lock +struct app_profile *ksu_get_app_profile(uid_t uid); +// only used to put the app_profile returned by ksu_get_app_profile +void ksu_put_app_profile(struct app_profile *); +int ksu_set_app_profile(struct app_profile *); bool ksu_uid_should_umount(uid_t uid); struct root_profile *ksu_get_root_profile(uid_t uid); +// only used to put the root_profile returned by ksu_get_root_profile +void ksu_put_root_profile(struct root_profile *); static inline bool is_appuid(uid_t uid) { diff --git a/drivers/kernelsu/policy/app_profile.c b/drivers/kernelsu/policy/app_profile.c new file mode 100644 index 000000000000..7abdbcdf8f40 --- /dev/null +++ b/drivers/kernelsu/policy/app_profile.c @@ -0,0 +1,209 @@ +#if LINUX_VERSION_CODE >= KERNEL_VERSION (6, 7, 0) +static struct group_info root_groups = { .usage = REFCOUNT_INIT(2) }; +#else +static struct group_info root_groups = { .usage = ATOMIC_INIT(2) }; +#endif + +static void setup_groups(struct root_profile *profile, struct cred *cred) +{ + if (profile->groups_count > KSU_MAX_GROUPS) { + pr_warn("Failed to setgroups, too large group: %d!\n", + profile->uid); + return; + } + + if (profile->groups_count == 1 && profile->groups[0] == 0) { + // setgroup to root and return early. + if (cred->group_info) + put_group_info(cred->group_info); + cred->group_info = get_group_info(&root_groups); + return; + } + + u32 ngroups = profile->groups_count; + struct group_info *group_info = groups_alloc(ngroups); + if (!group_info) { + pr_warn("Failed to setgroups, ENOMEM for: %d\n", profile->uid); + return; + } + + int i; + for (i = 0; i < ngroups; i++) { + gid_t gid = profile->groups[i]; + kgid_t kgid = make_kgid(current_user_ns(), gid); + if (!gid_valid(kgid)) { + pr_warn("Failed to setgroups, invalid gid: %d\n", gid); + put_group_info(group_info); + return; + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) + group_info->gid[i] = kgid; +#else + GROUP_AT(group_info, i) = kgid; +#endif + } + + groups_sort(group_info); + set_groups(cred, group_info); + put_group_info(group_info); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) +static void disable_seccomp(void) +{ + struct task_struct *fake; + + fake = kmalloc(sizeof(*fake), GFP_KERNEL); + if (!fake) { + pr_warn("failed to alloc fake task_struct\n"); + return; + } + + // Refer to kernel/seccomp.c: seccomp_set_mode_strict + // When disabling Seccomp, ensure that current->sighand->siglock is held during the operation. + spin_lock_irq(¤t->sighand->siglock); + // disable seccomp +#if defined(CONFIG_GENERIC_ENTRY) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) + clear_syscall_work(SECCOMP); +#else + clear_thread_flag(TIF_SECCOMP); +#endif + + memcpy(fake, current, sizeof(*fake)); + + current->seccomp.mode = 0; + current->seccomp.filter = NULL; + atomic_set(¤t->seccomp.filter_count, 0); + spin_unlock_irq(¤t->sighand->siglock); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 11, 0) + // https://github.com/torvalds/linux/commit/bfafe5efa9754ebc991750da0bcca2a6694f3ed3#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R576-R577 + fake->flags |= PF_EXITING; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) + // https://github.com/torvalds/linux/commit/0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e#diff-45eb79a57536d8eccfc1436932f093eb5c0b60d9361c39edb46581ad313e8987R556-R558 + fake->sighand = NULL; +#endif + + seccomp_filter_release(fake); + kfree(fake); +} +#else /* ! LINUX_VERSION_CODE < 5.9 */ +/* + * for < 5.9 lets have free_task do it for us (put_seccomp_filter) + * we risk a double free / double decrement which isn't safe on old kernels + * I'm not even sure if this thing is needed on newer kernels + * + */ +static void disable_seccomp(void) +{ + spin_lock_irq(¤t->sighand->siglock); + + clear_thread_flag(TIF_SECCOMP); + current->seccomp.mode = 0; + current->seccomp.filter = NULL; + + spin_unlock_irq(¤t->sighand->siglock); +} +#endif // 5.9 + +static int escape_to_root(bool is_forced) +{ + int ret = 0; + struct cred *cred; + struct root_profile *profile = NULL; + struct user_struct *new_user; + + cred = prepare_creds(); + if (!cred) { + pr_warn("prepare_creds failed!\n"); + return -ENOMEM; + } + + if (!is_forced && ksu_get_uid_t(cred->euid) == 0) { + pr_warn("Already root, don't escape!\n"); + goto out_abort_creds; + } + + profile = ksu_get_root_profile(ksu_get_uid_t(cred->uid)); + + ksu_get_uid_t(cred->uid) = profile->uid; + ksu_get_uid_t(cred->suid) = profile->uid; + ksu_get_uid_t(cred->euid) = profile->uid; + ksu_get_uid_t(cred->fsuid) = profile->uid; + + ksu_get_uid_t(cred->gid) = profile->gid; + ksu_get_uid_t(cred->fsgid) = profile->gid; + ksu_get_uid_t(cred->sgid) = profile->gid; + ksu_get_uid_t(cred->egid) = profile->gid; + cred->securebits = 0; + + BUILD_BUG_ON(sizeof(profile->capabilities.effective) != sizeof(kernel_cap_t)); + + /* + * Mirror the kernel set*uid path: update cred->user first, then + * cred->ucounts, before commit_creds(). commit_creds() moves + * RLIMIT_NPROC accounting based on cred->user; if uid changes while + * user/ucounts stay stale, the old charge can remain pinned to the + * previous UID. + * See kernel/sys.c:set_user() and kernel/cred.c:set_cred_ucounts() / + * commit_creds(): + * https://github.com/torvalds/linux/blob/v5.14/kernel/sys.c + * https://github.com/torvalds/linux/blob/v5.14/kernel/cred.c + */ + new_user = alloc_uid(cred->uid); + if (!new_user) { + ret = -ENOMEM; + goto out_abort_creds; + } + + free_uid(cred->user); + cred->user = new_user; + + // v5.14+ added cred->ucounts, so we must refresh it after changing uid/user: + // https://github.com/torvalds/linux/commit/905ae01c4ae2ae3df05bb141801b1db4b7d83c61#diff-ff6060da281bd9ef3f24e17b77a9b0b5b2ed2d7208bb69b29107bee69732bd31 + // on older kernels, per-UID process accounting lives in user_struct. +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) + if (set_cred_ucounts(cred)) { + goto out_abort_creds; + } +#endif + + // setup capabilities + // we need CAP_DAC_READ_SEARCH becuase `/data/adb/ksud` is not accessible for non root process + // we add it here but don't add it to cap_inhertiable, it would be dropped automaticly after exec! + u64 cap_for_ksud = profile->capabilities.effective | CAP_DAC_READ_SEARCH; + memcpy(&cred->cap_effective, &cap_for_ksud, sizeof(cred->cap_effective)); + memcpy(&cred->cap_permitted, &profile->capabilities.effective, sizeof(cred->cap_permitted)); + memcpy(&cred->cap_bset, &profile->capabilities.effective, sizeof(cred->cap_bset)); + + setup_groups(profile, cred); + setup_selinux(profile->selinux_domain, cred); + + commit_creds(cred); + + if (test_thread_flag(TIF_SECCOMP)) + disable_seccomp(); + + setup_mount_ns(profile->namespaces); + ksu_put_root_profile(profile); + return 0; + +out_abort_creds: + if (profile) + ksu_put_root_profile(profile); + abort_creds(cred); + return ret; +} + +int escape_with_root_profile(void) +{ + return escape_to_root(false); +} + +void escape_to_root_forced(void) +{ + // I'm not really sure which permissions are needed + // its just escape to root but bypasses cred check + // which we likely already have on contexts where this will be used. + escape_to_root(true); +} diff --git a/drivers/kernelsu/policy/app_profile.h b/drivers/kernelsu/policy/app_profile.h new file mode 100644 index 000000000000..747f550236d7 --- /dev/null +++ b/drivers/kernelsu/policy/app_profile.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_APP_PROFILE +#define __KSU_H_APP_PROFILE + +// Escalate current process to root with the appropriate profile +int escape_with_root_profile(void); + +void escape_to_root_forced(void); + +#endif diff --git a/drivers/kernelsu/feature.c b/drivers/kernelsu/policy/feature.c similarity index 96% rename from drivers/kernelsu/feature.c rename to drivers/kernelsu/policy/feature.c index a1017aafbb8e..cf9ee4d5e0eb 100644 --- a/drivers/kernelsu/feature.c +++ b/drivers/kernelsu/policy/feature.c @@ -1,8 +1,3 @@ -#include "feature.h" -#include "klog.h" // IWYU pragma: keep - -#include - static const struct ksu_feature_handler *feature_handlers[KSU_FEATURE_MAX]; static DEFINE_MUTEX(feature_mutex); @@ -149,7 +144,7 @@ int ksu_set_feature(u32 feature_id, u64 value) return ret; } -void ksu_feature_init(void) +void __init ksu_feature_init(void) { int i; @@ -160,7 +155,7 @@ void ksu_feature_init(void) pr_info("feature: feature management initialized\n"); } -void ksu_feature_exit(void) +void __exit ksu_feature_exit(void) { int i; diff --git a/drivers/kernelsu/feature.h b/drivers/kernelsu/policy/feature.h similarity index 82% rename from drivers/kernelsu/feature.h rename to drivers/kernelsu/policy/feature.h index a5de137a5cfb..1eb12392e617 100644 --- a/drivers/kernelsu/feature.h +++ b/drivers/kernelsu/policy/feature.h @@ -1,15 +1,6 @@ #ifndef __KSU_H_FEATURE #define __KSU_H_FEATURE -#include - -enum ksu_feature_id { - KSU_FEATURE_SU_COMPAT = 0, - KSU_FEATURE_KERNEL_UMOUNT = 1, - - KSU_FEATURE_MAX -}; - typedef int (*ksu_feature_get_t)(u64 *value); typedef int (*ksu_feature_set_t)(u64 value); diff --git a/drivers/kernelsu/runtime/ksud.c b/drivers/kernelsu/runtime/ksud.c new file mode 100644 index 000000000000..44b3c25d2618 --- /dev/null +++ b/drivers/kernelsu/runtime/ksud.c @@ -0,0 +1,563 @@ +static const char KERNEL_SU_RC[] = + "\n" + + "on post-fs-data\n" + " start logd\n" + // We should wait for the post-fs-data finish + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " post-fs-data\n" + "\n" + + "on nonencrypted\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" + "\n" + + "on property:vold.decrypt=trigger_restart_framework\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " services\n" + "\n" + + "on property:sys.boot_completed=1\n" + " exec u:r:" KERNEL_SU_DOMAIN ":s0 root -- " KSUD_PATH " boot-completed\n" + "\n" + + "\n"; + +static void stop_vfs_read_hook(); +static void stop_input_hook(); + +static bool ksu_module_mounted __read_mostly = false; +static bool ksu_boot_completed __read_mostly = false; +static bool ksu_vfs_read_hook __read_mostly = true; +static bool ksu_input_hook __read_mostly = true; + +#ifdef KSU_CAN_USE_JUMP_LABEL +DEFINE_STATIC_KEY_TRUE(ksud_vfs_read_key); +static inline void ksu_disable_vfs_read_branch() +{ + pr_info("vfs_read_hook: remove vfs_read branches\n"); + static_branch_disable(&ksud_vfs_read_key); + smp_mb(); +} +#else +static inline void ksu_disable_vfs_read_branch() { } // no-op +#endif + +void on_post_fs_data(void) +{ + static bool done = false; + if (done) { + pr_info("on_post_fs_data already done\n"); + return; + } + done = true; + pr_info("on_post_fs_data!\n"); + + ksu_load_allow_list(); + // sanity check, this may influence the performance + stop_input_hook(); +} + +extern void ext4_unregister_sysfs(struct super_block *sb); +int nuke_ext4_sysfs(const char *mnt) +{ + struct path path; + int err = kern_path(mnt, 0, &path); + if (err) { + pr_err("nuke path err: %d\n", err); + return err; + } + + struct super_block *sb = path.dentry->d_inode->i_sb; + const char *name = sb->s_type->name; + if (strcmp(name, "ext4") != 0) { + pr_info("nuke but module aren't mounted\n"); + path_put(&path); + return -EINVAL; + } + + ext4_unregister_sysfs(sb); + path_put(&path); + return 0; +} + +void on_module_mounted(void) +{ + pr_info("on_module_mounted!\n"); + ksu_module_mounted = true; +} + +void on_boot_completed(void) +{ + ksud_escape_exit(); + + ksu_boot_completed = true; + pr_info("on_boot_completed!\n"); + track_throne(true); +} + +static ssize_t (*orig_read)(struct file *, char __user *, size_t, loff_t *); +static ssize_t (*orig_read_iter)(struct kiocb *, struct iov_iter *); +static struct file_operations fops_proxy; +static ssize_t ksu_rc_pos = 0; +const size_t ksu_rc_len = sizeof(KERNEL_SU_RC) - 1; + +// https://cs.android.com/android/platform/superproject/main/+/main:system/core/init/parser.cpp;l=144;drc=61197364367c9e404c7da6900658f1b16c42d0da +// https://cs.android.com/android/platform/superproject/main/+/main:system/libbase/file.cpp;l=241-243;drc=61197364367c9e404c7da6900658f1b16c42d0da +// The system will read init.rc file until EOF, whenever read() returns 0, +// so we begin append ksu rc when we meet EOF. + +static ssize_t read_proxy(struct file *file, char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret = 0; + size_t append_count; + if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) + goto append_ksu_rc; + + ret = orig_read(file, buf, count, pos); + if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { + return ret; + } else { + pr_info("read_proxy: orig read finished, start append rc\n"); + } +append_ksu_rc: + append_count = ksu_rc_len - ksu_rc_pos; + if (append_count > count - ret) + append_count = count - ret; + // copy_to_user returns the number of not copied + if (copy_to_user(buf + ret, KERNEL_SU_RC + ksu_rc_pos, append_count)) { + pr_info("read_proxy: append error, totally appended %ld\n", ksu_rc_pos); + } else { + pr_info("read_proxy: append %ld\n", append_count); + + ksu_rc_pos += append_count; + if (ksu_rc_pos == ksu_rc_len) { + pr_info("read_proxy: append done\n"); + } + ret += append_count; + } + + return ret; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER) +static ssize_t read_iter_proxy(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + size_t append_count; + if (ksu_rc_pos && ksu_rc_pos < ksu_rc_len) + goto append_ksu_rc; + + ret = orig_read_iter(iocb, to); + if (ret != 0 || ksu_rc_pos >= ksu_rc_len) { + return ret; + } else { + pr_info("read_iter_proxy: orig read finished, start append rc\n"); + } +append_ksu_rc: + // copy_to_iter returns the number of copied bytes + append_count = copy_to_iter((void *)KERNEL_SU_RC + ksu_rc_pos, ksu_rc_len - ksu_rc_pos, to); + if (!append_count) { + pr_info("read_iter_proxy: append error, totally appended %ld\n", ksu_rc_pos); + } else { + pr_info("read_iter_proxy: append %ld\n", append_count); + + ksu_rc_pos += append_count; + if (ksu_rc_pos == ksu_rc_len) { + pr_info("read_iter_proxy: append done\n"); + } + ret += append_count; + } + return ret; +} +#endif + +static bool is_init_rc(struct file *fp) +{ + if (strcmp(current->comm, "init")) { + // we are only interest in `init` process + return false; + } + + if (!d_is_reg(fp->f_path.dentry)) { + return false; + } + + const char *short_name = fp->f_path.dentry->d_name.name; + if (strcmp(short_name, "init.rc")) { + // we are only interest `init.rc` file name file + return false; + } + char path[256] = {0}; + char *dpath = d_path(&fp->f_path, path, sizeof(path)); + + if (IS_ERR(dpath)) { + return false; + } + + if (!!strcmp(dpath, "/init.rc") && !!strcmp(dpath, "/system/etc/init/hw/init.rc")) { + return false; + } + + pr_info("%s: %s \n", __func__, dpath); + + return true; +} + +__attribute__((cold)) +static noinline void ksu_install_rc_hook(struct file *file) +{ + if (!is_init(current_cred())) + return; + + if (!is_init_rc(file)) { + return; + } + + // we only process the first read + static bool rc_hooked = false; + if (rc_hooked) { + // we don't need this kprobe, unregister it! + stop_vfs_read_hook(); + return; + } + rc_hooked = true; + + // since we already have domains, selinux is initialized, we can apply rules and shit + // https://github.com/LineageOS/android_system_core_old/blob/ecbcdafc3/init/init.cpp#L669 + pr_info("%s: init.rc second stage, fp: 0x%lx \n", __func__, (uintptr_t)file); + apply_kernelsu_rules(); + cache_sid(); + setup_ksu_cred(); + ksu_grab_init_session_keyring(); + + // now we can sure that the init process is reading + // `/system/etc/init/init.rc` + + pr_info("read init.rc, comm: %s, rc_count: %zu\n", current->comm, ksu_rc_len); + + // Now we need to proxy the read and modify the result! + // But, we can not modify the file_operations directly, because it's in read-only memory. + // We just replace the whole file_operations with a proxy one. + memcpy(&fops_proxy, file->f_op, sizeof(struct file_operations)); + orig_read = file->f_op->read; + if (orig_read) { + fops_proxy.read = read_proxy; + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(KSU_HAS_FOP_READ_ITER) + orig_read_iter = file->f_op->read_iter; + if (orig_read_iter) { + fops_proxy.read_iter = read_iter_proxy; + } +#endif + // replace the file_operations + file->f_op = &fops_proxy; + + return; +} + +// for sys_read kp / syscall table +__attribute__((cold)) +static noinline void ksu_handle_sys_read_fd(unsigned int fd) +{ + if (likely(!ksu_vfs_read_hook)) + return; + + if (!is_init(current_cred())) + return; + + struct file *file = fget(fd); + if (!file) { + return; + } + ksu_install_rc_hook(file); + fput(file); +} + +#define STAT_NATIVE 0 +#define STAT_STAT64 1 + +__attribute__((cold)) +static noinline void ksu_common_newfstat_ret(unsigned int fd_int, void **statbuf_ptr, + const int type, const char *syscall_name) +{ + if (!is_init(current_cred())) + return; + + struct file *file = fget(fd_int); + if (!file) + return; + + if (!is_init_rc(file)) { + fput(file); + return; + } + fput(file); + + pr_info("%s: stat init.rc \n", syscall_name); + + uintptr_t statbuf_ptr_local = (uintptr_t)*(void **)statbuf_ptr; + void __user *statbuf = (void __user *)statbuf_ptr_local; + if (!statbuf) + return; + + void __user *st_size_ptr; + long size, new_size; + size_t len; + + st_size_ptr = statbuf + offsetof(struct stat, st_size); + len = sizeof(long); + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + if (type) { + st_size_ptr = statbuf + offsetof(struct stat64, st_size); + len = sizeof(long long); + } +#endif + + // we do this for kretprobe's reusability + // this is pretty short, so nbd + bool got_flipped = false; + if (!preemptible()) { + preempt_enable(); + got_flipped = true; + } + + if (ksu_copy_from_user_retry(&size, st_size_ptr, len)) { + pr_info("%s: read statbuf 0x%lx failed \n", syscall_name, (unsigned long)st_size_ptr); + goto out; + } + + new_size = size + ksu_rc_len; + pr_info("%s: adding ksu_rc_len: %ld -> %ld \n", syscall_name, size, new_size); + + if (!copy_to_user(st_size_ptr, &new_size, len)) + pr_info("%s: added ksu_rc_len \n", syscall_name); + else + pr_info("%s: add ksu_rc_len failed: statbuf 0x%lx \n", syscall_name, (unsigned long)st_size_ptr); + +out: + if (got_flipped) + preempt_disable(); + + return; +} + +void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_common_newfstat_ret(*fd, (void **)statbuf_ptr, STAT_NATIVE, "sys_newfstat"); +#endif +} + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (static_branch_likely(&ksud_vfs_read_key)) + ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!! +#else + if (unlikely(ksu_vfs_read_hook)) + ksu_common_newfstat_ret(*(unsigned int *)fd, (void **)statbuf_ptr, STAT_STAT64, "sys_fstat64"); // WARNING: LE-only!!! +#endif +} +#endif + +static bool safe_mode_flag = false; +#define VOLUME_PRESS_THRESHOLD_COUNT 3 + +bool ksu_is_safe_mode() +{ + // don't need to check again, userspace may call multiple times + static bool already_checked = false; + if (already_checked) + return true; + + // stop hook first! + stop_input_hook(); + + if (!safe_mode_flag) + return false; + + pr_info("volume keys pressed max times, safe mode detected!\n"); + already_checked = true; + return true; +} + +static void vol_detector_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) +{ + static int vol_up_cnt = 0; + static int vol_down_cnt = 0; + + if (!value) + return; + + if (type != EV_KEY) + return; + + if (code == KEY_VOLUMEDOWN) { + vol_down_cnt++; + pr_info("KEY_VOLUMEDOWN press detected!\n"); + } + + if (code == KEY_VOLUMEUP) { + vol_up_cnt++; + pr_info("KEY_VOLUMEUP press detected!\n"); + } + + pr_info("volume_pressed_count: vol_up: %d vol_down: %d\n", vol_up_cnt, vol_down_cnt); + + /* + * on upstream we call stop_input_hook() here but this is causing issues + * #1. unregistering an input handler inside the input handler is a bad meme + * #2. when I tried to defer unreg to a kthread, it also causes issues on some users? nfi. + * since unregging is done anyway on ksu_is_safe_mode() or on_post_fs_data() we just dont bother. + * + */ + if (vol_up_cnt >= VOLUME_PRESS_THRESHOLD_COUNT || vol_down_cnt >= VOLUME_PRESS_THRESHOLD_COUNT) { + pr_info("volume keys pressed max times, safe mode detected!\n"); + safe_mode_flag = true; + } +} + +static int vol_detector_connect(struct input_handler *handler, struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "ksu_handle_input"; + + error = input_register_handle(handle); + if (error) + goto err_free_handle; + + error = input_open_device(handle); + if (error) + goto err_unregister_handle; + + return 0; + +err_unregister_handle: + input_unregister_handle(handle); +err_free_handle: + kfree(handle); + return error; +} + +static const struct input_device_id vol_detector_ids[] = { + // we add key volume up so that + // 1. if you have broken volume down you get shit + // 2. we can make sure to trigger only ksu safemode, not android's safemode. + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_VOLUMEUP)] = BIT_MASK(KEY_VOLUMEUP) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_VOLUMEDOWN)] = BIT_MASK(KEY_VOLUMEDOWN) }, + }, + { } +}; + +static void vol_detector_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +MODULE_DEVICE_TABLE(input, vol_detector_ids); + +static struct input_handler vol_detector_handler = { + .event = vol_detector_event, + .connect = vol_detector_connect, + .disconnect = vol_detector_disconnect, + .name = "ksu", + .id_table = vol_detector_ids, +}; + +static int vol_detector_init() +{ + pr_info("vol_detector: init\n"); + return input_register_handler(&vol_detector_handler); +} + +static int vol_detector_exit() +{ + pr_info("vol_detector: exit\n"); + input_unregister_handler(&vol_detector_handler); + return 0; +} + +// we do this so that if theres no ksud to call on_post_fs_data/ksu_is_safe_mode/on_boot_completed +// there will be no input handler / extra execve branch that stays around +// 60s is more than enough time from second_stage to decrypt/post_fs_data +// if theres no ksud that does that, we trigger the closing of hooks ourselves +static int ksu_hook_watchdog(void *data) +{ + unsigned int i = 0; + + set_user_nice(current, 19); // low prio + pr_info("%s: kthread init!\n", __func__); + +start: + if (!*(volatile bool *)&ksu_input_hook) + goto bail; + + msleep(5000); + + i++; + + if (i < 12) + goto start; + + // if this path gets triggerred, it means theres no ksud + pr_info("%s: ksud probably absent, closing hooks!\n", __func__); + + // close down input hook + stop_input_hook(); + + // close down ksud escape + ksud_escape_exit(); + ksu_boot_completed = true; + +bail: + pr_info("%s: kthread exit!\n", __func__); + return 0; +} + +static void stop_vfs_read_hook() +{ + ksu_vfs_read_hook = false; + pr_info("stop vfs_read_hook\n"); + ksu_disable_vfs_read_branch(); + + kthread_run(ksu_hook_watchdog, NULL, "watchdog"); +} + +static void stop_input_hook() +{ + if (!ksu_input_hook) { return; } + ksu_input_hook = false; + pr_info("stop input_hook\n"); + + vol_detector_exit(); +} + +void __init ksu_ksud_init() +{ + ksud_escape_init(); + vol_detector_init(); +} + diff --git a/drivers/kernelsu/runtime/ksud.h b/drivers/kernelsu/runtime/ksud.h new file mode 100644 index 000000000000..4461843407c3 --- /dev/null +++ b/drivers/kernelsu/runtime/ksud.h @@ -0,0 +1,26 @@ +#ifndef __KSU_H_KSUD +#define __KSU_H_KSUD + +#define KSUD_PATH "/data/adb/ksud" + +void ksu_ksud_init(); +void ksu_ksud_exit(); + +void on_post_fs_data(void); +void on_module_mounted(void); +void on_boot_completed(void); + +bool ksu_is_safe_mode(void); + +int nuke_ext4_sysfs(const char* mnt); + +static noinline void ksu_install_rc_hook(struct file *file); + +extern u32 ksu_file_sid; + +static bool ksu_module_mounted __read_mostly; +static bool ksu_boot_completed __read_mostly; +static bool ksu_vfs_read_hook __read_mostly; +static bool ksu_input_hook __read_mostly; + +#endif diff --git a/drivers/kernelsu/runtime/ksud_escape.c b/drivers/kernelsu/runtime/ksud_escape.c new file mode 100644 index 000000000000..974d5859eece --- /dev/null +++ b/drivers/kernelsu/runtime/ksud_escape.c @@ -0,0 +1,213 @@ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#if defined(CONFIG_KRETPROBES) +#include +static u32 cached_su_sid __read_mostly; +static u32 cached_init_sid __read_mostly; + +// int security_bounded_transition(u32 old_sid, u32 new_sid) +static int bounded_transition_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + // grab sids on entry + u32 *sid = (u32 *)ri->data; + sid[0] = PT_REGS_PARM1(regs); // old_sid + sid[1] = PT_REGS_PARM2(regs); // new_sid + + return 0; +} + +static int bounded_transition_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + u32 *sid = (u32 *)ri->data; + u32 old_sid = sid[0]; + u32 new_sid = sid[1]; + + if (!cached_su_sid) + return 0; + + // so if old sid is 'init' and trying to transition to a new sid of 'ksu' + // force the function to return 0 + if (old_sid == cached_init_sid && new_sid == cached_su_sid) { + pr_info("security_bounded_transition: allowing init (%d) -> ksu (%d) \n", old_sid, new_sid); + PT_REGS_RC(regs) = 0; // make the original func return 0 + } + + return 0; +} + +static struct kretprobe bounded_transition_rp = { + .kp.symbol_name = "security_bounded_transition", + .handler = bounded_transition_ret_handler, + .entry_handler = bounded_transition_entry_handler, + .data_size = sizeof(u32) * 2, // need to keep 2x u32's, one per sid + .maxactive = 20, +}; + +static int kp_ksud_transition_unregister(void *data) +{ + msleep(1000); + + unregister_kretprobe(&bounded_transition_rp); + pr_info("kp_ksud: unregister rp: security_bounded_transition\n"); + return 0; +} + +static void kp_ksud_transition_routine_start() +{ + static bool already_ran = false; + if (already_ran) + return; + + int ret = register_kretprobe(&bounded_transition_rp); + pr_info("kp_ksud: register rp: security_bounded_transition ret: %d\n", ret); + + already_ran = true; +} +#else +__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (ksu_boot_completed) { + pr_info("sys_execve: boot completed, remove escape branch\n"); + static_branch_disable(&ksud_escape_key); + smp_mb(); + return; + } +#endif + + // see if its init + if (!is_init(current_cred())) + return; + + const char ksud_path[] = KSUD_PATH; + char path[sizeof(ksud_path)]; + + // filename is void * char __user * + const char __user **filename_user = (const char __user **)filename; + + // see if its trying to execute ksud + if (ksu_copy_from_user_retry(path, *filename_user, sizeof(path))) + return; + + if (likely(!!memcmp(ksud_path, path, sizeof(path)))) + return; + + pr_info("sys_execve: escape init executing %s with pid: %d\n", path, current->pid); + escape_to_root_forced(); // give this context all permissions + return; +} + +__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename) +{ +#ifdef KSU_CAN_USE_JUMP_LABEL + if (ksu_boot_completed) { + pr_info("kernel_execve: boot completed, remove escape branch\n"); + static_branch_disable(&ksud_escape_key); + smp_mb(); + return; + } +#endif + // filename is void ** + void **filename_ptr = (void **)filename; + + // see if its init + if (!is_init(current_cred())) + return; + + if (!*filename_ptr) + return; + + if (likely(!!memcmp(*filename_ptr, KSUD_PATH, sizeof(KSUD_PATH)))) + return; + + pr_info("kernel_execve: escape init executing %s with pid: %d\n", *(const char **)filename_ptr, current->pid); + escape_to_root_forced(); // give this context all permissions + return; +} +#endif // KRETPROBES +#endif // < 4.14 && >= 4.2 + +// UL bprm_set_creds handling +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) +static uintptr_t selinux_ops_addr; +static int (*orig_bprm_set_creds)(struct linux_binprm *bprm) = NULL; + +static int ksu_unregister_bprm_set_creds(void *data) +{ + struct security_operations *ops = (struct security_operations *)selinux_ops_addr; + if (orig_bprm_set_creds) { + pr_info("%s: restoring: bprm_set_creds 0x%lx -> 0x%lx\n", __func__, (long)ops->bprm_set_creds, (long)orig_bprm_set_creds); + ops->bprm_set_creds = orig_bprm_set_creds; + } + + return 0; +} + +static int hook_bprm_set_creds(struct linux_binprm *bprm) +{ + if (ksu_boot_completed) + goto unreg_bprm_set_creds; + + if (!is_init(current_cred())) + goto bprm_set_creds; + + if (!bprm->filename) + goto bprm_set_creds; + + if (!!strcmp(bprm->filename, "/data/adb/ksud")) + goto bprm_set_creds; + + struct task_security_struct *old_tsec = current_security(); + struct task_security_struct *new_tsec = bprm->cred->security; + + if (!(old_tsec->exec_sid)) + goto bprm_set_creds; + + // we copy what selinux was doing + // ref: https://elixir.bootlin.com/linux/v3.0.101/source/security/selinux/hooks.c#L1971 + + /* Default to the current task SID. */ + new_tsec->sid = old_tsec->sid; + new_tsec->osid = old_tsec->sid; + + /* Reset fs, key, and sock SIDs on execve. */ + new_tsec->create_sid = 0; + new_tsec->keycreate_sid = 0; + new_tsec->sockcreate_sid = 0; + + new_tsec->sid = old_tsec->exec_sid; + /* Reset exec SID on execve. */ + new_tsec->exec_sid = 0; + + pr_info("bprm_set_creds: allow init executing %s with pid: %d\n", bprm->filename, current->pid); + return 0; + +unreg_bprm_set_creds: + stop_machine(ksu_unregister_bprm_set_creds, NULL, NULL); + +bprm_set_creds: + return orig_bprm_set_creds(bprm); + + +} +#endif + +static void ksud_escape_init() +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES) + kp_ksud_transition_routine_start(); +#endif +} + +static void ksud_escape_exit() +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && defined(CONFIG_KRETPROBES) + static bool already_ran = false; + if (already_ran) + return; + + already_ran = true; + + kthread_run(kp_ksud_transition_unregister, NULL, "rp_unhook"); +#endif + +} diff --git a/drivers/kernelsu/runtime/ksud_escape.h b/drivers/kernelsu/runtime/ksud_escape.h new file mode 100644 index 000000000000..13ba5b9a5145 --- /dev/null +++ b/drivers/kernelsu/runtime/ksud_escape.h @@ -0,0 +1,41 @@ +#ifndef __KSU_H_KSUD_ESCAPE +#define __KSU_H_KSUD_ESCAPE + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) && !defined(CONFIG_KRETPROBES) +__attribute__((cold)) static noinline void sys_execve_escape_ksud_internal(void *filename); +__attribute__((cold)) static noinline void kernel_execve_escape_ksud_internal(void *filename); + +#ifdef KSU_CAN_USE_JUMP_LABEL +DEFINE_STATIC_KEY_TRUE(ksud_escape_key); +static inline void sys_execve_escape_ksud(void *filename) +{ + if (static_branch_likely(&ksud_escape_key)) + sys_execve_escape_ksud_internal(filename); +} +static inline void kernel_execve_escape_ksud(void *filename) +{ + if (static_branch_likely(&ksud_escape_key)) + kernel_execve_escape_ksud_internal(filename); +} +#else +static inline void sys_execve_escape_ksud(void *filename) +{ + if (unlikely(!ksu_boot_completed)) + sys_execve_escape_ksud_internal(filename); +} +static inline void kernel_execve_escape_ksud(void *filename) +{ + if (unlikely(!ksu_boot_completed)) + kernel_execve_escape_ksud_internal(filename); +} +#endif + +#else +static inline void sys_execve_escape_ksud(void *filename) { } // no-op +static inline void kernel_execve_escape_ksud(void *filename) { } // no-op +#endif // < 4.14 && >= 4.2 && !KRETPROBES + +static void ksud_escape_init(); +static void ksud_escape_exit(); + +#endif // __KSU_H_KSUD_ESCAPE diff --git a/drivers/kernelsu/selinux/rules.c b/drivers/kernelsu/selinux/rules.c index a2b9a7dde728..c51990b6b060 100644 --- a/drivers/kernelsu/selinux/rules.c +++ b/drivers/kernelsu/selinux/rules.c @@ -1,51 +1,62 @@ -#include -#include -#include - -#include "../klog.h" // IWYU pragma: keep -#include "selinux.h" -#include "sepolicy.h" -#include "ss/services.h" -#include "linux/lsm_audit.h" // IWYU pragma: keep -#include "xfrm.h" - #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) #define SELINUX_POLICY_INSTEAD_SELINUX_SS #endif #define ALL NULL -static struct policydb *get_policydb(void) -{ - struct policydb *db; -// selinux_state does not exists before 4.19 -#ifdef KSU_COMPAT_USE_SELINUX_STATE -#ifdef SELINUX_POLICY_INSTEAD_SELINUX_SS - struct selinux_policy *policy = selinux_state.policy; - db = &policy->policydb; +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) +extern int avc_ss_reset(u32 seqno); #else - struct selinux_ss *ss = selinux_state.ss; - db = &ss->policydb; +extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); #endif +// reset avc cache table, otherwise the new rules will not take effect if already denied +static void reset_avc_cache() +{ +#if ((!defined(KSU_COMPAT_USE_SELINUX_STATE)) || LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)) + avc_ss_reset(0); + selnl_notify_policyload(0); + selinux_status_update_policyload(0); #else - db = &policydb; + struct selinux_avc *avc = selinux_state.avc; + avc_ss_reset(avc, 0); + selnl_notify_policyload(0); + selinux_status_update_policyload(&selinux_state, 0); #endif - return db; + selinux_xfrm_notify_policyload(); } -static DEFINE_MUTEX(ksu_rules); -void apply_kernelsu_rules(void) -{ - struct policydb *db; +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) - if (!getenforce()) { - pr_info("SELinux permissive or disabled, apply rules!\n"); - } +#if defined(KSU_COMPAT_USE_SELINUX_STATE) +static struct policydb *get_policydb(void) { return &selinux_state.ss->policydb; } +#else +static struct policydb *get_policydb(void) { return &policydb; } +#endif - mutex_lock(&ksu_rules); +// rwlock +#if defined(KSU_COMPAT_USE_SELINUX_STATE) +static inline rwlock_t *ksu_get_policy_rwlock() { return &selinux_state.ss->policy_rwlock; } +#elif defined(KSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK) +static inline rwlock_t *ksu_get_policy_rwlock() { extern rwlock_t policy_rwlock; return &policy_rwlock; } +#elif defined(CONFIG_KALLSYMS) +static noinline rwlock_t *ksu_get_policy_rwlock() { return (rwlock_t *)kallsyms_lookup_name("policy_rwlock"); } +#else +static inline rwlock_t *ksu_get_policy_rwlock() { return NULL; } +#endif - db = get_policydb(); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) || defined(KSU_COMPAT_HAS_BACKPORTED_CPUS_PTR) +static inline const cpumask_t *ksu_get_current_cpumask_t() { return current->cpus_ptr; } +#else +static inline cpumask_t *ksu_get_current_cpumask_t() { return ¤t->cpus_allowed; } +#endif + +#endif // < 5.10 + +static int apply_kernelsu_rules_fn(void *ptr) +{ + struct policydb *db = (struct policydb *)ptr; + ksu_type(db, KERNEL_SU_DOMAIN, "domain"); ksu_permissive(db, KERNEL_SU_DOMAIN); ksu_typeattribute(db, KERNEL_SU_DOMAIN, "mlstrustedsubject"); ksu_typeattribute(db, KERNEL_SU_DOMAIN, "netdomain"); @@ -54,7 +65,7 @@ void apply_kernelsu_rules(void) // Create unconstrained file type ksu_type(db, KERNEL_SU_FILE, "file_type"); ksu_typeattribute(db, KERNEL_SU_FILE, "mlstrustedobject"); - ksu_allow(db, ALL, KERNEL_SU_FILE, ALL, ALL); + ksu_allow(db, "domain", KERNEL_SU_FILE, ALL, ALL); // allow all! ksu_allow(db, KERNEL_SU_DOMAIN, ALL, ALL, ALL); @@ -69,10 +80,10 @@ void apply_kernelsu_rules(void) // our ksud triggered by init ksu_allow(db, "init", KERNEL_SU_DOMAIN, ALL, ALL); -#ifdef CONFIG_KSU_MANUAL_HOOK + + // restored from https://github.com/tiann/KernelSU/pull/3031 ksu_allow(db, "init", "adb_data_file", "file", ALL); ksu_allow(db, "init", "adb_data_file", "dir", ALL); // #1289 -#endif // copied from Magisk rules // suRights @@ -81,7 +92,7 @@ void apply_kernelsu_rules(void) ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "open"); ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "file", "read"); ksu_allow(db, "servicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "process", "sigchld"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "process", "sigchld"); // allowLog ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "dir", "search"); @@ -89,407 +100,606 @@ void apply_kernelsu_rules(void) ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "open"); ksu_allow(db, "logd", KERNEL_SU_DOMAIN, "file", "getattr"); - // dumpsys - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fd", "use"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "write"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "read"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "open"); - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "fifo_file", "getattr"); + // dumpsys, send fd + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fd", "use"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "write"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "read"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "open"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "fifo_file", "getattr"); // bootctl ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "dir", "search"); ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "read"); ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "file", "open"); - ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", - "getattr"); + ksu_allow(db, "hwservicemanager", KERNEL_SU_DOMAIN, "process", "getattr"); // Allow all binder transactions - ksu_allow(db, ALL, KERNEL_SU_DOMAIN, "binder", ALL); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "binder", ALL); // Allow system server kill su process ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "getpgid"); ksu_allow(db, "system_server", KERNEL_SU_DOMAIN, "process", "sigkill"); - mutex_unlock(&ksu_rules); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "read"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "write"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "connectto"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getopt"); + ksu_allow(db, "domain", KERNEL_SU_DOMAIN, "unix_stream_socket", "getattr"); + + return 0; } -#define MAX_SEPOL_LEN 128 +void apply_kernelsu_rules() +{ + struct policydb *db; + + if (!getenforce()) { + pr_info("SELinux permissive or disabled, apply rules!\n"); + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) + struct selinux_policy *pol, *old_pol = selinux_state.policy; + mutex_lock(&selinux_state.policy_mutex); + pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex))); + if (IS_ERR(pol)) { + pr_err("failed to dup selinux_policy: %ld\n", PTR_ERR(pol)); + goto out_unlock; + } + db = &pol->policydb; + + apply_kernelsu_rules_fn((void *)db); + + rcu_assign_pointer(selinux_state.policy, pol); + synchronize_rcu(); + ksu_destroy_sepolicy(old_pol); + + reset_avc_cache(); +out_unlock: + mutex_unlock(&selinux_state.policy_mutex); +#else + + db = get_policydb(); -#define CMD_NORMAL_PERM 1 -#define CMD_XPERM 2 -#define CMD_TYPE_STATE 3 -#define CMD_TYPE 4 -#define CMD_TYPE_ATTR 5 -#define CMD_ATTR 6 -#define CMD_TYPE_TRANSITION 7 -#define CMD_TYPE_CHANGE 8 -#define CMD_GENFSCON 9 + rwlock_t *lock = ksu_get_policy_rwlock(); + if (!lock) + goto do_stop_machine; + + /* + * HACK: write_lock() is held with preempt enabled. DO NOT let the + * task be migrated to any other CPU than the current CPU. And since + * set_cpus_allowed_ptr() can sleep, use raw_smp_processor_id() to get + * current CPU and bypass preemption checks. + */ + cpumask_t old_mask; + cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); + set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); + + pr_info("%s: type: policy_rwlock \n", __func__); + write_lock(lock); + preempt_enable(); + + apply_kernelsu_rules_fn((void *)db); + + preempt_disable(); + write_unlock(lock); + set_cpus_allowed_ptr(current, &old_mask); + goto out_flush; + +do_stop_machine: + pr_info("%s: type: stop_machine()\n", __func__); + stop_machine(apply_kernelsu_rules_fn, (void *)db, NULL); + +out_flush: + smp_mb(); + reset_avc_cache(); +#endif +} + +#define KSU_SEPOLICY_MAX_BATCH_SIZE (8U * 1024U * 1024U) +#define KSU_SEPOLICY_MAX_ARGS 5 struct sepol_data { u32 cmd; u32 subcmd; - u64 sepol1; - u64 sepol2; - u64 sepol3; - u64 sepol4; - u64 sepol5; - u64 sepol6; - u64 sepol7; }; -static int get_object(char *buf, char __user *user_object, size_t buf_sz, - char **object) +struct sepol_batch_cursor { + const u8 *cur; + const u8 *end; +}; + +static size_t sepol_remaining(const struct sepol_batch_cursor *cursor) { - if (!user_object) { - *object = ALL; - return 0; - } + return (size_t)(cursor->end - cursor->cur); +} - if (strncpy_from_user(buf, user_object, buf_sz) < 0) { +static int sepol_read_cmd_header(struct sepol_batch_cursor *cursor, struct sepol_data *header) +{ + if (sepol_remaining(cursor) < sizeof(*header)) { return -EINVAL; } - *object = buf; + memcpy(header, cursor->cur, sizeof(*header)); + cursor->cur += sizeof(*header); return 0; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) || \ - !defined(KSU_COMPAT_USE_SELINUX_STATE) -extern int avc_ss_reset(u32 seqno); -#else -extern int avc_ss_reset(struct selinux_avc *avc, u32 seqno); -#endif -// reset avc cache table, otherwise the new rules will not take effect if already denied -static void reset_avc_cache(void) +static int sepol_read_string(struct sepol_batch_cursor *cursor, const char **out) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0) || \ - !defined(KSU_COMPAT_USE_SELINUX_STATE) - avc_ss_reset(0); - selnl_notify_policyload(0); - selinux_status_update_policyload(0); -#else - struct selinux_avc *avc = selinux_state.avc; - avc_ss_reset(avc, 0); - selnl_notify_policyload(0); - selinux_status_update_policyload(&selinux_state, 0); -#endif - selinux_xfrm_notify_policyload(); -} - -int handle_sepolicy(unsigned long arg3, void __user *arg4) -{ - struct policydb *db; + u32 len; + const char *str; - if (!arg4) { + if (sepol_remaining(cursor) < sizeof(len)) { return -EINVAL; } - if (!getenforce()) { - pr_info("SELinux permissive or disabled when handle policy!\n"); + memcpy(&len, cursor->cur, sizeof(len)); + cursor->cur += sizeof(len); + + if (len >= sepol_remaining(cursor)) { + return -EINVAL; } - struct sepol_data data; - if (copy_from_user(&data, arg4, sizeof(struct sepol_data))) { - pr_err("sepol: copy sepol_data failed.\n"); + str = (const char *)cursor->cur; + if (memchr(str, '\0', len) != NULL || str[len] != '\0') { return -EINVAL; } - u32 cmd = data.cmd; - u32 subcmd = data.subcmd; + cursor->cur += len + 1; + if (len == 0) { + *out = ALL; + return 0; + } + + *out = str; + return 0; +} - mutex_lock(&ksu_rules); +static int sepol_require_not_all(const char *value, const char *name) +{ + if (value != ALL) { + return 0; + } - db = get_policydb(); + pr_err("sepol: %s cannot be ALL.\n", name); + return -EINVAL; +} - int ret = -EINVAL; +static int sepol_expected_argc(u32 cmd) +{ switch (cmd) { - case CMD_NORMAL_PERM: { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - char perm_buf[MAX_SEPOL_LEN]; + case KSU_SEPOLICY_CMD_NORMAL_PERM: + return 4; + case KSU_SEPOLICY_CMD_XPERM: + return 5; + case KSU_SEPOLICY_CMD_TYPE_STATE: + return 1; + case KSU_SEPOLICY_CMD_TYPE: + case KSU_SEPOLICY_CMD_TYPE_ATTR: + return 2; + case KSU_SEPOLICY_CMD_ATTR: + return 1; + case KSU_SEPOLICY_CMD_TYPE_TRANSITION: + return 5; + case KSU_SEPOLICY_CMD_TYPE_CHANGE: + return 4; + case KSU_SEPOLICY_CMD_GENFSCON: + return 3; + default: + return -EINVAL; + } +} - char *s, *t, *c, *p; - if (get_object(src_buf, (void __user *)data.sepol1, - sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; +static int apply_one_sepolicy_cmd(struct policydb *db, const struct sepol_data *header, const char **args) +{ + bool success = false; + int ret; + + switch (header->cmd) { + case KSU_SEPOLICY_CMD_NORMAL_PERM: + if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_ALLOW) { + success = ksu_allow(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DENY) { + success = ksu_deny(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_AUDITALLOW) { + success = ksu_auditallow(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_NORMAL_PERM_DONTAUDIT) { + success = ksu_dontaudit(db, args[0], args[1], args[2], args[3]); + } else { + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } + return success ? 0 : -EINVAL; - if (get_object(tgt_buf, (void __user *)data.sepol2, - sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_XPERM: + ret = sepol_require_not_all(args[3], "operation"); + if (ret < 0) { + return ret; } - - if (get_object(cls_buf, (void __user *)data.sepol3, - sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; + ret = sepol_require_not_all(args[4], "perm_set"); + if (ret < 0) { + return ret; } - if (get_object(perm_buf, (void __user *)data.sepol4, - sizeof(perm_buf), &p) < 0) { - pr_err("sepol: copy perm failed.\n"); - goto exit; + if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_ALLOW) { + success = ksu_allowxperm(db, args[0], args[1], args[2], args[4]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_AUDITALLOW) { + success = ksu_auditallowxperm(db, args[0], args[1], args[2], args[4]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_XPERM_DONTAUDIT) { + success = ksu_dontauditxperm(db, args[0], args[1], args[2], args[4]); + } else { + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } + return success ? 0 : -EINVAL; - bool success = false; + case KSU_SEPOLICY_CMD_TYPE_STATE: + ret = sepol_require_not_all(args[0], "type"); + if (ret < 0) { + return ret; + } - if (subcmd == 1) { - success = ksu_allow(db, s, t, c, p); - } else if (subcmd == 2) { - success = ksu_deny(db, s, t, c, p); - } else if (subcmd == 3) { - success = ksu_auditallow(db, s, t, c, p); - } else if (subcmd == 4) { - success = ksu_dontaudit(db, s, t, c, p); + if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_PERMISSIVE) { + success = ksu_permissive(db, args[0]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_STATE_ENFORCE) { + success = ksu_enforce(db, args[0]); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } - ret = success ? 0 : -EINVAL; - break; - } - case CMD_XPERM: { - char src_buf[MAX_SEPOL_LEN]; - char tgt_buf[MAX_SEPOL_LEN]; - char cls_buf[MAX_SEPOL_LEN]; - - char __maybe_unused - operation[MAX_SEPOL_LEN]; // it is always ioctl now! - char perm_set[MAX_SEPOL_LEN]; - - char *s, *t, *c; - if (get_object(src_buf, (void __user *)data.sepol1, - sizeof(src_buf), &s) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (get_object(tgt_buf, (void __user *)data.sepol2, - sizeof(tgt_buf), &t) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (get_object(cls_buf, (void __user *)data.sepol3, - sizeof(cls_buf), &c) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(operation, (void __user *)data.sepol4, - sizeof(operation)) < 0) { - pr_err("sepol: copy operation failed.\n"); - goto exit; - } - if (strncpy_from_user(perm_set, (void __user *)data.sepol5, - sizeof(perm_set)) < 0) { - pr_err("sepol: copy perm_set failed.\n"); - goto exit; - } - - bool success = false; - if (subcmd == 1) { - success = ksu_allowxperm(db, s, t, c, perm_set); - } else if (subcmd == 2) { - success = ksu_auditallowxperm(db, s, t, c, perm_set); - } else if (subcmd == 3) { - success = ksu_dontauditxperm(db, s, t, c, perm_set); + return success ? 0 : -EINVAL; + + case KSU_SEPOLICY_CMD_TYPE: + case KSU_SEPOLICY_CMD_TYPE_ATTR: + ret = sepol_require_not_all(args[0], "type"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[1], "attribute"); + if (ret < 0) { + return ret; + } + + if (header->cmd == KSU_SEPOLICY_CMD_TYPE) { + success = ksu_type(db, args[0], args[1]); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + success = ksu_typeattribute(db, args[0], args[1]); } - ret = success ? 0 : -EINVAL; - break; - } - case CMD_TYPE_STATE: { - char src[MAX_SEPOL_LEN]; + if (!success) { + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; + } + return 0; - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_ATTR: + ret = sepol_require_not_all(args[0], "attribute"); + if (ret < 0) { + return ret; } - bool success = false; - if (subcmd == 1) { - success = ksu_permissive(db, src); - } else if (subcmd == 2) { - success = ksu_enforce(db, src); - } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + if (!ksu_attribute(db, args[0])) { + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; + } + return 0; + + case KSU_SEPOLICY_CMD_TYPE_TRANSITION: { + const char *object = ALL; + + ret = sepol_require_not_all(args[0], "src"); + if (ret < 0) { + return ret; } - if (success) - ret = 0; - break; + ret = sepol_require_not_all(args[1], "tgt"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[2], "cls"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[3], "default_type"); + if (ret < 0) { + return ret; + } + + object = args[4]; + + success = ksu_type_transition(db, args[0], args[1], args[2], args[3], object); + return success ? 0 : -EINVAL; } - case CMD_TYPE: - case CMD_TYPE_ATTR: { - char type[MAX_SEPOL_LEN]; - char attr[MAX_SEPOL_LEN]; - if (strncpy_from_user(type, (void __user *)data.sepol1, - sizeof(type)) < 0) { - pr_err("sepol: copy type failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_TYPE_CHANGE: + ret = sepol_require_not_all(args[0], "src"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[1], "tgt"); + if (ret < 0) { + return ret; } - if (strncpy_from_user(attr, (void __user *)data.sepol2, - sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; + ret = sepol_require_not_all(args[2], "cls"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[3], "default_type"); + if (ret < 0) { + return ret; } - bool success = false; - if (cmd == CMD_TYPE) { - success = ksu_type(db, type, attr); + if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_CHANGE) { + success = ksu_type_change(db, args[0], args[1], args[2], args[3]); + } else if (header->subcmd == KSU_SEPOLICY_SUBCMD_TYPE_CHANGE_MEMBER) { + success = ksu_type_member(db, args[0], args[1], args[2], args[3]); } else { - success = ksu_typeattribute(db, type, attr); - } - if (!success) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; + pr_err("sepol: unknown subcmd: %d\n", header->subcmd); } - ret = 0; - break; - } - case CMD_ATTR: { - char attr[MAX_SEPOL_LEN]; + return success ? 0 : -EINVAL; - if (strncpy_from_user(attr, (void __user *)data.sepol1, - sizeof(attr)) < 0) { - pr_err("sepol: copy attr failed.\n"); - goto exit; + case KSU_SEPOLICY_CMD_GENFSCON: + ret = sepol_require_not_all(args[0], "name"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[1], "path"); + if (ret < 0) { + return ret; + } + ret = sepol_require_not_all(args[2], "context"); + if (ret < 0) { + return ret; } - if (!ksu_attribute(db, attr)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; + + if (!ksu_genfscon(db, args[0], args[1], args[2])) { + pr_err("sepol: %d failed.\n", header->cmd); + return -EINVAL; } - ret = 0; - break; + return 0; + + default: + pr_err("sepol: unknown cmd: %d\n", header->cmd); + return -EINVAL; } - case CMD_TYPE_TRANSITION: { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - char object[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (strncpy_from_user(tgt, (void __user *)data.sepol2, - sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (strncpy_from_user(cls, (void __user *)data.sepol3, - sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(default_type, (void __user *)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; - } - char *real_object; - if ((void __user *)data.sepol5 == NULL) { - real_object = NULL; - } else { - if (strncpy_from_user(object, - (void __user *)data.sepol5, - sizeof(object)) < 0) { - pr_err("sepol: copy object failed.\n"); - goto exit; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +int handle_sepolicy(void __user *user_data, u64 data_len) +{ + struct selinux_policy *pol, *old_pol; + struct policydb *db; + struct sepol_batch_cursor cursor; + u8 *payload; + int ret; + int success_cmd_count; + u32 cmd_index; + + if (!user_data || !data_len) { + return -EINVAL; + } + + if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE) { + return -E2BIG; + } + + payload = kvmalloc((size_t)data_len, GFP_KERNEL); + if (!payload) { + return -ENOMEM; + } + + if (copy_from_user(payload, user_data, (size_t)data_len)) { + ret = -EFAULT; + goto out_free; + } + + if (!getenforce()) { + pr_info("SELinux permissive or disabled when handle policy!\n"); + } + + mutex_lock(&selinux_state.policy_mutex); + + old_pol = selinux_state.policy; + pol = ksu_dup_sepolicy(rcu_dereference_protected(old_pol, lockdep_is_held(&selinux_state.policy_mutex))); + if (IS_ERR(pol)) { + ret = PTR_ERR(pol); + pr_err("ksu_dup_sepolicy err: %d\n", ret); + goto out_unlock; + } + db = &pol->policydb; + + cursor.cur = payload; + cursor.end = payload + (size_t)data_len; + + ret = 0; + success_cmd_count = 0; + cmd_index = 0; + while (cursor.cur < cursor.end) { + struct sepol_data header; + const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 }; + int expected_argc; + u32 arg_index; + + ret = sepol_read_cmd_header(&cursor, &header); + if (ret < 0) { + pr_err("sepol: failed to read cmd header #%u.\n", cmd_index); + goto out_drop_new_policy; + } + + expected_argc = sepol_expected_argc(header.cmd); + if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) { + ret = -EINVAL; + pr_err("sepol: invalid cmd header #%u.\n", cmd_index); + goto out_drop_new_policy; + } + + for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) { + ret = sepol_read_string(&cursor, &args[arg_index]); + if (ret < 0) { + pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index); + goto out_drop_new_policy; } - real_object = object; } - bool success = ksu_type_transition(db, src, tgt, cls, - default_type, real_object); - if (success) - ret = 0; - break; - } - case CMD_TYPE_CHANGE: { - char src[MAX_SEPOL_LEN]; - char tgt[MAX_SEPOL_LEN]; - char cls[MAX_SEPOL_LEN]; - char default_type[MAX_SEPOL_LEN]; - - if (strncpy_from_user(src, (void __user *)data.sepol1, - sizeof(src)) < 0) { - pr_err("sepol: copy src failed.\n"); - goto exit; - } - if (strncpy_from_user(tgt, (void __user *)data.sepol2, - sizeof(tgt)) < 0) { - pr_err("sepol: copy tgt failed.\n"); - goto exit; - } - if (strncpy_from_user(cls, (void __user *)data.sepol3, - sizeof(cls)) < 0) { - pr_err("sepol: copy cls failed.\n"); - goto exit; - } - if (strncpy_from_user(default_type, (void __user *)data.sepol4, - sizeof(default_type)) < 0) { - pr_err("sepol: copy default_type failed.\n"); - goto exit; - } - bool success = false; - if (subcmd == 1) { - success = ksu_type_change(db, src, tgt, cls, - default_type); - } else if (subcmd == 2) { - success = ksu_type_member(db, src, tgt, cls, - default_type); + ret = apply_one_sepolicy_cmd(db, &header, args); + if (ret < 0) { + pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); } else { - pr_err("sepol: unknown subcmd: %d\n", subcmd); + success_cmd_count++; + int argc = sepol_expected_argc(header.cmd); + int i; + for (i = 0; i < argc; i++) + ksu_add_shit_to_list(args[i]); } - if (success) - ret = 0; - break; + cmd_index++; } - case CMD_GENFSCON: { - char name[MAX_SEPOL_LEN]; - char path[MAX_SEPOL_LEN]; - char context[MAX_SEPOL_LEN]; - if (strncpy_from_user(name, (void __user *)data.sepol1, - sizeof(name)) < 0) { - pr_err("sepol: copy name failed.\n"); - goto exit; - } - if (strncpy_from_user(path, (void __user *)data.sepol2, - sizeof(path)) < 0) { - pr_err("sepol: copy path failed.\n"); - goto exit; - } - if (strncpy_from_user(context, (void __user *)data.sepol3, - sizeof(context)) < 0) { - pr_err("sepol: copy context failed.\n"); - goto exit; - } - - if (!ksu_genfscon(db, name, path, context)) { - pr_err("sepol: %d failed.\n", cmd); - goto exit; - } - ret = 0; - break; + + rcu_assign_pointer(selinux_state.policy, pol); + synchronize_rcu(); + ksu_destroy_sepolicy(old_pol); + + reset_avc_cache(); + ret = success_cmd_count; + goto out_unlock; + +out_drop_new_policy: + ksu_destroy_sepolicy(pol); +out_unlock: + mutex_unlock(&selinux_state.policy_mutex); +out_free: + kvfree(payload); + + return ret; +} +#else + +struct handle_sepolicy_args { + void *ctx_success_cmd_count; + void *ctx_payload; + u64 ctx_data_len; +}; + +static int handle_sepolicy_fn(void *data) +{ + struct sepol_batch_cursor cursor; + int ret = 0; + u32 cmd_index = 0; + int success_cmd_count = 0; + + struct policydb *db = get_policydb(); + struct handle_sepolicy_args *ctx = (struct handle_sepolicy_args *)data; + u8 *payload = (u8 *)ctx->ctx_payload; + u64 data_len = ctx->ctx_data_len; + + cursor.cur = payload; + cursor.end = payload + (size_t)data_len; + + while (cursor.cur < cursor.end) { + struct sepol_data header; + const char *args[KSU_SEPOLICY_MAX_ARGS] = { 0 }; + int expected_argc; + u32 arg_index; + + ret = sepol_read_cmd_header(&cursor, &header); + if (ret < 0) { + pr_err("sepol: failed to read cmd header #%u.\n", cmd_index); + goto out; + } + + expected_argc = sepol_expected_argc(header.cmd); + if (expected_argc < 0 || expected_argc > KSU_SEPOLICY_MAX_ARGS) { + ret = -EINVAL; + pr_err("sepol: invalid cmd header #%u.\n", cmd_index); + goto out; + } + + for (arg_index = 0; arg_index < (u32)expected_argc; arg_index++) { + ret = sepol_read_string(&cursor, &args[arg_index]); + if (ret < 0) { + pr_err("sepol: failed to read cmd #%u arg #%u.\n", cmd_index, arg_index); + goto out; + } + } + + ret = apply_one_sepolicy_cmd(db, &header, args); + if (ret < 0) + pr_err("sepol: cmd #%u failed, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); + else { + pr_info("sepol: cmd #%u success, cmd=%u subcmd=%u.\n", cmd_index, header.cmd, header.subcmd); + success_cmd_count++; + int argc = sepol_expected_argc(header.cmd); + int i; + for (i = 0; i < argc; i++) + ksu_add_shit_to_list(args[i]); + + } + + cmd_index++; } - default: { - pr_err("sepol: unknown cmd: %d\n", cmd); - break; + +out: + *(int *)(ctx->ctx_success_cmd_count) = success_cmd_count; + return ret; +} + +int handle_sepolicy(void __user *user_data, u64 data_len) +{ + u8 *payload; + int ret = 0; + int success_cmd_count = 0; + + if (!user_data || !data_len) + return -EINVAL; + + if (data_len > KSU_SEPOLICY_MAX_BATCH_SIZE) + return -E2BIG; + + payload = kvmalloc((size_t)data_len, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + if (copy_from_user(payload, user_data, (size_t)data_len)) { + ret = -EFAULT; + goto out_free; } + + if (!getenforce()) { + pr_info("SELinux permissive or disabled when handle policy!\n"); } -exit: - mutex_unlock(&ksu_rules); + struct handle_sepolicy_args ctx = { 0 }; + ctx.ctx_success_cmd_count = (void *)&success_cmd_count; + ctx.ctx_payload = (void *)payload; + ctx.ctx_data_len = (u64)data_len; + + rwlock_t *lock = ksu_get_policy_rwlock(); + if (!lock) + goto do_stop_machine; + + cpumask_t old_mask; + cpumask_copy(&old_mask, ksu_get_current_cpumask_t()); + set_cpus_allowed_ptr(current, cpumask_of(raw_smp_processor_id())); + + write_lock(lock); + preempt_enable(); - // only allow and xallow needs to reset avc cache, but we cannot do that because - // we are in atomic context. so we just reset it every time. + ret = handle_sepolicy_fn((void *)&ctx); + + preempt_disable(); + write_unlock(lock); + set_cpus_allowed_ptr(current, &old_mask); + goto out_done; + +do_stop_machine: + ret = stop_machine(handle_sepolicy_fn, (void *)&ctx, NULL); + +out_done: + if (ret) + goto out_free; + + smp_mb(); reset_avc_cache(); + ret = success_cmd_count; + +out_free: + kvfree(payload); return ret; } +#endif diff --git a/drivers/kernelsu/selinux/selinux.c b/drivers/kernelsu/selinux/selinux.c index 010732dffd9b..d7c6a71d20c6 100644 --- a/drivers/kernelsu/selinux/selinux.c +++ b/drivers/kernelsu/selinux/selinux.c @@ -1,11 +1,3 @@ -#include "linux/cred.h" -#include "linux/sched.h" -#include "linux/security.h" -#include "linux/version.h" -#include "selinux_defs.h" -#include "../klog.h" // IWYU pragma: keep -#include "../ksu.h" - /* * Cached SID values for frequently checked contexts. * These are resolved once at init and used for fast u32 comparison @@ -23,60 +15,40 @@ static u32 cached_zygote_sid __read_mostly = 0; static u32 cached_init_sid __read_mostly = 0; u32 ksu_file_sid __read_mostly = 0; -static int transive_to_domain(const char *domain, struct cred *cred) +static int transive_to_domain(const char *domain, struct cred *cred, bool clear_exec_sid) { - taskcred_sec_t *tsec; u32 sid; int error; - - tsec = (taskcred_sec_t *)selinux_cred(cred); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) + struct task_security_struct *tsec; +#else + struct cred_security_struct *tsec; +#endif + tsec = selinux_cred(cred); if (!tsec) { pr_err("tsec == NULL!\n"); return -1; } error = security_secctx_to_secid(domain, strlen(domain), &sid); if (error) { - pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n", - domain, sid, error); + pr_info("security_secctx_to_secid %s -> sid: %d, error: %d\n", domain, + sid, error); } if (!error) { tsec->sid = sid; tsec->create_sid = 0; tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; + if (clear_exec_sid) { + tsec->exec_sid = 0; + } } return error; } -#if LINUX_VERSION_CODE <= KERNEL_VERSION(4, 19, 0) -bool __maybe_unused -is_ksu_transition(const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec) +void setup_selinux(const char *domain, struct cred *cred) { - static u32 ksu_sid; - char *secdata; - int err; - u32 seclen; - bool allowed = false; - - if (!ksu_sid) { - err = security_secctx_to_secid( - KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &ksu_sid); - pr_err("failed to get ksu_sid: %d\n", err); - } - - if (security_secid_to_secctx(old_tsec->sid, &secdata, &seclen)) - return false; - - allowed = (!strcmp("u:r:init:s0", secdata) && new_tsec->sid == ksu_sid); - security_release_secctx(secdata, seclen); - return allowed; -} -#endif - -void setup_selinux(const char *domain) -{ - if (transive_to_domain(domain, (struct cred *)__task_cred(current))) { + if (transive_to_domain(domain, cred, false)) { pr_err("transive domain failed.\n"); return; } @@ -84,24 +56,65 @@ void setup_selinux(const char *domain) void setup_ksu_cred(void) { - if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred)) { + if (ksu_cred && transive_to_domain(KERNEL_SU_CONTEXT, ksu_cred, false)) { pr_err("setup ksu cred failed.\n"); } } void setenforce(bool enforce) { - do_setenforce(enforce); +#ifdef CONFIG_SECURITY_SELINUX_DEVELOP +#ifdef KSU_COMPAT_USE_SELINUX_STATE + selinux_state.enforcing = enforce; +#else + selinux_enforcing = enforce; +#endif +#endif } bool getenforce(void) { - if (is_selinux_disabled()) { +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +#ifdef KSU_COMPAT_USE_SELINUX_STATE + if (selinux_state.disabled) { return false; } +#else + if (selinux_disabled) { + return false; + } +#endif // KSU_COMPAT_USE_SELINUX_STATE +#endif // CONFIG_SECURITY_SELINUX_DISABLE + +#ifdef CONFIG_SECURITY_SELINUX_DEVELOP +#ifdef KSU_COMPAT_USE_SELINUX_STATE + return selinux_state.enforcing; +#else + return selinux_enforcing; +#endif +#else + return true; +#endif +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0) +struct lsm_context { + char *context; + u32 len; +}; - return is_selinux_enforcing(); +static int __security_secid_to_secctx(u32 secid, struct lsm_context *cp) +{ + return security_secid_to_secctx(secid, &cp->context, &cp->len); +} +static void __security_release_secctx(struct lsm_context *cp) +{ + security_release_secctx(cp->context, cp->len); } +#else +#define __security_secid_to_secctx security_secid_to_secctx +#define __security_release_secctx security_release_secctx +#endif /* * Initialize cached SID values for frequently checked SELinux contexts. @@ -112,8 +125,7 @@ void cache_sid(void) { int err; - err = security_secctx_to_secid( - KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid); + err = security_secctx_to_secid(KERNEL_SU_CONTEXT, strlen(KERNEL_SU_CONTEXT), &cached_su_sid); if (err) { pr_warn("Failed to cache kernel su domain SID: %d\n", err); cached_su_sid = 0; @@ -121,8 +133,7 @@ void cache_sid(void) pr_info("Cached su SID: %u\n", cached_su_sid); } - err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT), - &cached_zygote_sid); + err = security_secctx_to_secid(ZYGOTE_CONTEXT, strlen(ZYGOTE_CONTEXT), &cached_zygote_sid); if (err) { pr_warn("Failed to cache zygote SID: %d\n", err); cached_zygote_sid = 0; @@ -130,8 +141,7 @@ void cache_sid(void) pr_info("Cached zygote SID: %u\n", cached_zygote_sid); } - err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT), - &cached_init_sid); + err = security_secctx_to_secid(INIT_CONTEXT, strlen(INIT_CONTEXT), &cached_init_sid); if (err) { pr_warn("Failed to cache init SID: %d\n", err); cached_init_sid = 0; @@ -139,8 +149,7 @@ void cache_sid(void) pr_info("Cached init SID: %u\n", cached_init_sid); } - err = security_secctx_to_secid(KSU_FILE_CONTEXT, - strlen(KSU_FILE_CONTEXT), &ksu_file_sid); + err = security_secctx_to_secid(KSU_FILE_CONTEXT, strlen(KSU_FILE_CONTEXT), &ksu_file_sid); if (err) { pr_warn("Failed to cache ksu_file SID: %d\n", err); ksu_file_sid = 0; @@ -153,15 +162,16 @@ void cache_sid(void) * Fast path: compare task's SID directly against cached value. * Falls back to string comparison if cache is not initialized. */ -static bool is_sid_match(const struct cred *cred, u32 cached_sid, - const char *fallback_context) +static bool is_sid_match(const struct cred *cred, u32 cached_sid, const char *fallback_context) { - const taskcred_sec_t *tsec; if (!cred) { return false; } - - tsec = (const taskcred_sec_t *)selinux_cred(cred); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) + const struct task_security_struct *tsec = selinux_cred(cred); +#else + const struct cred_security_struct *tsec = selinux_cred(cred); +#endif if (!tsec) { return false; } @@ -172,10 +182,9 @@ static bool is_sid_match(const struct cred *cred, u32 cached_sid, } // Slow path fallback: string comparison (only before cache is initialized) - struct lsm_context ctx = { 0 }; + struct lsm_context ctx; bool result; - int err = __security_secid_to_secctx(tsec->sid, &ctx); - if (err) { + if (__security_secid_to_secctx(tsec->sid, &ctx)) { return false; } result = strncmp(fallback_context, ctx.context, ctx.len) == 0; @@ -202,3 +211,19 @@ bool is_init(const struct cred *cred) { return is_sid_match(cred, cached_init_sid, INIT_CONTEXT); } + +void escape_to_root_for_adb_root(void) +{ + struct cred *cred = prepare_creds(); + if (!cred) { + pr_err("Failed to prepare adbd's creds!\n"); + return; + } + + if (transive_to_domain(KERNEL_SU_CONTEXT, cred, true)) { + pr_err("transive domain failed.\n"); + abort_creds(cred); + return; + } + commit_creds(cred); +} diff --git a/drivers/kernelsu/selinux/selinux.h b/drivers/kernelsu/selinux/selinux.h index cf8c414ee0ea..cbeac553d20a 100644 --- a/drivers/kernelsu/selinux/selinux.h +++ b/drivers/kernelsu/selinux/selinux.h @@ -1,12 +1,11 @@ #ifndef __KSU_H_SELINUX #define __KSU_H_SELINUX -#include "linux/types.h" -#include "linux/version.h" -#include "linux/cred.h" +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)) || defined(KSU_COMPAT_HAS_SELINUX_STATE) +#define KSU_COMPAT_USE_SELINUX_STATE +#endif -// TODO: rename to "ksu" -#define KERNEL_SU_DOMAIN "su" +#define KERNEL_SU_DOMAIN "ksu" #define KERNEL_SU_FILE "ksu_file" #define KERNEL_SU_CONTEXT "u:r:" KERNEL_SU_DOMAIN ":s0" @@ -14,30 +13,28 @@ #define ZYGOTE_CONTEXT "u:r:zygote:s0" #define INIT_CONTEXT "u:r:init:s0" -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) -#define KSU_COMPAT_USE_SELINUX_STATE -#endif - -void setup_selinux(const char *); +void setup_selinux(const char *, struct cred *); void setenforce(bool); -bool getenforce(void); +bool getenforce(); void cache_sid(void); -bool is_task_ksu_domain(const struct cred *cred); +bool is_task_ksu_domain(const struct cred* cred); + +bool is_ksu_domain(); -bool is_ksu_domain(void); +bool is_zygote(const struct cred* cred); -bool is_zygote(const struct cred *cred); +bool is_init(const struct cred* cred); -bool is_init(const struct cred *cred); +void apply_kernelsu_rules(); -void apply_kernelsu_rules(void); +int handle_sepolicy(void __user *user_data, u64 data_len); -int handle_sepolicy(unsigned long arg3, void __user *arg4); +void setup_ksu_cred(); -void setup_ksu_cred(void); +void escape_to_root_for_adb_root(); #endif diff --git a/drivers/kernelsu/selinux/selinux_defs.h b/drivers/kernelsu/selinux/selinux_defs.h deleted file mode 100644 index b8e47e7d77f1..000000000000 --- a/drivers/kernelsu/selinux/selinux_defs.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef __KSU_H_SELINUX_DEFS -#define __KSU_H_SELINUX_DEFS - -#include "selinux.h" -#include "objsec.h" -#ifndef KSU_COMPAT_USE_SELINUX_STATE -#include "avc.h" -#endif - -static inline bool is_selinux_disabled(void) -{ -#ifdef CONFIG_SECURITY_SELINUX_DISABLE -#ifdef KSU_COMPAT_USE_SELINUX_STATE - return selinux_state.disabled; -#else - return selinux_disabled; -#endif -#else - return false; -#endif -} - -static inline bool is_selinux_enforcing(void) -{ -#ifdef CONFIG_SECURITY_SELINUX_DEVELOP -#ifdef KSU_COMPAT_USE_SELINUX_STATE - return selinux_state.enforcing; -#else - return selinux_enforcing; -#endif -#else - return true; -#endif -} - -static inline void do_setenforce(bool val) -{ -#ifdef CONFIG_SECURITY_SELINUX_DEVELOP -#ifdef KSU_COMPAT_USE_SELINUX_STATE - selinux_state.enforcing = val; -#else - selinux_enforcing = val; -#endif -#else - /* do nothing */ -#endif -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 18, 0) -typedef struct task_security_struct taskcred_sec_t; -#else -typedef struct cred_security_struct taskcred_sec_t; -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0) -static inline taskcred_sec_t *selinux_cred(const struct cred *cred) -{ - return (taskcred_sec_t *)cred->security; -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0) -struct lsm_context { - char *context; - u32 len; -}; - -static inline int __security_secid_to_secctx(u32 secid, struct lsm_context *cp) -{ - return security_secid_to_secctx(secid, &cp->context, &cp->len); -} -static inline void __security_release_secctx(struct lsm_context *cp) -{ - security_release_secctx(cp->context, cp->len); -} -#else -#define __security_secid_to_secctx security_secid_to_secctx -#define __security_release_secctx security_release_secctx -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 10, 0) -/* - * get the subjective security ID of the current task - */ -static inline u32 current_sid(void) -{ - const taskcred_sec_t *sec = current_security(); - - return sec->sid; -} -#endif - -#endif diff --git a/drivers/kernelsu/selinux/sepolicy.c b/drivers/kernelsu/selinux/sepolicy.c index 1d3ec397030f..a97c7430efcf 100644 --- a/drivers/kernelsu/selinux/sepolicy.c +++ b/drivers/kernelsu/selinux/sepolicy.c @@ -1,58 +1,55 @@ -#include -#include -#include -#include - -#include "sepolicy.h" -#include "../klog.h" // IWYU pragma: keep -#include "ss/symtab.h" -#include "../kernel_compat.h" // Add check Huawei Device - #define KSU_SUPPORT_ADD_TYPE +/* + * Adapt to Huawei HISI kernel without affecting other kernels , + * Huawei Hisi Kernel EBITMAP Enable or Disable Flag , + * From ss/ebitmap.h + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) +#ifdef HISI_SELINUX_EBITMAP_RO +#define CONFIG_IS_HW_HISI +#endif +#endif + ////////////////////////////////////////////////////// // Declaration ////////////////////////////////////////////////////// -static struct avtab_node *get_avtab_node(struct policydb *db, - struct avtab_key *key, - struct avtab_extended_perms *xperms); +static struct avtab_node *get_avtab_node(struct policydb *db, struct avtab_key *key, + struct avtab_extended_perms *xperms); -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert); +static bool is_redundant_avtab_node(struct avtab_node *node); -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert); +static bool remove_avtab_node(struct policydb *db, struct avtab_node *node); -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert); -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert); +static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect, + bool invert); -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect); +static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert); -static bool add_filename_trans(struct policydb *db, const char *s, - const char *t, const char *c, const char *d, - const char *o); +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, + struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert); +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range, + int effect, bool invert); + +static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect); -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context); +static bool add_filename_trans(struct policydb *db, const char *s, const char *t, const char *c, const char *d, + const char *o); + +static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context); static bool add_type(struct policydb *db, const char *type_name, bool attr); -static bool set_type_state(struct policydb *db, const char *type_name, - bool permissive); +static bool set_type_state(struct policydb *db, const char *type_name, bool permissive); -static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, - struct type_datum *attr); +static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, struct type_datum *attr); -static bool add_typeattribute(struct policydb *db, const char *type, - const char *attr); +static bool add_typeattribute(struct policydb *db, const char *type, const char *attr); ////////////////////////////////////////////////////// // Implementation @@ -70,11 +67,9 @@ static bool add_typeattribute(struct policydb *db, const char *type, // htable is a struct instead of pointer above 5.8.0: // https://elixir.bootlin.com/linux/v5.8-rc1/source/security/selinux/ss/symtab.h #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab.htable, htab.size, cur) +#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab.htable, htab.size, cur) #else -#define ksu_hashtab_for_each(htab, cur) \ - ksu_hash_for_each(htab->htable, htab->size, cur) +#define ksu_hashtab_for_each(htab, cur) ksu_hash_for_each(htab->htable, htab->size, cur) #endif // symtab_search is introduced on 5.9.0: @@ -84,8 +79,7 @@ static bool add_typeattribute(struct policydb *db, const char *type, #define symtab_insert(s, name, datum) hashtab_insert((s)->table, name, datum) #endif -#define avtab_for_each(avtab, cur) \ - ksu_hash_for_each(avtab.htable, avtab.nslot, cur); +#define avtab_for_each(avtab, cur) ksu_hash_for_each(avtab.htable, avtab.nslot, cur); static struct avtab_node *get_avtab_node(struct policydb *db, struct avtab_key *key, @@ -126,6 +120,8 @@ static struct avtab_node *get_avtab_node(struct policydb *db, } /* this is used to get the node - insertion is actually unique */ node = avtab_insert_nonunique(&db->te_avtab, key, &avdatum); + if (!node) + return NULL; int grow_size = sizeof(struct avtab_key); grow_size += sizeof(struct avtab_datum); @@ -141,8 +137,93 @@ static struct avtab_node *get_avtab_node(struct policydb *db, return node; } -static bool add_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *p, int effect, bool invert) +static bool is_redundant_avtab_node(struct avtab_node *node) +{ + if (node->key.specified & AVTAB_XPERMS) + return node->datum.u.xperms == NULL; + if (!(node->key.specified & AVTAB_AV)) + return false; + if (node->key.specified & AVTAB_AUDITDENY) + return node->datum.u.data == ~0U; + return node->datum.u.data == 0U; +} + +// 4.1, https://github.com/torvalds/linux/commit/ba39db6e0519aa8362dbda6523ceb69349a18dc3 +// 5.1, https://github.com/torvalds/linux/commit/acdf52d97f824019888422842757013b37441dd1 +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) || LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) +static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i) +{ + // htable is ** + // struct avtab_node **htable; + return ab->htable[i]; +} +static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node) +{ + ab->htable[i] = node; +} +#else +static inline struct avtab_node *avtab_get_slot(struct avtab *ab, int i) +{ + // htable is ** + // this can ret NULL! + struct avtab_node **p = flex_array_get(ab->htable, i); + if (!p) + return NULL; + + return *p; +} +static inline void avtab_set_slot(struct avtab *ab, int i, struct avtab_node *node) +{ + flex_array_put_ptr(ab->htable, i, node, GFP_KERNEL | __GFP_ZERO); +} +#endif + +static bool remove_avtab_node(struct policydb *db, struct avtab_node *node) +{ + int i; + int ret; + int shrink_size = sizeof(struct avtab_key) + sizeof(struct avtab_datum); + struct avtab removed = {}; + struct avtab_node *n; + struct avtab_node *prev; + + ret = avtab_alloc(&removed, 1); + if (ret < 0) + return false; + + for (i = 0; i < db->te_avtab.nslot; i++) { + prev = NULL; + for (n = avtab_get_slot(&db->te_avtab, i); n; prev = n, n = n->next) { + if (n != node) + continue; + + if (prev) + prev->next = n->next; + else + avtab_set_slot(&db->te_avtab, i, n->next); + + if (db->te_avtab.nel > 0) + db->te_avtab.nel--; + + if ((n->key.specified & AVTAB_XPERMS) && n->datum.u.xperms) { + shrink_size += sizeof(u8) + sizeof(u8) + sizeof(u32) * ARRAY_SIZE(n->datum.u.xperms->perms.p); + } + n->next = NULL; + avtab_set_slot(&removed, 0, n); + removed.nel = 1; + avtab_destroy(&removed); + if (db->len >= shrink_size) + db->len -= shrink_size; + return true; + } + } + + avtab_destroy(&removed); + return false; +} + +static bool add_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *p, int effect, + bool invert) { struct type_datum *src = NULL, *tgt = NULL; struct class_datum *cls = NULL; @@ -188,31 +269,27 @@ static bool add_rule(struct policydb *db, const char *s, const char *t, return false; } } - add_rule_raw(db, src, tgt, cls, perm, effect, invert); - return true; + return add_rule_raw(db, src, tgt, cls, perm, effect, invert); } -static void add_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - struct perm_datum *perm, int effect, bool invert) +static bool add_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, struct class_datum *cls, + struct perm_datum *perm, int effect, bool invert) { + bool success = true; + if (src == NULL) { struct hashtab_node *node; if (strip_av(effect, invert)) { ksu_hashtab_for_each(db->p_types.table, node) { - add_rule_raw(db, - (struct type_datum *)node->datum, - tgt, cls, perm, effect, invert); + success &= add_rule_raw(db, (struct type_datum *)node->datum, tgt, cls, perm, effect, invert); }; } else { ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_rule_raw(db, type, tgt, cls, perm, - effect, invert); + success &= add_rule_raw(db, type, tgt, cls, perm, effect, invert); } }; } @@ -221,18 +298,14 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, if (strip_av(effect, invert)) { ksu_hashtab_for_each(db->p_types.table, node) { - add_rule_raw(db, src, - (struct type_datum *)node->datum, - cls, perm, effect, invert); + success &= add_rule_raw(db, src, (struct type_datum *)node->datum, cls, perm, effect, invert); }; } else { ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_rule_raw(db, src, type, cls, perm, - effect, invert); + success &= add_rule_raw(db, src, type, cls, perm, effect, invert); } }; } @@ -240,22 +313,30 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, struct hashtab_node *node; ksu_hashtab_for_each(db->p_classes.table, node) { - add_rule_raw(db, src, tgt, - (struct class_datum *)node->datum, perm, - effect, invert); + success &= add_rule_raw(db, src, tgt, (struct class_datum *)node->datum, perm, effect, invert); } } else { struct avtab_key key; + struct avtab_node *node; + key.source_type = src->value; key.target_type = tgt->value; key.target_class = cls->value; key.specified = effect; - struct avtab_node *node = get_avtab_node(db, &key, NULL); + if (invert && effect != AVTAB_AUDITDENY) { + node = avtab_search_node(&db->te_avtab, &key); + if (!node) + return true; + } else { + node = get_avtab_node(db, &key, NULL); + if (!node) + return false; + } + if (invert) { if (perm) - node->datum.u.data &= - ~(1U << (perm->value - 1)); + node->datum.u.data &= ~(1U << (perm->value - 1)); else node->datum.u.data = 0U; } else { @@ -264,7 +345,11 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, else node->datum.u.data = ~0U; } + if (is_redundant_avtab_node(node)) + return remove_avtab_node(db, node); } + + return success; } #define ioctl_driver(x) (x >> 8 & 0xFF) @@ -274,40 +359,32 @@ static void add_rule_raw(struct policydb *db, struct type_datum *src, #define xperm_set(x, p) (p[x >> 5] |= (1 << (x & 0x1f))) #define xperm_clear(x, p) (p[x >> 5] &= ~(1 << (x & 0x1f))) -static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, - struct type_datum *tgt, struct class_datum *cls, - uint16_t low, uint16_t high, int effect, - bool invert) +static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, struct type_datum *tgt, + struct class_datum *cls, uint16_t low, uint16_t high, int effect, bool invert) { if (src == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_xperm_rule_raw(db, type, tgt, cls, low, - high, effect, invert); + add_xperm_rule_raw(db, type, tgt, cls, low, high, effect, invert); } }; } else if (tgt == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_types.table, node) { - struct type_datum *type = - (struct type_datum *)(node->datum); + struct type_datum *type = (struct type_datum *)(node->datum); if (type->attribute) { - add_xperm_rule_raw(db, src, type, cls, low, - high, effect, invert); + add_xperm_rule_raw(db, src, type, cls, low, high, effect, invert); } }; } else if (cls == NULL) { struct hashtab_node *node; ksu_hashtab_for_each(db->p_classes.table, node) { - add_xperm_rule_raw(db, src, tgt, - (struct class_datum *)(node->datum), - low, high, effect, invert); + add_xperm_rule_raw(db, src, tgt, (struct class_datum *)(node->datum), low, high, effect, invert); }; } else { struct avtab_key key; @@ -330,8 +407,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, } int i; if (xperms.specified == AVTAB_XPERMS_IOCTLDRIVER) { - for (i = ioctl_driver(low); i <= ioctl_driver(high); - ++i) { + for (i = ioctl_driver(low); i <= ioctl_driver(high); ++i) { if (invert) xperm_clear(i, xperms.perms.p); else @@ -354,9 +430,7 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, datum = &node->datum; if (datum->u.xperms == NULL) { - datum->u.xperms = - (struct avtab_extended_perms *)(kzalloc( - sizeof(xperms), GFP_ATOMIC)); + datum->u.xperms = (struct avtab_extended_perms *)(kzalloc(sizeof(xperms), GFP_ATOMIC)); if (!datum->u.xperms) { pr_err("alloc xperms failed\n"); return; @@ -366,9 +440,8 @@ static void add_xperm_rule_raw(struct policydb *db, struct type_datum *src, } } -static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *range, int effect, - bool invert) +static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *range, + int effect, bool invert) { struct type_datum *src = NULL, *tgt = NULL; struct class_datum *cls = NULL; @@ -415,8 +488,7 @@ static bool add_xperm_rule(struct policydb *db, const char *s, const char *t, return true; } -static bool add_type_rule(struct policydb *db, const char *s, const char *t, - const char *c, const char *d, int effect) +static bool add_type_rule(struct policydb *db, const char *s, const char *t, const char *c, const char *d, int effect) { struct type_datum *src, *tgt, *def; struct class_datum *cls; @@ -449,6 +521,8 @@ static bool add_type_rule(struct policydb *db, const char *s, const char *t, key.specified = effect; struct avtab_node *node = get_avtab_node(db, &key, NULL); + if (!node) + return false; node->datum.u.data = def->value; return true; @@ -533,11 +607,9 @@ static bool add_filename_trans(struct policydb *db, const char *s, struct filename_trans_datum *last = NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - struct filename_trans_datum *trans = - policydb_filenametr_search(db, &key); + struct filename_trans_datum *trans = policydb_filenametr_search(db, &key); #else - struct filename_trans_datum *trans = - hashtab_search(&db->filename_trans, &key); + struct filename_trans_datum *trans = hashtab_search(&db->filename_trans, &key); #endif while (trans) { if (ebitmap_get_bit(&trans->stypes, src->value - 1)) { @@ -552,17 +624,13 @@ static bool add_filename_trans(struct policydb *db, const char *s, } if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_ATOMIC); - struct filename_trans_key *new_key = - (struct filename_trans_key *)kzalloc(sizeof(*new_key), - GFP_ATOMIC); + trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), GFP_KERNEL); + struct filename_trans_key *new_key = (struct filename_trans_key *)kzalloc(sizeof(*new_key), GFP_KERNEL); *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); + new_key->name = kstrdup(key.name, GFP_KERNEL); trans->next = last; trans->otype = def->value; - hashtab_insert(&db->filename_trans, new_key, trans, - filenametr_key_params); + hashtab_insert(&db->filename_trans, new_key, trans, filenametr_key_params); } db->compat_filename_trans_count++; @@ -578,42 +646,52 @@ static bool add_filename_trans(struct policydb *db, const char *s, hashtab_search(db->filename_trans, &key); if (trans == NULL) { - trans = (struct filename_trans_datum *)kcalloc(1, sizeof(*trans), - GFP_ATOMIC); + trans = (struct filename_trans_datum *)kcalloc(sizeof(*trans), 1, GFP_KERNEL); if (!trans) { pr_err("add_filename_trans: Failed to alloc datum\n"); return false; } struct filename_trans *new_key = - (struct filename_trans *)kzalloc(sizeof(*new_key), - GFP_ATOMIC); + (struct filename_trans *)kmalloc(sizeof(*new_key), GFP_KERNEL); if (!new_key) { pr_err("add_filename_trans: Failed to alloc new_key\n"); return false; } *new_key = key; - new_key->name = kstrdup(key.name, GFP_ATOMIC); + new_key->name = kstrdup(key.name, GFP_KERNEL); trans->otype = def->value; hashtab_insert(db->filename_trans, new_key, trans); } - return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == - 0; + return ebitmap_set_bit(&db->filename_trans_ttypes, src->value - 1, 1) == 0; #endif } -static bool add_genfscon(struct policydb *db, const char *fs_name, - const char *path, const char *context) +static bool add_genfscon(struct policydb *db, const char *fs_name, const char *path, const char *context) { return false; } // https://github.com/torvalds/linux/commit/590b9d576caec6b4c46bba49ed36223a399c3fc5#diff-cc9aa90e094e6e0f47bd7300db4f33cf4366b98b55d8753744f31eb69c691016R844-R845 #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_ATOMIC) -#else -#define ksu_kvrealloc(p, new_size, old_size) \ - ksu_compat_kvrealloc(p, old_size, new_size, GFP_ATOMIC) +#define ksu_kvrealloc(p, new_size, _old_size) kvrealloc(p, new_size, GFP_KERNEL) +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) +// https://cs.android.com/android/_/android/kernel/common/+/f5f3e54f811679761c33526e695bd296190faade +// Some 5.10 kernel don't have this backport, so copy one. +static void *ksu_kvrealloc_compat(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + __builtin_memcpy(newp, p, oldsize); // bypass fortify_source, kasan + kvfree(p); + return newp; +} +#define ksu_kvrealloc(p, new_size, old_size) ksu_kvrealloc_compat(p, old_size, new_size, GFP_KERNEL) #endif static bool add_type(struct policydb *db, const char *type_name, bool attr) @@ -627,7 +705,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) u32 value = ++db->p_types.nprim; type = (struct type_datum *)kzalloc(sizeof(struct type_datum), - GFP_ATOMIC); + GFP_KERNEL); if (!type) { pr_err("add_type: alloc type_datum failed.\n"); return false; @@ -637,7 +715,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) type->value = value; type->attribute = attr; - char *key = kstrdup(type_name, GFP_ATOMIC); + char *key = kstrdup(type_name, GFP_KERNEL); if (!key) { pr_err("add_type: alloc key failed.\n"); return false; @@ -648,11 +726,11 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) return false; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) struct ebitmap *new_type_attr_map_array = ksu_kvrealloc(db->type_attr_map_array, - value * sizeof(struct ebitmap), - (value - 1) * sizeof(struct ebitmap)); + value * sizeof(struct ebitmap), + (value - 1) * sizeof(struct ebitmap)); if (!new_type_attr_map_array) { pr_err("add_type: alloc type_attr_map_array failed\n"); @@ -661,8 +739,8 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) struct type_datum **new_type_val_to_struct = ksu_kvrealloc(db->type_val_to_struct, - sizeof(*db->type_val_to_struct) * value, - sizeof(*db->type_val_to_struct) * (value - 1)); + sizeof(*db->type_val_to_struct) * value, + sizeof(*db->type_val_to_struct) * (value - 1)); if (!new_type_val_to_struct) { pr_err("add_type: alloc type_val_to_struct failed\n"); @@ -671,8 +749,8 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) char **new_val_to_name_types = ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES], - sizeof(char *) * value, - sizeof(char *) * (value - 1)); + sizeof(char *) * value, + sizeof(char *) * (value - 1)); if (!new_val_to_name_types) { pr_err("add_type: alloc val_to_name failed\n"); return false; @@ -695,6 +773,55 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) } return true; + +#elif defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) + struct ebitmap *new_type_attr_map_array = + ksu_kvrealloc(db->type_attr_map_array, + value * sizeof(struct ebitmap), + (value - 1) * sizeof(struct ebitmap)); + + if (!new_type_attr_map_array) { + pr_err("add_type: alloc type_attr_map_array failed\n"); + return false; + } + + struct type_datum **new_type_val_to_struct = + ksu_kvrealloc(db->type_val_to_struct_array, + sizeof(*db->type_val_to_struct_array) * value, + sizeof(*db->type_val_to_struct_array) * (value - 1)); + + if (!new_type_val_to_struct) { + pr_err("add_type: alloc type_val_to_struct failed\n"); + return false; + } + + char **new_val_to_name_types = + ksu_kvrealloc(db->sym_val_to_name[SYM_TYPES], + sizeof(char *) * value, + sizeof(char *) * (value - 1)); + if (!new_val_to_name_types) { + pr_err("add_type: alloc val_to_name failed\n"); + return false; + } + + db->type_attr_map_array = new_type_attr_map_array; + ebitmap_init(&db->type_attr_map_array[value - 1]); + ebitmap_set_bit(&db->type_attr_map_array[value - 1], value - 1, 1); + + db->type_val_to_struct_array = new_type_val_to_struct; + db->type_val_to_struct_array[value - 1] = type; + + db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; + db->sym_val_to_name[SYM_TYPES][value - 1] = key; + + int i; + for (i = 0; i < db->p_roles.nprim; ++i) { + ebitmap_set_bit(&db->role_val_to_struct[i]->types, value - 1, + 1); + } + + return true; + #elif defined(CONFIG_IS_HW_HISI) /* * Huawei use type_attr_map and type_val_to_struct. @@ -702,12 +829,12 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) */ size_t new_size = sizeof(struct ebitmap) * db->p_types.nprim; struct ebitmap *new_type_attr_map = - (krealloc(db->type_attr_map, new_size, GFP_ATOMIC)); + (krealloc(db->type_attr_map, new_size, GFP_KERNEL)); struct type_datum **new_type_val_to_struct = krealloc(db->type_val_to_struct, sizeof(*db->type_val_to_struct) * db->p_types.nprim, - GFP_ATOMIC); + GFP_KERNEL); if (!new_type_attr_map) { pr_err("add_type: alloc type_attr_map failed\n"); @@ -749,15 +876,15 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) // flex_array is not extensible, we need to create a new bigger one instead struct flex_array *new_type_attr_map_array = flex_array_alloc(sizeof(struct ebitmap), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); struct flex_array *new_type_val_to_struct = flex_array_alloc(sizeof(struct type_datum *), db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); struct flex_array *new_val_to_name_types = flex_array_alloc(sizeof(char *), db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); if (!new_type_attr_map_array) { pr_err("add_type: alloc type_attr_map_array failed\n"); @@ -776,20 +903,20 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) // preallocate so we don't have to worry about the put ever failing if (flex_array_prealloc(new_type_attr_map_array, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc type_attr_map_array failed\n"); return false; } if (flex_array_prealloc(new_type_val_to_struct, 0, db->p_types.nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc type_val_to_struct_array failed\n"); return false; } if (flex_array_prealloc(new_val_to_name_types, 0, db->symtab[SYM_TYPES].nprim, - GFP_ATOMIC | __GFP_ZERO)) { + GFP_KERNEL | __GFP_ZERO)) { pr_err("add_type: prealloc val_to_name_types failed\n"); return false; } @@ -801,14 +928,14 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) old_elem = flex_array_get(db->type_attr_map_array, j); if (old_elem) flex_array_put(new_type_attr_map_array, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } for (j = 0; j < db->type_val_to_struct_array->total_nr_elements; j++) { old_elem = flex_array_get_ptr(db->type_val_to_struct_array, j); if (old_elem) flex_array_put_ptr(new_type_val_to_struct, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } for (j = 0; j < db->symtab[SYM_TYPES].nprim; j++) { @@ -816,7 +943,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_get_ptr(db->sym_val_to_name[SYM_TYPES], j); if (old_elem) flex_array_put_ptr(new_val_to_name_types, j, old_elem, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } // store the pointer of old flex arrays first, when assigning new ones we @@ -839,7 +966,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_free(old_fa); } flex_array_put_ptr(db->type_val_to_struct_array, value - 1, type, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); old_fa = db->sym_val_to_name[SYM_TYPES]; db->sym_val_to_name[SYM_TYPES] = new_val_to_name_types; @@ -847,7 +974,7 @@ static bool add_type(struct policydb *db, const char *type_name, bool attr) flex_array_free(old_fa); } flex_array_put_ptr(db->sym_val_to_name[SYM_TYPES], value - 1, key, - GFP_ATOMIC | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); int i; for (i = 0; i < db->p_roles.nprim; ++i) { @@ -894,7 +1021,7 @@ static bool set_type_state(struct policydb *db, const char *type_name, static void add_typeattribute_raw(struct policydb *db, struct type_datum *type, struct type_datum *attr) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) || defined(KSU_TYPE_VAL_TO_STRUCT) || defined(KSU_TYPE_VAL_TO_STRUCT_ARRAY) struct ebitmap *sattr = &db->type_attr_map_array[type->value - 1]; #elif defined(CONFIG_IS_HW_HISI) /* @@ -1060,3 +1187,92 @@ bool ksu_genfscon(struct policydb *db, const char *fs_name, const char *path, { return add_genfscon(db, fs_name, path, ctx); } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#include "ss/avtab.h" +#include "ss/constraint.h" +#include "ss/ebitmap.h" +#include "ss/hashtab.h" +#include "ss/policydb.h" +#include "ss/services.h" + +void ksu_destroy_sepolicy(struct selinux_policy *pol) +{ + policydb_destroy(&pol->policydb); + kfree(pol); +} + +struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol) +{ + int ret; + size_t len; + struct selinux_policy *new_pol; + void *data; + struct policy_file fp; + + len = old_pol->policydb.len; + data = vmalloc(len); + if (!data) { + pr_err("alloc policy len %ld\n", len); + ret = -ENOMEM; + goto out_free_data; + } + + fp.data = data; + fp.len = len; + + ret = policydb_write(&old_pol->policydb, &fp); + if (ret) { + pr_err("sepolicy: policydb_write: %d\n", ret); + goto out_free_data; + } + + // https://android-review.googlesource.com/c/kernel/common/+/3009995/11/security/selinux/ss/policydb.c + // fixup config + // 4*2+8+4 + static const size_t kConfigOff = 20; + if (len >= kConfigOff + sizeof(u32)) { + u32 *config_ptr = (u32 *)((unsigned long)data + kConfigOff); + pr_info("old config: %u\n", *config_ptr); + if (old_pol->policydb.android_netlink_route) { + pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE\n"); + *config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE; + } + if (old_pol->policydb.android_netlink_getneigh) { + pr_info("adding POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH\n"); + *config_ptr |= POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH; + } + pr_info("new config: %u\n", *config_ptr); + } + + new_pol = kmemdup(old_pol, sizeof(*old_pol), GFP_KERNEL); + if (!new_pol) { + ret = -ENOMEM; + pr_err("sepolicy: dup old pol\n"); + goto out_free_data; + } + memset(&new_pol->policydb, 0, sizeof(new_pol->policydb)); + + // rewind fp + fp.data = data; + fp.len = len; + + ret = policydb_read(&new_pol->policydb, &fp); + if (ret) { + pr_err("sepolicy: policydb_read: %d\n", ret); + goto out_free_policydb; + } + new_pol->policydb.len = old_pol->policydb.len; + kvfree(data); + + return new_pol; + +out_free_policydb: + kfree(new_pol); + +out_free_data: + kvfree(data); + + return ERR_PTR(ret); +} +#endif diff --git a/drivers/kernelsu/selinux/sepolicy.h b/drivers/kernelsu/selinux/sepolicy.h index 675d1499e46d..8ae79e3dc3b3 100644 --- a/drivers/kernelsu/selinux/sepolicy.h +++ b/drivers/kernelsu/selinux/sepolicy.h @@ -1,10 +1,14 @@ #ifndef __KSU_H_SEPOLICY #define __KSU_H_SEPOLICY -#include - #include "ss/policydb.h" +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +struct selinux_policy *ksu_dup_sepolicy(struct selinux_policy *old_pol); + +void ksu_destroy_sepolicy(struct selinux_policy *orig); +#endif + // Operation on types bool ksu_type(struct policydb *db, const char *name, const char *attr); bool ksu_attribute(struct policydb *db, const char *name); diff --git a/drivers/kernelsu/setuid_hook.c b/drivers/kernelsu/setuid_hook.c deleted file mode 100644 index c15123101c45..000000000000 --- a/drivers/kernelsu/setuid_hook.c +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#endif -#include -#include -#include -#include - -#include "allowlist.h" -#include "setuid_hook.h" -#include "klog.h" // IWYU pragma: keep -#include "manager.h" -#include "selinux/selinux.h" -#include "supercalls.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "syscall_handler.h" -#endif -#include "kernel_umount.h" -#include "kernel_compat.h" - -static void ksu_install_manager_fd_tw_func(struct callback_head *cb) -{ - ksu_install_fd(); - kfree(cb); -} - -static void do_install_manager_fd(void) -{ - struct callback_head *cb = kzalloc(sizeof(*cb), GFP_ATOMIC); - if (!cb) - return; - - cb->func = ksu_install_manager_fd_tw_func; - if (task_work_add(current, cb, TWA_RESUME)) { - kfree(cb); - pr_warn("install manager fd add task_work failed\n"); - } -} - -// force_sig kcompat, TODO: move it out of core_hook.c -// https://elixir.bootlin.com/linux/v5.3-rc1/source/kernel/signal.c#L1613 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) -#define send_sig(sig) force_sig(sig) -#else -#define send_sig(sig) force_sig(sig, current) -#endif - -extern void disable_seccomp(void); -int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("handle_setuid from %d to %d\n", old_uid, new_uid); -#endif - - if (likely(ksu_is_manager_appid_valid()) && - unlikely(ksu_get_manager_appid() == new_uid % PER_USER_RANGE)) { - disable_seccomp(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_set_task_tracepoint_flag(current); -#endif - pr_info("install fd for manager (uid=%d)\n", new_uid); - do_install_manager_fd(); - return 0; - } - - if (ksu_is_allow_uid_for_current(new_uid)) { - disable_seccomp(); -#ifdef CONFIG_KSU_SYSCALL_HOOK - ksu_set_task_tracepoint_flag(current); - } else { - ksu_clear_task_tracepoint_flag_if_needed(current); -#endif - } - - // Handle kernel umount - ksu_handle_umount(old_uid, new_uid); - - return 0; -} - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0) && \ - defined(CONFIG_KSU_MANUAL_HOOK)) -int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid) -{ - if (!is_zygote(current_cred())) { -#ifdef CONFIG_KSU_DEBUG - pr_info("setresuid: disallow non zygote sid!\n"); -#endif - return 0; - } - return ksu_handle_setuid_common(ruid, current_uid().val, euid); -} -#endif - -void ksu_setuid_hook_init(void) -{ - ksu_kernel_umount_init(); -} - -void ksu_setuid_hook_exit(void) -{ - pr_info("ksu setuid exit\n"); - ksu_kernel_umount_exit(); -} diff --git a/drivers/kernelsu/setuid_hook.h b/drivers/kernelsu/setuid_hook.h deleted file mode 100644 index 7c4eda71c1c0..000000000000 --- a/drivers/kernelsu/setuid_hook.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __KSU_H_KSU_SETUID_HOOK -#define __KSU_H_KSU_SETUID_HOOK - -#include -#include - -void ksu_setuid_hook_init(void); -void ksu_setuid_hook_exit(void); - -int ksu_handle_setuid_common(uid_t new_uid, uid_t old_uid, uid_t new_euid); - -#endif diff --git a/drivers/kernelsu/shim.c b/drivers/kernelsu/shim.c deleted file mode 100644 index 75d5542a87aa..000000000000 --- a/drivers/kernelsu/shim.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include - -// unity build idea from backslashxx, not full, we only use it for shim ksu hooks - -#include "allowlist.h" -#include "arch.h" -#include "kp_hook.h" -#include "ksu.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "kernel_compat.h" -#include "kp_util.h" -#include "supercalls.h" -#include "sucompat.h" -#include "setuid_hook.h" -#include "syscall_handler.h" -#include "selinux/selinux.h" -#include "throne_tracker.h" - -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "pkg_observer.c" -#include "kp_hook.c" -#include "kp_util.c" -#include "syscall_handler.c" -#endif - -#if (defined(CONFIG_KSU_MANUAL_HOOK) && \ - LINUX_VERSION_CODE < KERNEL_VERSION(6, 8, 0)) -#include "lsm_hook.c" -#elif (defined(CONFIG_KSU_MANUAL_HOOK) && \ - LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)) -// + ksu_handle_setresuid hook for 6.8+ -#include "pkg_observer.c" -#endif diff --git a/drivers/kernelsu/sucompat.c b/drivers/kernelsu/sucompat.c deleted file mode 100644 index 2bb1a9fba702..000000000000 --- a/drivers/kernelsu/sucompat.c +++ /dev/null @@ -1,217 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) -#include -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#include -#else -#include -#endif -#include - -#include "allowlist.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksud.h" -#include "kernel_compat.h" -#include "sucompat.h" -#include "app_profile.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_util.h" -#endif - -#define SU_PATH "/system/bin/su" -#define SH_PATH "/system/bin/sh" - -bool ksu_su_compat_enabled __read_mostly = true; - -static const char su_path[] = SU_PATH; -static const char ksud_path[] = KSUD_PATH; -static const char sh_path[] = SH_PATH; - -static int su_compat_feature_get(u64 *value) -{ - *value = ksu_su_compat_enabled ? 1 : 0; - return 0; -} - -static int su_compat_feature_set(u64 value) -{ - bool enable = value != 0; - ksu_su_compat_enabled = enable; - pr_info("su_compat: set to %d\n", enable); - return 0; -} - -static const struct ksu_feature_handler su_compat_handler = { - .feature_id = KSU_FEATURE_SU_COMPAT, - .name = "su_compat", - .get_handler = su_compat_feature_get, - .set_handler = su_compat_feature_set, -}; - -static void __user *userspace_stack_buffer(const void *d, size_t len) -{ - // To avoid having to mmap a page in userspace, just write below the stack - // pointer. - char __user *p = (void __user *)current_user_stack_pointer() - len; - - return copy_to_user(p, d, len) ? NULL : p; -} - -static char __user *sh_user_path(void) -{ - return userspace_stack_buffer(sh_path, sizeof(sh_path)); -} - -static char __user *ksud_user_path(void) -{ - return userspace_stack_buffer(ksud_path, sizeof(ksud_path)); -} - -static inline bool is_su_allowed(void) -{ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (!ksu_su_compat_enabled) - return false; -#endif -#ifdef CONFIG_SECCOMP - if (likely(!!current->seccomp.mode)) - return false; -#endif - if (!ksu_is_allow_uid_for_current(current_uid().val)) - return false; - - return true; -} - -static int ksu_sucompat_user_common(const char __user **filename_user, - const char *syscall_name, - const bool escalate) -{ - char path[sizeof(su_path) + 1]; - - if (unlikely(!filename_user)) - return 0; - if (!is_su_allowed()) - return 0; - - memset(path, 0, sizeof(path)); - ksu_strncpy_from_user_nofault(path, *filename_user, sizeof(path)); - - if (memcmp(path, su_path, sizeof(su_path))) - return 0; - - if (escalate) { - pr_info("%s su found\n", syscall_name); - *filename_user = ksud_user_path(); - escape_with_root_profile(); // escalate !! - } else { - pr_info("%s su->sh!\n", syscall_name); - *filename_user = sh_user_path(); - } - - return 0; -} - -#ifdef CONFIG_KSU_SYSCALL_HOOK -static int do_execve_sucompat_for_kp(const char __user **filename_user) -{ - char path[sizeof(su_path) + 1]; - - if (unlikely(!filename_user)) - return 0; - if (!is_su_allowed()) - return 0; - if (!ksu_retry_filename_access(filename_user, path, sizeof(path), true)) - return 0; - if (likely(memcmp(path, su_path, sizeof(su_path)))) - return 0; - - pr_info("sys_execve su found\n"); - *filename_user = ksud_user_path(); - - escape_with_root_profile(); - - return 0; -} -#define handle_execve_sucompat(filename_ptr) \ - (do_execve_sucompat_for_kp(filename_ptr)) -#else -#define handle_execve_sucompat(filename_ptr) \ - (ksu_sucompat_user_common(filename_ptr, "sys_execve", true)) -#endif - -int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *__unused_flags) -{ - return ksu_sucompat_user_common(filename_user, "faccessat", false); -} - -int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags) -{ - return ksu_sucompat_user_common(filename_user, "newfstatat", false); -} - -int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) -{ - return handle_execve_sucompat(filename_user); -} - -int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags) -{ - struct filename *filename; - - if (unlikely(!filename_ptr)) - return 0; - if (!is_su_allowed()) - return 0; - - filename = *filename_ptr; - if (IS_ERR(filename)) - return 0; - if (likely(memcmp(filename->name, su_path, sizeof(su_path)))) - return 0; - - pr_info("do_execveat_common su found\n"); - memcpy((void *)filename->name, ksud_path, sizeof(ksud_path)); - - escape_with_root_profile(); - - return 0; -} - -int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, - void *envp, int *flags) -{ - return ksu_handle_execveat_ksud(fd, filename_ptr, argv, envp, flags); -} - -// dead code: devpts handling -int __maybe_unused ksu_handle_devpts(struct inode *inode) -{ - return 0; -} - -// sucompat: permitted process can execute 'su' to gain root access. -void ksu_sucompat_init(void) -{ - if (ksu_register_feature_handler(&su_compat_handler)) { - pr_err("Failed to register su_compat feature handler\n"); - } -} - -void ksu_sucompat_exit(void) -{ - ksu_unregister_feature_handler(KSU_FEATURE_SU_COMPAT); -} diff --git a/drivers/kernelsu/sucompat.h b/drivers/kernelsu/sucompat.h deleted file mode 100644 index de4bcfe037fa..000000000000 --- a/drivers/kernelsu/sucompat.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __KSU_H_SUCOMPAT -#define __KSU_H_SUCOMPAT -#include - -extern bool ksu_su_compat_enabled; - -void ksu_sucompat_init(void); -void ksu_sucompat_exit(void); - -// Handler functions exported for hook_manager -int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *__unused_flags); -int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags); -int ksu_handle_execve_sucompat(int *fd, const char __user **filename_user, - void *__never_use_argv, void *__never_use_envp, - int *__never_use_flags); -#endif diff --git a/drivers/kernelsu/sulog/event.c b/drivers/kernelsu/sulog/event.c new file mode 100644 index 000000000000..95f532979df5 --- /dev/null +++ b/drivers/kernelsu/sulog/event.c @@ -0,0 +1,267 @@ +#define KSU_SULOG_MAX_QUEUED 256U +#define KSU_SULOG_MAX_PAYLOAD_LEN 2048U +#define KSU_SULOG_MAX_ARG_STRINGS 0x7FFFFFFF +#define KSU_SULOG_MAX_ARG_CHUNK 256U +#define KSU_SULOG_MAX_FILENAME_LEN 256U + +static struct ksu_event_queue sulog_queue; + +struct ksu_sulog_pending_event { + __u16 event_type; + void *payload; + __u32 payload_len; +}; + +struct ksu_sulog_identity { + __u32 uid; + __u32 euid; +}; + +static void ksu_sulog_fill_task_info(struct ksu_sulog_event *event, __u16 event_type, int retval) +{ + event->version = KSU_SULOG_EVENT_VERSION; + event->event_type = event_type; + event->retval = retval; + event->pid = task_pid_nr(current); + event->tgid = task_tgid_nr(current); + event->ppid = task_ppid_nr(current); + event->uid = current_uid().val; + event->euid = current_euid().val; + get_task_comm(event->comm, current); +} + +static void ksu_sulog_set_identity(struct ksu_sulog_event *event, const struct ksu_sulog_identity *identity) +{ + if (!identity) + return; + + event->uid = identity->uid; + event->euid = identity->euid; +} + +static struct ksu_sulog_pending_event *ksu_sulog_capture(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) +{ + struct ksu_sulog_pending_event *pending = NULL; + struct ksu_sulog_event *event; + void *payload = NULL; + __u32 payload_len; + __u32 filename_len; + __u32 argv_len; + __u32 remaining; + char *filename_buf; + bool should_skip_copy = false; + + if (!ksu_sulog_is_enabled()) + return NULL; + + if (event_type == KSU_SULOG_EVENT_IOCTL_GRANT_ROOT || event_type == KSU_SULOG_EVENT_SUCOMPAT) { + filename_len = 0; + argv_len = 0; + should_skip_copy = true; + goto alloc; + } + + if (!bprm_argv) + return NULL; + + if (!bprm_argv_len) + return NULL; + + if (bprm_argv_len <= 0) + return NULL; + +alloc: + pending = kzalloc(sizeof(*pending), gfp); + if (!pending) + goto out_drop; + + payload = kzalloc(KSU_SULOG_MAX_PAYLOAD_LEN, gfp); + if (!payload) + goto out_free_pending; + + event = payload; + ksu_sulog_fill_task_info(event, event_type, 0); + + if (should_skip_copy) + goto skip_copy; + + remaining = KSU_SULOG_MAX_PAYLOAD_LEN - sizeof(*event); + filename_buf = (char *)payload + sizeof(*event); + + size_t actual_copy_len = bprm_argv_len; + + if (bprm_argv_len > remaining - 1) + actual_copy_len = remaining - 1 ; + + memcpy(filename_buf, bprm_argv, actual_copy_len); + filename_buf[actual_copy_len] = '\0'; + + filename_len = strlen(filename_buf) + 1 ; // argv0 + null terminator + + if (actual_copy_len > filename_len) + argv_len = actual_copy_len - (filename_len); + else + argv_len = 0; + +skip_copy: + event->filename_len = filename_len; + event->argv_len = argv_len; + + payload_len = (__u32)sizeof(*event) + filename_len + argv_len; + + // unlikely + if (payload_len > KSU_SULOG_MAX_PAYLOAD_LEN || (__u32)sizeof(*event) > payload_len) + goto out_free_payload; + + pending->event_type = event_type; + pending->payload = payload; + pending->payload_len = payload_len; + return pending; + +out_free_payload: + kfree(payload); +out_free_pending: + kfree(pending); +out_drop: + ksu_event_queue_drop(&sulog_queue); + return NULL; +} + +static struct ksu_sulog_pending_event *ksu_sulog_capture_grant_root(const struct ksu_sulog_identity *identity, gfp_t gfp) +{ + struct ksu_sulog_pending_event *pending; + struct ksu_sulog_event *event; + + pending = ksu_sulog_capture(KSU_SULOG_EVENT_IOCTL_GRANT_ROOT, NULL, NULL, gfp); + if (!pending) + return NULL; + + event = pending->payload; + ksu_sulog_set_identity(event, identity); + return pending; +} + +int ksu_sulog_events_init(void) +{ + ksu_event_queue_init(&sulog_queue, KSU_SULOG_MAX_QUEUED, KSU_SULOG_MAX_PAYLOAD_LEN); + return 0; +} + +void ksu_sulog_events_exit(void) +{ + ksu_event_queue_destroy(&sulog_queue); +} + +static void ksu_sulog_free_pending(struct ksu_sulog_pending_event *pending) +{ + if (!pending) + return; + kfree(pending->payload); + kfree(pending); +} + +void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp) +{ + struct ksu_sulog_event *event; + + if (!pending) + return; + + event = pending->payload; + event->retval = retval; + ksu_event_queue_push(&sulog_queue, pending->event_type, 0, pending->payload, pending->payload_len, gfp); + ksu_sulog_free_pending(pending); +} + +static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp) +{ + if (!ksu_sulog_is_enabled()) + return 0; + + struct ksu_sulog_pending_event *pending; + struct ksu_sulog_identity identity = { + .uid = uid, + .euid = euid, + }; + + pending = ksu_sulog_capture_grant_root(&identity, gfp); + if (!pending) + return 0; + + ksu_sulog_emit_pending(pending, retval, gfp); + return 0; +} + +static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp) +{ + if (!ksu_sulog_is_enabled()) + return 0; + + struct ksu_sulog_pending_event *pending; + + pending = ksu_sulog_capture(event_type, bprm_argv, bprm_argv_len, gfp); + if (!pending) + return 0; + + ksu_sulog_emit_pending(pending, 0, gfp); + return 0; +} + +static void ksu_sulog_emit_bprm(const char *filename) +{ + if (!ksu_sulog_is_enabled()) + return; + + // maybe tag the process instead? + if (!is_ksu_domain()) + return; + + if (!current->mm) + return; + + unsigned long arg_start = current->mm->arg_start; + unsigned long arg_end = current->mm->arg_end; + size_t arg_len = arg_end - arg_start; + + if (arg_len <= 0) + return; + +#define ARGV_MAX_BPRM 128 + char args[ARGV_MAX_BPRM] = {0}; + + size_t argv_copy_len = (arg_len > ARGV_MAX_BPRM) ? ARGV_MAX_BPRM : arg_len; + + // we cant use strncpy on here, else it will truncate once it sees \0 + if (ksu_copy_from_user_retry(args, (void __user *)arg_start, argv_copy_len)) + return; + + args[argv_copy_len - 1] = '\0'; + + // we grab strlen of argv0 as that needs to be kept as \0, basically to skip it + size_t argv0_len = strnlen(args, argv_copy_len); + char *buf = args + argv0_len + 1; + +flatten: + if (buf >= args + argv_copy_len - 1) + goto flatten_done; + + int len = strlen(buf); + if (!len) + goto flatten_done; + + *(buf + len) = ' '; + buf = buf + len + 1; + + if (buf - args < argv_copy_len - argv0_len - 1) + goto flatten; + +flatten_done: + // this should look like + // /system/bin/sh\0-c sh -c id + ksu_sulog_emit(KSU_SULOG_EVENT_ROOT_EXECVE, args, argv_copy_len, GFP_KERNEL); +} + +struct ksu_event_queue *ksu_sulog_get_queue(void) +{ + return &sulog_queue; +} diff --git a/drivers/kernelsu/sulog/event.h b/drivers/kernelsu/sulog/event.h new file mode 100644 index 000000000000..92563ded6d10 --- /dev/null +++ b/drivers/kernelsu/sulog/event.h @@ -0,0 +1,18 @@ +#ifndef __KSU_H_SULOG_EVENT +#define __KSU_H_SULOG_EVENT + +struct ksu_event_queue; +struct ksu_sulog_pending_event; + +int ksu_sulog_events_init(void); +void ksu_sulog_events_exit(void); + +void ksu_sulog_emit_pending(struct ksu_sulog_pending_event *pending, int retval, gfp_t gfp); + +static int ksu_sulog_emit_grant_root(int retval, __u32 uid, __u32 euid, gfp_t gfp); +static int ksu_sulog_emit(__u16 event_type, const char *bprm_argv, size_t bprm_argv_len, gfp_t gfp); +static void ksu_sulog_emit_bprm(const char *filename); + +struct ksu_event_queue *ksu_sulog_get_queue(void); + +#endif diff --git a/drivers/kernelsu/sulog/fd.c b/drivers/kernelsu/sulog/fd.c new file mode 100644 index 000000000000..70da685e73ea --- /dev/null +++ b/drivers/kernelsu/sulog/fd.c @@ -0,0 +1,83 @@ +static DEFINE_MUTEX(ksu_sulog_fd_lock); +static bool ksu_sulog_fd_active; + +static ssize_t ksu_sulog_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + return ksu_event_queue_read(ksu_sulog_get_queue(), buf, count, file->f_flags); +} + +static unsigned __bitwise ksu_sulog_poll(struct file *file, poll_table *wait) +{ + return ksu_event_queue_poll(ksu_sulog_get_queue(), file, wait); +} + +static int ksu_sulog_release(struct inode *inode, struct file *file) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); + + pr_info("sulog: fd released\n"); + return 0; +} + +static const struct file_operations ksu_sulog_fops = { + .owner = THIS_MODULE, + .read = ksu_sulog_read, + .poll = ksu_sulog_poll, + .release = ksu_sulog_release, + .llseek = noop_llseek, +}; + +int ksu_install_sulog_fd(void) +{ + struct file *filp; + int fd; + + mutex_lock(&ksu_sulog_fd_lock); + + if (ksu_sulog_fd_active) { + fd = -EBUSY; + goto out_unlock; + } + + if (READ_ONCE(ksu_sulog_get_queue()->closed)) { + fd = -EPIPE; + goto out_unlock; + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + goto out_unlock; + + filp = anon_inode_getfile("[ksu_sulog]", &ksu_sulog_fops, NULL, O_RDONLY | O_CLOEXEC); + if (IS_ERR(filp)) { + put_unused_fd(fd); + fd = PTR_ERR(filp); + goto out_unlock; + } + + ksu_sulog_fd_active = true; + fd_install(fd, filp); + pr_info("sulog: fd installed %d for pid %d\n", fd, current->pid); + +out_unlock: + mutex_unlock(&ksu_sulog_fd_lock); + return fd; +} + +void ksu_sulog_fd_init(void) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); +} + +void ksu_sulog_fd_exit(void) +{ + mutex_lock(&ksu_sulog_fd_lock); + ksu_sulog_fd_active = false; + mutex_unlock(&ksu_sulog_fd_lock); + + ksu_event_queue_close(ksu_sulog_get_queue()); +} diff --git a/drivers/kernelsu/sulog/fd.h b/drivers/kernelsu/sulog/fd.h new file mode 100644 index 000000000000..6a117fedc0a9 --- /dev/null +++ b/drivers/kernelsu/sulog/fd.h @@ -0,0 +1,8 @@ +#ifndef __KSU_H_SULOG_FD +#define __KSU_H_SULOG_FD + +int ksu_install_sulog_fd(void); +void ksu_sulog_fd_init(void); +void ksu_sulog_fd_exit(void); + +#endif diff --git a/drivers/kernelsu/supercall/dispatch.c b/drivers/kernelsu/supercall/dispatch.c new file mode 100644 index 000000000000..2ea7d8b4cbff --- /dev/null +++ b/drivers/kernelsu/supercall/dispatch.c @@ -0,0 +1,729 @@ +static int do_grant_root(void __user *arg) +{ + int ret; + __u32 audit_uid = current_uid().val; + __u32 audit_euid = current_euid().val; + + // we already check uid above on allowed_for_su() + + write_sulog('i'); // log ioctl escalation + + pr_info("allow root for: %d\n", audit_uid); + ret = escape_with_root_profile(); + +#ifdef CONFIG_KSU_FEATURE_SULOG + ksu_sulog_emit_grant_root(ret, audit_uid, audit_euid, GFP_KERNEL); +#endif + return ret; +} + +static uint32_t ksuver_override = 0; +static uint32_t ksuflags_override = 0; + +static int do_get_info(void __user *arg) +{ + struct ksu_get_info_cmd cmd = {.version = KERNEL_SU_VERSION, .flags = 0}; + + // NOTE: we do not have LKM support so we don't bother with its flags or late-load + if (is_manager()) { + cmd.flags |= KSU_GET_INFO_FLAG_MANAGER; + } + cmd.features = KSU_FEATURE_MAX; + + if (ksuver_override) + cmd.version = ksuver_override; + + if (ksuflags_override) + cmd.flags = ksuflags_override; + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_version: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_report_event(void __user *arg) +{ + struct ksu_report_event_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + switch (cmd.event) { + case EVENT_POST_FS_DATA: { + static bool post_fs_data_lock = false; + if (!post_fs_data_lock) { + post_fs_data_lock = true; + pr_info("post-fs-data triggered\n"); + on_post_fs_data(); + } + break; + } + case EVENT_BOOT_COMPLETED: { + static bool boot_complete_lock = false; + if (!boot_complete_lock) { + boot_complete_lock = true; + pr_info("boot_complete triggered\n"); + on_boot_completed(); + } + break; + } + case EVENT_MODULE_MOUNTED: { + ksu_module_mounted = true; + pr_info("module mounted!\n"); + on_module_mounted(); + break; + } + default: + break; + } + + return 0; +} + +static int do_set_sepolicy(void __user *arg) +{ + struct ksu_set_sepolicy_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + return handle_sepolicy((void __user *)cmd.data, cmd.data_len); +} + +static int do_check_safemode(void __user *arg) +{ + struct ksu_check_safemode_cmd cmd; + + cmd.in_safe_mode = ksu_is_safe_mode(); + + if (cmd.in_safe_mode) { + pr_warn("safemode enabled!\n"); + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("check_safemode: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_new_get_allow_list_common(void __user *arg, bool allow) +{ + struct ksu_new_get_allow_list_cmd cmd; + int *arr = NULL; + int err = 0; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + if (cmd.count) { + arr = kmalloc(sizeof(int) * cmd.count, GFP_KERNEL); + if (!arr) { + return -ENOMEM; + } + } + + bool success = ksu_get_allow_list(arr, cmd.count, &cmd.count, &cmd.total_count, allow); + + if (!success) { + err = -EFAULT; + goto out; + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("new_get_allow_list: copy_to_user count failed\n"); + err = -EFAULT; + goto out; + } + + if (cmd.count && copy_to_user(&((struct ksu_new_get_allow_list_cmd *)arg)->uids, arr, sizeof(int) * cmd.count)) { + pr_err("new_get_allow_list: copy_to_user uids failed\n"); + err = -EFAULT; + } + +out: + if (arr) { + kfree(arr); + } + return err; +} + +static int do_new_get_deny_list(void __user *arg) +{ + return do_new_get_allow_list_common(arg, false); +} + +static int do_new_get_allow_list(void __user *arg) +{ + return do_new_get_allow_list_common(arg, true); +} + +static int do_get_allow_list_common(void __user *arg, bool allow) +{ + int *arr = NULL; + int err = 0; + u16 count; + u32 out_count; + static const u16 kSize = 128; + + arr = kmalloc(sizeof(int) * kSize, GFP_KERNEL); + if (!arr) { + return -ENOMEM; + } + + bool success = ksu_get_allow_list(arr, kSize, &count, NULL, allow); + + if (!success) { + err = -EFAULT; + goto out; + } + + out_count = count; + + if (copy_to_user(arg + offsetof(struct ksu_get_allow_list_cmd, count), + &out_count, sizeof(u32))) { + pr_err("get_allow_list: copy_to_user count failed\n"); + err = -EFAULT; + goto out; + } + + if (copy_to_user(arg, arr, sizeof(u32) * count)) { + pr_err("get_allow_list: copy_to_user uids failed\n"); + err = -EFAULT; + } + +out: + if (arr) { + kfree(arr); + } + return err; +} + +static int do_get_deny_list(void __user *arg) +{ + return do_get_allow_list_common(arg, false); +} + +static int do_get_allow_list(void __user *arg) +{ + return do_get_allow_list_common(arg, true); +} + +static int do_uid_granted_root(void __user *arg) +{ + struct ksu_uid_granted_root_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + cmd.granted = ksu_is_allow_uid_for_current(cmd.uid); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("uid_granted_root: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_uid_should_umount(void __user *arg) +{ + struct ksu_uid_should_umount_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + return -EFAULT; + } + + cmd.should_umount = ksu_uid_should_umount(cmd.uid); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("uid_should_umount: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_get_manager_appid(void __user *arg) +{ + struct ksu_get_manager_appid_cmd cmd; + + cmd.appid = ksu_get_manager_appid(); + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_manager_appid: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_get_app_profile(void __user *arg) +{ + uid_t uid; + struct app_profile *profile; + int ret = 0; + + if (copy_from_user(&uid, (char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile.curr_uid), sizeof(uid_t))) { + pr_err("get_app_profile: copy_from_user failed\n"); + return -EFAULT; + } + + rcu_read_lock(); + profile = ksu_get_app_profile(uid); + rcu_read_unlock(); + if (!profile) { + ret = -ENOENT; + } else { + if (copy_to_user((char __user *)arg + offsetof(struct ksu_get_app_profile_cmd, profile), profile, sizeof(struct app_profile))) { + pr_err("get_app_profile: copy_to_user failed\n"); + ret = -EFAULT; + } + ksu_put_app_profile(profile); + } + return ret; +} + +static int do_set_app_profile(void __user *arg) +{ + struct ksu_set_app_profile_cmd cmd; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("set_app_profile: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_set_app_profile(&cmd.profile); + if (!ret) { + ksu_persistent_allow_list(); + } + + return ret; +} + +static int do_get_feature(void __user *arg) +{ + struct ksu_get_feature_cmd cmd; + bool supported; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_feature: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported); + cmd.supported = supported ? 1 : 0; + + if (ret && supported) { + pr_err("get_feature: failed for feature %u: %d\n", cmd.feature_id, ret); + return ret; + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("get_feature: copy_to_user failed\n"); + return -EFAULT; + } + + return 0; +} + +static int do_set_feature(void __user *arg) +{ + struct ksu_set_feature_cmd cmd; + int ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("set_feature: copy_from_user failed\n"); + return -EFAULT; + } + + ret = ksu_set_feature(cmd.feature_id, cmd.value); + if (ret) { + pr_err("set_feature: failed for feature %u: %d\n", cmd.feature_id, ret); + return ret; + } + + return 0; +} + +static int do_get_wrapper_fd(void __user *arg) { + if (!ksu_file_sid) { + return -EINVAL; + } + + struct ksu_get_wrapper_fd_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_wrapper_fd: copy_from_user failed\n"); + return -EFAULT; + } + + return ksu_install_file_wrapper(cmd.fd); +} + +// Get task mark status +// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found +/* BRICKPORT: on this one we return 1 if seccomp is disabled and 0 if enabled */ +static int ksu_get_task_mark(pid_t pid) +{ + struct task_struct *task; + int ret = -ESRCH; + + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return ret; + } + + ret = !task->seccomp.mode; + rcu_read_unlock(); + + return ret; +} + +static int do_manage_mark(void __user *arg) +{ + struct ksu_manage_mark_cmd cmd; + int ret = 0; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("manage_mark: copy_from_user failed\n"); + return -EFAULT; + } + + switch (cmd.operation) { + case KSU_MARK_GET: { + // on this one, we return seccomp status of a pid instead + // at the very least we have partial featureset + ret = ksu_get_task_mark(cmd.pid); + if (ret < 0) { + pr_err("manage_mark: get failed for pid %d: %d\n", cmd.pid, ret); + return ret; + } + cmd.result = (u32)ret; + break; + } +#if 0 // TODO: revisit this sometime + case KSU_MARK_MARK: { break; } + case KSU_MARK_UNMARK: { break; } + case KSU_MARK_REFRESH: { break; } +#endif + default: { + pr_err("manage_mark: invalid operation %u\n", cmd.operation); + return -EINVAL; + } + } + + if (copy_to_user(arg, &cmd, sizeof(cmd))) { + pr_err("manage_mark: copy_to_user failed\n"); + return -EFAULT; + } + + + return 0; +} + +static int do_nuke_ext4_sysfs(void __user *arg) +{ + struct ksu_nuke_ext4_sysfs_cmd cmd; + char mnt[256]; + long ret; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) + return -EFAULT; + + if (!cmd.arg) + return -EINVAL; + + memset(mnt, 0, sizeof(mnt)); + + ret = strncpy_from_user(mnt, (void __user *)cmd.arg, sizeof(mnt)); + if (ret < 0) { + pr_err("nuke ext4 copy mnt failed: %ld\\n", ret); + return -EFAULT; // 或者 return ret; + } + + if (ret == sizeof(mnt)) { + pr_err("nuke ext4 mnt path too long\\n"); + return -ENAMETOOLONG; + } + + pr_info("do_nuke_ext4_sysfs: %s\n", mnt); + + return nuke_ext4_sysfs(mnt); +} + +struct list_head mount_list = LIST_HEAD_INIT(mount_list); +DECLARE_RWSEM(mount_list_lock); + +static int add_try_umount(void __user *arg) +{ + struct mount_entry *new_entry, *entry, *tmp; + struct ksu_add_try_umount_cmd cmd; + char buf[256] = {0}; + + if (copy_from_user(&cmd, arg, sizeof cmd)) + return -EFAULT; + + switch (cmd.mode) { + case KSU_UMOUNT_WIPE: { + struct mount_entry *entry, *tmp; + down_write(&mount_list_lock); + list_for_each_entry_safe(entry, tmp, &mount_list, list) { + pr_info("wipe_umount_list: removing entry: %s\n", entry->umountable); + list_del(&entry->list); + kfree(entry->umountable); + kfree(entry); + } + up_write(&mount_list_lock); + + return 0; + } + + case KSU_UMOUNT_ADD: { + long len = strncpy_from_user(buf, (const char __user *)cmd.arg, 256); + if (len <= 0) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->umountable = kstrdup(buf, GFP_KERNEL); + if (!new_entry->umountable) { + kfree(new_entry); + return -ENOMEM; + } + + down_write(&mount_list_lock); + + // disallow dupes + // if this gets too many, we can consider moving this whole task to a kthread + list_for_each_entry(entry, &mount_list, list) { + if (!strcmp(entry->umountable, buf)) { + pr_info("cmd_add_try_umount: %s is already here!\n", buf); + up_write(&mount_list_lock); + kfree(new_entry->umountable); + kfree(new_entry); + return -EEXIST; + } + } + + // now check flags and add + // this also serves as a null check + if (cmd.flags) + new_entry->flags = cmd.flags; + else + new_entry->flags = 0; + + // debug + list_add(&new_entry->list, &mount_list); + up_write(&mount_list_lock); + pr_info("cmd_add_try_umount: %s added!\n", buf); + + return 0; + } + + // this is just strcmp'd wipe anyway + case KSU_UMOUNT_DEL: { + long len = strncpy_from_user(buf, (const char __user *)cmd.arg, sizeof(buf) - 1); + if (len <= 0) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + + down_write(&mount_list_lock); + list_for_each_entry_safe(entry, tmp, &mount_list, list) { + if (!strcmp(entry->umountable, buf)) { + pr_info("cmd_add_try_umount: entry removed: %s\n", entry->umountable); + list_del(&entry->list); + kfree(entry->umountable); + kfree(entry); + } + } + up_write(&mount_list_lock); + + return 0; + } + + // this way userspace can deduce the memory it has to prepare. + case KSU_UMOUNT_GETSIZE: { + // check for pointer first + if (!cmd.arg) + return -EFAULT; + + size_t total_size = 0; // size of list in bytes + + down_read(&mount_list_lock); + list_for_each_entry(entry, &mount_list, list) { + total_size = total_size + strlen(entry->umountable) + 1; // + 1 for \0 + } + up_read(&mount_list_lock); + + pr_info("cmd_add_try_umount: total_size: %zu\n", total_size); + + if (copy_to_user((size_t __user *)cmd.arg, &total_size, sizeof(total_size))) + return -EFAULT; + + return 0; + } + + // WARNING! this is straight up pointerwalking. + // this way we dont need to redefine the ioctl defs. + // this also avoids us needing to kmalloc + // userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA. + case KSU_UMOUNT_GETLIST: { + if (!cmd.arg) + return -EFAULT; + + char *user_buf = (char *)cmd.arg; + + down_read(&mount_list_lock); + list_for_each_entry(entry, &mount_list, list) { + pr_info("cmd_add_try_umount: entry: %s\n", entry->umountable); + + if (copy_to_user((char __user *)user_buf, entry->umountable, strlen(entry->umountable) + 1 )) { + up_read(&mount_list_lock); + return -EFAULT; + } + + // walk it! +1 for null terminator + user_buf = user_buf + strlen(entry->umountable) + 1; + } + up_read(&mount_list_lock); + + return 0; + } + + default: { + pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode); + return -EINVAL; + } + + } // switch(cmd.mode) + + return 0; +} + +static int do_set_init_pgrp(void __user *arg) +{ + int err; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + struct pid *pids[PIDTYPE_MAX] = { 0 }; +#endif + write_lock_irq(&tasklist_lock); + struct task_struct *p = current->group_leader; + struct pid *init_group = task_pgrp(&init_task); + + err = -EPERM; + if (task_session(p) != task_session(&init_task)) + goto out; + + err = 0; + if (task_pgrp(p) != init_group) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + change_pid(pids, p, PIDTYPE_PGID, init_group); +#else + change_pid(p, PIDTYPE_PGID, init_group); +#endif + } +out: + write_unlock_irq(&tasklist_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 15, 0) + free_pids(pids); +#endif + return err; +} + +static int do_get_sulog_fd(void __user *arg) +{ + struct ksu_get_sulog_fd_cmd cmd; + + if (copy_from_user(&cmd, arg, sizeof(cmd))) { + pr_err("get_sulog_fd: copy_from_user failed\n"); + return -EFAULT; + } + + if (cmd.flags) { + pr_err("get_sulog_fd: unsupported flags 0x%x\n", cmd.flags); + return -EINVAL; + } + + return ksu_install_sulog_fd(); +} + +// IOCTL handlers mapping table +static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { + { .cmd = KSU_IOCTL_GRANT_ROOT, .name = "GRANT_ROOT", .handler = do_grant_root, .perm_check = allowed_for_su }, + { .cmd = KSU_IOCTL_GET_INFO, .name = "GET_INFO", .handler = do_get_info, .perm_check = always_allow }, + { .cmd = KSU_IOCTL_REPORT_EVENT, .name = "REPORT_EVENT", .handler = do_report_event, .perm_check = only_root }, + { .cmd = KSU_IOCTL_SET_SEPOLICY, .name = "SET_SEPOLICY", .handler = do_set_sepolicy, .perm_check = only_root }, + { .cmd = KSU_IOCTL_CHECK_SAFEMODE, .name = "CHECK_SAFEMODE", .handler = do_check_safemode, .perm_check = always_allow }, + { .cmd = KSU_IOCTL_GET_ALLOW_LIST, .name = "GET_ALLOW_LIST", .handler = do_get_allow_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_DENY_LIST, .name = "GET_DENY_LIST", .handler = do_get_deny_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NEW_GET_ALLOW_LIST, .name = "NEW_GET_ALLOW_LIST", .handler = do_new_get_allow_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NEW_GET_DENY_LIST, .name = "NEW_GET_DENY_LIST", .handler = do_new_get_deny_list, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_UID_GRANTED_ROOT, .name = "UID_GRANTED_ROOT", .handler = do_uid_granted_root, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_UID_SHOULD_UMOUNT, .name = "UID_SHOULD_UMOUNT", .handler = do_uid_should_umount, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_MANAGER_APPID, .name = "GET_MANAGER_APPID", .handler = do_get_manager_appid, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_APP_PROFILE, .name = "GET_APP_PROFILE", .handler = do_get_app_profile, .perm_check = only_manager }, + { .cmd = KSU_IOCTL_SET_APP_PROFILE, .name = "SET_APP_PROFILE", .handler = do_set_app_profile, .perm_check = only_manager }, + { .cmd = KSU_IOCTL_GET_FEATURE, .name = "GET_FEATURE", .handler = do_get_feature, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_SET_FEATURE, .name = "SET_FEATURE", .handler = do_set_feature, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_GET_WRAPPER_FD, .name = "GET_WRAPPER_FD", .handler = do_get_wrapper_fd, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_MANAGE_MARK, .name = "MANAGE_MARK", .handler = do_manage_mark, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_NUKE_EXT4_SYSFS, .name = "NUKE_EXT4_SYSFS", .handler = do_nuke_ext4_sysfs, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_ADD_TRY_UMOUNT, .name = "ADD_TRY_UMOUNT", .handler = add_try_umount, .perm_check = manager_or_root }, + { .cmd = KSU_IOCTL_SET_INIT_PGRP, .name = "SET_INIT_PGRP", .handler = do_set_init_pgrp, .perm_check = only_root }, + { .cmd = KSU_IOCTL_GET_SULOG_FD, .name = "GET_SULOG_FD", .handler = do_get_sulog_fd, .perm_check = only_root }, + { .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } // Sentinel +}; + +long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp) +{ + int i; + +#ifdef CONFIG_KSU_DEBUG + pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val); +#endif + + for (i = 0; ksu_ioctl_handlers[i].handler; i++) { + if (cmd == ksu_ioctl_handlers[i].cmd) { + // Check permission first + if (ksu_ioctl_handlers[i].perm_check && !ksu_ioctl_handlers[i].perm_check()) { + pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", cmd, current_uid().val); + return -EPERM; + } + // Execute handler + return ksu_ioctl_handlers[i].handler(argp); + } + } + + pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd); + return -ENOTTY; +} + +void __init ksu_supercall_dump_commands(void) +{ + int i; + + pr_info("KernelSU IOCTL Commands:\n"); + for (i = 0; ksu_ioctl_handlers[i].handler; i++) { + pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, ksu_ioctl_handlers[i].cmd); + } +} + +void ksu_supercall_cleanup_state(void) {} diff --git a/drivers/kernelsu/supercall/internal.h b/drivers/kernelsu/supercall/internal.h new file mode 100644 index 000000000000..5287f2e5affe --- /dev/null +++ b/drivers/kernelsu/supercall/internal.h @@ -0,0 +1,14 @@ +#ifndef __KSU_H_SUPERCALL_INTERNAL +#define __KSU_H_SUPERCALL_INTERNAL + +bool only_manager(void); +bool only_root(void); +bool manager_or_root(void); +bool always_allow(void); +bool allowed_for_su(void); + +long ksu_supercall_handle_ioctl(unsigned int cmd, void __user *argp); +void ksu_supercall_dump_commands(void); +void ksu_supercall_cleanup_state(void); + +#endif // __KSU_H_SUPERCALL_INTERNAL diff --git a/drivers/kernelsu/supercall/perm.c b/drivers/kernelsu/supercall/perm.c new file mode 100644 index 000000000000..89b674885072 --- /dev/null +++ b/drivers/kernelsu/supercall/perm.c @@ -0,0 +1,25 @@ +bool only_manager(void) +{ + return is_manager(); +} + +bool only_root(void) +{ + return current_uid().val == 0; +} + +bool manager_or_root(void) +{ + return current_uid().val == 0 || is_manager(); +} + +bool always_allow(void) +{ + return true; +} + +bool allowed_for_su(void) +{ + return is_manager() || ksu_is_allow_uid_for_current(current_uid().val); + +} diff --git a/drivers/kernelsu/supercall/supercall.c b/drivers/kernelsu/supercall/supercall.c new file mode 100644 index 000000000000..9bfd347d3d2c --- /dev/null +++ b/drivers/kernelsu/supercall/supercall.c @@ -0,0 +1,212 @@ +static int anon_ksu_release(struct inode *inode, struct file *filp) +{ + pr_info("ksu fd released\n"); + return 0; +} + +static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + return ksu_supercall_handle_ioctl(cmd, (void __user *)arg); +} + +// File operations structure +static const struct file_operations anon_ksu_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = anon_ksu_ioctl, + .compat_ioctl = anon_ksu_ioctl, + .release = anon_ksu_release, +}; + +// Install KSU fd to current process +int ksu_install_fd(void) +{ + struct file *filp; + int fd; + + // Get unused fd + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + pr_err("ksu_install_fd: failed to get unused fd\n"); + return fd; + } + + // Create anonymous inode file + filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, O_RDWR | O_CLOEXEC); + if (IS_ERR(filp)) { + pr_err("ksu_install_fd: failed to create anon inode file\n"); + put_unused_fd(fd); + return PTR_ERR(filp); + } + + // Install fd + fd_install(fd, filp); + + pr_info("ksu fd installed: %d for pid %d\n", fd, current->pid); + + return fd; +} + +static inline int ksu_handle_fd_request(void __user *arg4) +{ + int fd = ksu_install_fd(); + pr_info("[%d] install ksu fd: %d\n", current->pid, fd); + + if (copy_to_user(arg4, &fd, sizeof(fd))) { + pr_err("install ksu fd reply err\n"); + close_fd(fd); + } + + return 0; +} + +// downstream: make sure to pass arg as reference, this can allow us to extend things. +int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg) +{ + if (magic1 != KSU_INSTALL_MAGIC1) + return 0; + + // when ternary on fmt? + // cold syscall, we can splurge xD + if (magic2 == KSU_INSTALL_MAGIC2) + pr_info("sys_reboot: magic: 0x%x id: 0x%x pid: %d comm: %s \n", magic1, magic2, current->pid, current->comm); + else + pr_info("sys_reboot: magic: 0x%x id: %d pid: %d pid: %s \n", magic1, magic2, current->pid, current->comm); + + // arg4 = (unsigned long)PT_REGS_SYSCALL_PARM4(real_regs); + // downstream: dereference arg as arg4 so we can be inline to upstream + void __user *arg4 = (void __user *)*arg; + + // Check if this is a request to install KSU fd + if (magic2 == KSU_INSTALL_MAGIC2) { + return ksu_handle_fd_request(arg4); + } + + // only root is allowed for these commands + if (current_uid().val != 0) + return 0; + + // extensions + u64 reply = (u64)*arg; + + if (magic2 == CHANGE_MANAGER_UID) { + pr_info("sys_reboot: ksu_set_manager_appid to: %d\n", cmd); + ksu_set_manager_appid(cmd); + + if (cmd == ksu_get_manager_appid()) { + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + pr_info("sys_reboot: reply fail\n"); + } + + return 0; + } + + if (magic2 == GET_SULOG_DUMP_V2) { + + int ret = send_sulog_dump(*arg); + if (ret) + return 0; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + if (magic2 == CHANGE_KSUVER) { + pr_info("sys_reboot: ksu_change_ksuver to: %d\n", cmd); + ksuver_override = cmd; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + // WARNING!!! triple ptr zone! *** + // https://wiki.c2.com/?ThreeStarProgrammer + if (magic2 == CHANGE_SPOOF_UNAME) { + + char release_buf[65]; + char version_buf[65]; + static char original_release_buf[65] = {0}; + static char original_version_buf[65] = {0}; + + // basically void * void __user * void __user *arg + void ***ppptr = (void ***)(uintptr_t)arg; + + // user pointer storage + // init this as zero so this works on 32-on-64 compat (LE) + uint64_t u_pptr = 0; + uint64_t u_ptr = 0; + + pr_info("sys_reboot: ppptr: 0x%lx \n", (uintptr_t)ppptr); + + // arg here is ***, dereference to pull out ** + if (copy_from_user(&u_pptr, (void __user *)*ppptr, sizeof(u_pptr))) + return 0; + + pr_info("sys_reboot: u_pptr: 0x%lx \n", (uintptr_t)u_pptr); + + // now we got the __user ** + // we cannot dereference this as this is __user + // we just do another copy_from_user to get it + if (copy_from_user(&u_ptr, (void __user *)u_pptr, sizeof(u_ptr))) + return 0; + + pr_info("sys_reboot: u_ptr: 0x%lx \n", (uintptr_t)u_ptr); + + // for release + if (strncpy_from_user(release_buf, (char __user *)u_ptr, sizeof(release_buf)) < 0) + return 0; + release_buf[sizeof(release_buf) - 1] = '\0'; + + // for version + if (strncpy_from_user(version_buf, (char __user *)(u_ptr + strlen(release_buf) + 1), sizeof(version_buf)) < 0) + return 0; + version_buf[sizeof(version_buf) - 1] = '\0'; + + if (original_release_buf[0] == '\0') { + struct new_utsname *u_curr = utsname(); + // we save current version as the original before modifying + strncpy(original_release_buf, u_curr->release, sizeof(original_release_buf)); + strncpy(original_version_buf, u_curr->version, sizeof(original_version_buf)); + pr_info("sys_reboot: original uname saved: %s %s\n", original_release_buf, original_version_buf); + } + + // so user can reset + if (!strcmp(release_buf, "default")) { + memcpy(release_buf, original_release_buf, sizeof(release_buf)); + } + if (!strcmp(version_buf, "default")) { + memcpy(version_buf, original_version_buf, sizeof(version_buf)); + } + + pr_info("sys_reboot: spoofing kernel to: %s - %s\n", release_buf, version_buf); + + struct new_utsname *u = utsname(); + + down_write(&uts_sem); + strncpy(u->release, release_buf, sizeof(u->release)); + strncpy(u->version, version_buf, sizeof(u->version)); + up_write(&uts_sem); + + // we write our confirmation on ** + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply))) + return 0; + } + + if (magic2 == CHANGE_KSUFLAGS) { + pr_info("sys_reboot: ksu_change_ksuflags to: %d\n", cmd); + ksuflags_override = cmd; + + if (copy_to_user((void __user *)*arg, &reply, sizeof(reply) )) + return 0; + } + + return 0; +} + +void __init ksu_supercalls_init(void) +{ + ksu_supercall_dump_commands(); + + tiny_sulog_init_heap(); // grab heap memory for sulog +} + +void __exit ksu_supercalls_exit(void) { } diff --git a/drivers/kernelsu/supercall/supercall.h b/drivers/kernelsu/supercall/supercall.h new file mode 100644 index 000000000000..1c9e5a0a27ed --- /dev/null +++ b/drivers/kernelsu/supercall/supercall.h @@ -0,0 +1,32 @@ +#ifndef __KSU_H_SUPERCALL +#define __KSU_H_SUPERCALL + +// IOCTL handler types +typedef int (*ksu_ioctl_handler_t)(void __user *arg); +typedef bool (*ksu_perm_check_t)(void); + +// IOCTL command mapping +struct ksu_ioctl_cmd_map { + unsigned int cmd; + const char *name; + ksu_ioctl_handler_t handler; + ksu_perm_check_t perm_check; // Permission check function +}; + +// Install KSU fd to current process +int ksu_install_fd(void); + +void ksu_supercalls_init(void); +void ksu_supercalls_exit(void); + +// extensions +#define CHANGE_MANAGER_UID 10006 +#define KSU_UMOUNT_GETSIZE 107 // get list size // shit is u8 we cant fit 10k+ on it +#define KSU_UMOUNT_GETLIST 108 // get list +#define GET_SULOG_DUMP 10009 // get sulog dump, max, last 100 escalations +#define GET_SULOG_DUMP_V2 10010 // get sulog dump, timestamped, last 250 escalations +#define CHANGE_KSUVER 10011 // change ksu version +#define CHANGE_SPOOF_UNAME 10012 // spoof uname +#define CHANGE_KSUFLAGS 10013 // change ksuflags, do the bit calc on your own, 0 + 1 + 2 + 4 + 8 blah + +#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/supercalls.c b/drivers/kernelsu/supercalls.c deleted file mode 100644 index 12c7e284cfd1..000000000000 --- a/drivers/kernelsu/supercalls.c +++ /dev/null @@ -1,847 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) -#include -#else -#include -#endif - -#include "supercalls.h" -#include "arch.h" -#include "allowlist.h" -#include "feature.h" -#include "klog.h" // IWYU pragma: keep -#include "ksu.h" -#include "ksud.h" -#ifdef CONFIG_KSU_SYSCALL_HOOK -#include "kp_hook.h" -#include "syscall_handler.h" -#endif -#include "kernel_compat.h" -#include "kernel_umount.h" -#include "manager.h" -#include "selinux/selinux.h" -#include "file_wrapper.h" - -// Permission check functions -bool only_manager(void) -{ - return is_manager(); -} - -bool only_root(void) -{ - return current_uid().val == 0; -} - -bool manager_or_root(void) -{ - return current_uid().val == 0 || is_manager(); -} - -bool always_allow(void) -{ - return true; // No permission check -} - -bool allowed_for_su(void) -{ - return is_manager() || ksu_is_allow_uid_for_current(current_uid().val); -} - -static int do_grant_root(void __user *arg) -{ - // we already check uid above on allowed_for_su() - - pr_info("allow root for: %d\n", current_uid().val); - escape_with_root_profile(); - - return 0; -} - -static int do_get_info(void __user *arg) -{ - struct ksu_get_info_cmd cmd = { .version = KERNEL_SU_VERSION, - .flags = 0 }; - -#ifdef MODULE - cmd.flags |= 0x1; -#endif - - if (is_manager()) { - cmd.flags |= 0x2; - } - cmd.features = KSU_FEATURE_MAX; - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_version: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_report_event(void __user *arg) -{ - struct ksu_report_event_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - switch (cmd.event) { - case EVENT_POST_FS_DATA: { - static bool post_fs_data_lock = false; - if (!post_fs_data_lock) { - post_fs_data_lock = true; - pr_info("post-fs-data triggered\n"); - on_post_fs_data(); - } - break; - } - case EVENT_BOOT_COMPLETED: { - static bool boot_complete_lock = false; - if (!boot_complete_lock) { - boot_complete_lock = true; - pr_info("boot_complete triggered\n"); - on_boot_completed(); - } - break; - } - case EVENT_MODULE_MOUNTED: { - pr_info("module mounted!\n"); - on_module_mounted(); - break; - } - default: - break; - } - - return 0; -} - -static int do_set_sepolicy(void __user *arg) -{ - struct ksu_set_sepolicy_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - return handle_sepolicy(cmd.cmd, (void __user *)cmd.arg); -} - -static int do_check_safemode(void __user *arg) -{ - struct ksu_check_safemode_cmd cmd; - - cmd.in_safe_mode = ksu_is_safe_mode(); - - if (cmd.in_safe_mode) { - pr_warn("safemode enabled!\n"); - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("check_safemode: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_allow_list(void __user *arg) -{ - struct ksu_get_allow_list_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - bool success = - ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, true); - - if (!success) { - return -EFAULT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_allow_list: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_deny_list(void __user *arg) -{ - struct ksu_get_allow_list_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - bool success = - ksu_get_allow_list((int *)cmd.uids, (int *)&cmd.count, false); - - if (!success) { - return -EFAULT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_deny_list: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_uid_granted_root(void __user *arg) -{ - struct ksu_uid_granted_root_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - cmd.granted = ksu_is_allow_uid_for_current(cmd.uid); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("uid_granted_root: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_uid_should_umount(void __user *arg) -{ - struct ksu_uid_should_umount_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - cmd.should_umount = ksu_uid_should_umount(cmd.uid); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("uid_should_umount: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_manager_appid(void __user *arg) -{ - struct ksu_get_manager_appid_cmd cmd; - - cmd.appid = ksu_get_manager_appid(); - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_manager_appid: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_get_app_profile(void __user *arg) -{ - struct ksu_get_app_profile_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_app_profile: copy_from_user failed\n"); - return -EFAULT; - } - - if (!ksu_get_app_profile(&cmd.profile)) { - return -ENOENT; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_app_profile: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_set_app_profile(void __user *arg) -{ - struct ksu_set_app_profile_cmd cmd; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("set_app_profile: copy_from_user failed\n"); - return -EFAULT; - } - - if (!ksu_set_app_profile(&cmd.profile, true)) { - return -EFAULT; - } - - return 0; -} - -static int do_get_feature(void __user *arg) -{ - struct ksu_get_feature_cmd cmd; - bool supported; - int ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_feature: copy_from_user failed\n"); - return -EFAULT; - } - - ret = ksu_get_feature(cmd.feature_id, &cmd.value, &supported); - cmd.supported = supported ? 1 : 0; - - if (ret && supported) { - pr_err("get_feature: failed for feature %u: %d\n", - cmd.feature_id, ret); - return ret; - } - - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("get_feature: copy_to_user failed\n"); - return -EFAULT; - } - - return 0; -} - -static int do_set_feature(void __user *arg) -{ - struct ksu_set_feature_cmd cmd; - int ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("set_feature: copy_from_user failed\n"); - return -EFAULT; - } - - ret = ksu_set_feature(cmd.feature_id, cmd.value); - if (ret) { - pr_err("set_feature: failed for feature %u: %d\n", - cmd.feature_id, ret); - return ret; - } - - return 0; -} - -static int do_get_wrapper_fd(void __user *arg) -{ - if (!ksu_file_sid) { - return -EINVAL; - } - - struct ksu_get_wrapper_fd_cmd cmd; - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("get_wrapper_fd: copy_from_user failed\n"); - return -EFAULT; - } - - return ksu_install_file_wrapper(cmd.fd); -} - -static int do_manage_mark(void __user *arg) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - struct ksu_manage_mark_cmd cmd; - int ret = 0; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - pr_err("manage_mark: copy_from_user failed\n"); - return -EFAULT; - } - - switch (cmd.operation) { - case KSU_MARK_GET: { - // Get task mark status - ret = ksu_get_task_mark(cmd.pid); - if (ret < 0) { - pr_err("manage_mark: get failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - cmd.result = (u32)ret; - break; - } - case KSU_MARK_MARK: { - if (cmd.pid == 0) { - ksu_mark_all_process(); - } else { - ret = ksu_set_task_mark(cmd.pid, true); - if (ret < 0) { - pr_err("manage_mark: set_mark failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - } - break; - } - case KSU_MARK_UNMARK: { - if (cmd.pid == 0) { - ksu_unmark_all_process(); - } else { - ret = ksu_set_task_mark(cmd.pid, false); - if (ret < 0) { - pr_err("manage_mark: set_unmark failed for pid %d: %d\n", - cmd.pid, ret); - return ret; - } - } - break; - } - case KSU_MARK_REFRESH: { - ksu_mark_running_process(); - pr_info("manage_mark: refreshed running processes\n"); - break; - } - default: { - pr_err("manage_mark: invalid operation %u\n", cmd.operation); - return -EINVAL; - } - } - if (copy_to_user(arg, &cmd, sizeof(cmd))) { - pr_err("manage_mark: copy_to_user failed\n"); - return -EFAULT; - } - return 0; -#else - // We don't care, just return -ENOTSUPP - pr_warn("manage_mark: this supercalls is not implemented for manual hook.\n"); - return -ENOTSUPP; -#endif -} - -struct list_head mount_list = LIST_HEAD_INIT(mount_list); -DECLARE_RWSEM(mount_list_lock); - -static int add_try_umount(void __user *arg) -{ - struct mount_entry *new_entry, *entry, *tmp; - struct ksu_add_try_umount_cmd cmd; - char buf[256] = { 0 }; - - // When userspace disable kernel_umount, don't do anything. - if (!ksu_kernel_umount_enabled) { - pr_warn("add_try_umount supercall is not available when kernel_umount is disabled!\n"); - return -ENOTSUPP; - } - - if (copy_from_user(&cmd, arg, sizeof(cmd))) { - return -EFAULT; - } - - switch (cmd.mode) { - case KSU_UMOUNT_WIPE: { - struct mount_entry *entry, *tmp; - down_write(&mount_list_lock); - list_for_each_entry_safe (entry, tmp, &mount_list, list) { - pr_info("wipe_umount_list: removing entry: %s\n", - entry->umountable); - list_del(&entry->list); - kfree(entry->umountable); - kfree(entry); - } - up_write(&mount_list_lock); - - return 0; - } - - case KSU_UMOUNT_ADD: { - long len = strncpy_from_user(buf, (const char __user *)cmd.arg, - 256); - if (len <= 0) - return -EFAULT; - - buf[sizeof(buf) - 1] = '\0'; - - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - - new_entry->umountable = kstrdup(buf, GFP_KERNEL); - if (!new_entry->umountable) { - kfree(new_entry); - return -1; - } - - down_write(&mount_list_lock); - - // disallow dupes - // if this gets too many, we can consider moving this whole task to a kthread - list_for_each_entry (entry, &mount_list, list) { - if (!strcmp(entry->umountable, buf)) { - pr_info("cmd_add_try_umount: %s is already here!\n", - buf); - up_write(&mount_list_lock); - kfree(new_entry->umountable); - kfree(new_entry); - return -1; - } - } - - // now check flags and add - // this also serves as a null check - if (cmd.flags) - new_entry->flags = cmd.flags; - else - new_entry->flags = 0; - - // debug - list_add(&new_entry->list, &mount_list); - up_write(&mount_list_lock); - pr_info("cmd_add_try_umount: %s added!\n", buf); - - return 0; - } - - // this is just strcmp'd wipe anyway - case KSU_UMOUNT_DEL: { - long len = strncpy_from_user(buf, (const char __user *)cmd.arg, - sizeof(buf) - 1); - if (len <= 0) - return -EFAULT; - - buf[sizeof(buf) - 1] = '\0'; - - down_write(&mount_list_lock); - list_for_each_entry_safe (entry, tmp, &mount_list, list) { - if (!strcmp(entry->umountable, buf)) { - pr_info("cmd_add_try_umount: entry removed: %s\n", - entry->umountable); - list_del(&entry->list); - kfree(entry->umountable); - kfree(entry); - } - } - up_write(&mount_list_lock); - - return 0; - } - - // this way userspace can deduce the memory it has to prepare. - case KSU_UMOUNT_GETSIZE: { - // check for pointer first - if (!cmd.arg) - return -EFAULT; - - size_t total_size = 0; // size of list in bytes - - down_read(&mount_list_lock); - list_for_each_entry (entry, &mount_list, list) { - // + 1 for \0 - total_size = total_size + strlen(entry->umountable) + 1; - } - up_read(&mount_list_lock); - - pr_info("cmd_add_try_umount: total_size: %zu\n", total_size); - - if (copy_to_user((size_t __user *)cmd.arg, &total_size, - sizeof(total_size))) - return -EFAULT; - - return 0; - } - - // WARNING! this is straight up pointerwalking. - // this way we dont need to redefine the ioctl defs. - // this also avoids us needing to kmalloc - // userspace have to send pointer to memory (malloc/alloca) or pointer to a VLA. - case KSU_UMOUNT_GETLIST: { - if (!cmd.arg) - return -EFAULT; - - char *user_buf = (char *)cmd.arg; - - down_read(&mount_list_lock); - list_for_each_entry (entry, &mount_list, list) { - pr_info("cmd_add_try_umount: entry: %s\n", - entry->umountable); - - if (copy_to_user((char __user *)user_buf, - entry->umountable, - strlen(entry->umountable) + 1)) { - up_read(&mount_list_lock); - return -EFAULT; - } - - // walk it! +1 for null terminator - user_buf = user_buf + strlen(entry->umountable) + 1; - } - up_read(&mount_list_lock); - - return 0; - } - - default: { - pr_err("cmd_add_try_umount: invalid operation %u\n", cmd.mode); - return -EINVAL; - } - - } // switch(cmd.mode) - - return 0; -} - -static int do_nuke_ext4_sysfs(void __user *arg) -{ - struct ksu_nuke_ext4_sysfs_cmd cmd; - char mnt[256]; - long ret; - - if (copy_from_user(&cmd, arg, sizeof(cmd))) - return -EFAULT; - - if (!cmd.arg) - return -EINVAL; - - memset(mnt, 0, sizeof(mnt)); - - ret = strncpy_from_user(mnt, cmd.arg, sizeof(mnt)); - if (ret < 0) { - pr_err("nuke ext4 copy mnt failed: %ld\n", ret); - return -EFAULT; // 或者 return ret; - } - - if (ret == sizeof(mnt)) { - pr_err("nuke ext4 mnt path too long\n"); - return -ENAMETOOLONG; - } - - pr_info("do_nuke_ext4_sysfs: %s\n", mnt); - - return nuke_ext4_sysfs(mnt); -} - -// IOCTL handlers mapping table -static const struct ksu_ioctl_cmd_map ksu_ioctl_handlers[] = { - KSU_IOCTL(GRANT_ROOT, "GRANT_ROOT", do_grant_root, allowed_for_su), - KSU_IOCTL(GET_INFO, "GET_INFO", do_get_info, always_allow), - KSU_IOCTL(REPORT_EVENT, "REPORT_EVENT", do_report_event, only_root), - KSU_IOCTL(SET_SEPOLICY, "SET_SEPOLICY", do_set_sepolicy, only_root), - KSU_IOCTL(CHECK_SAFEMODE, "CHECK_SAFEMODE", do_check_safemode, - always_allow), - KSU_IOCTL(GET_ALLOW_LIST, "GET_ALLOW_LIST", do_get_allow_list, - manager_or_root), - KSU_IOCTL(GET_DENY_LIST, "GET_DENY_LIST", do_get_deny_list, - manager_or_root), - KSU_IOCTL(UID_GRANTED_ROOT, "UID_GRANTED_ROOT", do_uid_granted_root, - manager_or_root), - KSU_IOCTL(UID_SHOULD_UMOUNT, "UID_SHOULD_UMOUNT", do_uid_should_umount, - manager_or_root), - KSU_IOCTL(GET_MANAGER_APPID, "GET_MANAGER_APPID", do_get_manager_appid, - manager_or_root), - KSU_IOCTL(GET_APP_PROFILE, "GET_APP_PROFILE", do_get_app_profile, - only_manager), - KSU_IOCTL(SET_APP_PROFILE, "SET_APP_PROFILE", do_set_app_profile, - only_manager), - KSU_IOCTL(GET_FEATURE, "GET_FEATURE", do_get_feature, manager_or_root), - KSU_IOCTL(SET_FEATURE, "SET_FEATURE", do_set_feature, manager_or_root), - KSU_IOCTL(GET_WRAPPER_FD, "GET_WRAPPER_FD", do_get_wrapper_fd, - manager_or_root), - KSU_IOCTL(MANAGE_MARK, "MANAGE_MARK", do_manage_mark, manager_or_root), - KSU_IOCTL(NUKE_EXT4_SYSFS, "NUKE_EXT4_SYSFS", do_nuke_ext4_sysfs, - manager_or_root), - KSU_IOCTL(ADD_TRY_UMOUNT, "ADD_TRY_UMOUNT", add_try_umount, - manager_or_root), - - // Sentinel - { .cmd = 0, .name = NULL, .handler = NULL, .perm_check = NULL } -}; - -#ifdef CONFIG_KSU_SYSCALL_HOOK -struct ksu_install_fd_tw { - struct callback_head cb; - int __user *outp; -}; - -static void ksu_install_fd_tw_func(struct callback_head *cb) -{ - struct ksu_install_fd_tw *tw = - container_of(cb, struct ksu_install_fd_tw, cb); - int fd = ksu_install_fd(); - - if (copy_to_user(tw->outp, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - do_close_fd(fd); - } - - kfree(tw); -} - -static int ksu_handle_fd_request(void __user *arg) -{ - struct ksu_install_fd_tw *tw; - - tw = kzalloc(sizeof(*tw), GFP_ATOMIC); - if (!tw) - return -ENOMEM; - - tw->outp = (int __user *)arg; - tw->cb.func = ksu_install_fd_tw_func; - - if (task_work_add(current, &tw->cb, TWA_RESUME)) { - kfree(tw); - pr_warn("install fd add task_work failed\n"); - return -EINVAL; - } - - return 0; -} -#else -static int ksu_handle_fd_request(void __user *arg) -{ - int fd = ksu_install_fd(); - - if (copy_to_user(arg, &fd, sizeof(fd))) { - pr_err("install ksu fd reply err\n"); - do_close_fd(fd); - return -EFAULT; - } - - return 0; -} -#endif - -int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, - void __user **arg) -{ - if (magic1 != KSU_INSTALL_MAGIC1) - return -EINVAL; - - // Rare case that unlikely to happen - if (unlikely(!arg)) - return -EINVAL; - -#ifdef CONFIG_KSU_DEBUG - pr_info("sys_reboot: magic: 0x%x (id: %d)\n", magic1, magic2); -#endif - - // Dereference **arg.. with IS_ERR check. - void __user *argp = (void __user *)*arg; - if (IS_ERR(argp)) { - pr_err("Failed to deref user arg, err: %lu\n", PTR_ERR(argp)); - return -EINVAL; - } - - // Check if this is a request to install KSU fd - if (magic2 == KSU_INSTALL_MAGIC2) { - return ksu_handle_fd_request(argp); - } - - return 0; -} - -void ksu_supercalls_init(void) -{ - int i; - - pr_info("KernelSU IOCTL Commands:\n"); - for (i = 0; ksu_ioctl_handlers[i].handler; i++) { - pr_info(" %-18s = 0x%08x\n", ksu_ioctl_handlers[i].name, - ksu_ioctl_handlers[i].cmd); - } -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_supercalls_init(); -#endif -} - -void ksu_supercalls_exit(void) -{ -#ifdef CONFIG_KSU_SYSCALL_HOOK - kp_handle_supercalls_exit(); -#endif -} - -// IOCTL dispatcher -static long anon_ksu_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int i; - -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu ioctl: cmd=0x%x from uid=%d\n", cmd, current_uid().val); -#endif - - for (i = 0; ksu_ioctl_handlers[i].handler; i++) { - if (cmd == ksu_ioctl_handlers[i].cmd) { - // Check permission first - if (ksu_ioctl_handlers[i].perm_check && - !ksu_ioctl_handlers[i].perm_check()) { - pr_warn("ksu ioctl: permission denied for cmd=0x%x uid=%d\n", - cmd, current_uid().val); - return -EPERM; - } - // Execute handler - return ksu_ioctl_handlers[i].handler(argp); - } - } - - pr_warn("ksu ioctl: unsupported command 0x%x\n", cmd); - return -ENOTTY; -} - -// File release handler -static int anon_ksu_release(struct inode *inode, struct file *filp) -{ -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu fd released\n"); -#endif - return 0; -} - -// File operations structure -static const struct file_operations anon_ksu_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = anon_ksu_ioctl, - .compat_ioctl = anon_ksu_ioctl, - .release = anon_ksu_release, -}; - -// Install KSU fd to current process -int ksu_install_fd(void) -{ - struct file *filp; - int fd; - - // Get unused fd - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - pr_err("ksu_install_fd: failed to get unused fd\n"); - return fd; - } - - // Create anonymous inode file - filp = anon_inode_getfile("[ksu_driver]", &anon_ksu_fops, NULL, - O_RDWR | O_CLOEXEC); - if (IS_ERR(filp)) { - pr_err("ksu_install_fd: failed to create anon inode file\n"); - put_unused_fd(fd); - return PTR_ERR(filp); - } - - // Install fd - fd_install(fd, filp); - -#ifdef CONFIG_KSU_DEBUG - pr_info("ksu fd[%d] installed for %s/%d\n", fd, current->comm, - current->pid); -#endif - - return fd; -} diff --git a/drivers/kernelsu/supercalls.h b/drivers/kernelsu/supercalls.h deleted file mode 100644 index f6ba38c498d3..000000000000 --- a/drivers/kernelsu/supercalls.h +++ /dev/null @@ -1,152 +0,0 @@ -#ifndef __KSU_H_SUPERCALLS -#define __KSU_H_SUPERCALLS - -#include -#include -#include "app_profile.h" - -// Magic numbers for reboot hook to install fd -#define KSU_INSTALL_MAGIC1 0xDEADBEEF -#define KSU_INSTALL_MAGIC2 0xCAFEBABE - -// Command structures for ioctl - -struct ksu_become_daemon_cmd { - __u8 token[65]; // Input: daemon token (null-terminated) -}; - -struct ksu_get_info_cmd { - __u32 version; // Output: KERNEL_SU_VERSION - __u32 flags; // Output: flags (bit 0: MODULE mode) - __u32 features; // Output: max feature ID supported -}; - -struct ksu_report_event_cmd { - __u32 event; // Input: EVENT_POST_FS_DATA, EVENT_BOOT_COMPLETED, etc. -}; - -struct ksu_set_sepolicy_cmd { - __u64 cmd; // Input: sepolicy command - __aligned_u64 arg; // Input: sepolicy argument pointer -}; - -struct ksu_check_safemode_cmd { - __u8 in_safe_mode; // Output: true if in safe mode, false otherwise -}; - -struct ksu_get_allow_list_cmd { - __u32 uids[128]; // Output: array of allowed/denied UIDs - __u32 count; // Output: number of UIDs in array - __u8 allow; // Input: true for allow list, false for deny list -}; - -struct ksu_uid_granted_root_cmd { - __u32 uid; // Input: target UID to check - __u8 granted; // Output: true if granted, false otherwise -}; - -struct ksu_uid_should_umount_cmd { - __u32 uid; // Input: target UID to check - __u8 should_umount; // Output: true if should umount, false otherwise -}; - -struct ksu_get_manager_appid_cmd { - __u32 appid; // Output: manager app id -}; - -struct ksu_get_app_profile_cmd { - struct app_profile profile; // Input/Output: app profile structure -}; - -struct ksu_set_app_profile_cmd { - struct app_profile profile; // Input: app profile structure -}; - -struct ksu_get_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Output: feature value/state - __u8 supported; // Output: true if feature is supported, false otherwise -}; - -struct ksu_set_feature_cmd { - __u32 feature_id; // Input: feature ID (enum ksu_feature_id) - __u64 value; // Input: feature value/state to set -}; - -struct ksu_get_wrapper_fd_cmd { - __u32 fd; // Input: userspace fd - __u32 flags; // Input: flags of userspace fd -}; - -struct ksu_manage_mark_cmd { - __u32 operation; // Input: KSU_MARK_* - __s32 pid; // Input: target pid (0 for all processes) - __u32 result; // Output: for get operation - mark status or reg_count -}; - -struct ksu_nuke_ext4_sysfs_cmd { - __aligned_u64 arg; // Input: mnt pointer -}; - -#define KSU_MARK_GET 1 -#define KSU_MARK_MARK 2 -#define KSU_MARK_UNMARK 3 -#define KSU_MARK_REFRESH 4 - -struct ksu_add_try_umount_cmd { - __aligned_u64 arg; // char ptr, this is the mountpoint - __u32 flags; // this is the flag we use for it - __u8 mode; // denotes what to do with it 0:wipe_list 1:add_to_list 2:delete_entry -}; - -#define KSU_UMOUNT_WIPE 0 // ignore everything and wipe list -#define KSU_UMOUNT_ADD 1 // add entry (path + flags) -#define KSU_UMOUNT_DEL 2 // delete entry, strcmp -#define KSU_UMOUNT_GETSIZE 3 // get list size -#define KSU_UMOUNT_GETLIST 4 // get list - -// IOCTL command definitions -#define KSU_IOCTL_GRANT_ROOT _IOC(_IOC_NONE, 'K', 1, 0) -#define KSU_IOCTL_GET_INFO _IOC(_IOC_READ, 'K', 2, 0) -#define KSU_IOCTL_REPORT_EVENT _IOC(_IOC_WRITE, 'K', 3, 0) -#define KSU_IOCTL_SET_SEPOLICY _IOC(_IOC_READ | _IOC_WRITE, 'K', 4, 0) -#define KSU_IOCTL_CHECK_SAFEMODE _IOC(_IOC_READ, 'K', 5, 0) -#define KSU_IOCTL_GET_ALLOW_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 6, 0) -#define KSU_IOCTL_GET_DENY_LIST _IOC(_IOC_READ | _IOC_WRITE, 'K', 7, 0) -#define KSU_IOCTL_UID_GRANTED_ROOT _IOC(_IOC_READ | _IOC_WRITE, 'K', 8, 0) -#define KSU_IOCTL_UID_SHOULD_UMOUNT _IOC(_IOC_READ | _IOC_WRITE, 'K', 9, 0) -#define KSU_IOCTL_GET_MANAGER_APPID _IOC(_IOC_READ, 'K', 10, 0) -#define KSU_IOCTL_GET_APP_PROFILE _IOC(_IOC_READ | _IOC_WRITE, 'K', 11, 0) -#define KSU_IOCTL_SET_APP_PROFILE _IOC(_IOC_WRITE, 'K', 12, 0) -#define KSU_IOCTL_GET_FEATURE _IOC(_IOC_READ | _IOC_WRITE, 'K', 13, 0) -#define KSU_IOCTL_SET_FEATURE _IOC(_IOC_WRITE, 'K', 14, 0) -#define KSU_IOCTL_GET_WRAPPER_FD _IOC(_IOC_WRITE, 'K', 15, 0) -#define KSU_IOCTL_MANAGE_MARK _IOC(_IOC_READ | _IOC_WRITE, 'K', 16, 0) -#define KSU_IOCTL_NUKE_EXT4_SYSFS _IOC(_IOC_WRITE, 'K', 17, 0) -#define KSU_IOCTL_ADD_TRY_UMOUNT _IOC(_IOC_WRITE, 'K', 18, 0) - -// IOCTL handler types -typedef int (*ksu_ioctl_handler_t)(void __user *arg); -typedef bool (*ksu_perm_check_t)(void); - -// IOCTL command mapping -struct ksu_ioctl_cmd_map { - unsigned int cmd; - const char *name; - ksu_ioctl_handler_t handler; - ksu_perm_check_t perm_check; // Permission check function -}; - -#define KSU_IOCTL(CMD, NAME, HANDLER, PERM) \ - { \ - .cmd = KSU_IOCTL_##CMD, .name = NAME, .handler = HANDLER, \ - .perm_check = PERM \ - } - -// Install KSU fd to current process -int ksu_install_fd(void); - -void ksu_supercalls_init(void); -void ksu_supercalls_exit(void); - -#endif // __KSU_H_SUPERCALLS diff --git a/drivers/kernelsu/syscall_handler.c b/drivers/kernelsu/syscall_handler.c deleted file mode 100644 index 499967165bce..000000000000 --- a/drivers/kernelsu/syscall_handler.c +++ /dev/null @@ -1,374 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -// Tracepoint registration count management -// == 1: just us -// > 1: someone else is also using syscall tracepoint e.g. ftrace -static int tracepoint_reg_count = 0; -static DEFINE_SPINLOCK(tracepoint_reg_lock); - -void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count <= 1) { - ksu_clear_task_tracepoint_flag(t); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); -} - -// Process marking management -static void handle_process_mark(bool mark) -{ - struct task_struct *p, *t; - read_lock(&tasklist_lock); - for_each_process_thread (p, t) { - if (mark) - ksu_set_task_tracepoint_flag(t); - else - ksu_clear_task_tracepoint_flag(t); - } - read_unlock(&tasklist_lock); -} - -void ksu_mark_all_process(void) -{ - handle_process_mark(true); - pr_info("hook_manager: mark all user process done!\n"); -} - -void ksu_unmark_all_process(void) -{ - handle_process_mark(false); - pr_info("hook_manager: unmark all user process done!\n"); -} - -static void ksu_mark_running_process_locked(void) -{ - struct task_struct *p, *t; - read_lock(&tasklist_lock); - for_each_process_thread (p, t) { - if (!t->mm) { // only user processes - continue; - } - int uid = task_uid(t).val; - const struct cred *cred = get_task_cred(t); - bool ksu_root_process = uid == 0 && is_task_ksu_domain(cred); - bool is_zygote_process = is_zygote(cred); - bool is_shell = uid == 2000; - // before boot completed, we shall mark init for marking zygote - bool is_init = t->pid == 1; - if (ksu_root_process || is_zygote_process || is_shell || - is_init || ksu_is_allow_uid(uid)) { - ksu_set_task_tracepoint_flag(t); - pr_info("hook_manager: mark process: pid:%d, uid: %d, comm:%s\n", - t->pid, uid, t->comm); - } else { - ksu_clear_task_tracepoint_flag(t); - pr_info("hook_manager: unmark process: pid:%d, uid: %d, comm:%s\n", - t->pid, uid, t->comm); - } - put_cred(cred); - } - read_unlock(&tasklist_lock); -} - -void ksu_mark_running_process(void) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count <= 1) { - ksu_mark_running_process_locked(); - } else { - pr_info("hook_manager: not mark running process since syscall tracepoint is in use\n"); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); -} - -// Get task mark status -// Returns: 1 if marked, 0 if not marked, -ESRCH if task not found -int ksu_get_task_mark(pid_t pid) -{ - struct task_struct *task; - int marked = -ESRCH; - - rcu_read_lock(); - task = find_task_by_vpid(pid); - if (task) { - get_task_struct(task); - rcu_read_unlock(); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - marked = test_task_syscall_work(task, SYSCALL_TRACEPOINT) ? 1 : 0; -#else - marked = test_tsk_thread_flag(task, TIF_SYSCALL_TRACEPOINT) ? 1 : 0; -#endif - put_task_struct(task); - } else { - rcu_read_unlock(); - } - - return marked; -} - -// Set task mark status -// Returns: 0 on success, -ESRCH if task not found -int ksu_set_task_mark(pid_t pid, bool mark) -{ - struct task_struct *task; - int ret = -ESRCH; - - rcu_read_lock(); - task = find_task_by_vpid(pid); - if (task) { - get_task_struct(task); - rcu_read_unlock(); - if (mark) { - ksu_set_task_tracepoint_flag(task); - pr_info("hook_manager: marked task pid=%d comm=%s\n", - pid, task->comm); - } else { - ksu_clear_task_tracepoint_flag(task); - pr_info("hook_manager: unmarked task pid=%d comm=%s\n", - pid, task->comm); - } - put_task_struct(task); - ret = 0; - } else { - rcu_read_unlock(); - } - - return ret; -} - -#ifdef CONFIG_KRETPROBES - -static struct kretprobe *init_kretprobe(const char *name, - kretprobe_handler_t handler) -{ - struct kretprobe *rp = kzalloc(sizeof(struct kretprobe), GFP_KERNEL); - if (!rp) - return NULL; - rp->kp.symbol_name = name; - rp->handler = handler; - rp->data_size = 0; - rp->maxactive = 0; - - int ret = register_kretprobe(rp); - pr_info("hook_manager: register_%s kretprobe: %d\n", name, ret); - if (ret) { - kfree(rp); - return NULL; - } - - return rp; -} - -static void destroy_kretprobe(struct kretprobe **rp_ptr) -{ - struct kretprobe *rp = *rp_ptr; - if (!rp) - return; - unregister_kretprobe(rp); - synchronize_rcu(); - kfree(rp); - *rp_ptr = NULL; -} - -static int syscall_regfunc_handler(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - if (tracepoint_reg_count < 1) { - // while install our tracepoint, mark our processes - ksu_mark_running_process_locked(); - } else if (tracepoint_reg_count == 1) { - // while other tracepoint first added, mark all processes - ksu_mark_all_process(); - } - tracepoint_reg_count++; - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); - return 0; -} - -static int syscall_unregfunc_handler(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long flags; - spin_lock_irqsave(&tracepoint_reg_lock, flags); - tracepoint_reg_count--; - if (tracepoint_reg_count <= 0) { - // while no tracepoint left, unmark all processes - ksu_unmark_all_process(); - } else if (tracepoint_reg_count == 1) { - // while just our tracepoint left, unmark disallowed processes - ksu_mark_running_process_locked(); - } - spin_unlock_irqrestore(&tracepoint_reg_lock, flags); - return 0; -} - -static struct kretprobe *syscall_regfunc_rp = NULL; -static struct kretprobe *syscall_unregfunc_rp = NULL; -#endif - -static inline bool check_syscall_fastpath(int nr) -{ - switch (nr) { - case __NR_newfstatat: - case __NR_faccessat: - case __NR_execve: - case __NR_setresuid: - return true; - default: - return false; - } -} - -// Unmark init's child that are not zygote, adbd or ksud -int ksu_handle_init_mark_tracker(const char __user **filename_user) -{ - char path[64]; - - if (unlikely(!filename_user)) - return 0; - if (!ksu_retry_filename_access(filename_user, path, sizeof(path), - false)) - return 0; - - if (unlikely(strcmp(path, KSUD_PATH) == 0)) { - pr_info("hook_manager: escape to root for init executing ksud: %d\n", - current->pid); - escape_to_root_for_init(); - } else if (likely(strstr(path, "/app_process") == NULL && - strstr(path, "/adbd") == NULL)) { - pr_info("hook_manager: unmark %d exec %s\n", current->pid, - path); - ksu_clear_task_tracepoint_flag_if_needed(current); - } - - return 0; -} - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -static int ksu_handle_setresuid(uid_t ruid, uid_t euid, uid_t suid) -{ - return ksu_handle_setuid_common(ruid, current_uid().val, euid); -} - -// Generic sys_enter handler that dispatches to specific handlers -static void ksu_sys_enter_handler(void *data, struct pt_regs *regs, long id) -{ - if (unlikely(check_syscall_fastpath(id))) { - if (ksu_su_compat_enabled) { - // Handle newfstatat - if (id == __NR_newfstatat) { - int *dfd = (int *)&PT_REGS_PARM1(regs); - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM2( - regs); - int *flags = - (int *)&PT_REGS_SYSCALL_PARM4(regs); - ksu_handle_stat(dfd, filename_user, flags); - return; - } - - // Handle faccessat - if (id == __NR_faccessat) { - int *dfd = (int *)&PT_REGS_PARM1(regs); - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM2( - regs); - int *mode = (int *)&PT_REGS_PARM3(regs); - ksu_handle_faccessat(dfd, filename_user, mode, - NULL); - return; - } - - // Handle execve - if (id == __NR_execve) { - const char __user **filename_user = - (const char __user **)&PT_REGS_PARM1( - regs); - if (current->pid != 1 && - is_init(get_current_cred())) { - ksu_handle_init_mark_tracker( - filename_user); - } else { - ksu_handle_execve_sucompat( - NULL, filename_user, NULL, NULL, - NULL); - } - return; - } - } - - // Handle setresuid - if (id == __NR_setresuid) { - uid_t ruid = (uid_t)PT_REGS_PARM1(regs); - uid_t euid = (uid_t)PT_REGS_PARM2(regs); - uid_t suid = (uid_t)PT_REGS_PARM3(regs); - ksu_handle_setresuid(ruid, euid, suid); - return; - } - } -} -#endif - -void ksu_syscall_hook_manager_init(void) -{ - int ret; - pr_info("hook_manager: ksu_hook_manager_init called\n"); - -#ifdef CONFIG_KRETPROBES - // Register kretprobe for syscall_regfunc - syscall_regfunc_rp = - init_kretprobe("syscall_regfunc", syscall_regfunc_handler); - // Register kretprobe for syscall_unregfunc - syscall_unregfunc_rp = - init_kretprobe("syscall_unregfunc", syscall_unregfunc_handler); -#endif - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - ret = register_trace_sys_enter(ksu_sys_enter_handler, NULL); -#ifndef CONFIG_KRETPROBES - ksu_mark_running_process_locked(); -#endif - if (ret) { - pr_err("hook_manager: failed to register sys_enter tracepoint: %d\n", - ret); - } else { - pr_info("hook_manager: sys_enter tracepoint registered\n"); - } -#endif - - ksu_setuid_hook_init(); - ksu_sucompat_init(); -} - -void ksu_syscall_hook_manager_exit(void) -{ - pr_info("hook_manager: ksu_hook_manager_exit called\n"); -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - unregister_trace_sys_enter(ksu_sys_enter_handler, NULL); - tracepoint_synchronize_unregister(); - pr_info("hook_manager: sys_enter tracepoint unregistered\n"); -#endif - -#ifdef CONFIG_KRETPROBES - destroy_kretprobe(&syscall_regfunc_rp); - destroy_kretprobe(&syscall_unregfunc_rp); -#endif - - ksu_sucompat_exit(); - ksu_setuid_hook_exit(); -} diff --git a/drivers/kernelsu/syscall_handler.h b/drivers/kernelsu/syscall_handler.h deleted file mode 100644 index 463617fd97d9..000000000000 --- a/drivers/kernelsu/syscall_handler.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __KSU_H_HOOK_MANAGER -#define __KSU_H_HOOK_MANAGER - -#include -#include -#include - -// Hook manager initialization and cleanup -void ksu_syscall_hook_manager_init(void); -void ksu_syscall_hook_manager_exit(void); - -// Process marking for tracepoint -void ksu_mark_all_process(void); -void ksu_unmark_all_process(void); -void ksu_mark_running_process(void); - -// Per-task mark operations -int ksu_get_task_mark(pid_t pid); -int ksu_set_task_mark(pid_t pid, bool mark); - -static inline void ksu_set_task_tracepoint_flag(struct task_struct *t) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - set_task_syscall_work(t, SYSCALL_TRACEPOINT); -#else - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); -#endif -} - -static inline void ksu_clear_task_tracepoint_flag(struct task_struct *t) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) - clear_task_syscall_work(t, SYSCALL_TRACEPOINT); -#else - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); -#endif -} - -void ksu_clear_task_tracepoint_flag_if_needed(struct task_struct *t); -#endif diff --git a/drivers/kernelsu/throne_tracker.h b/drivers/kernelsu/throne_tracker.h deleted file mode 100644 index 8bb3b9a29b51..000000000000 --- a/drivers/kernelsu/throne_tracker.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __KSU_H_THRONE_TRACKER -#define __KSU_H_THRONE_TRACKER - -void ksu_throne_tracker_init(void); - -void ksu_throne_tracker_exit(void); - -void track_throne(bool prune_only); - -#endif diff --git a/drivers/kernelsu/tiny_sulog.c b/drivers/kernelsu/tiny_sulog.c new file mode 100644 index 000000000000..1fc8a5b1e3dd --- /dev/null +++ b/drivers/kernelsu/tiny_sulog.c @@ -0,0 +1,125 @@ +// half assed ringbuffer +// 8 bytes +struct sulog_entry { + uint32_t s_time; // uptime in seconds + uint32_t data; // uint8_t[0,1,2] = uid, basically uint24_t, uint8_t[3] = symbol +} __attribute__((packed)); + +#define SULOG_ENTRY_MAX 250 +#define SULOG_BUFSIZ SULOG_ENTRY_MAX * (sizeof (struct sulog_entry)) + +static void *sulog_buf_ptr = NULL; +static uint8_t sulog_index_next = 0; + +static DEFINE_SPINLOCK(sulog_lock); + +static void tiny_sulog_init_heap() +{ + sulog_buf_ptr = kzalloc(SULOG_BUFSIZ, GFP_KERNEL); + if (!sulog_buf_ptr) + return; + + pr_info("sulog_init: allocated %lu bytes on 0x%p \n", SULOG_BUFSIZ, sulog_buf_ptr); +} + +/** + * + * boottime_s_get, get kernel uptime in seconds + * + * - handles sub 4.10 compat + * - we do this forced pointer cast to cut down on compat, pre 4.10, ktime is a union + * + * - bs handling 64-bit division on 32-bit (do_div) + * - remainder = do_div(dividend, divisor); dividend will hold the quotient + * - for 64-bit we can straight up just use divide + * + */ +static inline uint32_t boottime_s_get() +{ + ktime_t boottime_kt = ktime_get_boottime(); + +#ifdef CONFIG_64BIT + uint64_t boottime_s = *(uint64_t *)&boottime_kt / 1000000000; +#else + uint64_t boottime_s = *(uint64_t *)&boottime_kt; + do_div(boottime_s, 1000000000); +#endif + + return (uint32_t)boottime_s; +} + +static void write_sulog(uint8_t sym) +{ + if (!sulog_buf_ptr) + return; + + unsigned int offset = sulog_index_next * sizeof(struct sulog_entry); + struct sulog_entry entry = {0}; + + // WARNING!!! this is LE only! + entry.s_time = boottime_s_get(); + entry.data = (uint32_t)current_uid().val; + *((char *)&entry.data + 3) = sym; + + // we can perform this write atomic on 64-bit + // however this still has to be locked for exclusion as theres a reader + + spin_lock(&sulog_lock); + +#ifdef CONFIG_64BIT + *(volatile uint64_t *)(sulog_buf_ptr + offset) = *(uint64_t *)&entry; +#else + __builtin_memcpy(sulog_buf_ptr + offset, &entry, sizeof(entry)); +#endif + + // move ptr for next iteration + sulog_index_next = sulog_index_next + 1; + + if (sulog_index_next >= SULOG_ENTRY_MAX) + sulog_index_next = 0; + + spin_unlock(&sulog_lock); + + return; +} + +struct sulog_entry_rcv_ptr { + uint64_t index_ptr; // send index here + uint64_t buf_ptr; // send buf here + uint64_t uptime_ptr; // uptime +}; + +static int send_sulog_dump(void __user *uptr) +{ + if (!sulog_buf_ptr) + return 1; + + struct sulog_entry_rcv_ptr sbuf = {0}; + + if (copy_from_user(&sbuf, uptr, sizeof(sbuf) )) + return 1; + + if (!sbuf.index_ptr || !sbuf.buf_ptr || !sbuf.uptime_ptr ) + return 1; + + // send uptime + + uint32_t uptime = boottime_s_get(); + + if (copy_to_user((void __user *)(uintptr_t)sbuf.uptime_ptr, &uptime, sizeof(uptime) )) + return 1; + + // send index + if (copy_to_user((void __user *)(uintptr_t)sbuf.index_ptr, &sulog_index_next, sizeof(sulog_index_next) )) + return 1; + + // send buffer data + spin_lock(&sulog_lock); + if (copy_to_user((void __user *)(uintptr_t)sbuf.buf_ptr, sulog_buf_ptr, SULOG_BUFSIZ )) { + spin_unlock(&sulog_lock); + return 1; + } + spin_unlock(&sulog_lock); + + return 0; +} diff --git a/drivers/rekernel/Kconfig b/drivers/rekernel/Kconfig deleted file mode 100644 index dadf14779fde..000000000000 --- a/drivers/rekernel/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -menu "Re:Kernel" - -config REKERNEL - bool "Re:Kernel support" - default n - help - Make tombstone users get a better experience. - -config REKERNEL_NETWORK - bool "Re:Kernel NetReceive unfreeze support" - depends on REKERNEL - default n - help - Make tombstone users get a better experience. -endmenu diff --git a/drivers/rekernel/Makefile b/drivers/rekernel/Makefile deleted file mode 100644 index bb613644a5f4..000000000000 --- a/drivers/rekernel/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_REKERNEL) += rekernel.o diff --git a/drivers/rekernel/rekernel.c b/drivers/rekernel/rekernel.c deleted file mode 100644 index d3783225c33c..000000000000 --- a/drivers/rekernel/rekernel.c +++ /dev/null @@ -1,333 +0,0 @@ -#include -#include - -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#endif /* IS_ENABLED(CONFIG_IPV6) */ -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#endif /* IS_ENABLED(CONFIG_IPV6) */ - -#include -#include -#include "rekernel.h" - -#define MIN_USERAPP_UID 10000 -#define MAX_SYSTEM_UID 2000 -#define SYSTEM_APP_UID 1000 -#define INTERFACETOKEN_BUFF_SIZE 140 -#define PARCEL_OFFSET 16 -#define LINE_ERROR 1 -#define LINE_SUCCESS 0 - -#define NETLINK_REKERNEL_MAX 26 -#define NETLINK_REKERNEL_MIN 22 -#define USER_PORT 100 -#define PACKET_SIZE 256 - -static const char* binder_type[] = { - "reply", - "transaction", - "free_buffer_full", -}; -static const char* rpc_type[] = { - "SYNC_BINDER_REPLY", - "SYNC_BINDER", - "FREE_BUFFER_FULL", -}; -static struct sock* netlink_socket; -extern struct net init_net; -static unsigned long netlink_unit = 0; -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry* rekernel_dir, * rekernel_unit_entry; -#endif /* CONFIG_PROC_FS */ - -static int sendMessage(char* packet_buffer, uint16_t len) { - struct sk_buff* socket_buffer; - struct nlmsghdr* netlink_hdr; - - socket_buffer = nlmsg_new(len, GFP_ATOMIC); - if (!socket_buffer) { - pr_err("netlink alloc failure.\n"); - return -LINE_ERROR; - } - - netlink_hdr = nlmsg_put(socket_buffer, 0, 0, netlink_unit, len, 0); - if (!netlink_hdr) { - pr_err("nlmsg_put failaure.\n"); - nlmsg_free(socket_buffer); - return -LINE_ERROR; - } - - memcpy(nlmsg_data(netlink_hdr), packet_buffer, len); - return netlink_unicast(netlink_socket, socket_buffer, USER_PORT, MSG_DONTWAIT); -} -static void netlink_rcv_msg(struct sk_buff* socket_buffer) { - struct nlmsghdr* nlhdr = NULL; - char* umsg = NULL; - - if (socket_buffer->len >= nlmsg_total_size(0)) { - nlhdr = nlmsg_hdr(socket_buffer); - umsg = nlmsg_data(nlhdr); - if (umsg) { -#ifdef CONFIG_PROC_FS - if (!memcmp(umsg, "#proc_remove", nlmsg_len(nlhdr))) { - if (rekernel_dir) { - proc_remove(rekernel_dir); - } - } -#endif /* CONFIG_PROC_FS */ - } - } -} -#ifdef CONFIG_REKERNEL_NETWORK -static unsigned int rekernel_pkg_ipv4_ipv6_in(void* priv, struct sk_buff* socket_buffer, - const struct nf_hook_state* state) { - struct sock* sk; - unsigned int thoff = 0; - unsigned short frag_off = 0; - uid_t uid; - uint hook; - struct net_device* dev = NULL; - struct tcphdr *th; - int data_len = 0; - - if (!socket_buffer || !socket_buffer->len || !state) - return NF_ACCEPT; - - hook = state->hook; - if (NF_INET_LOCAL_IN == hook) - dev = state->in; - - if (NULL == dev) - return NF_ACCEPT; - - if (ip_hdr(socket_buffer)->version == 4) { - struct iphdr *iph4 = ip_hdr(socket_buffer); - if (iph4->protocol != IPPROTO_TCP) - return NF_ACCEPT; - if (!pskb_may_pull(socket_buffer, (iph4->ihl << 2) + sizeof(struct tcphdr))) - return NF_ACCEPT; - th = (struct tcphdr *)((unsigned char *)iph4 + (iph4->ihl << 2)); - data_len = ntohs(iph4->tot_len) - (iph4->ihl << 2) - (th->doff << 2); -#if IS_ENABLED(CONFIG_IPV6) - } else if (ip_hdr(socket_buffer)->version == 6) { - struct ipv6hdr *iph6 = ipv6_hdr(socket_buffer); - if (ipv6_find_hdr(socket_buffer, &thoff, -1, &frag_off, NULL) != IPPROTO_TCP) - return NF_ACCEPT; - if (!pskb_may_pull(socket_buffer, thoff + sizeof(struct tcphdr))) - return NF_ACCEPT; - th = (struct tcphdr *)(skb_network_header(socket_buffer) + thoff); - data_len = ntohs(iph6->payload_len) - (thoff - sizeof(struct ipv6hdr)) - (th->doff << 2); -#endif - } else { - return NF_ACCEPT; - } - - sk = skb_to_full_sk(socket_buffer); - if (sk == NULL || !sk_fullsock(sk)) - return NF_ACCEPT; - - uid = sock_i_uid(sk).val; - if (uid < MIN_USERAPP_UID) - return NF_ACCEPT; - - if (data_len <= 0 && !th->syn && !th->fin && !th->rst) - return NF_ACCEPT; - - rekernel_report(NETWORK, ip_hdr(socket_buffer)->version, data_len, NULL, uid, NULL, true, NULL); - return NF_ACCEPT; -} -/* Only monitor input network packages */ -static struct nf_hook_ops rekernel_nf_ops[] = { - { - .hook = rekernel_pkg_ipv4_ipv6_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_SELINUX_LAST + 1, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .hook = rekernel_pkg_ipv4_ipv6_in, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_SELINUX_LAST + 1, - } -#endif -}; - -int register_netfilter(void) { - int rc; - struct net* net = NULL; - for_each_net(net) { - rc = nf_register_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops)); - if (rc) { - pr_err("register netfilter hooks failed, rc=%d\n", rc); - break; - } - } - if (rc) { - for_each_net(net) { - nf_unregister_net_hooks(net, rekernel_nf_ops, ARRAY_SIZE(rekernel_nf_ops)); - } - return -1; - } - - return LINE_SUCCESS; -} -#endif /* CONFIG_REKERNEL_NETWORK */ -struct netlink_kernel_cfg cfg = { - .input = netlink_rcv_msg, // set recv callback -}; -#ifdef CONFIG_PROC_FS -static int rekernel_unit_show(struct seq_file* m, void* v) { - seq_printf(m, "%d\n", netlink_unit); - return LINE_SUCCESS; -} -static int rekernel_unit_open(struct inode* inode, struct file* file) { - return single_open(file, rekernel_unit_show, NULL); -} -static const struct file_operations rekernel_unit_fops = { - .open = rekernel_unit_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release -}; -#endif /* CONFIG_PROC_FS */ -// init -static int start_rekernel(void) { - if (netlink_unit) - return 0; - - pr_info("Thank you for choosing Re:Kernel!\n"); -#ifdef CONFIG_REKERNEL_NETWORK - pr_info("NetFilter is enabled!\n"); -#endif - pr_info("Re:Kernel v8.6 | DEVELOPER: Sakion Team | Timeline | USER PORT: %d\n", USER_PORT); - pr_info("Trying to create Re:Kernel Server......\n"); - - for (netlink_unit = NETLINK_REKERNEL_MIN; netlink_unit < NETLINK_REKERNEL_MAX; netlink_unit++) { - netlink_socket = netlink_kernel_create(&init_net, netlink_unit, &cfg); - if (netlink_socket != NULL) - break; - } - if (netlink_socket == NULL) { - netlink_unit = 0; - pr_err("Failed to create Re:Kernel server!\n"); - return -LINE_ERROR; - } - pr_info("Created Re:Kernel server! NETLINK UNIT: %d\n", netlink_unit); - -#ifdef CONFIG_PROC_FS - rekernel_dir = proc_mkdir("rekernel", NULL); - if (!rekernel_dir) { - pr_err("create /proc/rekernel failed!\n"); - } else { - char buff[32]; - sprintf(buff, "%d", netlink_unit); - rekernel_unit_entry = proc_create(buff, 0644, rekernel_dir, &rekernel_unit_fops); - if (!rekernel_unit_entry) { - pr_err("create rekernel unit failed!\n"); - } - } -#endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_REKERNEL_NETWORK - if (register_netfilter()) { - pr_err("%s: Failed to hook netfilter!\n", __func__); - return -LINE_ERROR; - } -#endif /* CONFIG_REKERNEL_NETWORK */ - return LINE_SUCCESS; -} - -void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - char binder_kmsg[PACKET_SIZE]; - char buf_data[INTERFACETOKEN_BUFF_SIZE]; - size_t buf_data_size; - char buf[INTERFACETOKEN_BUFF_SIZE] = { 0 }; - char* p; - int i = 0; - int j = 0; - - if (start_rekernel()) - return; - -#ifdef CONFIG_REKERNEL_NETWORK - if (reporttype == NETWORK) { - char binder_kmsg[PACKET_SIZE]; - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Network,target=%d,proto=ipv%d,data_len=%d;", dst_pid, type, src_pid); - sendMessage(binder_kmsg, strlen(binder_kmsg)); - return; - } -#endif /* CONFIG_REKERNEL_NETWORK */ - - if (!frozen_task_group(dst)) - return; - - if (task_uid(src).val == task_uid(dst).val) - return; - - switch (reporttype) { - case BINDER: - if (oneway && type == TRANSACTION) { - if (tr->code < 29 || tr->code > 32) - return; - buf_data_size = tr->data_size > INTERFACETOKEN_BUFF_SIZE ? INTERFACETOKEN_BUFF_SIZE : tr->data_size; - if (copy_from_user(buf_data, (char*)tr->data.ptr.buffer, buf_data_size)) - return; - j = PARCEL_OFFSET + 1; - p = (char*)(buf_data)+PARCEL_OFFSET; - while (i < INTERFACETOKEN_BUFF_SIZE && j < buf_data_size && *p != '\0') { - buf[i++] = *p; - j += 2; - p += 2; - } - if (i == INTERFACETOKEN_BUFF_SIZE) { - buf[i - 1] = '\0'; - } - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d,rpc_name=%s,code=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, buf, tr->code); - } else { - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Binder,bindertype=%s,oneway=%d,from_pid=%d,from=%d,target_pid=%d,target=%d;", binder_type[type], oneway, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val, rpc_type[type], -1); - } - break; - case SIGNAL: - snprintf(binder_kmsg, sizeof(binder_kmsg), "type=Signal,signal=%d,killer_pid=%d,killer=%d,dst_pid=%d,dst=%d;", type, src_pid, task_uid(src).val, dst_pid, task_uid(dst).val); - break; - default: - return; - } - sendMessage(binder_kmsg, strlen(binder_kmsg)); -} - -void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - if (task_uid(dst).val > MAX_SYSTEM_UID || src_pid == dst_pid) - return; - - // oneway=0 - rekernel_report(BINDER, REPLY, src_pid, src, dst_pid, dst, oneway, tr); -} - -void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - if ((task_uid(dst).val <= MIN_USERAPP_UID) || src_pid == dst_pid) - return; - - rekernel_report(BINDER, TRANSACTION, src_pid, src, dst_pid, dst, oneway, tr); -} - -void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr) { - if (unlikely(!dst)) - return; - - // oneway=1 - rekernel_report(BINDER, OVERFLOW, src_pid, src, dst_pid, dst, oneway, tr); -} diff --git a/drivers/rekernel/rekernel.h b/drivers/rekernel/rekernel.h deleted file mode 100644 index af7022a8535c..000000000000 --- a/drivers/rekernel/rekernel.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __REKERNEL_H -#define __REKERNEL_H - -#include -#include -#include -#include - -enum report_type { - BINDER, - SIGNAL, -#ifdef CONFIG_REKERNEL_NETWORK - NETWORK, -#endif /* CONFIG_REKERNEL_NETWORK */ -}; -enum binder_type { - REPLY, - TRANSACTION, - OVERFLOW, -}; - -static inline bool jobctl_frozen(struct task_struct* task) { - return ((task->jobctl & JOBCTL_TRAP_FREEZE) != 0); -} -static inline bool frozen_task_group(struct task_struct* task) { - return (jobctl_frozen(task) || cgroup_freezing(task)); -} - -extern void rekernel_report(int reporttype, int type, pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_reply_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_trans_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); -extern void binder_overflow_handler(pid_t src_pid, struct task_struct* src, pid_t dst_pid, struct task_struct* dst, bool oneway, struct binder_transaction_data* tr); - -#endif /* __REKERNEL_H */ diff --git a/fs/exec.c b/fs/exec.c index c2530fed584d..351ce34f1226 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1676,13 +1676,6 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_execveat_hook __read_mostly; -extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr, void *argv, - void *envp, int *flags); -extern int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, - void *argv, void *envp, int *flags); -#endif static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, @@ -1694,13 +1687,6 @@ static int do_execveat_common(int fd, struct filename *filename, struct files_struct *displaced; int retval; -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_execveat_hook)) - ksu_handle_execveat(&fd, &filename, &argv, &envp, &flags); - else - ksu_handle_execveat_sucompat(&fd, &filename, &argv, &envp, &flags); -#endif - if (IS_ERR(filename)) return PTR_ERR(filename); @@ -1851,12 +1837,21 @@ static int do_execveat_common(int fd, struct filename *filename, return retval; } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_execveat(int *fd, struct filename **filename_ptr, + void *argv, void *envp, int *flags); +#endif + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; +#ifdef CONFIG_KSU + ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0); +#endif return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } @@ -1884,6 +1879,9 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; +#ifdef CONFIG_KSU // 32-bit ksud and 32-on-64 support + ksu_handle_execveat((int *)AT_FDCWD, &filename, &argv, &envp, 0); +#endif return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } diff --git a/fs/file.c b/fs/file.c index 73b85f676357..be0792c0a231 100644 --- a/fs/file.c +++ b/fs/file.c @@ -656,37 +656,6 @@ int __close_fd(struct files_struct *files, unsigned fd) return -EBADF; } -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file, and then - * an fput(). - */ -int close_fd_get_file(unsigned int fd, struct file **res) -{ - struct files_struct *files = current->files; - struct file *file; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_unlock; - file = fdt->fd[fd]; - if (!file) - goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); - get_file(file); - *res = file; - return 0; - -out_unlock: - spin_unlock(&files->file_lock); - *res = NULL; - return -ENOENT; -} - void do_close_on_exec(struct files_struct *files) { unsigned i; diff --git a/fs/internal.h b/fs/internal.h index 380bae4c5ff7..3e58863de514 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -68,7 +68,6 @@ extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -int path_umount(struct path *path, int flags); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b70288a713b3..27358c854203 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -516,7 +516,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) goto out_put; rc = 0; - of->mmapped = true; + of->mmapped = 1; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1c2ea6ca0381..d5b149a45be1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -123,10 +123,8 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) { - dput(dentry); + if (WARN_ON(!knparent)) return ERR_PTR(-EINVAL); - } do { struct dentry *dtmp; @@ -135,11 +133,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) { - dput(dentry); + if (WARN_ON(!kntmp)) return ERR_PTR(-EINVAL); - } - dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name)); + dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; diff --git a/fs/open.c b/fs/open.c index 66fadbdfd17a..7dc516777071 100644 --- a/fs/open.c +++ b/fs/open.c @@ -355,15 +355,17 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return error; } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, + int *mode, int *flags); +#endif + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, - int *flags); -#endif SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { const struct cred *old_cred; @@ -373,7 +375,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) struct vfsmount *mnt; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; -#ifdef CONFIG_KSU_MANUAL_HOOK + +#ifdef CONFIG_KSU ksu_handle_faccessat(&dfd, &filename, &mode, NULL); #endif diff --git a/fs/read_write.c b/fs/read_write.c index 4f892b7649d5..901231269242 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -456,19 +456,10 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, } EXPORT_SYMBOL(__vfs_read); -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool ksu_vfs_read_hook __read_mostly; -extern int ksu_handle_vfs_read(struct file **file_ptr, char __user **buf_ptr, - size_t *count_ptr, loff_t **pos); -#endif ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; -#ifdef CONFIG_KSU_MANUAL_HOOK - if (unlikely(ksu_vfs_read_hook)) - ksu_handle_vfs_read(&file, &buf, &count, &pos); -#endif if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) diff --git a/fs/stat.c b/fs/stat.c index 0d099fff8b82..6c795dd237bc 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -87,9 +87,6 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) } EXPORT_SYMBOL(vfs_fstat); -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags); -#endif int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flag) { @@ -97,9 +94,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; unsigned int lookup_flags = 0; -#ifdef CONFIG_KSU_MANUAL_HOOK - ksu_handle_stat(&dfd, &filename, &flag); -#endif if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0) goto out; @@ -293,6 +287,12 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename, return cp_new_stat(&stat, statbuf); } +#ifdef CONFIG_KSU +__attribute__((hot)) +extern int ksu_handle_stat(int *dfd, const char __user **filename_user, + int *flags); +#endif + #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct stat __user *, statbuf, int, flag) @@ -300,6 +300,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct kstat stat; int error; +#ifdef CONFIG_KSU + ksu_handle_stat(&dfd, &filename, &flag); +#endif error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; @@ -307,6 +310,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, } #endif +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) +extern void ksu_handle_newfstat_ret(unsigned int *fd, struct stat __user **statbuf_ptr); +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +extern void ksu_handle_fstat64_ret(unsigned long *fd, struct stat64 __user **statbuf_ptr); // for 32-bit +#endif +#endif + SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) { struct kstat stat; @@ -315,6 +325,9 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) if (!error) error = cp_new_stat(&stat, statbuf); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) + ksu_handle_newfstat_ret(&fd, &statbuf); +#endif return error; } @@ -433,6 +446,9 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) if (!error) error = cp_new_stat64(&stat, statbuf); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) // for 32-bit + ksu_handle_fstat64_ret(&fd, &statbuf); +#endif return error; } @@ -442,6 +458,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, struct kstat stat; int error; +#ifdef CONFIG_KSU // 32-bit su + ksu_handle_stat(&dfd, &filename, &flag); +#endif error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1778f36ac1ce..9c41956dc9ca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -45,7 +45,7 @@ struct pr_ops; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 3 +#define BLKCG_MAX_POLS 2 typedef void (rq_end_io_fn)(struct request *, int); diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ab429b48f8bd..35a28e4fb2dd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -63,38 +62,18 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, - - /* Control group has to be frozen. */ - CGRP_FREEZE, - - /* Cgroup is frozen. */ - CGRP_FROZEN, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ - - /* - * Consider namespaces as delegation boundaries. If this flag is - * set, controller specific interface files in a namespace root - * aren't writeable from inside the namespace. - */ - CGRP_ROOT_NS_DELEGATE = (1 << 3), - - /* - * Enable cpuset controller in v1 cgroup to use v2 behavior. - */ - CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ - CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ @@ -131,6 +110,9 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* PI: the parent css */ + struct cgroup_subsys_state *parent; + /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; @@ -160,12 +142,6 @@ struct cgroup_subsys_state { /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; - - /* - * PI: the parent css. Placed here for cache proximity to following - * fields of the containing structure. - */ - struct cgroup_subsys_state *parent; }; /* @@ -176,29 +152,14 @@ struct cgroup_subsys_state { * set for a task. */ struct css_set { - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* reference count */ - refcount_t refcount; + /* Reference count */ + atomic_t refcount; /* - * For a domain cgroup, the following points to self. If threaded, - * to the matching cset of the nearest domain ancestor. The - * dom_cset provides access to the domain cgroup and its csses to - * which domain level resource consumptions should be charged. + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock */ - struct css_set *dom_cset; - - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* internal task count, protected by css_set_lock */ - int nr_tasks; + struct hlist_node hlist; /* * Lists running through all tasks using this cgroup group. @@ -209,42 +170,28 @@ struct css_set { */ struct list_head tasks; struct list_head mg_tasks; - struct list_head dying_tasks; - - /* all css_task_iters currently walking this cset */ - struct list_head task_iters; /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* all threaded csets whose ->dom_cset points to this cset */ - struct list_head threaded_csets; - struct list_head threaded_csets_node; + struct list_head cgrp_links; - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; /* - * List of cgrp_cset_links pointing at cgroups referenced from this - * css_set. Protected by css_set_lock. + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). */ - struct list_head cgrp_links; + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ - struct list_head mg_src_preload_node; - struct list_head mg_dst_preload_node; + struct list_head mg_preload_node; struct list_head mg_node; /* @@ -258,6 +205,18 @@ struct css_set { struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* all css_task_iters currently walking this cset */ + struct list_head task_iters; + /* dead and being drained, ignore for migration */ bool dead; @@ -265,25 +224,6 @@ struct css_set { struct rcu_head rcu_head; }; -struct cgroup_freezer_state { - /* Should the cgroup and its descendants be frozen. */ - bool freeze; - - /* Should the cgroup actually be frozen? */ - int e_freeze; - - /* Fields below are protected by css_set_lock */ - - /* Number of frozen descendant cgroups */ - int nr_frozen_descendants; - - /* - * Number of tasks, which are counted as frozen: - * frozen, SIGSTOPped, and PTRACEd. - */ - int nr_frozen_tasks; -}; - struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -308,40 +248,13 @@ struct cgroup { */ int level; - /* Maximum allowed descent tree depth */ - int max_depth; - - /* - * Keep track of total numbers of visible and dying descent cgroups. - * Dying cgroups are cgroups which were deleted by a user, - * but are still existing because someone else is holding a reference. - * max_descendants is a maximum allowed number of descent cgroups. - * - * nr_descendants and nr_dying_descendants are protected - * by cgroup_mutex and css_set_lock. It's fine to read them holding - * any of cgroup_mutex and css_set_lock; for writing both locks - * should be held. - */ - int nr_descendants; - int nr_dying_descendants; - int max_descendants; - /* * Each non-empty css_set associated with this cgroup contributes - * one to nr_populated_csets. The counter is zero iff this cgroup - * doesn't have any tasks. - * - * All children which have non-zero nr_populated_csets and/or - * nr_populated_children of their own contribute one to either - * nr_populated_domain_children or nr_populated_threaded_children - * depending on their type. Each counter is zero iff all cgroups - * of the type in the subtree proper don't have any tasks. + * one to populated_cnt. All children with non-zero popuplated_cnt + * of their own contribute one. The count is zero iff there's no + * task in this cgroup or its subtree. */ - int nr_populated_csets; - int nr_populated_domain_children; - int nr_populated_threaded_children; - - int nr_threaded_children; /* # of live threaded child cgroups */ + int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ @@ -379,16 +292,6 @@ struct cgroup { */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - /* - * If !threaded, self. If threaded, it points to the nearest - * domain ancestor. Inside a threaded subtree, cgroups are exempt - * from process granularity and no-internal-task constraint. - * Domain level resource consumptions which aren't tied to a - * specific task are charged to the dom_cgrp. - */ - struct cgroup *dom_cgrp; - struct cgroup *old_dom_cgrp; /* used while enabling threaded */ - /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. @@ -408,9 +311,6 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; - /* Used to store internal freezer state */ - struct cgroup_freezer_state freezer; - /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; @@ -548,7 +448,7 @@ struct cftype { /* * Control Group subsystem type. - * See Documentation/cgroup-v1/cgroups.txt for details + * See Documentation/cgroups/cgroups.txt for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); @@ -567,7 +467,7 @@ struct cgroup_subsys { void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); - void (*release)(struct task_struct *task); + void (*free)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; @@ -585,18 +485,6 @@ struct cgroup_subsys { */ bool implicit_on_dfl:1; - /* - * If %true, the controller, supports threaded mode on the default - * hierarchy. In a threaded subtree, both process granularity and - * no-internal-process constraint are ignored and a threaded - * controllers should be able to handle that. - * - * Note that as an implicit controller is automatically enabled on - * all cgroups on the default hierarchy, it should also be - * threaded. implicit && !threaded is not supported. - */ - bool threaded:1; - /* * If %false, this subsystem is properly hierarchical - * configuration, resource accounting and restriction on a parent diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 55a878aebe21..4e93ff0e45ba 100755 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,11 +17,11 @@ #include #include #include +#include #include #include #include #include -#include #include @@ -36,33 +36,18 @@ #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 -/* walk only threadgroup leaders */ -#define CSS_TASK_ITER_PROCS (1U << 0) -/* walk all threaded css_sets in the domain */ -#define CSS_TASK_ITER_THREADED (1U << 1) - -/* internal flags */ -#define CSS_TASK_ITER_SKIPPED (1U << 16) - /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; - unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; - struct list_head *tcset_pos; - struct list_head *tcset_head; - struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; - struct list_head *dying_tasks_head; - struct list_head *cur_tasks_head; struct css_set *cur_cset; - struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; @@ -122,7 +107,6 @@ extern int cgroup_can_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); @@ -145,7 +129,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, +void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); @@ -282,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css - * @tset: taskset to iterate + * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. @@ -563,27 +547,6 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } -/** - * cgroup_ancestor - find ancestor of cgroup - * @cgrp: cgroup to find ancestor of - * @ancestor_level: level of ancestor to find starting from root - * - * Find ancestor of cgroup at specified level starting from root if it exists - * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at - * @ancestor_level. - * - * This function is safe to call as long as @cgrp is accessible. - */ -static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, - int ancestor_level) -{ - if (cgrp->level < ancestor_level) - return NULL; - while (cgrp && cgrp->level > ancestor_level) - cgrp = cgroup_parent(cgrp); - return cgrp; -} - /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested @@ -604,8 +567,7 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + - cgrp->nr_populated_threaded_children; + return cgrp->populated_cnt; } /* returns ino associated with a cgroup */ @@ -709,7 +671,6 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } @@ -788,7 +749,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { - refcount_t count; + atomic_t count; struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; @@ -823,56 +784,13 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) - refcount_inc(&ns->count); + atomic_inc(&ns->count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->count)) + if (ns && atomic_dec_and_test(&ns->count)) free_cgroup_ns(ns); } -#ifdef CONFIG_CGROUPS - -void cgroup_enter_frozen(void); -void cgroup_leave_frozen(bool always_leave); -void cgroup_update_frozen(struct cgroup *cgrp); -void cgroup_freeze(struct cgroup *cgrp, bool freeze); -void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, - struct cgroup *dst); -void cgroup_freezer_frozen_exit(struct task_struct *task); -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - bool ret; - - if (task->flags & PF_KTHREAD) - return false; - - rcu_read_lock(); - ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); - rcu_read_unlock(); - - return ret; -} - -static inline bool cgroup_task_frozen(struct task_struct *task) -{ - return task->frozen; -} - -#else /* !CONFIG_CGROUPS */ - -static inline void cgroup_enter_frozen(void) { } -static inline void cgroup_leave_frozen(bool always_leave) { } -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - return false; -} -static inline bool cgroup_task_frozen(struct task_struct *task) -{ - return false; -} - -#endif /* !CONFIG_CGROUPS */ - #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h deleted file mode 100644 index e94290b29e99..000000000000 --- a/include/linux/cgroup_rdma.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. - */ - -#ifndef _CGROUP_RDMA_H -#define _CGROUP_RDMA_H - -#include - -enum rdmacg_resource_type { - RDMACG_RESOURCE_HCA_HANDLE, - RDMACG_RESOURCE_HCA_OBJECT, - RDMACG_RESOURCE_MAX, -}; - -#ifdef CONFIG_CGROUP_RDMA - -struct rdma_cgroup { - struct cgroup_subsys_state css; - - /* - * head to keep track of all resource pools - * that belongs to this cgroup. - */ - struct list_head rpools; -}; - -struct rdmacg_device { - struct list_head dev_node; - struct list_head rpools; - char *name; -}; - -/* - * APIs for RDMA/IB stack to publish when a device wants to - * participate in resource accounting - */ -int rdmacg_register_device(struct rdmacg_device *device); -void rdmacg_unregister_device(struct rdmacg_device *device); - -/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ -int rdmacg_try_charge(struct rdma_cgroup **rdmacg, - struct rdmacg_device *device, - enum rdmacg_resource_type index); -void rdmacg_uncharge(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index); -#endif /* CONFIG_CGROUP_RDMA */ -#endif /* _CGROUP_RDMA_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ff4cad3a2275..7f4a2a5a2a77 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,10 +60,6 @@ SUBSYS(hugetlb) SUBSYS(pids) #endif -#if IS_ENABLED(CONFIG_CGROUP_RDMA) -SUBSYS(rdma) -#endif - /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3cfe2d27811b..d807fa9b2051 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -56,7 +56,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); -extern void cpuset_update_active_cpus(void); +extern void cpuset_update_active_cpus(bool cpu_online); extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); @@ -172,7 +172,7 @@ static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } -static inline void cpuset_update_active_cpus(void) +static inline void cpuset_update_active_cpus(bool cpu_online) { partition_sched_domains(1, NULL, NULL); } diff --git a/include/linux/cred.h b/include/linux/cred.h index 796dc4380de0..09debf2e047f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -252,18 +252,6 @@ static inline const struct cred *get_cred(const struct cred *cred) return get_new_cred(nonconst_cred); } -static inline const struct cred *get_cred_rcu(const struct cred *cred) -{ - struct cred *nonconst_cred = (struct cred *) cred; - if (!cred) - return NULL; - if (!atomic_inc_not_zero(&nonconst_cred->usage)) - return NULL; - validate_creds(cred); - nonconst_cred->non_rcu = 0; - return cred; -} - /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index b01be50dbb24..442b54a14cbc 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -120,7 +120,6 @@ extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); -extern int close_fd_get_file(unsigned int fd, struct file **res); extern struct kmem_cache *files_cachep; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 4df9b50cb1c3..44e529353b6b 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -188,7 +188,7 @@ struct kernfs_open_file { char *prealloc_buf; size_t atomic_write_len; - bool mmapped:1; + bool mmapped; bool released:1; const struct vm_operations_struct *vm_ops; }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a548961be39..d2d7208b2274 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,28 +32,6 @@ struct user_struct; struct writeback_control; struct bdi_writeback; -/** - * mmgrab() - Pin a &struct mm_struct. - * @mm: The &struct mm_struct to pin. - * - * Make sure that @mm will not get freed even after the owning task - * exits. This doesn't guarantee that the associated address space - * will still exist later on and mmget_not_zero() has to be used before - * accessing it. - * - * This is a preferred way to pin @mm for a longer/unbounded amount - * of time. - * - * Use mmdrop() to release the reference acquired by mmgrab(). - * - * See also for an in-depth explanation - * of &mm_struct.mm_count vs &mm_struct.mm_users. - */ -static inline void mmgrab(struct mm_struct *mm) -{ - atomic_inc(&mm->mm_count); -} - #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/include/linux/sched.h b/include/linux/sched.h index d8106413464f..32111634c69b 100755 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1829,8 +1829,6 @@ struct task_struct { #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; - /* task is frozen/stopped (used by the cgroup freezer) */ - unsigned frozen:1; #endif unsigned long atomic_flags; /* Flags needing atomic access. */ @@ -2650,7 +2648,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -2659,7 +2656,6 @@ TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) #define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) -#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 975be862e083..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,7 +1,5 @@ -#ifndef _LINUX_SCHED_DEADLINE_H -#define _LINUX_SCHED_DEADLINE_H - -#include +#ifndef _SCHED_DEADLINE_H +#define _SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -28,4 +26,4 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#endif /* _LINUX_SCHED_DEADLINE_H */ +#endif /* _SCHED_DEADLINE_H */ diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 2cc450f6ec54..d9cf5a5762d9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -1,5 +1,5 @@ -#ifndef _LINUX_SCHED_PRIO_H -#define _LINUX_SCHED_PRIO_H +#ifndef _SCHED_PRIO_H +#define _SCHED_PRIO_H #define MAX_NICE 19 #define MIN_NICE -20 @@ -57,4 +57,4 @@ static inline long rlimit_to_nice(long prio) return (MAX_NICE - prio + 1); } -#endif /* _LINUX_SCHED_PRIO_H */ +#endif /* _SCHED_PRIO_H */ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 3bd668414f61..a30b172df6e1 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -1,9 +1,7 @@ -#ifndef _LINUX_SCHED_RT_H -#define _LINUX_SCHED_RT_H +#ifndef _SCHED_RT_H +#define _SCHED_RT_H -#include - -struct task_struct; +#include static inline int rt_prio(int prio) { @@ -59,4 +57,4 @@ extern void normalize_rt_tasks(void); */ #define RR_TIMESLICE (100 * HZ / 1000) -#endif /* _LINUX_SCHED_RT_H */ +#endif /* _SCHED_RT_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 07207044b5f4..9ff03d20b986 100755 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -1,9 +1,5 @@ -#ifndef _LINUX_SCHED_SYSCTL_H -#define _LINUX_SCHED_SYSCTL_H - -#include - -struct ctl_table; +#ifndef _SCHED_SYSCTL_H +#define _SCHED_SYSCTL_H #ifdef CONFIG_DETECT_HUNG_TASK extern int sysctl_hung_task_check_count; @@ -152,4 +148,4 @@ extern int sched_little_cluster_coloc_fmin_khz_handler(struct ctl_table *table, extern char sched_lib_name[LIB_PATH_LENGTH]; extern unsigned int sched_lib_mask_force; extern bool is_sched_lib_based_app(pid_t pid); -#endif /* _LINUX_SCHED_SYSCTL_H */ +#endif /* _SCHED_SYSCTL_H */ diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index d75248d81499..e305b66a9fb9 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -139,20 +139,6 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); -#define DEFINE_SHOW_ATTRIBUTE(__name) \ -static int __name ## _open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, __name ## _show, inode->i_private); \ -} \ - \ -static const struct file_operations __name ## _fops = { \ - .owner = THIS_MODULE, \ - .open = __name ## _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index dab24c19c82a..3558b58da3e4 100755 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -1,4 +1,3 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Copyright (C) 2008 Google, Inc. * @@ -67,7 +66,6 @@ enum flat_binder_object_flags { * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds. */ FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, - /** * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy * @@ -89,6 +87,7 @@ enum flat_binder_object_flags { * scheduling policy from the caller (for synchronous transactions). */ FLAT_BINDER_FLAG_INHERIT_RT = 0x800, +#ifdef __KERNEL__ /** * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts @@ -97,6 +96,7 @@ enum flat_binder_object_flags { * context */ FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000, +#endif /* __KERNEL__ */ }; #ifdef BINDER_IPC_32BIT @@ -265,25 +265,6 @@ struct binder_node_info_for_ref { __u32 reserved3; }; -struct binder_freeze_info { - __u32 pid; - __u32 enable; - __u32 timeout_ms; -}; - -struct binder_frozen_status_info { - __u32 pid; - - /* process received sync transactions since last frozen - * bit 0: received sync transaction after being frozen - * bit 1: new pending sync transaction during freezing - */ - __u32 sync_recv; - - /* process received async transactions since last frozen */ - __u32 async_recv; -}; - #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -294,9 +275,6 @@ struct binder_frozen_status_info { #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) #define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) #define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) -#define BINDER_FREEZE _IOW('b', 14, struct binder_freeze_info) -#define BINDER_GET_FROZEN_INFO _IOWR('b', 15, struct binder_frozen_status_info) -#define BINDER_ENABLE_ONEWAY_SPAM_DETECTION _IOW('b', 16, __u32) /* * NOTE: Two special error codes you should check for when calling @@ -319,7 +297,6 @@ enum transaction_flags { TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */ TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */ TF_CLEAR_BUF = 0x20, /* clear buffer on txn complete */ - TF_UPDATE_TXN = 0x40, /* update the outdated pending async txn */ }; struct binder_transaction_data { @@ -357,11 +334,13 @@ struct binder_transaction_data { } data; }; +#ifdef __KERNEL__ struct binder_transaction_data_secctx { struct binder_transaction_data transaction_data; binder_uintptr_t secctx; }; +#endif /* __KERNEL__ */ struct binder_transaction_data_sg { struct binder_transaction_data transaction_data; binder_size_t buffers_size; @@ -397,11 +376,13 @@ enum binder_driver_return_protocol { BR_OK = _IO('r', 1), /* No parameters! */ +#ifdef __KERNEL__ BR_TRANSACTION_SEC_CTX = _IOR('r', 2, struct binder_transaction_data_secctx), /* * binder_transaction_data_secctx: the received command. */ +#endif /* __KERNEL__ */ BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), BR_REPLY = _IOR('r', 3, struct binder_transaction_data), /* @@ -476,22 +457,9 @@ enum binder_driver_return_protocol { BR_FAILED_REPLY = _IO('r', 17), /* - * The last transaction (either a bcTRANSACTION or + * The the last transaction (either a bcTRANSACTION or * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. */ - - BR_FROZEN_REPLY = _IO('r', 18), - /* - * The target of the last transaction (either a bcTRANSACTION or - * a bcATTEMPT_ACQUIRE) is frozen. No parameters. - */ - - BR_ONEWAY_SPAM_SUSPECT = _IO('r', 19), - /* - * Current process sent too many oneway calls to target, and the last - * asynchronous transaction makes the allocated async buffer size exceed - * detection threshold. No parameters. - */ }; enum binder_driver_command_protocol { @@ -578,3 +546,4 @@ enum binder_driver_command_protocol { }; #endif /* _UAPI_LINUX_BINDER_H */ + diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h deleted file mode 100644 index 87410477aea9..000000000000 --- a/include/uapi/linux/android/binderfs.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2018 Canonical Ltd. - * - */ - -#ifndef _UAPI_LINUX_BINDERFS_H -#define _UAPI_LINUX_BINDERFS_H - -#include -#include -#include - -#define BINDERFS_MAX_NAME 255 - -/** - * struct binderfs_device - retrieve information about a new binder device - * @name: the name to use for the new binderfs binder device - * @major: major number allocated for binderfs binder devices - * @minor: minor number allocated for the new binderfs binder device - * - */ -struct binderfs_device { - char name[BINDERFS_MAX_NAME + 1]; - __u32 major; - __u32 minor; -}; - -/** - * Allocate a new binder device. - */ -#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device) - -#endif /* _UAPI_LINUX_BINDERFS_H */ - diff --git a/init/Kconfig b/init/Kconfig index bd93c3f5015a..25fb46dd2b56 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -858,16 +858,6 @@ config CGROUP_PIDS since the PIDs limit only affects a process's ability to fork, not to attach to a cgroup. -config CGROUP_RDMA - bool "RDMA controller" - help - Provides enforcement of RDMA resources defined by IB stack. - It is fairly easy for consumers to exhaust RDMA resources, which - can result into resource unavailability to other consumers. - RDMA controller is designed to stop this from happening. - Attaching processes with active RDMA resources to the cgroup - hierarchy is allowed even if can cross the hierarchy's limit. - config CGROUP_FREEZER bool "Freezer controller" help @@ -946,14 +936,11 @@ config CGROUP_BPF inet sockets. config CGROUP_DEBUG - bool "Debug controller" + bool "Example controller" default n - depends on DEBUG_KERNEL help This option enables a simple controller that exports - debugging information about the cgroups framework. This - controller is for control cgroup debugging only. Its - interfaces are not stable. + debugging information about the cgroups framework. Say N. diff --git a/kernel/Makefile b/kernel/Makefile index 9fec7d39f4b0..f3a91fa080bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -65,7 +65,10 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o -obj-$(CONFIG_CGROUPS) += cgroup/ +obj-$(CONFIG_CGROUPS) += cgroup.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o +obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup.c similarity index 70% rename from kernel/cgroup/cgroup.c rename to kernel/cgroup.c index 37cdbeb85a92..5c6deb033c96 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup.c @@ -28,13 +28,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "cgroup-internal.h" - +#include #include +#include #include #include #include +#include #include +#include #include #include #include @@ -45,11 +47,16 @@ #include #include #include +#include +#include +#include +#include #include +#include #include +#include /* TODO: replace with more sophisticated array */ #include #include -#include #include #include #include @@ -61,6 +68,14 @@ #define CREATE_TRACE_POINTS #include +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -74,12 +89,14 @@ * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ +#ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); - -#ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); +#else +static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -94,6 +111,12 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ @@ -109,9 +132,15 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; */ static struct workqueue_struct *cgroup_destroy_wq; +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, -struct cgroup_subsys *cgroup_subsys[] = { +static struct cgroup_subsys *cgroup_subsys[] = { #include }; #undef SUBSYS @@ -158,17 +187,18 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_visible; +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; - -/* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ -LIST_HEAD(cgroup_roots); + +static LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ @@ -184,25 +214,29 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static u64 css_serial_nr_next = 1; /* - * These bitmasks identify subsystems with specific features to avoid - * having to do iterative checks repeatedly. + * These bitmask flags indicate whether tasks in the fork and exit paths have + * fork/exit handlers to call. This avoids us having to do extra work in the + * fork/exit path to check which subsystems have fork/exit callbacks. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u16 have_free_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = REFCOUNT_INIT(2), + .count = { .counter = 2, }, .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; +/* Ditto for the can_fork callback. */ +static u16 have_canfork_callback __read_mostly; + static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; /* cgroup optional features */ enum cgroup_opt_features { @@ -220,10 +254,11 @@ static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { static u16 cgroup_feature_disable_mask __read_mostly; +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_skip(struct css_task_iter *it, - struct task_struct *task); +static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -241,7 +276,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ -bool cgroup_ssid_enabled(int ssid) +static bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; @@ -249,6 +284,11 @@ bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -302,7 +342,7 @@ bool cgroup_ssid_enabled(int ssid) * * - debug: disallowed on the default hierarchy. */ -bool cgroup_on_dfl(const struct cgroup *cgrp) +static bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } @@ -338,103 +378,14 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) -{ - return cgrp->nr_populated_csets; -} - -bool cgroup_is_threaded(struct cgroup *cgrp) -{ - return cgrp->dom_cgrp != cgrp; -} - -/* can @cgrp host both domain and threaded children? */ -static bool cgroup_is_mixable(struct cgroup *cgrp) -{ - /* - * Root isn't under domain level resource control exempting it from - * the no-internal-process constraint, so it can serve as a thread - * root and a parent of resource domains at the same time. - */ - return !cgroup_parent(cgrp); -} - -/* can @cgrp become a thread root? should always be true for a thread root */ -static bool cgroup_can_be_thread_root(struct cgroup *cgrp) -{ - /* mixables don't care */ - if (cgroup_is_mixable(cgrp)) - return true; - - /* domain roots can't be nested under threaded */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* can only have either domain or threaded children */ - if (cgrp->nr_populated_domain_children) - return false; - - /* and no domain controllers can be enabled */ - if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) - return false; - - return true; -} - -/* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) -{ - /* thread root should be a domain */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* a domain w/ threaded children is a thread root */ - if (cgrp->nr_threaded_children) - return true; - - /* - * A domain which has tasks and explicit threaded controllers - * enabled is a thread root. - */ - if (cgroup_has_tasks(cgrp) && - (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) - return true; - - return false; -} - -/* a domain which isn't connected to the root w/o brekage can't be used */ -static bool cgroup_is_valid_domain(struct cgroup *cgrp) -{ - /* the cgroup itself can be a thread root */ - if (cgroup_is_threaded(cgrp)) - return false; - - /* but the ancestors can't be unless mixable */ - while ((cgrp = cgroup_parent(cgrp))) { - if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) - return false; - if (cgroup_is_threaded(cgrp)) - return false; - } - - return true; -} - /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent) { - u16 ss_mask = parent->subtree_control; - - /* threaded cgroups can only have threaded controllers */ - if (cgroup_is_threaded(cgrp)) - ss_mask &= cgrp_dfl_threaded_ss_mask; - return ss_mask; - } + if (parent) + return parent->subtree_control; if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | @@ -447,14 +398,8 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - if (parent) { - u16 ss_mask = parent->subtree_ss_mask; - - /* threaded cgroups can only have threaded controllers */ - if (cgroup_is_threaded(cgrp)) - ss_mask &= cgrp_dfl_threaded_ss_mask; - return ss_mask; - } + if (parent) + return parent->subtree_ss_mask; return cgrp->root->subsys_mask; } @@ -544,37 +489,10 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, return css; } -/** - * __cgroup_task_count - count the number of tasks in a cgroup. The caller - * is responsible for taking the css_set_lock. - * @cgrp: the cgroup in question - */ -int __cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - lockdep_assert_held(&css_set_lock); - - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - - return count; -} - -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) +/* convenient tests for these bits */ +static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - int count; - - spin_lock_irq(&css_set_lock); - count = __cgroup_task_count(cgrp); - spin_unlock_irq(&css_set_lock); - - return count; + return !(cgrp->self.flags & CSS_ONLINE); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) @@ -597,6 +515,11 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); +static int notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor @@ -626,6 +549,15 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor @@ -650,6 +582,10 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ @@ -676,6 +612,29 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else +static void cgroup_release_agent(struct work_struct *work); +static void check_for_release(struct cgroup *cgrp); + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -684,42 +643,20 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = REFCOUNT_INIT(1), - .dom_cset = &init_css_set, + .refcount = ATOMIC_INIT(1), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), - .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), - .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), - .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), - - /* - * The following field is re-initialized when this cset gets linked - * in cgroup_init(). However, let's initialize the field - * statically too so that the default cgroup can be accessed safely - * early during boot. - */ - .dfl_cgrp = &cgrp_dfl_root.cgrp, + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ -static bool css_set_threaded(struct css_set *cset) -{ - return cset->dom_cset != cset; -} - /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set - * - * css_set_populated() should be the same as !!cset->nr_tasks at steady - * state. However, css_set_populated() can be called while a task is being - * added to or removed from the linked list before the nr_tasks is - * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -729,48 +666,39 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - update the populated count of a cgroup + * cgroup_update_populated - updated populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->nr_populated_* accordingly. The - * count is propagated towards root so that a given cgroup's - * nr_populated_children is zero iff none of its descendants contain any - * tasks. - * - * @cgrp's interface file "cgroup.populated" is zero if both - * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and - * 1 otherwise. When the sum changes from or to zero, userland is notified - * that the content of the interface file has changed. This can be used to - * detect when @cgrp and its descendants become populated or empty. + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { - struct cgroup *child = NULL; - int adj = populated ? 1 : -1; - lockdep_assert_held(&css_set_lock); do { - bool was_populated = cgroup_is_populated(cgrp); + bool trigger; - if (!child) { - cgrp->nr_populated_csets += adj; - } else { - if (cgroup_is_threaded(child)) - cgrp->nr_populated_threaded_children += adj; - else - cgrp->nr_populated_domain_children += adj; - } + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; - if (was_populated == cgroup_is_populated(cgrp)) + if (!trigger) break; - cgroup1_check_for_release(cgrp); + check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); - child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -781,7 +709,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the - * populated counters of all associated cgroups accordingly. + * ->populated_cnt of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { @@ -793,21 +721,6 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } -/* - * @task is leaving, advance task iterators which are pointing to it so - * that they can resume at the next position. Advancing an iterator might - * remove it from the list, use safe walk. See css_task_iter_skip() for - * details. - */ -static void css_set_skip_task_iters(struct css_set *cset, - struct task_struct *task) -{ - struct css_task_iter *it, *pos; - - list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) - css_task_iter_skip(it, task); -} - /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -819,7 +732,7 @@ static void css_set_skip_task_iters(struct css_set *cset, * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated counter updates and + * This function automatically handles populated_cnt updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ @@ -833,9 +746,22 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { + struct css_task_iter *it, *pos; + WARN_ON_ONCE(list_empty(&task->cg_list)); - css_set_skip_task_iters(from_cset, task); + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -879,7 +805,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -void put_css_set_locked(struct css_set *cset) +static void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -887,11 +813,9 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!refcount_dec_and_test(&cset->refcount)) + if (!atomic_dec_and_test(&cset->refcount)) return; - WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); - /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -908,14 +832,34 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } - if (css_set_threaded(cset)) { - list_del(&cset->threaded_csets_node); - put_css_set_locked(cset->dom_cset); - } - kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + atomic_inc(&cset->refcount); +} + /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -931,7 +875,6 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { - struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -942,16 +885,6 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - - /* @cset's domain should match the default cgroup's */ - if (cgroup_on_dfl(new_cgrp)) - new_dfl_cgrp = new_cgrp; - else - new_dfl_cgrp = old_cset->dfl_cgrp; - - if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) - return false; - /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -1158,18 +1091,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - refcount_set(&cset->refcount, 1); - cset->dom_cset = cset; + atomic_set(&cset->refcount, 1); + INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); - INIT_LIST_HEAD(&cset->dying_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); INIT_LIST_HEAD(&cset->task_iters); - INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); - INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_src_preload_node); - INIT_LIST_HEAD(&cset->mg_dst_preload_node); - INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -1203,32 +1132,10 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); - /* - * If @cset should be threaded, look up the matching dom_cset and - * link them up. We first fully initialize @cset then look for the - * dom_cset. It's simpler this way and safe as @cset is guaranteed - * to stay empty until we return. - */ - if (cgroup_is_threaded(cset->dfl_cgrp)) { - struct css_set *dcset; - - dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); - if (!dcset) { - put_css_set(cset); - return NULL; - } - - spin_lock_irq(&css_set_lock); - cset->dom_cset = dcset; - list_add_tail(&cset->threaded_csets_node, - &dcset->threaded_csets); - spin_unlock_irq(&css_set_lock); - } - return cset; } -struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1256,7 +1163,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -void cgroup_free_root(struct cgroup_root *root) +static void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1352,8 +1259,6 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, if (cset == &init_css_set) { res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1375,8 +1280,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ -struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root) +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -1413,6 +1318,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; +static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) @@ -1425,7 +1331,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1506,7 +1412,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ -void cgroup_kn_unlock(struct kernfs_node *kn) +static void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -1538,7 +1444,8 @@ void cgroup_kn_unlock(struct kernfs_node *kn) * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1601,17 +1508,8 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); - } else { - list_for_each_entry(cfts, &css->ss->cfts, node) - cgroup_addrm_files(css, cgrp, cfts, false); - } } /** @@ -1631,20 +1529,18 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; + cfts = cgroup_dfl_base_files; else - cfts = cgroup1_base_files; + cfts = cgroup_legacy_base_files; - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; - } else { - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; - } + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + } + + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; } } @@ -1660,7 +1556,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; @@ -1753,8 +1649,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1780,56 +1676,245 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - char *token; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} - *root_flags = 0; +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; - if (!data || *data == '\0') - return 0; +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; continue; } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + continue; + } - return 0; -} + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup_ssid_no_v1(i)) + continue; -static void apply_cgroup_root_flags(unsigned int root_flags) -{ - if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { - if (root_flags & CGRP_ROOT_NS_DELEGATE) - cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; - else - cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; } -} -static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) - seq_puts(seq, ",nsdelegate"); + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + return 0; } static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - unsigned int root_flags; - int ret; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; - apply_cgroup_root_flags(root_flags); - return 0; + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); + return -EINVAL; + } + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + /* See cgroup_mount release_agent handling */ + if (opts.release_agent && + ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { + ret = -EINVAL; + goto out_unlock; + } + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; } /* @@ -1882,7 +1967,6 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); - cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -1903,18 +1987,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; - cgrp->dom_cgrp = cgrp; - cgrp->max_descendants = INT_MAX; - cgrp->max_depth = INT_MAX; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +static void init_cgroup_root(struct cgroup_root *root, + struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1926,18 +2008,17 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strcpy(root->release_agent_path, opts->release_agent); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strcpy(root->name, opts->name); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -1949,8 +2030,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - 0, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -1969,10 +2050,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - kf_sops = root == &cgrp_dfl_root ? - &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; - - root->kf_root = kernfs_create_root(kf_sops, + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -2033,52 +2111,20 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) -{ - struct dentry *dentry; - bool new_sb = false; - - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); - - /* - * In non-init cgroup namespace, instead of root cgroup's dentry, - * we return the dentry corresponding to the cgroupns->root_cgrp. - */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { - struct dentry *nsdentry; - struct super_block *sb = dentry->d_sb; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); - - cgrp = cset_cgroup_from_root(ns->root_cset, root); - - spin_unlock_bh(&css_set_lock); - mutex_unlock(&cgroup_mutex); - - nsdentry = kernfs_node_dentry(cgrp->kn, sb); - dput(dentry); - if (IS_ERR(nsdentry)) - deactivate_locked_super(sb); - dentry = nsdentry; - } - - if (!new_sb) - cgroup_put(&root->cgrp); - - return dentry; -} - static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; + struct super_block *pinned_sb = NULL; struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_subsys *ss; + struct cgroup_root *root; + struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + int i; + bool new_sb; get_cgroup_ns(ns); @@ -2095,25 +2141,190 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); put_cgroup_ns(ns); - return ERR_PTR(ret); + return ERR_PTR(-EINVAL); } - cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if (opts.release_agent && + ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { + ret = -EINVAL; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) { + put_cgroup_ns(ns); + return ERR_PTR(ret); + } +out_mount: + dentry = kernfs_mount(fs_type, flags, root->kf_root, + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + + if (IS_ERR(dentry) || !new_sb) + cgroup_put(&root->cgrp); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); } put_cgroup_ns(ns); @@ -2126,20 +2337,22 @@ static void cgroup_kill_sb(struct super_block *sb) struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* - * If @root doesn't have any children, start killing it. + * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ - if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) + if (!list_empty(&root->cgrp.self.children) || + root == &cgrp_dfl_root) + cgroup_put(&root->cgrp); + else percpu_ref_kill(&root->cgrp.self.refcnt); - cgroup_put(&root->cgrp); + kernfs_kill_sb(sb); } -struct file_system_type cgroup_fs_type = { +static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, @@ -2153,8 +2366,8 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); @@ -2217,18 +2430,49 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/* used to track tasks and other necessary states during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + /** - * cgroup_migrate_add_task - add a migration target task to a migration context + * cgroup_taskset_add - try to add a migration target task to a taskset * @task: target task - * @mgctx: target migration context + * @tset: target taskset * - * Add @task, which is a migration target, to @mgctx->tset. This function - * becomes noop if @task doesn't need to be migrated. @task's css_set - * should have been added as a migration source and @task->cg_list will be - * moved from the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. */ -static void cgroup_migrate_add_task(struct task_struct *task, - struct cgroup_mgctx *mgctx) +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) { struct css_set *cset; @@ -2246,15 +2490,12 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (!cset->mg_src_cgrp) return; - mgctx->tset.nr_tasks++; - list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, - &mgctx->tset.src_csets); + list_add_tail(&cset->mg_node, &tset->src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) - list_add_tail(&cset->mg_dst_cset->mg_node, - &mgctx->tset.dst_csets); + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); } /** @@ -2321,34 +2562,37 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @mgctx: migration context + * @tset: taget taskset + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @mgctx as setup by migration preparation functions. + * Migrate tasks in @tset as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @mgctx are migrated. - * @mgctx is consumed regardless of success. + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ -static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup_root *root) { - struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; + /* check that we can legitimately attach to the cgroup */ - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->can_attach) { - tset->ssid = ssid; - ret = ss->can_attach(tset); - if (ret) { - failed_ssid = ssid; - goto out_cancel_attach; - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); + if (ret) { + failed_ssid = ssid; + goto out_cancel_attach; } - } while_each_subsys_mask(); - } + } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2362,17 +2606,8 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); - to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - from_cset->nr_tasks--; - /* - * If the source or destination cgroup is frozen, - * the task might require to change its state. - */ - cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, - to_cset->dfl_cgrp); put_css_set_locked(from_cset); - } } spin_unlock_irq(&css_set_lock); @@ -2384,29 +2619,25 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) */ tset->csets = &tset->dst_csets; - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->attach) { - tset->ssid = ssid; - ss->attach(tset); - } - } while_each_subsys_mask(); - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - if (tset->nr_tasks) { - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ssid == failed_ssid) - break; - if (ss->cancel_attach) { - tset->ssid = ssid; - ss->cancel_attach(tset); - } - } while_each_subsys_mask(); - } + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) + break; + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2415,87 +2646,44 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); - - /* - * Re-initialize the cgroup_taskset structure in case it is reused - * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() - * iteration. - */ - tset->nr_tasks = 0; - tset->csets = &tset->src_csets; return ret; } /** - * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * - * On the default hierarchy, except for the mixable, (possible) thread root - * and threaded cgroups, subtree_control must be zero for migration - * destination cgroups with tasks so that child cgroups don't compete - * against tasks. + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. */ -int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) { - /* v1 doesn't have any restriction */ - if (!cgroup_on_dfl(dst_cgrp)) - return 0; - - /* verify @dst_cgrp can host resources */ - if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) - return -EOPNOTSUPP; - - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - - /* - * If @dst_cgrp is already or can become a thread root or is - * threaded, it doesn't matter. - */ - if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) - return 0; - - /* apply no-internal-process constraint */ - if (dst_cgrp->subtree_control) - return -EBUSY; - - return 0; + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; } /** * cgroup_migrate_finish - cleanup after attach - * @mgctx: migration context + * @preloaded_csets: list of preloaded css_sets * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) +static void cgroup_migrate_finish(struct list_head *preloaded_csets) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - - list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_src_preload_node) { - cset->mg_src_cgrp = NULL; - cset->mg_dst_cgrp = NULL; - cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_src_preload_node); - put_css_set_locked(cset); - } - - list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, - mg_dst_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_dst_preload_node); + list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_irq(&css_set_lock); } @@ -2503,10 +2691,10 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @mgctx: migration context + * @preloaded_csets: list of preloaded css_sets * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @mgctx->src_csets, which should later be cleaned + * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2515,9 +2703,9 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ -void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct cgroup_mgctx *mgctx) +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) { struct cgroup *src_cgrp; @@ -2534,7 +2722,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_src_preload_node)) + if (!list_empty(&src_cset->mg_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); @@ -2545,39 +2733,37 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); + list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @mgctx: migration context + * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @mgctx->preloaded_src_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @mgctx->preloaded_dst_csets. + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @mgctx. + * @preloaded_csets. */ -int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { + LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_src_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - struct cgroup_subsys *ss; - int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - return -ENOMEM; + goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2589,7 +2775,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_src_preload_node); + list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2597,25 +2783,24 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_dst_preload_node)) - list_add_tail(&dst_cset->mg_dst_preload_node, - &mgctx->preloaded_dst_csets); + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); else put_css_set(dst_cset); - - for_each_subsys(ss, ssid) - if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) - mgctx->ss_mask |= 1 << ssid; } + list_splice_tail(&csets, preloaded_csets); return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @mgctx: migration context + * @root: cgroup root migration is taking place on * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2629,9 +2814,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx) +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_root *root) { + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2643,14 +2829,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_task(task, mgctx); + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2661,23 +2847,23 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ -int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) { - DEFINE_CGROUP_MGCTX(mgctx); + LIST_HEAD(preloaded_csets); struct task_struct *task; int ret; - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2685,11 +2871,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&mgctx); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, &mgctx); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); - cgroup_migrate_finish(&mgctx); + cgroup_migrate_finish(&preloaded_csets); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2697,65 +2883,222 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) - __acquires(&cgroup_threadgroup_rwsem) +int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) { - struct task_struct *tsk; - pid_t pid; + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return ERR_PTR(-EINVAL); + if (capable(CAP_SYS_NICE)) + return 0; - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; - } - } else { - tsk = current; + if (current != task && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + return -EACCES; } - if (threadgroup) - tsk = tsk->group_leader; + return 0; +} - /* +static int cgroup_procs_write_permission(struct task_struct *task, + struct cgroup *dst_cgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + int ret = 0; + + /* + * even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) + ret = -EACCES; + + if (!ret && cgroup_on_dfl(dst_cgrp)) { + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *cgrp; + struct inode *inode; + + spin_lock_irq(&css_set_lock); + cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + while (!cgroup_is_descendant(dst_cgrp, cgrp)) + cgrp = cgroup_parent(cgrp); + + ret = -ENOMEM; + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (inode) { + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + } + } + + put_cred(tcred); + return ret; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup. + */ +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) +{ + struct task_struct *tsk; + struct cgroup_subsys *ss; + struct cgroup *cgrp; + pid_t pid; + int ssid, ret; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + percpu_down_write(&cgroup_threadgroup_rwsem); + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + ret = -ESRCH; + goto out_unlock_rcu; + } + } else { + tsk = current; + } + + if (threadgroup) + tsk = tsk->group_leader; + + /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { - tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + ret = -EINVAL; + goto out_unlock_rcu; } get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + ret = cgroup_procs_write_permission(tsk, cgrp, of); + if (!ret) + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + + put_task_struct(tsk); + goto out_unlock_threadgroup; -out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); out_unlock_rcu: rcu_read_unlock(); - return tsk; +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; } -void cgroup_procs_write_finish(struct task_struct *task) - __releases(&cgroup_threadgroup_rwsem) +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroup_subsys *ss; - int ssid; + struct cgroup_root *root; + int retval = 0; - /* release reference from cgroup_procs_write_start() */ - put_task_struct(task); + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } percpu_up_write(&cgroup_threadgroup_rwsem); - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, true); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -2803,7 +3146,8 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - DEFINE_CGROUP_MGCTX(mgctx); + LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -2819,29 +3163,33 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, &mgctx); + cgroup_migrate_add_src(link->cset, dsct, + &preloaded_csets); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&mgctx); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, - mg_src_preload_node) { + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_migrate_add_task(task, &mgctx); + cgroup_taskset_add(task, &tset); } spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_execute(&mgctx); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: - cgroup_migrate_finish(&mgctx); + cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -2854,7 +3202,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -void cgroup_lock_and_drain_offline(struct cgroup *cgrp) +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; @@ -2888,12 +3236,11 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) } /** - * cgroup_save_control - save control masks and dom_cgrp of a subtree + * cgroup_save_control - save control masks of a subtree * @cgrp: root of the target subtree * - * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the - * respective old_ prefixed fields for @cgrp's subtree including @cgrp - * itself. + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. */ static void cgroup_save_control(struct cgroup *cgrp) { @@ -2903,7 +3250,6 @@ static void cgroup_save_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; - dsct->old_dom_cgrp = dsct->dom_cgrp; } } @@ -2929,12 +3275,11 @@ static void cgroup_propagate_control(struct cgroup *cgrp) } /** - * cgroup_restore_control - restore control masks and dom_cgrp of a subtree + * cgroup_restore_control - restore control masks of a subtree * @cgrp: root of the target subtree * - * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the - * respective old_ prefixed fields for @cgrp's subtree including @cgrp - * itself. + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { @@ -2944,7 +3289,6 @@ static void cgroup_restore_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; - dsct->dom_cgrp = dsct->old_dom_cgrp; } } @@ -2984,6 +3328,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -2993,8 +3339,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } - WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); - if (css_visible(css)) { ret = css_populate_dir(css); if (ret) @@ -3030,11 +3374,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!css) continue; - WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); - if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); @@ -3103,46 +3447,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) -{ - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; - - /* if nothing is getting enabled, nothing to worry about */ - if (!enable) - return 0; - - /* can @cgrp host any resources? */ - if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) - return -EOPNOTSUPP; - - /* mixables don't care */ - if (cgroup_is_mixable(cgrp)) - return 0; - - if (domain_enable) { - /* can't enable domain controllers inside a thread subtree */ - if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) - return -EOPNOTSUPP; - } else { - /* - * Threaded controllers can handle internal competitions - * and are always allowed inside a (prospective) thread - * subtree. - */ - if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) - return 0; - } - - /* - * Controllers can't be enabled for a cgroup with tasks to avoid - * child cgroups competing against tasks. - */ - if (cgroup_has_tasks(cgrp)) - return -EBUSY; - - return 0; -} - /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3218,9 +3522,33 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - ret = cgroup_vet_subtree_control_enable(cgrp, enable); - if (ret) - goto out_unlock; + /* + * Except for the root, subtree_control must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; + } /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -3239,193 +3567,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return ret ?: nbytes; } -/** - * cgroup_enable_threaded - make @cgrp threaded - * @cgrp: the target cgroup - * - * Called when "threaded" is written to the cgroup.type interface file and - * tries to make @cgrp threaded and join the parent's resource domain. - * This function is never called on the root cgroup as cgroup.type doesn't - * exist on it. - */ -static int cgroup_enable_threaded(struct cgroup *cgrp) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup *dom_cgrp = parent->dom_cgrp; - struct cgroup *dsct; - struct cgroup_subsys_state *d_css; - int ret; - - lockdep_assert_held(&cgroup_mutex); - - /* noop if already threaded */ - if (cgroup_is_threaded(cgrp)) - return 0; - - /* - * If @cgroup is populated or has domain controllers enabled, it - * can't be switched. While the below cgroup_can_be_thread_root() - * test can catch the same conditions, that's only when @parent is - * not mixable, so let's check it explicitly. - */ - if (cgroup_is_populated(cgrp) || - cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) - return -EOPNOTSUPP; - - /* we're joining the parent's domain, ensure its validity */ - if (!cgroup_is_valid_domain(dom_cgrp) || - !cgroup_can_be_thread_root(dom_cgrp)) - return -EOPNOTSUPP; - - /* - * The following shouldn't cause actual migrations and should - * always succeed. - */ - cgroup_save_control(cgrp); - - cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) - if (dsct == cgrp || cgroup_is_threaded(dsct)) - dsct->dom_cgrp = dom_cgrp; - - ret = cgroup_apply_control(cgrp); - if (!ret) - parent->nr_threaded_children++; - - cgroup_finalize_control(cgrp, ret); - return ret; -} - -static int cgroup_type_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - if (cgroup_is_threaded(cgrp)) - seq_puts(seq, "threaded\n"); - else if (!cgroup_is_valid_domain(cgrp)) - seq_puts(seq, "domain invalid\n"); - else if (cgroup_is_thread_root(cgrp)) - seq_puts(seq, "domain threaded\n"); - else - seq_puts(seq, "domain\n"); - - return 0; -} - -static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - int ret; - - /* only switching to threaded mode is supported */ - if (strcmp(strstrip(buf), "threaded")) - return -EINVAL; - - /* drain dying csses before we re-apply (threaded) subtree control */ - cgrp = cgroup_kn_lock_live(of->kn, true); - if (!cgrp) - return -ENOENT; - - /* threaded can only be enabled */ - ret = cgroup_enable_threaded(cgrp); - - cgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -static int cgroup_max_descendants_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - int descendants = READ_ONCE(cgrp->max_descendants); - - if (descendants == INT_MAX) - seq_puts(seq, "max\n"); - else - seq_printf(seq, "%d\n", descendants); - - return 0; -} - -static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - int descendants; - ssize_t ret; - - buf = strstrip(buf); - if (!strcmp(buf, "max")) { - descendants = INT_MAX; - } else { - ret = kstrtoint(buf, 0, &descendants); - if (ret) - return ret; - } - - if (descendants < 0) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgrp->max_descendants = descendants; - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - -static int cgroup_max_depth_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - int depth = READ_ONCE(cgrp->max_depth); - - if (depth == INT_MAX) - seq_puts(seq, "max\n"); - else - seq_printf(seq, "%d\n", depth); - - return 0; -} - -static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - ssize_t ret; - int depth; - - buf = strstrip(buf); - if (!strcmp(buf, "max")) { - depth = INT_MAX; - } else { - ret = kstrtoint(buf, 0, &depth); - if (ret) - return ret; - } - - if (depth < 0) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgrp->max_depth = depth; - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - static int cgroup_events_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); - seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); - + seq_printf(seq, "populated %d\n", + cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } @@ -3519,108 +3664,31 @@ bool cgroup_psi_enabled(void) #endif /* CONFIG_PSI */ -static int cgroup_stat_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgroup = seq_css(seq)->cgroup; - - seq_printf(seq, "nr_descendants %d\n", - cgroup->nr_descendants); - seq_printf(seq, "nr_dying_descendants %d\n", - cgroup->nr_dying_descendants); - - return 0; -} - -static int cgroup_freeze_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgrp->freezer.freeze); - - return 0; -} - -static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - ssize_t ret; - int freeze; - - ret = kstrtoint(strstrip(buf), 0, &freeze); - if (ret) - return ret; - - if (freeze < 0 || freeze > 1) - return -ERANGE; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENOENT; - - cgroup_freeze(cgrp, freeze); - - cgroup_kn_unlock(of->kn); - - return nbytes; -} - static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; - struct cgroup_file_ctx *ctx; - int ret; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->ns = current->nsproxy->cgroup_ns; - get_cgroup_ns(ctx->ns); - of->priv = ctx; - if (!cft->open) - return 0; - - ret = cft->open(of); - if (ret) { - put_cgroup_ns(ctx->ns); - kfree(ctx); - } - return ret; + if (cft->open) + return cft->open(of); + return 0; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; - struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); - put_cgroup_ns(ctx->ns); - kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; - /* - * If namespaces are delegation boundaries, disallow writes to - * files in an non-init namespace root from inside the namespace - * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. - */ - if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && - !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) - return -EPERM; - if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3715,6 +3783,52 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + /* do not accept '\n' to prevent making /proc//cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow on the default hierarchy. + */ + if (cgroup_on_dfl(cgrp)) + return -EPERM; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -3814,6 +3928,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { + LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; @@ -4014,6 +4129,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) @@ -4241,58 +4376,6 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } -static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) -{ - struct list_head *l; - struct cgrp_cset_link *link; - struct css_set *cset; - - lockdep_assert_held(&css_set_lock); - - /* find the next threaded cset */ - if (it->tcset_pos) { - l = it->tcset_pos->next; - - if (l != it->tcset_head) { - it->tcset_pos = l; - return container_of(l, struct css_set, - threaded_csets_node); - } - - it->tcset_pos = NULL; - } - - /* find the next cset */ - l = it->cset_pos; - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; - return NULL; - } - - if (it->ss) { - cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } - - it->cset_pos = l; - - /* initialize threaded css_set walking */ - if (it->flags & CSS_TASK_ITER_THREADED) { - if (it->cur_dcset) - put_css_set_locked(it->cur_dcset); - it->cur_dcset = cset; - get_css_set(cset); - - it->tcset_head = &cset->threaded_csets; - it->tcset_pos = &cset->threaded_csets; - } - - return cset; -} - /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -4301,33 +4384,39 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { + struct list_head *l = it->cset_pos; + struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - cset = css_task_iter_next_css_set(it); - if (!cset) { + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; it->task_pos = NULL; return; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) { + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + } while (!css_set_populated(cset)); + + it->cset_pos = l; + + if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { + else it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; - } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4353,74 +4442,32 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_skip(struct css_task_iter *it, - struct task_struct *task) -{ - lockdep_assert_held(&css_set_lock); - - if (it->task_pos == &task->cg_list) { - it->task_pos = it->task_pos->next; - it->flags |= CSS_TASK_ITER_SKIPPED; - } -} - static void css_task_iter_advance(struct css_task_iter *it) { - struct task_struct *task; + struct list_head *l = it->task_pos; lockdep_assert_held(&css_set_lock); -repeat: - if (it->task_pos) { - /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. - */ - if (it->flags & CSS_TASK_ITER_SKIPPED) - it->flags &= ~CSS_TASK_ITER_SKIPPED; - else - it->task_pos = it->task_pos->next; + WARN_ON_ONCE(!l); - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; - } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; - } - if (it->task_pos == it->dying_tasks_head) - css_task_iter_advance_css_set(it); - } else { - /* called from start, proceed to the first cset */ - css_task_iter_advance_css_set(it); - } - - if (!it->task_pos) - return; - - task = list_entry(it->task_pos, struct task_struct, cg_list); + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ + l = l->next; - if (it->flags & CSS_TASK_ITER_PROCS) { - /* if PROCS, skip over tasks which aren't group leaders */ - if (!thread_group_leader(task)) - goto repeat; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && - !atomic_read(&task->signal->live)) - goto repeat; - } else { - /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) - goto repeat; - } + if (l == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = l; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of - * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -4428,7 +4475,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, +void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -4439,7 +4486,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, spin_lock_irq(&css_set_lock); it->ss = css->ss; - it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -4448,7 +4494,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->cset_head = it->cset_pos; - css_task_iter_advance(it); + css_task_iter_advance_css_set(it); spin_unlock_irq(&css_set_lock); } @@ -4470,10 +4516,6 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); - /* @it may be half-advanced by skips, finish advancing */ - if (it->flags & CSS_TASK_ITER_SKIPPED) - css_task_iter_advance(it); - if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); @@ -4501,276 +4543,576 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } - if (it->cur_dcset) - put_css_set(it->cur_dcset); - if (it->cur_task) put_task_struct(it->cur_task); } -static void cgroup_procs_release(struct kernfs_open_file *of) +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&preloaded_csets); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) { - struct cgroup_file_ctx *ctx = of->priv; + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} - if (ctx->procs.started) - css_task_iter_end(&ctx->procs.iter); +static void pidlist_free(void *p) +{ + kvfree(p); } -static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) { - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l, *tmp_l; - if (pos) - (*pos)++; + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); - return css_task_iter_next(&ctx->procs.iter); + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); } -static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, - unsigned int iter_flags) +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { - struct kernfs_open_file *of = s->private; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_file_ctx *ctx = of->priv; - struct css_task_iter *it = &ctx->procs.iter; + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); /* - * When a seq_file is seeked, it's always traversed sequentially - * from position 0, so we can simply keep iterating on !0 *pos. + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. */ - if (!ctx->procs.started) { - if (WARN_ON_ONCE((*pos))) - return ERR_PTR(-EINVAL); - css_task_iter_start(&cgrp->self, iter_flags, it); - ctx->procs.started = true; - } else if (!(*pos)) { - css_task_iter_end(it); - css_task_iter_start(&cgrp->self, iter_flags, it); - } else - return it->cur_task; + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } - return cgroup_procs_next(s, NULL, NULL); + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); } -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) { - struct cgroup *cgrp = seq_css(s)->cgroup; + int src, dest = 1; /* - * All processes of a threaded subtree belong to the domain cgroup - * of the subtree. Only threads can be distributed across the - * subtree. Reject reads on cgroup.procs in the subtree proper. - * They're always empty anyway. + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either */ - if (cgroup_is_threaded(cgrp)) - return ERR_PTR(-EOPNOTSUPP); + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; - return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | - CSS_TASK_ITER_THREADED); + return (a << 1) | (b >> 1); } -static int cgroup_procs_show(struct seq_file *s, void *v) +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - seq_printf(s, "%d\n", task_pid_vnr(v)); - return 0; + if (cgroup_on_dfl(cgrp)) + return pid_fry(pid); + else + return pid; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) +static int cmppid(const void *a, const void *b) { - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; + return *(pid_t *)a - *(pid_t *)b; +} - if (capable(CAP_SYS_NICE)) - return 0; +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } + lockdep_assert_held(&cgrp->pidlist_mutex); - return 0; + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; } -static int cgroup_procs_write_permission(struct cgroup *src_cgrp, - struct cgroup *dst_cgrp, - struct super_block *sb, - struct cgroup_namespace *ns) +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { - struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; - int ret; + struct cgroup_pidlist *l; - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgrp->pidlist_mutex); - /* find the common ancestor */ - while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) - com_cgrp = cgroup_parent(com_cgrp); + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; - /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - if (ret) - return ret; + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); /* - * If namespaces are delegation boundaries, %current must be able - * to see both source and destination cgroups from its namespace. + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. */ - if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && - (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || - !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) - return -ENOENT; + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + if (cgroup_on_dfl(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; return 0; } -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - const struct cred *saved_cred; - ssize_t ret; - - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; - task = cgroup_procs_write_start(buf, true); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + mutex_lock(&cgroup_mutex); /* - * Process and thread migrations follow same delegation rule. Check - * permissions using the credentials from file open to protect against - * inherited fd attacks. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - ctx->ns); - revert_creds(saved_cred); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(dst_cgrp, task, true); + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); - return ret ?: nbytes; + mutex_unlock(&cgroup_mutex); + return 0; } -static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) -{ - return __cgroup_procs_start(s, pos, 0); -} -static ssize_t cgroup_threads_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - const struct cred *saved_cred; - ssize_t ret; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; - buf = strstrip(buf); + mutex_lock(&cgrp->pidlist_mutex); - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); - task = cgroup_procs_write_start(buf, false); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { + index = mid; + break; + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = cgroup_pid_fry(cgrp, *iter); + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; /* - * Process and thread migrations follow same delegation rule. Check - * permissions using the credentials from file open to protect against - * inherited fd attacks. + * Advance to the next pid in the array. If this goes off the + * end, we're done */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - ctx->ns); - revert_creds(saved_cred); - if (ret) - goto out_finish; + p++; + if (p >= end) { + return NULL; + } else { + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); + return p; + } +} - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); - ret = cgroup_attach_task(dst_cgrp, task, false); + return 0; +} -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} - return ret ?: nbytes; +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_base_files[] = { - { - .name = "cgroup.type", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_type_show, - .write = cgroup_type_write, - }, +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", - .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), - .release = cgroup_procs_release, - .seq_start = cgroup_procs_start, - .seq_next = cgroup_procs_next, - .seq_show = cgroup_procs_show, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, }, - { - .name = "cgroup.threads", - .flags = CFTYPE_NS_DELEGATABLE, - .release = cgroup_procs_release, - .seq_start = cgroup_threads_start, - .seq_next = cgroup_procs_next, - .seq_show = cgroup_procs_show, - .write = cgroup_threads_write, - }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4806,25 +5148,50 @@ static struct cftype cgroup_base_files[] = { .release = cgroup_pressure_release, }, #endif /* CONFIG_PSI */ + { } /* terminate */ +}; + +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, { - .name = "cgroup.max.descendants", - .seq_show = cgroup_max_descendants_show, - .write = cgroup_max_descendants_write, + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, }, { - .name = "cgroup.max.depth", - .seq_show = cgroup_max_depth_show, - .write = cgroup_max_depth_write, + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, }, { - .name = "cgroup.stat", - .seq_show = cgroup_stat_show, + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, }, { - .name = "cgroup.freeze", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_freeze_show, - .write = cgroup_freeze_write, + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -4874,7 +5241,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + cgroup_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -4927,17 +5294,9 @@ static void css_release_work_fn(struct work_struct *work) if (ss->css_released) ss->css_released(css); } else { - struct cgroup *tcgrp; - /* cgroup release path */ trace_cgroup_release(cgrp); - spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; - tcgrp = cgroup_parent(tcgrp)) - tcgrp->nr_dying_descendants--; - spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5024,6 +5383,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -5136,40 +5498,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_idr_free; - /* - * New cgroup inherits effective freeze counter, and - * if the parent has to be frozen, the child has too. - */ - cgrp->freezer.e_freeze = parent->freezer.e_freeze; - if (cgrp->freezer.e_freeze) { - /* - * Set the CGRP_FREEZE flag, so when a process will be - * attached to the child cgroup, it will become frozen. - * At this point the new cgroup is unpopulated, so we can - * consider it frozen immediately. - */ - set_bit(CGRP_FREEZE, &cgrp->flags); - set_bit(CGRP_FROZEN, &cgrp->flags); - } - - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5215,30 +5546,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return ERR_PTR(ret); } -static bool cgroup_check_hierarchy_limits(struct cgroup *parent) -{ - struct cgroup *cgroup; - int ret = false; - int level = 1; - - lockdep_assert_held(&cgroup_mutex); - - for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { - if (cgroup->nr_descendants >= cgroup->max_descendants) - goto fail; - - if (level > cgroup->max_depth) - goto fail; - - level++; - } - - ret = true; -fail: - return ret; -} - -int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -5252,11 +5561,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (!parent) return -ENODEV; - if (!cgroup_check_hierarchy_limits(parent)) { - ret = -EAGAIN; - goto out_unlock; - } - cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -5408,7 +5712,6 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -5447,27 +5750,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ - css_clear_dir(&cgrp->self); + /* + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. + */ kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) - parent->nr_threaded_children--; - - spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { - tcgrp->nr_descendants--; - tcgrp->nr_dying_descendants++; - /* - * If the dying cgroup is frozen, decrease frozen descendants - * counters of ancestor cgroups. - */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - tcgrp->freezer.nr_frozen_descendants--; - } - spin_unlock_irq(&css_set_lock); - - cgroup1_check_for_release(parent); + check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5475,7 +5764,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -int cgroup_rmdir(struct kernfs_node *kn) +static int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -5494,10 +5783,11 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { - .show_options = cgroup_show_options, .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, + .rename = cgroup_rename, .show_path = cgroup_show_path, }; @@ -5541,7 +5831,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_release_callback |= (bool)ss->release << ss->id; + have_free_callback |= (bool)ss->free << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5603,8 +5893,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5650,23 +5940,17 @@ int __init cgroup_init(void) if (!cgroup_ssid_enabled(ssid)) continue; - if (cgroup1_ssid_disabled(ssid)) + if (cgroup_ssid_no_v1(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; - /* implicit controllers must be threaded too */ - WARN_ON(ss->implicit_on_dfl && !ss->threaded); - if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; - if (ss->threaded) - cgrp_dfl_threaded_ss_mask |= 1 << ss->id; - if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { @@ -5707,6 +5991,15 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; } core_initcall(cgroup_wq_init); @@ -5789,6 +6082,42 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, return retval; } +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +static const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5895,29 +6224,8 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); - cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } - - /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - spin_unlock_irq(&css_set_lock); } @@ -5965,13 +6273,6 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; - - if (unlikely(cgroup_task_frozen(tsk))) - cgroup_freezer_frozen_exit(tsk); - else if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); @@ -5983,27 +6284,87 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_release(struct task_struct *task) +void cgroup_free(struct task_struct *task) { + struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_release_callback) { - ss->release(task); + do_each_subsys_mask(ss, ssid, have_free_callback) { + ss->free(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + put_css_set(cset); } -void cgroup_free(struct task_struct *task) +static void check_for_release(struct cgroup *cgrp) { - struct css_set *cset = task_css_set(task); - put_css_set(cset); + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +static void cgroup_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); } static int __init cgroup_disable(char *str) @@ -6039,6 +6400,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6068,7 +6456,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + cgrp = rcu_dereference(kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); @@ -6238,6 +6626,154 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct ucounts *ucounts; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + dec_cgroup_namespaces(ucounts); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags) @@ -6261,69 +6797,148 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } #endif /* CONFIG_CGROUP_BPF */ -#ifdef CONFIG_SYSFS -static ssize_t show_delegatable_files(struct cftype *files, char *buf, - ssize_t size, const char *prefix) +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cftype *cft; - ssize_t ret = 0; + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - for (cft = files; cft && cft->name[0] != '\0'; cft++) { - if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) - continue; + if (!css) + return ERR_PTR(-ENOMEM); - if (prefix) - ret += snprintf(buf + ret, size - ret, "%s.", prefix); + return css; +} - ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} - if (WARN_ON(ret >= size)) - break; - } +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} - return ret; +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; } -static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct cgroup_subsys *ss; - int ssid; - ssize_t ret = 0; + u64 count; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + rcu_read_lock(); + count = atomic_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} - for_each_subsys(ss, ssid) - ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, - PAGE_SIZE - ret, - cgroup_subsys_name[ssid]); +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; - return ret; + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; } -static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); -static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); -} -static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; -static struct attribute *cgroup_sysfs_attrs[] = { - &cgroup_delegate_attr.attr, - &cgroup_features_attr.attr, - NULL, -}; + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; -static const struct attribute_group cgroup_sysfs_attr_group = { - .attrs = cgroup_sysfs_attrs, - .name = "cgroup", -}; + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} -static int __init cgroup_sysfs_init(void) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); } -subsys_initcall(cgroup_sysfs_init); -#endif /* CONFIG_SYSFS */ + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile deleted file mode 100644 index 0a3e87cc648d..000000000000 --- a/kernel/cgroup/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o freezer.o - -obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o -obj-$(CONFIG_CGROUP_PIDS) += pids.o -obj-$(CONFIG_CGROUP_RDMA) += rdma.o -obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h deleted file mode 100644 index 90104f82593d..000000000000 --- a/kernel/cgroup/cgroup-internal.h +++ /dev/null @@ -1,242 +0,0 @@ -#ifndef __CGROUP_INTERNAL_H -#define __CGROUP_INTERNAL_H - -#include -#include -#include -#include -#include - -struct cgroup_pidlist; - -struct cgroup_file_ctx { - struct cgroup_namespace *ns; - - struct { - void *trigger; - } psi; - - struct { - bool started; - struct css_task_iter iter; - } procs; - - struct { - struct cgroup_pidlist *pidlist; - } procs1; -}; - -/* - * A cgroup can be associated with multiple css_sets as different tasks may - * belong to different cgroups on different hierarchies. In the other - * direction, a css_set is naturally associated with multiple cgroups. - * This M:N relationship is represented by the following link structure - * which exists for each association and allows traversing the associations - * from both sides. - */ -struct cgrp_cset_link { - /* the cgroup and css_set this link associates */ - struct cgroup *cgrp; - struct css_set *cset; - - /* list of cgrp_cset_links anchored at cgrp->cset_links */ - struct list_head cset_link; - - /* list of cgrp_cset_links anchored at css_set->cgrp_links */ - struct list_head cgrp_link; -}; - -/* used to track tasks and csets during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the number of tasks in the set */ - int nr_tasks; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -/* migration context also tracks preloading */ -struct cgroup_mgctx { - /* - * Preloaded source and destination csets. Used to guarantee - * atomic success or failure on actual migration. - */ - struct list_head preloaded_src_csets; - struct list_head preloaded_dst_csets; - - /* tasks and csets to migrate */ - struct cgroup_taskset tset; - - /* subsystems affected by migration */ - u16 ss_mask; -}; - -#define CGROUP_TASKSET_INIT(tset) \ -{ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - -#define CGROUP_MGCTX_INIT(name) \ -{ \ - LIST_HEAD_INIT(name.preloaded_src_csets), \ - LIST_HEAD_INIT(name.preloaded_dst_csets), \ - CGROUP_TASKSET_INIT(name.tset), \ -} - -#define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -extern struct mutex cgroup_mutex; -extern spinlock_t css_set_lock; -extern struct cgroup_subsys *cgroup_subsys[]; -extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; - -/* iterate across the hierarchies */ -#define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) - -/** - * for_each_subsys - iterate all enabled cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - */ -#define for_each_subsys(ss, ssid) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) - -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - -static inline bool notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - -bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); - -struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); -struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root); -struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); -void cgroup_kn_unlock(struct kernfs_node *kn); -int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); - -int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); -void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); -void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, - struct cgroup_mgctx *mgctx); -int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); -int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx); - -int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) - __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) - __releases(&cgroup_threadgroup_rwsem); - -void cgroup_lock_and_drain_offline(struct cgroup *cgrp); - -int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); -int cgroup_rmdir(struct kernfs_node *kn); -int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root); - -int __cgroup_task_count(const struct cgroup *cgrp); -int cgroup_task_count(const struct cgroup *cgrp); - -/* - * namespace.c - */ -extern const struct proc_ns_operations cgroupns_operations; - -/* - * cgroup-v1.c - */ -extern struct cftype cgroup1_base_files[]; -extern const struct file_operations proc_cgroupstats_operations; -extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; - -bool cgroup1_ssid_disabled(int ssid); -void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); -void cgroup1_release_agent(struct work_struct *work); -void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); - -#endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c deleted file mode 100644 index fc576131fdf2..000000000000 --- a/kernel/cgroup/cgroup-v1.c +++ /dev/null @@ -1,1314 +0,0 @@ -#include "cgroup-internal.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * pidlists linger the following amount before being destroyed. The goal - * is avoiding frequent destruction in the middle of consecutive read calls - * Expiring in the middle is a performance problem not a correctness one. - * 1 sec should be enough. - */ -#define CGROUP_PIDLIST_DESTROY_DELAY HZ - -/* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; - -/* disable named v1 mounts */ -static bool cgroup_no_v1_named; - -/* - * pidlist destructions need to be flushed on cgroup destruction. Use a - * separate workqueue as flush domain. - */ -static struct workqueue_struct *cgroup_pidlist_destroy_wq; - -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ -static DEFINE_SPINLOCK(release_agent_path_lock); - -bool cgroup1_ssid_disabled(int ssid) -{ - return cgroup_no_v1_mask & (1 << ssid); -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroup_root *root; - int retval = 0; - - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - for_each_root(root) { - struct cgroup *from_cgrp; - - if (root == &cgrp_dfl_root) - continue; - - spin_lock_irq(&css_set_lock); - from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_irq(&css_set_lock); - - retval = cgroup_attach_task(from_cgrp, tsk, false); - if (retval) - break; - } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - * - * Locking rules between cgroup_post_fork() and the migration path - * guarantee that, if a task is forking while being migrated, the new child - * is guaranteed to be either visible in the source cgroup after the - * parent's migration is complete or put into the target cgroup. No task - * can slip out of migration through forking. - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - DEFINE_CGROUP_MGCTX(mgctx); - struct cgrp_cset_link *link; - struct css_task_iter it; - struct task_struct *task; - int ret; - - if (cgroup_on_dfl(to)) - return -EINVAL; - - ret = cgroup_migrate_vet_dst(to); - if (ret) - return ret; - - mutex_lock(&cgroup_mutex); - - percpu_down_write(&cgroup_threadgroup_rwsem); - - /* all tasks in @from are being moved, all csets are source */ - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &mgctx); - spin_unlock_irq(&css_set_lock); - - ret = cgroup_migrate_prepare_dst(&mgctx); - if (ret) - goto out_err; - - /* - * Migrate tasks one-by-one until @from is empty. This fails iff - * ->can_attach() fails. - */ - do { - css_task_iter_start(&from->self, 0, &it); - - do { - task = css_task_iter_next(&it); - } while (task && (task->flags & PF_EXITING)); - - if (task) - get_task_struct(task); - css_task_iter_end(&it); - - if (task) { - ret = cgroup_migrate(task, false, &mgctx); - if (!ret) - trace_cgroup_transfer_tasks(to, task, false); - put_task_struct(task); - } - } while (task && !ret); -out_err: - cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - return ret; -} - -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* for delayed destruction */ - struct delayed_work destroy_dwork; -}; - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - -/* - * Used to destroy all pidlists lingering waiting for destroy timer. None - * should be left afterwards. - */ -void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) -{ - struct cgroup_pidlist *l, *tmp_l; - - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); - mutex_unlock(&cgrp->pidlist_mutex); - - flush_workqueue(cgroup_pidlist_destroy_wq); - BUG_ON(!list_empty(&cgrp->pidlists)); -} - -static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, - destroy_dwork); - struct cgroup_pidlist *tofree = NULL; - - mutex_lock(&l->owner->pidlist_mutex); - - /* - * Destroy iff we didn't get queued again. The state won't change - * as destroy_dwork can only be queued while locked. - */ - if (!delayed_work_pending(dwork)) { - list_del(&l->links); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - tofree = l; - } - - mutex_unlock(&l->owner->pidlist_mutex); - kfree(tofree); -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * Returns the number of unique elements. - */ -static int pidlist_uniq(pid_t *list, int length) -{ - int src, dest = 1; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - return dest; -} - -/* - * The two pid files - task and cgroup.procs - guaranteed that the result - * is sorted, which forced this whole pidlist fiasco. As pid order is - * different per namespace, each namespace needs differently sorted list, - * making it impossible to use, for example, single rbtree of member tasks - * sorted by task pointer. As pidlists can be fairly large, allocating one - * per open file is dangerous, so cgroup had to implement shared pool of - * pidlists keyed by cgroup and namespace. - */ -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - - lockdep_assert_held(&cgrp->pidlist_mutex); - - list_for_each_entry(l, &cgrp->pidlists, links) - if (l->key.type == type && l->key.ns == ns) - return l; - return NULL; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - l = cgroup_pidlist_find(cgrp, type); - if (l) - return l; - - /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) - return l; - - INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); - l->key.type = type; - /* don't need task_nsproxy() if we're looking at ourself */ - l->key.ns = get_pid_ns(task_active_pid_ns(current)); - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct css_task_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - css_task_iter_start(&cgrp->self, 0, &it); - while ((tsk = css_task_iter_next(&it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - css_task_iter_end(&it); - length = n; - /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); - - l = cgroup_pidlist_find_create(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; - } - - /* store array, freeing old if necessary */ - pidlist_free(l->list); - l->list = array; - l->length = length; - *lp = l; - return 0; -} - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) -{ - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_pidlist *l; - enum cgroup_filetype type = seq_cft(s)->private; - int index = 0, pid = *pos; - int *iter, ret; - - mutex_lock(&cgrp->pidlist_mutex); - - /* - * !NULL @ctx->procs1.pidlist indicates that this isn't the first - * start() after open. If the matching pidlist is around, we can use - * that. Look for it. Note that @ctx->procs1.pidlist can't be used - * directly. It could already have been destroyed. - */ - if (ctx->procs1.pidlist) - ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); - - /* - * Either this is the first start() after open or the matching - * pidlist has been destroyed inbetween. Create a new one. - */ - if (!ctx->procs1.pidlist) { - ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); - if (ret) - return ERR_PTR(ret); - } - l = ctx->procs1.pidlist; - - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (l->list[mid] == pid) { - index = mid; - break; - } else if (l->list[mid] <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = *iter; - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup_pidlist *l = ctx->procs1.pidlist; - - if (l) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, - CGROUP_PIDLIST_DESTROY_DELAY); - mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); -} - -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_file_ctx *ctx = of->priv; - struct cgroup_pidlist *l = ctx->procs1.pidlist; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - (*pos)++; - return NULL; - } else { - *pos = *p; - return p; - } -} - -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - seq_printf(s, "%d\n", *(int *)v); - - return 0; -} - -static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, - bool threadgroup) -{ - struct cgroup *cgrp; - struct task_struct *task; - const struct cred *cred, *tcred; - ssize_t ret; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - task = cgroup_procs_write_start(buf, threadgroup); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; - - /* - * Even if we're attaching all tasks in the thread group, we only need - * to check permissions on one of them. Check permissions using the - * credentials from file open to protect against inherited fd attacks. - */ - cred = of->file->f_cred; - tcred = get_task_cred(task); - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_NICE)) - ret = -EACCES; - put_cred(tcred); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(cgrp, task, threadgroup); - -out_finish: - cgroup_procs_write_finish(task); -out_unlock: - cgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup1_procs_write(of, buf, nbytes, off, true); -} - -static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup1_procs_write(of, buf, nbytes, off, false); -} - -static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - - /* - * Release agent gets called with all capabilities, - * require capabilities to set release agent. - */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); - cgroup_kn_unlock(of->kn); - return nbytes; -} - -static int cgroup_release_agent_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); - seq_putc(seq, '\n'); - return 0; -} - -static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) -{ - seq_puts(seq, "0\n"); - return 0; -} - -static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return notify_on_release(css->cgroup); -} - -static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - return 0; -} - -static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); -} - -static int cgroup_clone_children_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - return 0; -} - -/* cgroup core interface files for the legacy hierarchies */ -struct cftype cgroup1_base_files[] = { - { - .name = "cgroup.procs", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, - .write = cgroup1_procs_write, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { - .name = "tasks", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_TASKS, - .write = cgroup1_tasks_write, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_release_agent_show, - .write = cgroup_release_agent_write, - .max_write_len = PATH_MAX - 1, - }, - { } /* terminate */ -}; - -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - struct cgroup_subsys *ss; - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - - for_each_subsys(ss, i) - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) -{ - struct kernfs_node *kn = kernfs_node_from_dentry(dentry); - struct cgroup *cgrp; - struct css_task_iter it; - struct task_struct *tsk; - - /* it should be kernfs_node belonging to cgroupfs and is a directory */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) - return -EINVAL; - - mutex_lock(&cgroup_mutex); - - /* - * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_online_from_dir(), - * @kn->priv is RCU safe. Let's do the RCU dancing. - */ - rcu_read_lock(); - cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { - rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); - return -ENOENT; - } - rcu_read_unlock(); - - css_task_iter_start(&cgrp->self, 0, &it); - while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - css_task_iter_end(&it); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -void cgroup1_check_for_release(struct cgroup *cgrp) -{ - if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && - !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -void cgroup1_release_agent(struct work_struct *work) -{ - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; - - mutex_lock(&cgroup_mutex); - - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; - - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(agentbuf); - kfree(pathbuf); -} - -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) -{ - struct cgroup *cgrp = kn->priv; - int ret; - - /* do not accept '\n' to prevent making /proc//cgroup unparsable */ - if (strchr(new_name_str, '\n')) - return -EINVAL; - - if (kernfs_type(kn) != KERNFS_DIR) - return -ENOTDIR; - if (kn->parent != new_parent) - return -EIO; - - /* - * We're gonna grab cgroup_mutex which nests outside kernfs - * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_mutex. - */ - kernfs_break_active_protection(new_parent); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_mutex); - - ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) - trace_cgroup_rename(cgrp); - - mutex_unlock(&cgroup_mutex); - - kernfs_unbreak_active_protection(kn); - kernfs_unbreak_active_protection(new_parent); - return ret; -} - -static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) - seq_puts(seq, ",cpuset_v2_mode"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup1_ssid_disabled(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) -{ - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - /* See cgroup1_mount release_agent handling */ - if (opts.release_agent && - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { - ret = -EINVAL; - goto out_unlock; - } - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; -} - -struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { - .rename = cgroup1_rename, - .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .show_path = cgroup_show_path, -}; - -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) -{ - struct cgroup_sb_opts opts; - struct cgroup_root *root = NULL; - struct cgroup_subsys *ss; - struct dentry *dentry; - int i, ret; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - ret = 0; - goto out_unlock; - } - - /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - /* - * Release agent gets called with all capabilities, - * require capabilities to set release agent. - */ - if (opts.release_agent && - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { - ret = -EINVAL; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) - return ERR_PTR(ret); - - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); - - if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); - deactivate_locked_super(sb); - msleep(10); - dentry = ERR_PTR(restart_syscall()); - } - return dentry; -} - -static int __init cgroup1_wq_init(void) -{ - /* - * Used to destroy pidlists and separate to serve as flush domain. - * Cap @max_active to 1 too. - */ - cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); - BUG_ON(!cgroup_pidlist_destroy_wq); - return 0; -} -core_initcall(cgroup1_wq_init); - -static int __init cgroup_no_v1(char *str) -{ - struct cgroup_subsys *ss; - char *token; - int i; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; - continue; - } - - if (!strcmp(token, "named")) { - cgroup_no_v1_named = true; - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->name) && - strcmp(token, ss->legacy_name)) - continue; - - cgroup_no_v1_mask |= 1 << i; - } - } - return 1; -} -__setup("cgroup_no_v1=", cgroup_no_v1); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c deleted file mode 100644 index f661b4cc5efd..000000000000 --- a/kernel/cgroup/debug.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Debug controller - * - * WARNING: This controller is for cgroup core debugging only. - * Its interfaces are unstable and subject to changes at any time. - */ -#include -#include -#include - -#include "cgroup-internal.h" - -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -/* - * debug_taskcount_read - return the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static int current_css_set_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct css_set *cset; - struct cgroup_subsys *ss; - struct cgroup_subsys_state *css; - int i, refcnt; - - if (!cgroup_kn_lock_live(of->kn, false)) - return -ENODEV; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - refcnt = refcount_read(&cset->refcount); - seq_printf(seq, "css_set %pK %d", cset, refcnt); - if (refcnt > cset->nr_tasks) - seq_printf(seq, " +%d", refcnt - cset->nr_tasks); - seq_puts(seq, "\n"); - - /* - * Print the css'es stored in the current css_set. - */ - for_each_subsys(ss, i) { - css = cset->subsys[ss->id]; - if (!css) - continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - cgroup_kn_unlock(of->kn); - return 0; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - int dead_cnt = 0, extra_refs = 0, threaded_csets = 0; - - spin_lock_irq(&css_set_lock); - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - int refcnt = refcount_read(&cset->refcount); - - /* - * Print out the proc_cset and threaded_cset relationship - * and highlight difference between refcount and task_count. - */ - seq_printf(seq, "css_set %pK", cset); - if (rcu_dereference_protected(cset->dom_cset, 1) != cset) { - threaded_csets++; - seq_printf(seq, "=>%pK", cset->dom_cset); - } - if (!list_empty(&cset->threaded_csets)) { - struct css_set *tcset; - int idx = 0; - - list_for_each_entry(tcset, &cset->threaded_csets, - threaded_csets_node) { - seq_puts(seq, idx ? "," : "<="); - seq_printf(seq, "%pK", tcset); - idx++; - } - } else { - seq_printf(seq, " %d", refcnt); - if (refcnt - cset->nr_tasks > 0) { - int extra = refcnt - cset->nr_tasks; - - seq_printf(seq, " +%d", extra); - /* - * Take out the one additional reference in - * init_css_set. - */ - if (cset == &init_css_set) - extra--; - extra_refs += extra; - } - } - seq_puts(seq, "\n"); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ <= MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ <= MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - /* show # of overflowed tasks */ - if (count > MAX_TASKS_SHOWN_PER_CSS) - seq_printf(seq, " ... (%d)\n", - count - MAX_TASKS_SHOWN_PER_CSS); - - if (cset->dead) { - seq_puts(seq, " [dead]\n"); - dead_cnt++; - } - - WARN_ON(count != cset->nr_tasks); - } - spin_unlock_irq(&css_set_lock); - - if (!dead_cnt && !extra_refs && !threaded_csets) - return 0; - - seq_puts(seq, "\n"); - if (threaded_csets) - seq_printf(seq, "threaded css_sets = %d\n", threaded_csets); - if (extra_refs) - seq_printf(seq, "extra references = %d\n", extra_refs); - if (dead_cnt) - seq_printf(seq, "dead css_sets = %d\n", dead_cnt); - - return 0; -} - -static int cgroup_subsys_states_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct cgroup *cgrp; - struct cgroup_subsys *ss; - struct cgroup_subsys_state *css; - char pbuf[16]; - int i; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - for_each_subsys(ss, i) { - css = rcu_dereference_check(cgrp->subsys[ss->id], true); - if (!css) - continue; - - pbuf[0] = '\0'; - - /* Show the parent CSS if applicable*/ - if (css->parent) - snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", - css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, - atomic_read(&css->online_cnt), pbuf); - } - - cgroup_kn_unlock(of->kn); - return 0; -} - -static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) -{ - struct cgroup_subsys *ss; - int ssid; - bool first = true; - - seq_printf(seq, "%-17s: ", name); - for_each_subsys(ss, ssid) { - if (!(mask & (1 << ssid))) - continue; - if (!first) - seq_puts(seq, ", "); - seq_puts(seq, ss->name); - first = false; - } - seq_putc(seq, '\n'); -} - -static int cgroup_masks_read(struct seq_file *seq, void *v) -{ - struct kernfs_open_file *of = seq->private; - struct cgroup *cgrp; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - - cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); - cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); - - cgroup_kn_unlock(of->kn); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_legacy_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .seq_show = current_css_set_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "cgroup_subsys_states", - .seq_show = cgroup_subsys_states_read, - }, - - { - .name = "cgroup_masks", - .seq_show = cgroup_masks_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .seq_show = current_css_set_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - .flags = CFTYPE_ONLY_ON_ROOT, - }, - - { - .name = "css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "csses", - .seq_show = cgroup_subsys_states_read, - }, - - { - .name = "masks", - .seq_show = cgroup_masks_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_legacy_files, -}; - -/* - * On v2, debug is an implicit controller enabled by "cgroup_debug" boot - * parameter. - */ -static int __init enable_cgroup_debug(char *str) -{ - debug_cgrp_subsys.dfl_cftypes = debug_files; - debug_cgrp_subsys.implicit_on_dfl = true; - debug_cgrp_subsys.threaded = true; - return 1; -} -__setup("cgroup_debug", enable_cgroup_debug); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c deleted file mode 100644 index 103938f25757..000000000000 --- a/kernel/cgroup/freezer.c +++ /dev/null @@ -1,315 +0,0 @@ -//SPDX-License-Identifier: GPL-2.0 -#include -#include - -#include "cgroup-internal.h" - -/* - * Propagate the cgroup frozen state upwards by the cgroup tree. - */ -static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) -{ - int desc = 1; - - /* - * If the new state is frozen, some freezing ancestor cgroups may change - * their state too, depending on if all their descendants are frozen. - * - * Otherwise, all ancestor cgroups are forced into the non-frozen state. - */ - while ((cgrp = cgroup_parent(cgrp))) { - if (frozen) { - cgrp->freezer.nr_frozen_descendants += desc; - if (!test_bit(CGRP_FROZEN, &cgrp->flags) && - test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_descendants == - cgrp->nr_descendants) { - set_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - desc++; - } - } else { - cgrp->freezer.nr_frozen_descendants -= desc; - if (test_bit(CGRP_FROZEN, &cgrp->flags)) { - clear_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - desc++; - } - } - } -} - -/* - * Revisit the cgroup frozen state. - * Checks if the cgroup is really frozen and perform all state transitions. - */ -void cgroup_update_frozen(struct cgroup *cgrp) -{ - bool frozen; - - lockdep_assert_held(&css_set_lock); - - /* - * If the cgroup has to be frozen (CGRP_FREEZE bit set), - * and all tasks are frozen and/or stopped, let's consider - * the cgroup frozen. Otherwise it's not frozen. - */ - frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - - if (frozen) { - /* Already there? */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - set_bit(CGRP_FROZEN, &cgrp->flags); - } else { - /* Already there? */ - if (!test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - clear_bit(CGRP_FROZEN, &cgrp->flags); - } - cgroup_file_notify(&cgrp->events_file); - - /* Update the state of ancestor cgroups. */ - cgroup_propagate_frozen(cgrp, frozen); -} - -/* - * Increment cgroup's nr_frozen_tasks. - */ -static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) -{ - cgrp->freezer.nr_frozen_tasks++; -} - -/* - * Decrement cgroup's nr_frozen_tasks. - */ -static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) -{ - cgrp->freezer.nr_frozen_tasks--; - WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); -} - -/* - * Enter frozen/stopped state, if not yet there. Update cgroup's counters, - * and revisit the state of the cgroup, if necessary. - */ -void cgroup_enter_frozen(void) -{ - struct cgroup *cgrp; - - if (current->frozen) - return; - - spin_lock_irq(&css_set_lock); - current->frozen = true; - cgrp = task_dfl_cgroup(current); - cgroup_inc_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); - spin_unlock_irq(&css_set_lock); -} - -/* - * Conditionally leave frozen/stopped state. Update cgroup's counters, - * and revisit the state of the cgroup, if necessary. - * - * If always_leave is not set, and the cgroup is freezing, - * we're racing with the cgroup freezing. In this case, we don't - * drop the frozen counter to avoid a transient switch to - * the unfrozen state. - */ -void cgroup_leave_frozen(bool always_leave) -{ - struct cgroup *cgrp; - - spin_lock_irq(&css_set_lock); - cgrp = task_dfl_cgroup(current); - if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); - WARN_ON_ONCE(!current->frozen); - current->frozen = false; - } - spin_unlock_irq(&css_set_lock); - - if (unlikely(current->frozen)) { - /* - * If the task remained in the frozen state, - * make sure it won't reach userspace without - * entering the signal handling loop. - */ - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } -} - -/* - * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE - * jobctl bit. - */ -static void cgroup_freeze_task(struct task_struct *task, bool freeze) -{ - unsigned long flags; - - /* If the task is about to die, don't bother with freezing it. */ - if (!lock_task_sighand(task, &flags)) - return; - - if (freeze) { - task->jobctl |= JOBCTL_TRAP_FREEZE; - signal_wake_up(task, false); - } else { - task->jobctl &= ~JOBCTL_TRAP_FREEZE; - wake_up_process(task); - } - - unlock_task_sighand(task, &flags); -} - -/* - * Freeze or unfreeze all tasks in the given cgroup. - */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) -{ - struct css_task_iter it; - struct task_struct *task; - - lockdep_assert_held(&cgroup_mutex); - - spin_lock_irq(&css_set_lock); - if (freeze) - set_bit(CGRP_FREEZE, &cgrp->flags); - else - clear_bit(CGRP_FREEZE, &cgrp->flags); - spin_unlock_irq(&css_set_lock); - - css_task_iter_start(&cgrp->self, 0, &it); - while ((task = css_task_iter_next(&it))) { - /* - * Ignore kernel threads here. Freezing cgroups containing - * kthreads isn't supported. - */ - if (task->flags & PF_KTHREAD) - continue; - cgroup_freeze_task(task, freeze); - } - css_task_iter_end(&it); - - /* - * Cgroup state should be revisited here to cover empty leaf cgroups - * and cgroups which descendants are already in the desired state. - */ - spin_lock_irq(&css_set_lock); - if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) - cgroup_update_frozen(cgrp); - spin_unlock_irq(&css_set_lock); -} - -/* - * Adjust the task state (freeze or unfreeze) and revisit the state of - * source and destination cgroups. - */ -void cgroup_freezer_migrate_task(struct task_struct *task, - struct cgroup *src, struct cgroup *dst) -{ - lockdep_assert_held(&css_set_lock); - - /* - * Kernel threads are not supposed to be frozen at all. - */ - if (task->flags & PF_KTHREAD) - return; - - /* - * Adjust counters of freezing and frozen tasks. - * Note, that if the task is frozen, but the destination cgroup is not - * frozen, we bump both counters to keep them balanced. - */ - if (task->frozen) { - cgroup_inc_frozen_cnt(dst); - cgroup_dec_frozen_cnt(src); - } - cgroup_update_frozen(dst); - cgroup_update_frozen(src); - - /* - * Force the task to the desired state. - */ - cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); -} - -void cgroup_freezer_frozen_exit(struct task_struct *task) -{ - struct cgroup *cgrp = task_dfl_cgroup(task); - - lockdep_assert_held(&css_set_lock); - - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); -} - -void cgroup_freeze(struct cgroup *cgrp, bool freeze) -{ - struct cgroup_subsys_state *css; - struct cgroup *dsct; - bool applied = false; - - lockdep_assert_held(&cgroup_mutex); - - /* - * Nothing changed? Just exit. - */ - if (cgrp->freezer.freeze == freeze) - return; - - cgrp->freezer.freeze = freeze; - - /* - * Propagate changes downwards the cgroup tree. - */ - css_for_each_descendant_pre(css, &cgrp->self) { - dsct = css->cgroup; - - if (cgroup_is_dead(dsct)) - continue; - - if (freeze) { - dsct->freezer.e_freeze++; - /* - * Already frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 1) - continue; - } else { - dsct->freezer.e_freeze--; - /* - * Still frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 0) - continue; - - WARN_ON_ONCE(dsct->freezer.e_freeze < 0); - } - - /* - * Do change actual state: freeze or unfreeze. - */ - cgroup_do_freeze(dsct, freeze); - applied = true; - } - - /* - * Even if the actual state hasn't changed, let's notify a user. - * The state can be enforced by an ancestor cgroup: the cgroup - * can already be in the desired state or it can be locked in the - * opposite state, so that the transition will never happen. - * In both cases it's better to notify a user, that there is - * nothing to wait for. - */ - if (!applied) - cgroup_file_notify(&cgrp->events_file); -} diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c deleted file mode 100644 index 86e9bbeb57ec..000000000000 --- a/kernel/cgroup/namespace.c +++ /dev/null @@ -1,155 +0,0 @@ -#include "cgroup-internal.h" - -#include -#include -#include -#include - - -/* cgroup namespaces */ - -static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) -{ - return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); -} - -static void dec_cgroup_namespaces(struct ucounts *ucounts) -{ - dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); -} - -static struct cgroup_namespace *alloc_cgroup_ns(void) -{ - struct cgroup_namespace *new_ns; - int ret; - - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); - if (!new_ns) - return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - refcount_set(&new_ns->count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; -} - -void free_cgroup_ns(struct cgroup_namespace *ns) -{ - put_css_set(ns->root_cset); - dec_cgroup_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); -} -EXPORT_SYMBOL(free_cgroup_ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - struct cgroup_namespace *new_ns; - struct ucounts *ucounts; - struct css_set *cset; - - BUG_ON(!old_ns); - - if (!(flags & CLONE_NEWCGROUP)) { - get_cgroup_ns(old_ns); - return old_ns; - } - - /* Allow only sysadmin to create cgroup namespace. */ - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - ucounts = inc_cgroup_namespaces(user_ns); - if (!ucounts) - return ERR_PTR(-ENOSPC); - - /* It is not safe to take cgroup_mutex here */ - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - spin_unlock_irq(&css_set_lock); - - new_ns = alloc_cgroup_ns(); - if (IS_ERR(new_ns)) { - put_css_set(cset); - dec_cgroup_namespaces(ucounts); - return new_ns; - } - - new_ns->user_ns = get_user_ns(user_ns); - new_ns->ucounts = ucounts; - new_ns->root_cset = cset; - - return new_ns; -} - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) -{ - struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || - !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* Don't need to do anything if we are attaching to our own cgroupns. */ - if (cgroup_ns == nsproxy->cgroup_ns) - return 0; - - get_cgroup_ns(cgroup_ns); - put_cgroup_ns(nsproxy->cgroup_ns); - nsproxy->cgroup_ns = cgroup_ns; - - return 0; -} - -static struct ns_common *cgroupns_get(struct task_struct *task) -{ - struct cgroup_namespace *ns = NULL; - struct nsproxy *nsproxy; - - task_lock(task); - nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->cgroup_ns; - get_cgroup_ns(ns); - } - task_unlock(task); - - return ns ? &ns->ns : NULL; -} - -static void cgroupns_put(struct ns_common *ns) -{ - put_cgroup_ns(to_cg_ns(ns)); -} - -static struct user_namespace *cgroupns_owner(struct ns_common *ns) -{ - return to_cg_ns(ns)->user_ns; -} - -const struct proc_ns_operations cgroupns_operations = { - .name = "cgroup", - .type = CLONE_NEWCGROUP, - .get = cgroupns_get, - .put = cgroupns_put, - .install = cgroupns_install, - .owner = cgroupns_owner, -}; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c deleted file mode 100644 index defad3c5e7dc..000000000000 --- a/kernel/cgroup/rdma.c +++ /dev/null @@ -1,619 +0,0 @@ -/* - * RDMA resource limiting controller for cgroups. - * - * Used to allow a cgroup hierarchy to stop processes from consuming - * additional RDMA resources after a certain limit is reached. - * - * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. - */ - -#include -#include -#include -#include -#include -#include - -#define RDMACG_MAX_STR "max" - -/* - * Protects list of resource pools maintained on per cgroup basis - * and rdma device list. - */ -static DEFINE_MUTEX(rdmacg_mutex); -static LIST_HEAD(rdmacg_devices); - -enum rdmacg_file_type { - RDMACG_RESOURCE_TYPE_MAX, - RDMACG_RESOURCE_TYPE_STAT, -}; - -/* - * resource table definition as to be seen by the user. - * Need to add entries to it when more resources are - * added/defined at IB verb/core layer. - */ -static char const *rdmacg_resource_names[] = { - [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", - [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", -}; - -/* resource tracker for each resource of rdma cgroup */ -struct rdmacg_resource { - int max; - int usage; -}; - -/* - * resource pool object which represents per cgroup, per device - * resources. There are multiple instances of this object per cgroup, - * therefore it cannot be embedded within rdma_cgroup structure. It - * is maintained as list. - */ -struct rdmacg_resource_pool { - struct rdmacg_device *device; - struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; - - struct list_head cg_node; - struct list_head dev_node; - - /* count active user tasks of this pool */ - u64 usage_sum; - /* total number counts which are set to max */ - int num_max_cnt; -}; - -static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) -{ - return container_of(css, struct rdma_cgroup, css); -} - -static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) -{ - return css_rdmacg(cg->css.parent); -} - -static inline struct rdma_cgroup *get_current_rdmacg(void) -{ - return css_rdmacg(task_get_css(current, rdma_cgrp_id)); -} - -static void set_resource_limit(struct rdmacg_resource_pool *rpool, - int index, int new_max) -{ - if (new_max == S32_MAX) { - if (rpool->resources[index].max != S32_MAX) - rpool->num_max_cnt++; - } else { - if (rpool->resources[index].max == S32_MAX) - rpool->num_max_cnt--; - } - rpool->resources[index].max = new_max; -} - -static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) -{ - int i; - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) - set_resource_limit(rpool, i, S32_MAX); -} - -static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) -{ - lockdep_assert_held(&rdmacg_mutex); - - list_del(&rpool->cg_node); - list_del(&rpool->dev_node); - kfree(rpool); -} - -static struct rdmacg_resource_pool * -find_cg_rpool_locked(struct rdma_cgroup *cg, - struct rdmacg_device *device) - -{ - struct rdmacg_resource_pool *pool; - - lockdep_assert_held(&rdmacg_mutex); - - list_for_each_entry(pool, &cg->rpools, cg_node) - if (pool->device == device) - return pool; - - return NULL; -} - -static struct rdmacg_resource_pool * -get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) -{ - struct rdmacg_resource_pool *rpool; - - rpool = find_cg_rpool_locked(cg, device); - if (rpool) - return rpool; - - rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); - if (!rpool) - return ERR_PTR(-ENOMEM); - - rpool->device = device; - set_all_resource_max_limit(rpool); - - INIT_LIST_HEAD(&rpool->cg_node); - INIT_LIST_HEAD(&rpool->dev_node); - list_add_tail(&rpool->cg_node, &cg->rpools); - list_add_tail(&rpool->dev_node, &device->rpools); - return rpool; -} - -/** - * uncharge_cg_locked - uncharge resource for rdma cgroup - * @cg: pointer to cg to uncharge and all parents in hierarchy - * @device: pointer to rdmacg device - * @index: index of the resource to uncharge in cg (resource pool) - * - * It also frees the resource pool which was created as part of - * charging operation when there are no resources attached to - * resource pool. - */ -static void -uncharge_cg_locked(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - struct rdmacg_resource_pool *rpool; - - rpool = find_cg_rpool_locked(cg, device); - - /* - * rpool cannot be null at this stage. Let kernel operate in case - * if there a bug in IB stack or rdma controller, instead of crashing - * the system. - */ - if (unlikely(!rpool)) { - pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); - return; - } - - rpool->resources[index].usage--; - - /* - * A negative count (or overflow) is invalid, - * it indicates a bug in the rdma controller. - */ - WARN_ON_ONCE(rpool->resources[index].usage < 0); - rpool->usage_sum--; - if (rpool->usage_sum == 0 && - rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); - } -} - -/** - * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count - * @device: pointer to rdmacg device - * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup - * stop uncharging - * @index: index of the resource to uncharge in cg in given resource pool - */ -static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, - struct rdmacg_device *device, - struct rdma_cgroup *stop_cg, - enum rdmacg_resource_type index) -{ - struct rdma_cgroup *p; - - mutex_lock(&rdmacg_mutex); - - for (p = cg; p != stop_cg; p = parent_rdmacg(p)) - uncharge_cg_locked(p, device, index); - - mutex_unlock(&rdmacg_mutex); - - css_put(&cg->css); -} - -/** - * rdmacg_uncharge - hierarchically uncharge rdma resource count - * @device: pointer to rdmacg device - * @index: index of the resource to uncharge in cgroup in given resource pool - */ -void rdmacg_uncharge(struct rdma_cgroup *cg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - if (index >= RDMACG_RESOURCE_MAX) - return; - - rdmacg_uncharge_hierarchy(cg, device, NULL, index); -} -EXPORT_SYMBOL(rdmacg_uncharge); - -/** - * rdmacg_try_charge - hierarchically try to charge the rdma resource - * @rdmacg: pointer to rdma cgroup which will own this resource - * @device: pointer to rdmacg device - * @index: index of the resource to charge in cgroup (resource pool) - * - * This function follows charging resource in hierarchical way. - * It will fail if the charge would cause the new value to exceed the - * hierarchical limit. - * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. - * Returns pointer to rdmacg for this resource when charging is successful. - * - * Charger needs to account resources on two criteria. - * (a) per cgroup & (b) per device resource usage. - * Per cgroup resource usage ensures that tasks of cgroup doesn't cross - * the configured limits. Per device provides granular configuration - * in multi device usage. It allocates resource pool in the hierarchy - * for each parent it come across for first resource. Later on resource - * pool will be available. Therefore it will be much faster thereon - * to charge/uncharge. - */ -int rdmacg_try_charge(struct rdma_cgroup **rdmacg, - struct rdmacg_device *device, - enum rdmacg_resource_type index) -{ - struct rdma_cgroup *cg, *p; - struct rdmacg_resource_pool *rpool; - s64 new; - int ret = 0; - - if (index >= RDMACG_RESOURCE_MAX) - return -EINVAL; - - /* - * hold on to css, as cgroup can be removed but resource - * accounting happens on css. - */ - cg = get_current_rdmacg(); - - mutex_lock(&rdmacg_mutex); - for (p = cg; p; p = parent_rdmacg(p)) { - rpool = get_cg_rpool_locked(p, device); - if (IS_ERR(rpool)) { - ret = PTR_ERR(rpool); - goto err; - } else { - new = rpool->resources[index].usage + 1; - if (new > rpool->resources[index].max) { - ret = -EAGAIN; - goto err; - } else { - rpool->resources[index].usage = new; - rpool->usage_sum++; - } - } - } - mutex_unlock(&rdmacg_mutex); - - *rdmacg = cg; - return 0; - -err: - mutex_unlock(&rdmacg_mutex); - rdmacg_uncharge_hierarchy(cg, device, p, index); - return ret; -} -EXPORT_SYMBOL(rdmacg_try_charge); - -/** - * rdmacg_register_device - register rdmacg device to rdma controller. - * @device: pointer to rdmacg device whose resources need to be accounted. - * - * If IB stack wish a device to participate in rdma cgroup resource - * tracking, it must invoke this API to register with rdma cgroup before - * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. - */ -int rdmacg_register_device(struct rdmacg_device *device) -{ - INIT_LIST_HEAD(&device->dev_node); - INIT_LIST_HEAD(&device->rpools); - - mutex_lock(&rdmacg_mutex); - list_add_tail(&device->dev_node, &rdmacg_devices); - mutex_unlock(&rdmacg_mutex); - return 0; -} -EXPORT_SYMBOL(rdmacg_register_device); - -/** - * rdmacg_unregister_device - unregister rdmacg device from rdma controller. - * @device: pointer to rdmacg device which was previously registered with rdma - * controller using rdmacg_register_device(). - * - * IB stack must invoke this after all the resources of the IB device - * are destroyed and after ensuring that no more resources will be created - * when this API is invoked. - */ -void rdmacg_unregister_device(struct rdmacg_device *device) -{ - struct rdmacg_resource_pool *rpool, *tmp; - - /* - * Synchronize with any active resource settings, - * usage query happening via configfs. - */ - mutex_lock(&rdmacg_mutex); - list_del_init(&device->dev_node); - - /* - * Now that this device is off the cgroup list, its safe to free - * all the rpool resources. - */ - list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) - free_cg_rpool_locked(rpool); - - mutex_unlock(&rdmacg_mutex); -} -EXPORT_SYMBOL(rdmacg_unregister_device); - -static int parse_resource(char *c, int *intval) -{ - substring_t argstr; - const char **table = &rdmacg_resource_names[0]; - char *name, *value = c; - size_t len; - int ret, i = 0; - - name = strsep(&value, "="); - if (!name || !value) - return -EINVAL; - - len = strlen(value); - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; - - argstr.from = value; - argstr.to = value + len; - - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; - } - return -EINVAL; -} - -static int rdmacg_parse_limits(char *options, - int *new_limits, unsigned long *enables) -{ - char *c; - int err = -EINVAL; - - /* parse resource options */ - while ((c = strsep(&options, " ")) != NULL) { - int index, intval; - - index = parse_resource(c, &intval); - if (index < 0) - goto err; - - new_limits[index] = intval; - *enables |= BIT(index); - } - return 0; - -err: - return err; -} - -static struct rdmacg_device *rdmacg_get_device_locked(const char *name) -{ - struct rdmacg_device *device; - - lockdep_assert_held(&rdmacg_mutex); - - list_for_each_entry(device, &rdmacg_devices, dev_node) - if (!strcmp(name, device->name)) - return device; - - return NULL; -} - -static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdma_cgroup *cg = css_rdmacg(of_css(of)); - const char *dev_name; - struct rdmacg_resource_pool *rpool; - struct rdmacg_device *device; - char *options = strstrip(buf); - int *new_limits; - unsigned long enables = 0; - int i = 0, ret = 0; - - /* extract the device name first */ - dev_name = strsep(&options, " "); - if (!dev_name) { - ret = -EINVAL; - goto err; - } - - new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); - if (!new_limits) { - ret = -ENOMEM; - goto err; - } - - ret = rdmacg_parse_limits(options, new_limits, &enables); - if (ret) - goto parse_err; - - /* acquire lock to synchronize with hot plug devices */ - mutex_lock(&rdmacg_mutex); - - device = rdmacg_get_device_locked(dev_name); - if (!device) { - ret = -ENODEV; - goto dev_err; - } - - rpool = get_cg_rpool_locked(cg, device); - if (IS_ERR(rpool)) { - ret = PTR_ERR(rpool); - goto dev_err; - } - - /* now set the new limits of the rpool */ - for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) - set_resource_limit(rpool, i, new_limits[i]); - - if (rpool->usage_sum == 0 && - rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); - } - -dev_err: - mutex_unlock(&rdmacg_mutex); - -parse_err: - kfree(new_limits); - -err: - return ret ?: nbytes; -} - -static void print_rpool_values(struct seq_file *sf, - struct rdmacg_resource_pool *rpool) -{ - enum rdmacg_file_type sf_type; - int i; - u32 value; - - sf_type = seq_cft(sf)->private; - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - seq_puts(sf, rdmacg_resource_names[i]); - seq_putc(sf, '='); - if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { - if (rpool) - value = rpool->resources[i].max; - else - value = S32_MAX; - } else { - if (rpool) - value = rpool->resources[i].usage; - else - value = 0; - } - - if (value == S32_MAX) - seq_puts(sf, RDMACG_MAX_STR); - else - seq_printf(sf, "%d", value); - seq_putc(sf, ' '); - } -} - -static int rdmacg_resource_read(struct seq_file *sf, void *v) -{ - struct rdmacg_device *device; - struct rdmacg_resource_pool *rpool; - struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); - - mutex_lock(&rdmacg_mutex); - - list_for_each_entry(device, &rdmacg_devices, dev_node) { - seq_printf(sf, "%s ", device->name); - - rpool = find_cg_rpool_locked(cg, device); - print_rpool_values(sf, rpool); - - seq_putc(sf, '\n'); - } - - mutex_unlock(&rdmacg_mutex); - return 0; -} - -static struct cftype rdmacg_files[] = { - { - .name = "max", - .write = rdmacg_resource_set_max, - .seq_show = rdmacg_resource_read, - .private = RDMACG_RESOURCE_TYPE_MAX, - .flags = CFTYPE_NOT_ON_ROOT, - }, - { - .name = "current", - .seq_show = rdmacg_resource_read, - .private = RDMACG_RESOURCE_TYPE_STAT, - .flags = CFTYPE_NOT_ON_ROOT, - }, - { } /* terminate */ -}; - -static struct cgroup_subsys_state * -rdmacg_css_alloc(struct cgroup_subsys_state *parent) -{ - struct rdma_cgroup *cg; - - cg = kzalloc(sizeof(*cg), GFP_KERNEL); - if (!cg) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&cg->rpools); - return &cg->css; -} - -static void rdmacg_css_free(struct cgroup_subsys_state *css) -{ - struct rdma_cgroup *cg = css_rdmacg(css); - - kfree(cg); -} - -/** - * rdmacg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away and responsible - * for shooting down all rdmacg associated with @css. As part of that it - * marks all the resource pool entries to max value, so that when resources are - * uncharged, associated resource pool can be freed as well. - */ -static void rdmacg_css_offline(struct cgroup_subsys_state *css) -{ - struct rdma_cgroup *cg = css_rdmacg(css); - struct rdmacg_resource_pool *rpool; - - mutex_lock(&rdmacg_mutex); - - list_for_each_entry(rpool, &cg->rpools, cg_node) - set_all_resource_max_limit(rpool); - - mutex_unlock(&rdmacg_mutex); -} - -struct cgroup_subsys rdma_cgrp_subsys = { - .css_alloc = rdmacg_css_alloc, - .css_free = rdmacg_css_free, - .css_offline = rdmacg_css_offline, - .legacy_cftypes = rdmacg_files, - .dfl_cftypes = rdmacg_files, -}; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup_freezer.c similarity index 99% rename from kernel/cgroup/legacy_freezer.c rename to kernel/cgroup_freezer.c index 08236798d173..1b72d56edce5 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup_freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); + css_task_iter_start(css, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, 0, &it); + css_task_iter_start(&freezer->css, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, 0, &it); + css_task_iter_start(&freezer->css, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup_pids.c similarity index 98% rename from kernel/cgroup/pids.c rename to kernel/cgroup_pids.c index 6f064cce257a..b8b898e21c19 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup_pids.c @@ -248,7 +248,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_release(struct task_struct *task) +static void pids_free(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -343,8 +343,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .release = pids_release, + .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, - .threaded = true, }; diff --git a/kernel/cgroup/cpuset.c b/kernel/cpuset.c similarity index 97% rename from kernel/cgroup/cpuset.c rename to kernel/cpuset.c index 4890211f5709..df64cb9ba63a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cpuset.c @@ -298,16 +298,6 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); -/* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. - */ -static inline bool is_in_v2_mode(void) -{ - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); -} - /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -504,7 +494,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -889,7 +880,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) update_cpus_allowed(cs, task, cs->effective_cpus); css_task_iter_end(&it); @@ -922,7 +913,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -939,7 +931,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!is_in_v2_mode() && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1134,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1192,7 +1184,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1209,7 +1202,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!is_in_v2_mode() && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1326,7 +1319,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, 0, &it); + css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); @@ -1500,7 +1493,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!is_in_v2_mode() && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1557,7 +1550,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - get_online_cpus(); mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1613,7 +1605,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); - put_online_cpus(); } /* The various types of files and directories in a cpuset file system */ @@ -2032,7 +2023,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (is_in_v2_mode()) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2113,7 +2104,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (is_in_v2_mode()) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2183,9 +2174,12 @@ int __init cpuset_init(void) { int err = 0; - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) + BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); cpumask_setall(top_cpuset.cpus_requested); @@ -2201,7 +2195,8 @@ int __init cpuset_init(void) if (err < 0) return err; - BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); return 0; } @@ -2327,7 +2322,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (is_in_v2_mode()) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2365,7 +2360,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = is_in_v2_mode(); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); @@ -2424,7 +2419,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } } -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { /* * We're inside cpu hotplug critical region which usually nests @@ -2469,11 +2464,8 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - /* - * cpus_allowd/mems_allowed set to v2 values in the initial - * cpuset_bind() call will be reset to v1 values in another - * cpuset_bind() call when v1 cpuset is mounted. - */ + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); + top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); @@ -2507,23 +2499,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } -/** - * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. - * @tsk: pointer to task_struct with which the scheduler is struggling - * - * Description: In the case that the scheduler cannot find an allowed cpu in - * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy - * mode however, this value is the same as task_cs(tsk)->effective_cpus, - * which will not contain a sane cpumask during cases such as cpu hotplugging. - * This is the absolute last resort for the scheduler and it is only used if - * _every_ other avenue has been traveled. - **/ - void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* diff --git a/kernel/cred.c b/kernel/cred.c index ad24a4cb25c0..d63a2d861ac2 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -196,7 +196,7 @@ const struct cred *get_task_cred(struct task_struct *task) do { cred = __task_cred((task)); BUG_ON(!cred); - } while (!get_cred_rcu(cred)); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); rcu_read_unlock(); return cred; diff --git a/kernel/events/core.c b/kernel/events/core.c index 540256086e91..547184b71dce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11458,12 +11458,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .attach = perf_cgroup_attach, - /* - * Implicitly enable on dfl hierarchy so that perf events can - * always be filtered by cgroup2 path as long as perf_event - * controller is not mounted on a legacy hierarchy. - */ - .implicit_on_dfl = true, - .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/exit.c b/kernel/exit.c index 5e0ca9c806a6..09beccfb0977 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,7 +178,6 @@ void release_task(struct task_struct *p) rcu_read_unlock(); proc_flush_task(p); - cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); diff --git a/kernel/fork.c b/kernel/fork.c index cbce7b33193b..00f93deb2829 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1069,9 +1069,7 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); - cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); - cgroup_leave_frozen(false); freezer_count(); if (killed) { diff --git a/kernel/reboot.c b/kernel/reboot.c index 48445cb61e8f..a5ff5d0ef572 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -269,6 +269,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off); static DEFINE_MUTEX(reboot_mutex); +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) +extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); +#endif + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -277,9 +281,6 @@ static DEFINE_MUTEX(reboot_mutex); * * reboot doesn't sync: do that yourself before calling this. */ -#ifdef CONFIG_KSU_MANUAL_HOOK -extern int ksu_handle_sys_reboot(int magic1, int magic2, unsigned int cmd, void __user **arg); -#endif SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { @@ -287,10 +288,9 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, char buffer[256]; int ret = 0; -#ifdef CONFIG_KSU_MANUAL_HOOK +#if defined(CONFIG_KSU) && !defined(CONFIG_KSU_KPROBES_KSUD) ksu_handle_sys_reboot(magic1, magic2, cmd, &arg); #endif - /* We only trust the superuser with rebooting the system. */ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29e696d490f1..f312d7a3b914 100755 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8055,7 +8055,7 @@ static void cpuset_cpu_active(void) */ cpuset_force_rebuild(); } - cpuset_update_active_cpus(); + cpuset_update_active_cpus(true); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -8078,7 +8078,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(); + cpuset_update_active_cpus(false); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 5b2edc6341f8..8051e3741aed 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -38,7 +38,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -48,10 +47,6 @@ #include #include #include -#ifdef CONFIG_REKERNEL -#include -#include <../drivers/rekernel/rekernel.h> -#endif /* CONFIG_REKERNEL */ #include "audit.h" /* audit_signal_info() */ /* @@ -151,10 +146,9 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || + if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked) || - cgroup_task_frozen(t)) { + PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return 1; } @@ -1212,10 +1206,6 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, { unsigned long flags; int ret = -ESRCH; -#ifdef CONFIG_REKERNEL - if (sig == SIGKILL || sig == SIGTERM || sig == SIGABRT || sig == SIGQUIT) - rekernel_report(SIGNAL, sig, task_tgid_nr(current), current, task_tgid_nr(p), p, false, NULL); -#endif /* CONFIG_REKERNEL */ if (lock_task_sighand(p, &flags)) { ret = send_signal(sig, info, p, group); @@ -1939,10 +1929,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) */ preempt_disable(); read_unlock(&tasklist_lock); - cgroup_enter_frozen(); preempt_enable_no_resched(); freezable_schedule(); - cgroup_leave_frozen(true); } else { /* * By the time we got the lock, our tracer went away. @@ -2120,7 +2108,6 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2167,43 +2154,6 @@ static void do_jobctl_trap(void) } } -/** - * do_freezer_trap - handle the freezer jobctl trap - * - * Puts the task into frozen state, if only the task is not about to quit. - * In this case it drops JOBCTL_TRAP_FREEZE. - * - * CONTEXT: - * Must be called with @current->sighand->siglock held, - * which is always released before returning. - */ -static void do_freezer_trap(void) - __releases(¤t->sighand->siglock) -{ - /* - * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, - * let's make another loop to give it a chance to be handled. - * In any case, we'll return back. - */ - if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != - JOBCTL_TRAP_FREEZE) { - spin_unlock_irq(¤t->sighand->siglock); - return; - } - - /* - * Now we're sure that there is no pending fatal signal and no - * pending traps. Clear TIF_SIGPENDING to not get out of schedule() - * immediately (if there is a non-fatal signal pending), and - * put the task into sleep. - */ - __set_current_state(TASK_INTERRUPTIBLE); - clear_thread_flag(TIF_SIGPENDING); - spin_unlock_irq(¤t->sighand->siglock); - cgroup_enter_frozen(); - freezable_schedule(); -} - static int ptrace_signal(int signr, siginfo_t *info) { ptrace_signal_deliver(); @@ -2316,10 +2266,6 @@ int get_signal(struct ksignal *ksig) trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL - 1]); recalc_sigpending(); - current->jobctl &= ~JOBCTL_TRAP_FREEZE; - spin_unlock_irq(&sighand->siglock); - if (unlikely(cgroup_task_frozen(current))) - cgroup_leave_frozen(true); goto fatal; } @@ -2330,24 +2276,9 @@ int get_signal(struct ksignal *ksig) do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & - (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { - if (current->jobctl & JOBCTL_TRAP_MASK) { - do_jobctl_trap(); - spin_unlock_irq(&sighand->siglock); - } else if (current->jobctl & JOBCTL_TRAP_FREEZE) - do_freezer_trap(); - - goto relock; - } - - /* - * If the task is leaving the frozen state, let's update - * cgroup counters and reset the frozen bit. - */ - if (unlikely(cgroup_task_frozen(current))) { + if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { + do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); - cgroup_leave_frozen(true); goto relock; } @@ -2441,8 +2372,8 @@ int get_signal(struct ksignal *ksig) continue; } - spin_unlock_irq(&sighand->siglock); fatal: + spin_unlock_irq(&sighand->siglock); /* * Anything else is fatal, maybe with a core dump. @@ -2477,7 +2408,7 @@ int get_signal(struct ksignal *ksig) } /** - * signal_delivered - + * signal_delivered - * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * @@ -3540,7 +3471,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { - return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); + return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } #endif @@ -3665,7 +3596,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { sigset_to_compat(&mask, &old_ka.sa.sa_mask); - ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), + ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); @@ -3843,7 +3774,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; return sigsuspend(&newset); } - + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a49cdf33a62e..b508b47ae3ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -966,7 +966,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index dad43d1924db..db65b0cdfc4c 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -128,7 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, cs->classid = (u32)value; - css_task_iter_start(css, 0, &it); + css_task_iter_start(css, &it); while ((p = css_task_iter_next(&it))) { update_classid_task(p, cs->classid); cond_resched(); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7e1caf9ee106..ac0c60389581 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2314,10 +2314,6 @@ static u32 ptrace_parent_sid(struct task_struct *task) return sid; } -#ifdef CONFIG_KSU_MANUAL_HOOK -extern bool is_ksu_transition(const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec); -#endif static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *old_tsec, const struct task_security_struct *new_tsec) @@ -2332,11 +2328,6 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, if (new_tsec->sid == old_tsec->sid) return 0; /* No change in credentials */ -#ifdef CONFIG_KSU_MANUAL_HOOK - if (is_ksu_transition(old_tsec, new_tsec)) - return 0; -#endif - /* * The only transitions we permit under NNP or nosuid * are transitions to bounded SIDs, i.e. SIDs that are @@ -5916,6 +5907,10 @@ static int selinux_getprocattr(struct task_struct *p, return -EINVAL; } +#ifdef CONFIG_KSU +extern int ksu_hide_setprocattr(const char *name, void *value, size_t size); +#endif + static int selinux_setprocattr(struct task_struct *p, char *name, void *value, size_t size) { @@ -5925,6 +5920,10 @@ static int selinux_setprocattr(struct task_struct *p, int error; char *str = value; +#ifdef CONFIG_KSU + ksu_hide_setprocattr(name, value, size); +#endif + if (current != p) { /* SELinux only allows a process to change its own security attributes. */ diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index b818410d2418..58e5ccf6b1e9 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -76,7 +76,11 @@ int selinux_policycap_netpeer; int selinux_policycap_openperm; int selinux_policycap_alwaysnetwork; +#ifdef CONFIG_KSU +DEFINE_RWLOCK(policy_rwlock); +#else static DEFINE_RWLOCK(policy_rwlock); +#endif static struct sidtab sidtab; struct policydb policydb; diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 34156826c14f..5bc2b92ace6d 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; - char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; char *token, *saved_ptr = NULL; + int found = 0; fp = fopen("/proc/mounts", "r"); if (!fp) @@ -24,43 +24,31 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) * and inspect every cgroupfs mount point to find one that has * perf_event subsystem */ - path_v1[0] = '\0'; - path_v2[0] = '\0'; - while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" STR(PATH_MAX)"s %*d %*d\n", mountpoint, type, tokens) == 3) { - if (!path_v1[0] && !strcmp(type, "cgroup")) { + if (!strcmp(type, "cgroup")) { token = strtok_r(tokens, ",", &saved_ptr); while (token != NULL) { if (!strcmp(token, "perf_event")) { - strcpy(path_v1, mountpoint); + found = 1; break; } token = strtok_r(NULL, ",", &saved_ptr); } } - - if (!path_v2[0] && !strcmp(type, "cgroup2")) - strcpy(path_v2, mountpoint); - - if (path_v1[0] && path_v2[0]) + if (found) break; } fclose(fp); - - if (path_v1[0]) - path = path_v1; - else if (path_v2[0]) - path = path_v2; - else + if (!found) return -1; - if (strlen(path) < maxlen) { - strcpy(buf, path); + if (strlen(mountpoint) < maxlen) { + strcpy(buf, mountpoint); return 0; } return -1;