diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 948e5d193a8c..e35f5377f681 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,9 +27,9 @@ jobs:
- ursa
steps:
- name: 安装软件包
+ if: env.PACKAGES != ''
env:
PACKAGES:
- ccache
binutils-aarch64-linux-gnu
binutils-arm-linux-gnueabi
run: |
@@ -39,7 +39,7 @@ jobs:
- name: 安装make4.4.1-2
run: |
curl -LSs http://ftp.debian.org/debian/pool/main/m/make-dfsg/make_4.4.1-2_amd64.deb -o make.deb
- sudo apt-get install -y ./make.deb
+ sudo apt-get install -y -q ./make.deb
rm ./make.deb
- name: 同步仓库
@@ -49,17 +49,25 @@ jobs:
- name: 缓存Clang
id: cache-clang
- uses: actions/cache@main
+ uses: actions/cache/restore@main
with:
path: clang
key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }}
- name: 下载Clang
+ id: download_clang
if: steps.cache-clang.outputs.cache-hit != 'true'
- run:
- mkdir -p clang &&
- curl -LSs "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" |
- tar xz -C clang
+ run: |
+ mkdir -p clang
+ wget -c -t 10 "https://android.googlesource.com/platform/prebuilts/clang/host/linux-x86/+archive/refs/heads/${{ env.AOSP_TOOLCHAIN_BRANCH }}-release/clang-${{ env.AOSP_CLANG_VERSION }}.tar.gz" -O clang.tgz
+ tar -zxvf clang.tgz -C clang/
+
+ - name: 保存Clang
+ if: always() && steps.cache-clang.outputs.cache-hit != 'true' && steps.download_clang.outcome == 'success'
+ uses: actions/cache/save@main
+ with:
+ path: clang
+ key: clang-${{ env.AOSP_TOOLCHAIN_BRANCH }}-${{ env.AOSP_CLANG_VERSION }}
- name: 缓存ccache
uses: hendrikmuhs/ccache-action@main
@@ -73,17 +81,18 @@ jobs:
env:
MAKE_ARGS:
-j$(nproc --all)
+ O=out
+ LLVM=1
+ LLVM_IAS=1
CC="ccache clang"
LD=ld.lld
ARCH=arm64
- LLVM=1
- LLVM_IAS=1
- O=out
CROSS_COMPILE=aarch64-linux-gnu-
CROSS_COMPILE_ARM32=arm-linux-gnueabi-
CONFIG_FILES:
vendor/xiaomi/mi845_defconfig
vendor/xiaomi/${{ matrix.device }}.config
+ lxc.config
run: |
export PATH=$GITHUB_WORKSPACE/clang/bin:$PATH
export KBUILD_BUILD_USER=${{ github.repository_owner }}
@@ -112,31 +121,15 @@ jobs:
EOF
zip -qr9 Anykernel3-${{ matrix.device }}.zip * -x .git .github README.md *placeholder
- - name: 打包(boot)
- run: |
- git clone https://android.googlesource.com/platform/system/tools/mkbootimg --depth=1 mkbootimg
- cp kernel/out/arch/arm64/boot/Image.gz-dtb mkbootimg/
- cd mkbootimg
- boot_url=$(curl -LSs https://download.lineageos.org/api/v2/devices/${{ matrix.device }}/builds | jq -r '.[0].files[1].url')
- curl -LSs $boot_url -o boot.img
- mkbootimg_args=$(./unpack_bootimg.py --out out --boot_img boot.img --format mkbootimg)
- mv Image.gz-dtb out/kernel
- eval "./mkbootimg.py $mkbootimg_args -o boot-lineage-${{ matrix.device }}.img"
-
- name: 上传文件
uses: actions/upload-artifact@main
with:
name: kernel-${{ matrix.device }}-ak3
path: ak3/Anykernel3-${{ matrix.device }}.zip
- - name: 上传文件
- uses: actions/upload-artifact@main
- with:
- name: kernel-${{ matrix.device }}-boot
- path: mkbootimg/boot-lineage-${{ matrix.device }}.img
-
release:
name: 发布
+ if: github.event_name == 'push'
permissions: { contents: write }
runs-on: ubuntu-latest
needs: build
@@ -151,54 +144,61 @@ jobs:
- name: 获取当前时间
id: time
run: |
- echo "time=$(TZ='Asia/Shanghai' date -u +'%Y%m%d%H%M')" >> $GITHUB_OUTPUT
- echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT
-
- - name: 下载ci管理器
- continue-on-error: true
- uses: dawidd6/action-download-artifact@master
- with:
- repo: rsuntk/KernelSU
- workflow_conclusion: success
- name: manager
- workflow: build-manager.yml
- path: manager
- check_artifacts: true
- search_artifacts: true
+ NOW=$(date +%s)
+ TIME_STR=$(TZ='Asia/Shanghai' date -d "@$NOW" +'%Y%m%d%H%M')
+ echo "timestamp=$NOW" >> $GITHUB_OUTPUT
+ echo "time=$TIME_STR" >> $GITHUB_OUTPUT
- name: 发布
uses: softprops/action-gh-release@master
+ id: release
with:
tag_name: rel-${{ steps.time.outputs.timestamp }}
name: Kernel build ${{ steps.time.outputs.time }}
prerelease: ${{ startsWith(github.ref_name, 'dev/') }}
files: |
kernel/*
- manager/*
- name: 发送Telegram通知
continue-on-error: true
- env:
- COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
- COMMIT_URL: ${{ github.event.head_commit.url }}
- RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- RELEASE_URL: ${{ github.server_url }}/${{ github.repository }}/releases/tag/rel-${{ steps.time.outputs.timestamp }}
run: |
- msg="*CI ${{ steps.time.outputs.time }}*
- > Branch/分支: \`${{ github.ref_name }}\`
- \`\`\`
- $COMMIT_MESSAGE
- \`\`\`
- [Download/下载]($RELEASE_URL)
- [Commit/提交]($COMMIT_URL)
- [Run/工作流]($RUN_URL)
+ IDS=(${{ join(github.event.commits.*.id, ' ') }})
+ MAX=6
+ if [ "${#IDS[@]}" -gt "$MAX" ]; then
+ COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]:0:$MAX}"; echo "......")"
+ else
+ COMMIT_IDS_TEXT="$(printf "%s\n" "${IDS[@]}")"
+ fi
+ MSG="\
+ CI ${{ steps.time.outputs.time }}
+
\
+ 项目: ${{ github.repository }}
+ 分支: ${{ github.ref_name }}\
+
+ 提交ID:
+ $COMMIT_IDS_TEXT
\
"
- curl -LSs https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \
- -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \
- -F 'message_thread_id=${{ secrets.TELEGRAM_MESSAGE_THREAD_ID }}' \
- -F 'parse_mode="markdownv2"' \
- -F "text=\"$msg\"" | tee Markdown.txt
- ! ${{ startsWith(github.ref_name, 'stable/') }} || \
- curl https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \
- -F 'chat_id="${{ secrets.TELEGRAM_CHAT_ID }}"' \
- -F message_id=$(jq '.result.message_id' Markdown.txt)
+ PREVIEW_OPTIONS="{ \
+ \"url\": \"${{ steps.release.outputs.url }}\", \
+ \"prefer_small_media\": true, \
+ \"show_above_text\": true \
+ }"
+ BUTTONS="{\"inline_keyboard\": [ [ \
+ { \"text\": \"下载链接\", \"url\": \"${{ steps.release.outputs.url }}\" }, \
+ { \"text\": \"对比差异\", \"url\": \"${{ github.event.compare }}\" } \
+ ] ] }"
+ curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage \
+ -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \
+ -d "message_thread_id=${{ vars.TELEGRAM_MESSAGE_THREAD_ID }}" \
+ -d "parse_mode=HTML" \
+ --data-urlencode "text=$MSG" \
+ -d "link_preview_options=$PREVIEW_OPTIONS" \
+ -d "reply_markup=$BUTTONS" \
+ -o response.txt && \
+ (! ${{ startsWith(github.ref_name, 'stable/') }} || \
+ curl -LSs -X POST https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/pinChatMessage \
+ -d "chat_id=${{ vars.TELEGRAM_CHAT_ID }}" \
+ -d "message_id=$(jq '.result.message_id' response.txt)")
+ if [ "${{ runner.debug }}" = "1" ]; then
+ cat response.txt
+ fi
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index f8614b3d49f9..a542b9f2a30d 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
00-INDEX
- This file
-bfq-iosched.txt
- - BFQ IO scheduler and its tunables
biodoc.txt
- Notes on the Generic Block Layer Rewrite in Linux 2.5
biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
deleted file mode 100644
index 0539e87962ed..000000000000
--- a/Documentation/block/bfq-iosched.txt
+++ /dev/null
@@ -1,545 +0,0 @@
-BFQ (Budget Fair Queueing)
-==========================
-
-BFQ is a proportional-share I/O scheduler, with some extra
-low-latency capabilities. In addition to cgroups support (blkio or io
-controllers), BFQ's main features are:
-- BFQ guarantees a high system and application responsiveness, and a
- low latency for time-sensitive applications, such as audio or video
- players;
-- BFQ distributes bandwidth, and not just time, among processes or
- groups (switching back to time distribution when needed to keep
- throughput high).
-
-In its default configuration, BFQ privileges latency over
-throughput. So, when needed for achieving a lower latency, BFQ builds
-schedules that may lead to a lower throughput. If your main or only
-goal, for a given device, is to achieve the maximum-possible
-throughput at all times, then do switch off all low-latency heuristics
-for that device, by setting low_latency to 0. Full details in Section 3.
-
-On average CPUs, the current version of BFQ can handle devices
-performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
-reference, 30-50 KIOPS correspond to very high bandwidths with
-sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
-to 120-200 MB/s with 4KB random I/O.
-
-The table of contents follow. Impatients can just jump to Section 3.
-
-CONTENTS
-
-1. When may BFQ be useful?
- 1-1 Personal systems
- 1-2 Server systems
-2. How does BFQ work?
-3. What are BFQ's tunable?
-4. BFQ group scheduling
- 4-1 Service guarantees provided
- 4-2 Interface
-
-1. When may BFQ be useful?
-==========================
-
-BFQ provides the following benefits on personal and server systems.
-
-1-1 Personal systems
---------------------
-
-Low latency for interactive applications
-
-Regardless of the actual background workload, BFQ guarantees that, for
-interactive tasks, the storage device is virtually as responsive as if
-it was idle. For example, even if one or more of the following
-background workloads are being executed:
-- one or more large files are being read, written or copied,
-- a tree of source files is being compiled,
-- one or more virtual machines are performing I/O,
-- a software update is in progress,
-- indexing daemons are scanning filesystems and updating their
- databases,
-starting an application or loading a file from within an application
-takes about the same time as if the storage device was idle. As a
-comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
-applications experience high latencies, or even become unresponsive
-until the background workload terminates (also on SSDs).
-
-Low latency for soft real-time applications
-
-Also soft real-time applications, such as audio and video
-players/streamers, enjoy a low latency and a low drop rate, regardless
-of the background I/O workload. As a consequence, these applications
-do not suffer from almost any glitch due to the background workload.
-
-Higher speed for code-development tasks
-
-If some additional workload happens to be executed in parallel, then
-BFQ executes the I/O-related components of typical code-development
-tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
-NOOP or DEADLINE.
-
-High throughput
-
-On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
-up to 150% higher throughput than DEADLINE and NOOP, with all the
-sequential workloads considered in our tests. With random workloads,
-and with all the workloads on flash-based devices, BFQ achieves,
-instead, about the same throughput as the other schedulers.
-
-Strong fairness, bandwidth and delay guarantees
-
-BFQ distributes the device throughput, and not just the device time,
-among I/O-bound applications in proportion their weights, with any
-workload and regardless of the device parameters. From these bandwidth
-guarantees, it is possible to compute tight per-I/O-request delay
-guarantees by a simple formula. If not configured for strict service
-guarantees, BFQ switches to time-based resource sharing (only) for
-applications that would otherwise cause a throughput loss.
-
-1-2 Server systems
-------------------
-
-Most benefits for server systems follow from the same service
-properties as above. In particular, regardless of whether additional,
-possibly heavy workloads are being served, BFQ guarantees:
-
-. audio and video-streaming with zero or very low jitter and drop
- rate;
-
-. fast retrieval of WEB pages and embedded objects;
-
-. real-time recording of data in live-dumping applications (e.g.,
- packet logging);
-
-. responsiveness in local and remote access to a server.
-
-
-2. How does BFQ work?
-=====================
-
-BFQ is a proportional-share I/O scheduler, whose general structure,
-plus a lot of code, are borrowed from CFQ.
-
-- Each process doing I/O on a device is associated with a weight and a
- (bfq_)queue.
-
-- BFQ grants exclusive access to the device, for a while, to one queue
- (process) at a time, and implements this service model by
- associating every queue with a budget, measured in number of
- sectors.
-
- - After a queue is granted access to the device, the budget of the
- queue is decremented, on each request dispatch, by the size of the
- request.
-
- - The in-service queue is expired, i.e., its service is suspended,
- only if one of the following events occurs: 1) the queue finishes
- its budget, 2) the queue empties, 3) a "budget timeout" fires.
-
- - The budget timeout prevents processes doing random I/O from
- holding the device for too long and dramatically reducing
- throughput.
-
- - Actually, as in CFQ, a queue associated with a process issuing
- sync requests may not be expired immediately when it empties. In
- contrast, BFQ may idle the device for a short time interval,
- giving the process the chance to go on being served if it issues
- a new request in time. Device idling typically boosts the
- throughput on rotational devices, if processes do synchronous
- and sequential I/O. In addition, under BFQ, device idling is
- also instrumental in guaranteeing the desired throughput
- fraction to processes issuing sync requests (see the description
- of the slice_idle tunable in this document, or [1, 2], for more
- details).
-
- - With respect to idling for service guarantees, if several
- processes are competing for the device at the same time, but
- all processes (and groups, after the following commit) have
- the same weight, then BFQ guarantees the expected throughput
- distribution without ever idling the device. Throughput is
- thus as high as possible in this common scenario.
-
- - If low-latency mode is enabled (default configuration), BFQ
- executes some special heuristics to detect interactive and soft
- real-time applications (e.g., video or audio players/streamers),
- and to reduce their latency. The most important action taken to
- achieve this goal is to give to the queues associated with these
- applications more than their fair share of the device
- throughput. For brevity, we call just "weight-raising" the whole
- sets of actions taken by BFQ to privilege these queues. In
- particular, BFQ provides a milder form of weight-raising for
- interactive applications, and a stronger form for soft real-time
- applications.
-
- - BFQ automatically deactivates idling for queues born in a burst of
- queue creations. In fact, these queues are usually associated with
- the processes of applications and services that benefit mostly
- from a high throughput. Examples are systemd during boot, or git
- grep.
-
- - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
- performing random I/O that becomes mostly sequential if
- merged. Differently from CFQ, BFQ achieves this goal with a more
- reactive mechanism, called Early Queue Merge (EQM). EQM is so
- responsive in detecting interleaved I/O (cooperating processes),
- that it enables BFQ to achieve a high throughput, by queue
- merging, even for queues for which CFQ needs a different
- mechanism, preemption, to get a high throughput. As such EQM is a
- unified mechanism to achieve a high throughput with interleaved
- I/O.
-
- - Queues are scheduled according to a variant of WF2Q+, named
- B-WF2Q+, and implemented using an augmented rb-tree to preserve an
- O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
- also ready for hierarchical scheduling. However, for a cleaner
- logical breakdown, the code that enables and completes
- hierarchical support is provided in the next commit, which focuses
- exactly on this feature.
-
- - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
- perfectly fair, and smooth service. In particular, B-WF2Q+
- guarantees that each queue receives a fraction of the device
- throughput proportional to its weight, even if the throughput
- fluctuates, and regardless of: the device parameters, the current
- workload and the budgets assigned to the queue.
-
- - The last, budget-independence, property (although probably
- counterintuitive in the first place) is definitely beneficial, for
- the following reasons:
-
- - First, with any proportional-share scheduler, the maximum
- deviation with respect to an ideal service is proportional to
- the maximum budget (slice) assigned to queues. As a consequence,
- BFQ can keep this deviation tight not only because of the
- accurate service of B-WF2Q+, but also because BFQ *does not*
- need to assign a larger budget to a queue to let the queue
- receive a higher fraction of the device throughput.
-
- - Second, BFQ is free to choose, for every process (queue), the
- budget that best fits the needs of the process, or best
- leverages the I/O pattern of the process. In particular, BFQ
- updates queue budgets with a simple feedback-loop algorithm that
- allows a high throughput to be achieved, while still providing
- tight latency guarantees to time-sensitive applications. When
- the in-service queue expires, this algorithm computes the next
- budget of the queue so as to:
-
- - Let large budgets be eventually assigned to the queues
- associated with I/O-bound applications performing sequential
- I/O: in fact, the longer these applications are served once
- got access to the device, the higher the throughput is.
-
- - Let small budgets be eventually assigned to the queues
- associated with time-sensitive applications (which typically
- perform sporadic and short I/O), because, the smaller the
- budget assigned to a queue waiting for service is, the sooner
- B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
-
-- If several processes are competing for the device at the same time,
- but all processes and groups have the same weight, then BFQ
- guarantees the expected throughput distribution without ever idling
- the device. It uses preemption instead. Throughput is then much
- higher in this common scenario.
-
-- ioprio classes are served in strict priority order, i.e.,
- lower-priority queues are not served as long as there are
- higher-priority queues. Among queues in the same class, the
- bandwidth is distributed in proportion to the weight of each
- queue. A very thin extra bandwidth is however guaranteed to
- the Idle class, to prevent it from starving.
-
-
-3. What are BFQ's tunable?
-==========================
-
-The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
-fifo_expire_sync below are the same as in CFQ. Their description is
-just copied from that for CFQ. Some considerations in the description
-of slice_idle are copied from CFQ too.
-
-per-process ioprio and weight
------------------------------
-
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
-weights can be assigned to processes only indirectly, through I/O
-priorities, and according to the relation:
-weight = (IOPRIO_BE_NR - ioprio) * 10.
-
-Beware that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-slice_idle
-----------
-
-This parameter specifies how long BFQ should idle for next I/O
-request, when certain sync BFQ queues become empty. By default
-slice_idle is a non-zero value. Idling has a double purpose: boosting
-throughput and making sure that the desired throughput distribution is
-respected (see the description of how BFQ works, and, if needed, the
-papers referred there).
-
-As for throughput, idling can be very helpful on highly seeky media
-like single spindle SATA/SAS disks where we can cut down on overall
-number of seeks and see improved throughput.
-
-Setting slice_idle to 0 will remove all the idling on queues and one
-should see an overall improved throughput on faster storage devices
-like multiple SATA/SAS disks in hardware RAID configuration.
-
-So depending on storage and workload, it might be useful to set
-slice_idle=0. In general for SATA/SAS disks and software RAID of
-SATA/SAS disks keeping slice_idle enabled should be useful. For any
-configurations where there are multiple spindles behind single LUN
-(Host based hardware RAID controller or for storage arrays), setting
-slice_idle=0 might end up in better throughput and acceptable
-latencies.
-
-Idling is however necessary to have service guarantees enforced in
-case of differentiated weights or differentiated I/O-request lengths.
-To see why, suppose that a given BFQ queue A must get several I/O
-requests served for each request served for another queue B. Idling
-ensures that, if A makes a new I/O request slightly after becoming
-empty, then no request of B is dispatched in the middle, and thus A
-does not lose the possibility to get more than one request dispatched
-before the next request of B is dispatched. Note that idling
-guarantees the desired differentiated treatment of queues only in
-terms of I/O-request dispatches. To guarantee that the actual service
-order then corresponds to the dispatch order, the strict_guarantees
-tunable must be set too.
-
-There is an important flipside for idling: apart from the above cases
-where it is beneficial also for throughput, idling can severely impact
-throughput. One important case is random workload. Because of this
-issue, BFQ tends to avoid idling as much as possible, when it is not
-beneficial also for throughput. As a consequence of this behavior, and
-of further issues described for the strict_guarantees tunable,
-short-term service guarantees may be occasionally violated. And, in
-some cases, these guarantees may be more important than guaranteeing
-maximum throughput. For example, in video playing/streaming, a very
-low drop rate may be more important than maximum throughput. In these
-cases, consider setting the strict_guarantees parameter.
-
-strict_guarantees
------------------
-
-If this parameter is set (default: unset), then BFQ
-
-- always performs idling when the in-service queue becomes empty;
-
-- forces the device to serve one I/O request at a time, by dispatching a
- new request only if there is no outstanding request.
-
-In the presence of differentiated weights or I/O-request sizes, both
-the above conditions are needed to guarantee that every BFQ queue
-receives its allotted share of the bandwidth. The first condition is
-needed for the reasons explained in the description of the slice_idle
-tunable. The second condition is needed because all modern storage
-devices reorder internally-queued requests, which may trivially break
-the service guarantees enforced by the I/O scheduler.
-
-Setting strict_guarantees may evidently affect throughput.
-
-back_seek_max
--------------
-
-This specifies, given in Kbytes, the maximum "distance" for backward seeking.
-The distance is the amount of space from the current head location to the
-sectors that are backward in terms of distance.
-
-This parameter allows the scheduler to anticipate requests in the "backward"
-direction and consider them as being the "next" if they are within this
-distance from the current head location.
-
-back_seek_penalty
------------------
-
-This parameter is used to compute the cost of backward seeking. If the
-backward distance of request is just 1/back_seek_penalty from a "front"
-request, then the seeking cost of two requests is considered equivalent.
-
-So scheduler will not bias toward one or the other request (otherwise scheduler
-will bias toward front request). Default value of back_seek_penalty is 2.
-
-fifo_expire_async
------------------
-
-This parameter is used to set the timeout of asynchronous requests. Default
-value of this is 248ms.
-
-fifo_expire_sync
-----------------
-
-This parameter is used to set the timeout of synchronous requests. Default
-value of this is 124ms. In case to favor synchronous requests over asynchronous
-one, this value should be decreased relative to fifo_expire_async.
-
-low_latency
------------
-
-This parameter is used to enable/disable BFQ's low latency mode. By
-default, low latency mode is enabled. If enabled, interactive and soft
-real-time applications are privileged and experience a lower latency,
-as explained in more detail in the description of how BFQ works.
-
-DISABLE this mode if you need full control on bandwidth
-distribution. In fact, if it is enabled, then BFQ automatically
-increases the bandwidth share of privileged applications, as the main
-means to guarantee a lower latency to them.
-
-In addition, as already highlighted at the beginning of this document,
-DISABLE this mode if your only goal is to achieve a high throughput.
-In fact, privileging the I/O of some application over the rest may
-entail a lower throughput. To achieve the highest-possible throughput
-on a non-rotational device, setting slice_idle to 0 may be needed too
-(at the cost of giving up any strong guarantee on fairness and low
-latency).
-
-timeout_sync
-------------
-
-Maximum amount of device time that can be given to a task (queue) once
-it has been selected for service. On devices with costly seeks,
-increasing this time usually increases maximum throughput. On the
-opposite end, increasing this time coarsens the granularity of the
-short-term bandwidth and latency guarantees, especially if the
-following parameter is set to zero.
-
-max_budget
-----------
-
-Maximum amount of service, measured in sectors, that can be provided
-to a BFQ queue once it is set in service (of course within the limits
-of the above timeout). According to what said in the description of
-the algorithm, larger values increase the throughput in proportion to
-the percentage of sequential I/O requests issued. The price of larger
-values is that they coarsen the granularity of short-term bandwidth
-and latency guarantees.
-
-The default value is 0, which enables auto-tuning: BFQ sets max_budget
-to the maximum number of sectors that can be served during
-timeout_sync, according to the estimated peak rate.
-
-weights
--------
-
-Read-only parameter, used to show the weights of the currently active
-BFQ queues.
-
-
-wr_ tunables
-------------
-
-BFQ exports a few parameters to control/tune the behavior of
-low-latency heuristics.
-
-wr_coeff
-
-Factor by which the weight of a weight-raised queue is multiplied. If
-the queue is deemed soft real-time, then the weight is further
-multiplied by an additional, constant factor.
-
-wr_max_time
-
-Maximum duration of a weight-raising period for an interactive task
-(ms). If set to zero (default value), then this value is computed
-automatically, as a function of the peak rate of the device. In any
-case, when the value of this parameter is read, it always reports the
-current duration, regardless of whether it has been set manually or
-computed automatically.
-
-wr_max_softrt_rate
-
-Maximum service rate below which a queue is deemed to be associated
-with a soft real-time application, and is then weight-raised
-accordingly (sectors/sec).
-
-wr_min_idle_time
-
-Minimum idle period after which interactive weight-raising may be
-reactivated for a queue (in ms).
-
-wr_rt_max_time
-
-Maximum weight-raising duration for soft real-time queues (in ms). The
-start time from which this duration is considered is automatically
-moved forward if the queue is detected to be still soft real-time
-before the current soft real-time weight-raising period finishes.
-
-wr_min_inter_arr_async
-
-Minimum period between I/O request arrivals after which weight-raising
-may be reactivated for an already busy async queue (in ms).
-
-
-4. Group scheduling with BFQ
-============================
-
-BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
-blkio and io. In particular, BFQ supports weight-based proportional
-share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
-
-4-1 Service guarantees provided
--------------------------------
-
-With BFQ, proportional share means true proportional share of the
-device bandwidth, according to group weights. For example, a group
-with weight 200 gets twice the bandwidth, and not just twice the time,
-of a group with weight 100.
-
-BFQ supports hierarchies (group trees) of any depth. Bandwidth is
-distributed among groups and processes in the expected way: for each
-group, the children of the group share the whole bandwidth of the
-group in proportion to their weights. In particular, this implies
-that, for each leaf group, every process of the group receives the
-same share of the whole group bandwidth, unless the ioprio of the
-process is modified.
-
-The resource-sharing guarantee for a group may partially or totally
-switch from bandwidth to time, if providing bandwidth guarantees to
-the group lowers the throughput too much. This switch occurs on a
-per-process basis: if a process of a leaf group causes throughput loss
-if served in such a way to receive its share of the bandwidth, then
-BFQ switches back to just time-based proportional share for that
-process.
-
-4-2 Interface
--------------
-
-To get proportional sharing of bandwidth with BFQ for a given device,
-BFQ must of course be the active scheduler for that device.
-
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
-
-Parameters to set
------------------
-
-For each group, there is only the following parameter to set.
-
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..10000 (default 100). The
-linear mapping between ioprio and weights, described at the beginning
-of the tunable section, is still valid, but all weights higher than
-IOPRIO_BE_NR*10 are mapped to ioprio 0.
-
-Recall that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-
-[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
- Scheduler", Proceedings of the First Workshop on Mobile System
- Technologies (MST-2015), May 2015.
- http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
-
-[2] P. Valente and M. Andreolini, "Improving Application
- Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
- the 5th Annual International Systems and Storage Conference
- (SYSTOR '12), June 2012.
- Slightly extended version:
- http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
- results.pdf
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
deleted file mode 100644
index af618171e0eb..000000000000
--- a/Documentation/cgroup-v1/rdma.txt
+++ /dev/null
@@ -1,109 +0,0 @@
- RDMA Controller
- ----------------
-
-Contents
---------
-
-1. Overview
- 1-1. What is RDMA controller?
- 1-2. Why RDMA controller needed?
- 1-3. How is RDMA controller implemented?
-2. Usage Examples
-
-1. Overview
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can leads to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
- hca_handle Maximum number of HCA Handles
- hca_object Maximum number of HCA Objects
-
-2. Usage Examples
------------------
-
-(a) Configure resource limit:
-echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit:
-cat /sys/fs/cgroup/rdma/2/rdma.max
-#Output:
-mlx4_0 hca_handle=2 hca_object=2000
-ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage:
-cat /sys/fs/cgroup/rdma/2/rdma.current
-#Output:
-mlx4_0 hca_handle=1 hca_object=20
-ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit:
-echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e4b6bf4de837..73950fdea31a 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -16,9 +16,7 @@ CONTENTS
1-2. What is cgroup?
2. Basic Operations
2-1. Mounting
- 2-2. Organizing Processes and Threads
- 2-2-1. Processes
- 2-2-2. Threads
+ 2-2. Organizing Processes
2-3. [Un]populated Notification
2-4. Controlling Controllers
2-4-1. Enabling and Disabling
@@ -49,12 +47,6 @@ CONTENTS
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
- 5-4. PID
- 5-4-1. PID Interface Files
- 5-5. Misc
- 5-5-1. perf_event
- 5-6. RDMA
- 5-6-1. RDMA Interface Files
6. Namespace
6-1. Basics
6-2. The Root and Views
@@ -151,20 +143,8 @@ during boot, before manual intervention is possible. To make testing
and experimenting easier, the kernel parameter cgroup_no_v1= allows
disabling controllers in v1 and make them always available in v2.
-cgroup v2 currently supports the following mount options.
- nsdelegate
-
- Consider cgroup namespaces as delegation boundaries. This
- option is system wide and can only be set on mount or modified
- through remount from the init namespace. The mount option is
- ignored on non-init namespace mounts. Please refer to the
- Delegation section for details.
-
-
-2-2. Organizing Processes and Threads
-
-2-2-1. Processes
+2-2. Organizing Processes
Initially, only the root cgroup exists to which all processes belong.
A child cgroup can be created by creating a sub-directory.
@@ -215,104 +195,6 @@ is removed subsequently, " (deleted)" is appended to the path.
0::/test-cgroup/test-cgroup-nested (deleted)
-2-2-2. Threads
-
-cgroup v2 supports thread granularity for a subset of controllers to
-support use cases requiring hierarchical resource distribution across
-the threads of a group of processes. By default, all threads of a
-process belong to the same cgroup, which also serves as the resource
-domain to host resource consumptions which are not specific to a
-process or thread. The thread mode allows threads to be spread across
-a subtree while still maintaining the common resource domain for them.
-
-Controllers which support thread mode are called threaded controllers.
-The ones which don't are called domain controllers.
-
-Marking a cgroup threaded makes it join the resource domain of its
-parent as a threaded cgroup. The parent may be another threaded
-cgroup whose resource domain is further up in the hierarchy. The root
-of a threaded subtree, that is, the nearest ancestor which is not
-threaded, is called threaded domain or thread root interchangeably and
-serves as the resource domain for the entire subtree.
-
-Inside a threaded subtree, threads of a process can be put in
-different cgroups and are not subject to the no internal process
-constraint - threaded controllers can be enabled on non-leaf cgroups
-whether they have threads in them or not.
-
-As the threaded domain cgroup hosts all the domain resource
-consumptions of the subtree, it is considered to have internal
-resource consumptions whether there are processes in it or not and
-can't have populated child cgroups which aren't threaded. Because the
-root cgroup is not subject to no internal process constraint, it can
-serve both as a threaded domain and a parent to domain cgroups.
-
-The current operation mode or type of the cgroup is shown in the
-"cgroup.type" file which indicates whether the cgroup is a normal
-domain, a domain which is serving as the domain of a threaded subtree,
-or a threaded cgroup.
-
-On creation, a cgroup is always a domain cgroup and can be made
-threaded by writing "threaded" to the "cgroup.type" file. The
-operation is single direction::
-
- # echo threaded > cgroup.type
-
-Once threaded, the cgroup can't be made a domain again. To enable the
-thread mode, the following conditions must be met.
-
-- As the cgroup will join the parent's resource domain. The parent
- must either be a valid (threaded) domain or a threaded cgroup.
-
-- When the parent is an unthreaded domain, it must not have any domain
- controllers enabled or populated domain children. The root is
- exempt from this requirement.
-
-Topology-wise, a cgroup can be in an invalid state. Please consider
-the following toplogy::
-
- A (threaded domain) - B (threaded) - C (domain, just created)
-
-C is created as a domain but isn't connected to a parent which can
-host child domains. C can't be used until it is turned into a
-threaded cgroup. "cgroup.type" file will report "domain (invalid)" in
-these cases. Operations which fail due to invalid topology use
-EOPNOTSUPP as the errno.
-
-A domain cgroup is turned into a threaded domain when one of its child
-cgroup becomes threaded or threaded controllers are enabled in the
-"cgroup.subtree_control" file while there are processes in the cgroup.
-A threaded domain reverts to a normal domain when the conditions
-clear.
-
-When read, "cgroup.threads" contains the list of the thread IDs of all
-threads in the cgroup. Except that the operations are per-thread
-instead of per-process, "cgroup.threads" has the same format and
-behaves the same way as "cgroup.procs". While "cgroup.threads" can be
-written to in any cgroup, as it can only move threads inside the same
-threaded domain, its operations are confined inside each threaded
-subtree.
-
-The threaded domain cgroup serves as the resource domain for the whole
-subtree, and, while the threads can be scattered across the subtree,
-all the processes are considered to be in the threaded domain cgroup.
-"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
-processes in the subtree and is not readable in the subtree proper.
-However, "cgroup.procs" can be written to from anywhere in the subtree
-to migrate all threads of the matching process to the cgroup.
-
-Only threaded controllers can be enabled in a threaded subtree. When
-a threaded controller is enabled inside a threaded subtree, it only
-accounts for and controls resource consumptions associated with the
-threads in the cgroup and its descendants. All consumptions which
-aren't tied to a specific thread belong to the threaded domain cgroup.
-
-Because a threaded subtree is exempt from no internal process
-constraint, a threaded controller must be able to handle competition
-between threads in a non-leaf cgroup and its child cgroups. Each
-threaded controller defines how such competitions are handled.
-
-
2-3. [Un]populated Notification
Each non-root cgroup has a "cgroup.events" file which contains
@@ -391,15 +273,15 @@ disabled if one or more children have it enabled.
2-4-3. No Internal Process Constraint
-Non-root cgroups can distribute domain resources to their children
-only when they don't have any processes of their own. In other words,
-only domain cgroups which don't contain any processes can have domain
-controllers enabled in their "cgroup.subtree_control" files.
+Non-root cgroups can only distribute resources to their children when
+they don't have any processes of their own. In other words, only
+cgroups which don't contain any processes can have controllers enabled
+in their "cgroup.subtree_control" files.
-This guarantees that, when a domain controller is looking at the part
-of the hierarchy which has it enabled, processes are always only on
-the leaves. This rules out situations where child cgroups compete
-against internal processes of the parent.
+This guarantees that, when a controller is looking at the part of the
+hierarchy which has it enabled, processes are always only on the
+leaves. This rules out situations where child cgroups compete against
+internal processes of the parent.
The root cgroup is exempt from this restriction. Root contains
processes and anonymous resource consumption which can't be associated
@@ -420,27 +302,18 @@ file.
2-5-1. Model of Delegation
-A cgroup can be delegated in two ways. First, to a less privileged
-user by granting write access of the directory and its "cgroup.procs",
-"cgroup.threads" and "cgroup.subtree_control" files to the user.
-Second, if the "nsdelegate" mount option is set, automatically to a
-cgroup namespace on namespace creation.
-
-Because the resource control interface files in a given directory
-control the distribution of the parent's resources, the delegatee
-shouldn't be allowed to write to them. For the first method, this is
-achieved by not granting access to these files. For the second, the
-kernel rejects writes to all files other than "cgroup.procs" and
-"cgroup.subtree_control" on a namespace root from inside the
-namespace.
-
-The end results are equivalent for both delegation types. Once
-delegated, the user can build sub-hierarchy under the directory,
-organize processes inside it as it sees fit and further distribute the
-resources it received from the parent. The limits and other settings
-of all resource controllers are hierarchical and regardless of what
-happens in the delegated sub-hierarchy, nothing can escape the
-resource restrictions imposed by the parent.
+A cgroup can be delegated to a less privileged user by granting write
+access of the directory and its "cgroup.procs" file to the user. Note
+that resource control interface files in a given directory control the
+distribution of the parent's resources and thus must not be delegated
+along with the directory.
+
+Once delegated, the user can build sub-hierarchy under the directory,
+organize processes as it sees fit and further distribute the resources
+it received from the parent. The limits and other settings of all
+resource controllers are hierarchical and regardless of what happens
+in the delegated sub-hierarchy, nothing can escape the resource
+restrictions imposed by the parent.
Currently, cgroup doesn't impose any restrictions on the number of
cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -450,19 +323,19 @@ this may be limited explicitly in the future.
2-5-2. Delegation Containment
A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.
+can't be moved into or out of the sub-hierarchy by the delegatee. For
+a process with a non-root euid to migrate a target process into a
+cgroup by writing its PID to the "cgroup.procs" file, the following
+conditions must be met.
-For delegations to a less privileged user, this is achieved by
-requiring the following conditions for a process with a non-root euid
-to migrate a target process into a cgroup by writing its PID to the
-"cgroup.procs" file.
+- The writer's euid must match either uid or suid of the target process.
- The writer must have write access to the "cgroup.procs" file.
- The writer must have write access to the "cgroup.procs" file of the
common ancestor of the source and destination cgroups.
-The above two constraints ensure that while a delegatee may migrate
+The above three constraints ensure that while a delegatee may migrate
processes around freely in the delegated sub-hierarchy it can't pull
in from or push out to outside the sub-hierarchy.
@@ -477,15 +350,10 @@ all processes under C0 and C1 belong to U0.
Let's also say U0 wants to write the PID of a process which is
currently in C10 into "C00/cgroup.procs". U0 has write access to the
-file; however, the common ancestor of the source cgroup C10 and the
-destination cgroup C00 is above the points of delegation and U0 would
-not have write access to its "cgroup.procs" files and thus the write
-will be denied with -EACCES.
-
-For delegations to namespaces, containment is achieved by requiring
-that both the source and destination cgroups are reachable from the
-namespace of the process which is attempting the migration. If either
-is not reachable, the migration is rejected with -ENOENT.
+file and uid match on the process; however, the common ancestor of the
+source cgroup C10 and the destination cgroup C00 is above the points
+of delegation and U0 would not have write access to its "cgroup.procs"
+files and thus the write will be denied with -EACCES.
2-6. Guidelines
@@ -718,29 +586,6 @@ may be specified in any order and not all pairs have to be specified.
All cgroup core files are prefixed with "cgroup."
- cgroup.type
-
- A read-write single value file which exists on non-root
- cgroups.
-
- When read, it indicates the current type of the cgroup, which
- can be one of the following values.
-
- - "domain" : A normal valid domain cgroup.
-
- - "domain threaded" : A threaded domain cgroup which is
- serving as the root of a threaded subtree.
-
- - "domain invalid" : A cgroup which is in an invalid state.
- It can't be populated or have controllers enabled. It may
- be allowed to become a threaded cgroup.
-
- - "threaded" : A threaded cgroup which is a member of a
- threaded subtree.
-
- A cgroup can be turned into a threaded cgroup by writing
- "threaded" to this file.
-
cgroup.procs
A read-write new-line separated values file which exists on
@@ -756,36 +601,10 @@ All cgroup core files are prefixed with "cgroup."
the PID to the cgroup. The writer should match all of the
following conditions.
- - It must have write access to the "cgroup.procs" file.
+ - Its euid is either root or must match either uid or suid of
+ the target process.
- - It must have write access to the "cgroup.procs" file of the
- common ancestor of the source and destination cgroups.
-
- When delegating a sub-hierarchy, write access to this file
- should be granted along with the containing directory.
-
- In a threaded cgroup, reading this file fails with EOPNOTSUPP
- as all the processes belong to the thread root. Writing is
- supported and moves every thread of the process to the cgroup.
-
- cgroup.threads
- A read-write new-line separated values file which exists on
- all cgroups.
-
- When read, it lists the TIDs of all threads which belong to
- the cgroup one-per-line. The TIDs are not ordered and the
- same TID may show up more than once if the thread got moved to
- another cgroup and then back or the TID got recycled while
- reading.
-
- A TID can be written to migrate the thread associated with the
- TID to the cgroup. The writer should match all of the
- following conditions.
-
- - It must have write access to the "cgroup.threads" file.
-
- - The cgroup that the thread is currently in must be in the
- same resource domain as the destination cgroup.
+ - It must have write access to the "cgroup.procs" file.
- It must have write access to the "cgroup.procs" file of the
common ancestor of the source and destination cgroups.
@@ -829,38 +648,6 @@ All cgroup core files are prefixed with "cgroup."
1 if the cgroup or its descendants contains any live
processes; otherwise, 0.
- cgroup.max.descendants
- A read-write single value files. The default is "max".
-
- Maximum allowed number of descent cgroups.
- If the actual number of descendants is equal or larger,
- an attempt to create a new cgroup in the hierarchy will fail.
-
- cgroup.max.depth
- A read-write single value files. The default is "max".
-
- Maximum allowed descent depth below the current cgroup.
- If the actual descent depth is equal or larger,
- an attempt to create a new child cgroup will fail.
-
- cgroup.stat
- A read-only flat-keyed file with the following entries:
-
- nr_descendants
- Total number of visible descendant cgroups.
-
- nr_dying_descendants
- Total number of dying descendant cgroups. A cgroup becomes
- dying after being deleted by a user. The cgroup will remain
- in dying state for some time undefined time (which can depend
- on system load) before being completely destroyed.
-
- A process can't enter a dying cgroup under any circumstances,
- a dying cgroup can't revive.
-
- A dying cgroup can consume system resources not exceeding
- limits, which were active at the moment of cgroup deletion.
-
5. Controllers
@@ -1350,92 +1137,6 @@ writeback as follows.
vm.dirty[_background]_ratio.
-5-4. PID
-
-The process number controller is used to allow a cgroup to stop any
-new tasks from being fork()'d or clone()'d after a specified limit is
-reached.
-
-The number of tasks in a cgroup can be exhausted in ways which other
-controllers cannot prevent, thus warranting its own controller. For
-example, a fork bomb is likely to exhaust the number of tasks before
-hitting memory restrictions.
-
-Note that PIDs used in this controller refer to TIDs, process IDs as
-used by the kernel.
-
-
-5-4-1. PID Interface Files
-
- pids.max
-
- A read-write single value file which exists on non-root
- cgroups. The default is "max".
-
- Hard limit of number of processes.
-
- pids.current
-
- A read-only single value file which exists on all cgroups.
-
- The number of processes currently in the cgroup and its
- descendants.
-
-Organisational operations are not blocked by cgroup policies, so it is
-possible to have pids.current > pids.max. This can be done by either
-setting the limit to be smaller than pids.current, or attaching enough
-processes to the cgroup such that pids.current is larger than
-pids.max. However, it is not possible to violate a cgroup PID policy
-through fork() or clone(). These will return -EAGAIN if the creation
-of a new process would cause a cgroup policy to be violated.
-
-
-5-5. Misc
-
-5-5-1. perf_event
-
-perf_event controller, if not mounted on a legacy hierarchy, is
-automatically enabled on the v2 hierarchy so that perf events can
-always be filtered by cgroup v2 path. The controller can still be
-moved to a legacy hierarchy after v2 hierarchy is populated.
-
-
-5-6. RDMA
-
-The "rdma" controller regulates the distribution and accounting of
-of RDMA resources.
-
-5-6-1. RDMA Interface Files
-
- rdma.max
- A readwrite nested-keyed file that exists for all the cgroups
- except root that describes current configured resource limit
- for a RDMA/IB device.
-
- Lines are keyed by device name and are not ordered.
- Each line contains space separated resource name and its configured
- limit that can be distributed.
-
- The following nested keys are defined.
-
- hca_handle Maximum number of HCA Handles
- hca_object Maximum number of HCA Objects
-
- An example for mlx4 and ocrdma device follows.
-
- mlx4_0 hca_handle=2 hca_object=2000
- ocrdma1 hca_handle=3 hca_object=max
-
- rdma.current
- A read-only file that describes current resource usage.
- It exists for all the cgroup except root.
-
- An example for mlx4 and ocrdma device follows.
-
- mlx4_0 hca_handle=1 hca_object=20
- ocrdma1 hca_handle=1 hca_object=23
-
-
6. Namespace
6-1. Basics
@@ -1623,7 +1324,7 @@ D. Deprecated v1 Core Features
- Multiple hierarchies including named ones are not supported.
-- All v1 mount options are not supported.
+- All mount options and remounting are not supported.
- The "tasks" file is removed and "cgroup.procs" is not sorted.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9afba613a5c3..a66de7db0118 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -692,14 +692,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Specifying "pressure" disables per-cgroup pressure
stall information accounting feature
- cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
- Format: { { controller | "all" | "named" }
- [,{ controller | "all" | "named" }...] }
+ cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
+ Format: { controller[,controller...] | "all" }
Like cgroup_disable, but only applies to cgroup v1;
the blacklisted controllers remain available in cgroup2.
- "all" blacklists all controllers and "named" disables
- named mounts. Specifying both "all" and "named" disables
- all v1 hierarchies.
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format:
diff --git a/Makefile b/Makefile
index 64d8b31c7b5d..a2946c935d83 100644
--- a/Makefile
+++ b/Makefile
@@ -87,10 +87,16 @@ endif
# If the user is running make -s (silent mode), suppress echoing of
# commands
+# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS.
-ifneq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),)
- quiet=silent_
- tools_silent=s
+ifeq ($(filter 3.%,$(MAKE_VERSION)),)
+silence:=$(findstring s,$(firstword -$(MAKEFLAGS)))
+else
+silence:=$(findstring s,$(filter-out --%,$(MAKEFLAGS)))
+endif
+
+ifeq ($(silence),s)
+quiet=silent_
endif
export quiet Q KBUILD_VERBOSE
diff --git a/README.md b/README.md
deleted file mode 100644
index 4a7cd21638cd..000000000000
--- a/README.md
+++ /dev/null
@@ -1 +0,0 @@
-已停更,随缘更新
\ No newline at end of file
diff --git a/arch/arm64/configs/lxc.config b/arch/arm64/configs/lxc.config
new file mode 100644
index 000000000000..7c2967ce7218
--- /dev/null
+++ b/arch/arm64/configs/lxc.config
@@ -0,0 +1,44 @@
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+
+CONFIG_NAMESPACES=y
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_PID_NS=y
+CONFIG_USER_NS=y
+CONFIG_NET_NS=y
+
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_MEMCG=y
+CONFIG_CPUSETS=y
+
+CONFIG_VETH=y
+CONFIG_MACVLAN=y
+CONFIG_VLAN_8021Q=y
+CONFIG_BRIDGE=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NF_NAT_IPV4=y
+CONFIG_NF_NAT_IPV6=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP6_NF_TARGET_MASQUERADE=y
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_FUSE_FS=y
+
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_FHANDLE=y
+CONFIG_EVENTFD=y
+CONFIG_EPOLL=y
+CONFIG_UNIX_DIAG=y
+CONFIG_INET_DIAG=y
+CONFIG_PACKET_DIAG=y
+CONFIG_NETLINK_DIAG=y
+
+CONFIG_BINFMT_MISC=y
+
+CONFIG_ANDROID_PARANOID_NETWORK=n
\ No newline at end of file
diff --git a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
index 3768440716ca..1ff7a9286950 100644
--- a/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
+++ b/arch/arm64/configs/vendor/xiaomi/mi845_defconfig
@@ -71,7 +71,7 @@ CONFIG_PCI_MSM=y
CONFIG_SCHED_MC=y
CONFIG_NR_CPUS=8
CONFIG_PREEMPT=y
-CONFIG_HZ_100=y
+CONFIG_HZ_300=y
CONFIG_ANON_MIN_KBYTES=196608
CONFIG_CLEAN_LOW_KBYTES=393216
CONFIG_CLEAN_MIN_KBYTES=196608
@@ -218,6 +218,7 @@ CONFIG_IP6_NF_IPTABLES_128=y
CONFIG_IP6_NF_MATCH_RPFILTER=y
CONFIG_IP6_NF_TARGET_HL=y
CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_NAT=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
CONFIG_IP6_NF_RAW=y
@@ -634,5 +635,4 @@ CONFIG_SND_SOC_WCD_MBHC_ADC=y
CONFIG_SND_SOC_WCD_SPI=y
CONFIG_SOUNDWIRE=y
CONFIG_WCD_SPI_AC=y
-CONFIG_REKERNEL=y
-CONFIG_REKERNEL_NETWORK=y
+CONFIG_KSU=y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 75ee7ba34ebb..421bef9c4c48 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -39,28 +39,9 @@ config CFQ_GROUP_IOSCHED
---help---
Enable group IO scheduling in CFQ.
-config IOSCHED_BFQ
- tristate "BFQ I/O scheduler"
- default y
- ---help---
- The BFQ I/O scheduler distributes bandwidth among all
- processes according to their weights, regardless of the
- device parameters and with any workload. It also guarantees
- a low latency to interactive and soft real-time applications.
- Details in Documentation/block/bfq-iosched.txt
-
-config BFQ_GROUP_IOSCHED
- bool "BFQ hierarchical scheduling support"
- depends on IOSCHED_BFQ && BLK_CGROUP
- default n
- ---help---
-
- Enable hierarchical scheduling in BFQ, using the blkio
- (cgroups-v1) or io (cgroups-v2) controller.
-
choice
prompt "Default I/O scheduler"
- default DEFAULT_BFQ
+ default DEFAULT_CFQ
help
Select the I/O scheduler which will be used by default for all
block devices.
@@ -74,16 +55,6 @@ choice
config DEFAULT_NOOP
bool "No-op"
- config DEFAULT_BFQ
- bool "BFQ" if IOSCHED_BFQ=y
- help
- Selects BFQ as the default I/O scheduler which will be
- used by default for all block devices.
- The BFQ I/O scheduler aims at distributing the bandwidth
- as desired, independently of the disk parameters and with
- any workload. It also tries to guarantee low latency to
- interactive and soft real-time applications.
-
endchoice
config DEFAULT_IOSCHED
@@ -91,7 +62,6 @@ config DEFAULT_IOSCHED
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP
- default "bfq" if DEFAULT_BFQ
endmenu
diff --git a/block/Makefile b/block/Makefile
index 736e91a2ca1c..36acdd7545be 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
-obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
deleted file mode 100644
index 52484f10bb6f..000000000000
--- a/block/bfq-cgroup.c
+++ /dev/null
@@ -1,1191 +0,0 @@
-/*
- * BFQ: CGROUPS support.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe
- *
- * Copyright (C) 2008 Fabio Checconi
- * Paolo Valente
- *
- * Copyright (C) 2015 Paolo Valente
- *
- * Copyright (C) 2016 Paolo Valente
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
- * file.
- */
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-/* bfqg stats flags */
-enum bfqg_stats_flags {
- BFQG_stats_waiting = 0,
- BFQG_stats_idling,
- BFQG_stats_empty,
-};
-
-#define BFQG_FLAG_FNS(name) \
-static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
-{ \
- stats->flags |= (1 << BFQG_stats_##name); \
-} \
-static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
-{ \
- stats->flags &= ~(1 << BFQG_stats_##name); \
-} \
-static int bfqg_stats_##name(struct bfqg_stats *stats) \
-{ \
- return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
-} \
-
-BFQG_FLAG_FNS(waiting)
-BFQG_FLAG_FNS(idling)
-BFQG_FLAG_FNS(empty)
-#undef BFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
-{
- unsigned long long now;
-
- if (!bfqg_stats_waiting(stats))
- return;
-
- now = sched_clock();
- if (time_after64(now, stats->start_group_wait_time))
- blkg_stat_add(&stats->group_wait_time,
- now - stats->start_group_wait_time);
- bfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
- struct bfq_group *curr_bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (bfqg_stats_waiting(stats))
- return;
- if (bfqg == curr_bfqg)
- return;
- stats->start_group_wait_time = sched_clock();
- bfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
-{
- unsigned long long now;
-
- if (!bfqg_stats_empty(stats))
- return;
-
- now = sched_clock();
- if (time_after64(now, stats->start_empty_time))
- blkg_stat_add(&stats->empty_time,
- now - stats->start_empty_time);
- bfqg_stats_clear_empty(stats);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
-{
- blkg_stat_add(&bfqg->stats.dequeue, 1);
-}
-
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (blkg_rwstat_total(&stats->queued))
- return;
-
- /*
- * group is already marked empty. This can happen if bfqq got new
- * request in parent group and moved to this group while being added
- * to service tree. Just ignore the event and move on.
- */
- if (bfqg_stats_empty(stats))
- return;
-
- stats->start_empty_time = sched_clock();
- bfqg_stats_mark_empty(stats);
-}
-
-static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- if (bfqg_stats_idling(stats)) {
- unsigned long long now = sched_clock();
-
- if (time_after64(now, stats->start_idle_time))
- blkg_stat_add(&stats->idle_time,
- now - stats->start_idle_time);
- bfqg_stats_clear_idling(stats);
- }
-}
-
-static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- stats->start_idle_time = sched_clock();
- bfqg_stats_mark_idling(stats);
-}
-
-static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
-{
- struct bfqg_stats *stats = &bfqg->stats;
-
- blkg_stat_add(&stats->avg_queue_size_sum,
- blkg_rwstat_total(&stats->queued));
- blkg_stat_add(&stats->avg_queue_size_samples, 1);
- bfqg_stats_update_group_wait_time(stats);
-}
-
-static struct blkcg_policy blkcg_policy_bfq;
-
-/*
- * blk-cgroup policy-related handlers
- * The following functions help in converting between blk-cgroup
- * internal structures and BFQ-specific structures.
- */
-
-static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct bfq_group, pd) : NULL;
-}
-
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
-{
- return pd_to_blkg(&bfqg->pd);
-}
-
-static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
-{
- struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
-
- return pd_to_bfqg(pd);
-}
-
-/*
- * bfq_group handlers
- * The following functions help in navigating the bfq_group hierarchy
- * by allowing to find the parent of a bfq_group or the bfq_group
- * associated to a bfq_queue.
- */
-
-static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
-{
- struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
-
- return pblkg ? blkg_to_bfqg(pblkg) : NULL;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- return group_entity ? container_of(group_entity, struct bfq_group,
- entity) :
- bfqq->bfqd->root_group;
-}
-
-/*
- * The following two functions handle get and put of a bfq_group by
- * wrapping the related blk-cgroup hooks.
- */
-
-static void bfqg_get(struct bfq_group *bfqg)
-{
- return blkg_get(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_put(struct bfq_group *bfqg)
-{
- return blkg_put(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
- struct bfq_queue *bfqq,
- int op, int op_flags)
-{
- blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
- bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
-
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
- int op_flags)
-{
- blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
-}
-
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op,
- int op_flags)
-{
- blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
-}
-
-static void bfqg_stats_update_completion(struct bfq_group *bfqg,
- uint64_t start_time, uint64_t io_start_time, int op,
- int op_flags)
-{
- struct bfqg_stats *stats = &bfqg->stats;
- unsigned long long now = sched_clock();
-
- if (time_after64(now, io_start_time))
- blkg_rwstat_add(&stats->service_time, op, op_flags,
- now - io_start_time);
- if (time_after64(io_start_time, start_time))
- blkg_rwstat_add(&stats->wait_time, op, op_flags,
- io_start_time - start_time);
-}
-
-/* @stats = 0 */
-static void bfqg_stats_reset(struct bfqg_stats *stats)
-{
- /* queued stats shouldn't be cleared */
- blkg_rwstat_reset(&stats->merged);
- blkg_rwstat_reset(&stats->service_time);
- blkg_rwstat_reset(&stats->wait_time);
- blkg_stat_reset(&stats->time);
- blkg_stat_reset(&stats->avg_queue_size_sum);
- blkg_stat_reset(&stats->avg_queue_size_samples);
- blkg_stat_reset(&stats->dequeue);
- blkg_stat_reset(&stats->group_wait_time);
- blkg_stat_reset(&stats->idle_time);
- blkg_stat_reset(&stats->empty_time);
-}
-
-/* @to += @from */
-static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
-{
- if (!to || !from)
- return;
-
- /* queued stats shouldn't be cleared */
- blkg_rwstat_add_aux(&to->merged, &from->merged);
- blkg_rwstat_add_aux(&to->service_time, &from->service_time);
- blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
- blkg_stat_add_aux(&from->time, &from->time);
- blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_add_aux(&to->avg_queue_size_samples,
- &from->avg_queue_size_samples);
- blkg_stat_add_aux(&to->dequeue, &from->dequeue);
- blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_add_aux(&to->idle_time, &from->idle_time);
- blkg_stat_add_aux(&to->empty_time, &from->empty_time);
-}
-
-/*
- * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
- * recursive stats can still account for the amount used by this bfqg after
- * it's gone.
- */
-static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
-{
- struct bfq_group *parent;
-
- if (!bfqg) /* root_group */
- return;
-
- parent = bfqg_parent(bfqg);
-
- lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
-
- if (unlikely(!parent))
- return;
-
- bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
- bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_init_entity(struct bfq_entity *entity,
- struct bfq_group *bfqg)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- entity->weight = entity->new_weight;
- entity->orig_weight = entity->new_weight;
- if (bfqq) {
- bfqq->ioprio = bfqq->new_ioprio;
- bfqq->ioprio_class = bfqq->new_ioprio_class;
- bfqg_get(bfqg);
- }
- entity->parent = bfqg->my_entity; /* NULL for root group */
- entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
- blkg_rwstat_exit(&stats->merged);
- blkg_rwstat_exit(&stats->service_time);
- blkg_rwstat_exit(&stats->wait_time);
- blkg_rwstat_exit(&stats->queued);
- blkg_stat_exit(&stats->time);
- blkg_stat_exit(&stats->avg_queue_size_sum);
- blkg_stat_exit(&stats->avg_queue_size_samples);
- blkg_stat_exit(&stats->dequeue);
- blkg_stat_exit(&stats->group_wait_time);
- blkg_stat_exit(&stats->idle_time);
- blkg_stat_exit(&stats->empty_time);
-}
-
-static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
-{
- if (blkg_rwstat_init(&stats->merged, gfp) ||
- blkg_rwstat_init(&stats->service_time, gfp) ||
- blkg_rwstat_init(&stats->wait_time, gfp) ||
- blkg_rwstat_init(&stats->queued, gfp) ||
- blkg_stat_init(&stats->time, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
- blkg_stat_init(&stats->dequeue, gfp) ||
- blkg_stat_init(&stats->group_wait_time, gfp) ||
- blkg_stat_init(&stats->idle_time, gfp) ||
- blkg_stat_init(&stats->empty_time, gfp)) {
- bfqg_stats_exit(stats);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
-{
- return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
-}
-
-static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
-{
- return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
-}
-
-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
-{
- struct bfq_group_data *bgd;
-
- bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
- if (!bgd)
- return NULL;
- return &bgd->pd;
-}
-
-static void bfq_cpd_init(struct blkcg_policy_data *cpd)
-{
- struct bfq_group_data *d = cpd_to_bfqgd(cpd);
-
- d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
- CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
-}
-
-static void bfq_cpd_free(struct blkcg_policy_data *cpd)
-{
- kfree(cpd_to_bfqgd(cpd));
-}
-
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
-{
- struct bfq_group *bfqg;
-
- bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
- if (!bfqg)
- return NULL;
-
- if (bfqg_stats_init(&bfqg->stats, gfp)) {
- kfree(bfqg);
- return NULL;
- }
-
- return &bfqg->pd;
-}
-
-static void bfq_pd_init(struct blkg_policy_data *pd)
-{
- struct blkcg_gq *blkg;
- struct bfq_group *bfqg;
- struct bfq_data *bfqd;
- struct bfq_entity *entity;
- struct bfq_group_data *d;
-
- blkg = pd_to_blkg(pd);
- BUG_ON(!blkg);
- bfqg = blkg_to_bfqg(blkg);
- bfqd = blkg->q->elevator->elevator_data;
- entity = &bfqg->entity;
- d = blkcg_to_bfqgd(blkg->blkcg);
-
- entity->orig_weight = entity->weight = entity->new_weight = d->weight;
- entity->my_sched_data = &bfqg->sched_data;
- bfqg->my_entity = entity; /*
- * the root_group's will be set to NULL
- * in bfq_init_queue()
- */
- bfqg->bfqd = bfqd;
- bfqg->active_entities = 0;
- bfqg->rq_pos_tree = RB_ROOT;
-}
-
-static void bfq_pd_free(struct blkg_policy_data *pd)
-{
- struct bfq_group *bfqg = pd_to_bfqg(pd);
-
- bfqg_stats_exit(&bfqg->stats);
- return kfree(bfqg);
-}
-
-static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
-{
- struct bfq_group *bfqg = pd_to_bfqg(pd);
-
- bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_group_set_parent(struct bfq_group *bfqg,
- struct bfq_group *parent)
-{
- struct bfq_entity *entity;
-
- BUG_ON(!parent);
- BUG_ON(!bfqg);
- BUG_ON(bfqg == parent);
-
- entity = &bfqg->entity;
- entity->parent = parent->my_entity;
- entity->sched_data = &parent->sched_data;
-}
-
-static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
- struct blkcg *blkcg)
-{
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup(blkcg, bfqd->queue);
- if (likely(blkg))
- return blkg_to_bfqg(blkg);
- return NULL;
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
- struct blkcg *blkcg)
-{
- struct bfq_group *bfqg, *parent;
- struct bfq_entity *entity;
-
- assert_spin_locked(bfqd->queue->queue_lock);
-
- bfqg = bfq_lookup_bfqg(bfqd, blkcg);
-
- if (unlikely(!bfqg))
- return NULL;
-
- /*
- * Update chain of bfq_groups as we might be handling a leaf group
- * which, along with some of its relatives, has not been hooked yet
- * to the private hierarchy of BFQ.
- */
- entity = &bfqg->entity;
- for_each_entity(entity) {
- bfqg = container_of(entity, struct bfq_group, entity);
- BUG_ON(!bfqg);
- if (bfqg != bfqd->root_group) {
- parent = bfqg_parent(bfqg);
- if (!parent)
- parent = bfqd->root_group;
- BUG_ON(!parent);
- bfq_group_set_parent(bfqg, parent);
- }
- }
-
- return bfqg;
-}
-
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
- struct bfq_queue *bfqq);
-
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- bool compensate,
- enum bfqq_expiration reason);
-
-/**
- * bfq_bfqq_move - migrate @bfqq to @bfqg.
- * @bfqd: queue descriptor.
- * @bfqq: the queue to move.
- * @bfqg: the group to move to.
- *
- * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
- * it on the new one. Avoid putting the entity on the old group idle tree.
- *
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
- */
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct bfq_group *bfqg)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
- BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
- BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
- && entity->on_st &&
- bfqq != bfqd->in_service_queue);
- BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
-
- /* If bfqq is empty, then bfq_bfqq_expire also invokes
- * bfq_del_bfqq_busy, thereby removing bfqq and its entity
- * from data structures related to current group. Otherwise we
- * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
- * we do below.
- */
- if (bfqq == bfqd->in_service_queue)
- bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
- false, BFQ_BFQQ_PREEMPTED);
-
- BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
- && &bfq_entity_service_tree(entity)->idle !=
- entity->tree);
-
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
-
- if (bfq_bfqq_busy(bfqq))
- bfq_deactivate_bfqq(bfqd, bfqq, false, false);
- else if (entity->on_st) {
- BUG_ON(&bfq_entity_service_tree(entity)->idle !=
- entity->tree);
- bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- }
- bfqg_put(bfqq_group(bfqq));
-
- /*
- * Here we use a reference to bfqg. We don't need a refcounter
- * as the cgroup reference will not be dropped, so that its
- * destroy() callback will not be invoked.
- */
- entity->parent = bfqg->my_entity;
- entity->sched_data = &bfqg->sched_data;
- bfqg_get(bfqg);
-
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
- if (bfq_bfqq_busy(bfqq)) {
- bfq_pos_tree_add_move(bfqd, bfqq);
- bfq_activate_bfqq(bfqd, bfqq);
- }
-
- if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
- bfq_schedule_dispatch(bfqd);
- BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
- && &bfq_entity_service_tree(entity)->idle !=
- entity->tree);
-}
-
-/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
- * @bfqd: the queue descriptor.
- * @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
- *
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
- *
- * NOTE: an alternative approach might have been to store the current
- * cgroup in bfqq and getting a reference to it, reducing the lookup
- * time here, at the price of slightly more complex code.
- */
-static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
- struct bfq_io_cq *bic,
- struct blkcg *blkcg)
-{
- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
- struct bfq_group *bfqg;
- struct bfq_entity *entity;
-
- lockdep_assert_held(bfqd->queue->queue_lock);
-
- bfqg = bfq_find_set_group(bfqd, blkcg);
-
- if (unlikely(!bfqg))
- bfqg = bfqd->root_group;
-
- if (async_bfqq) {
- entity = &async_bfqq->entity;
-
- if (entity->sched_data != &bfqg->sched_data) {
- bic_set_bfqq(bic, NULL, 0);
- bfq_log_bfqq(bfqd, async_bfqq,
- "bic_change_group: %p %d",
- async_bfqq,
- async_bfqq->ref);
- bfq_put_queue(async_bfqq);
- }
- }
-
- if (sync_bfqq) {
- entity = &sync_bfqq->entity;
- if (entity->sched_data != &bfqg->sched_data)
- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
- }
-
- return bfqg;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
-{
- struct bfq_data *bfqd = bic_to_bfqd(bic);
- struct bfq_group *bfqg = NULL;
- uint64_t serial_nr;
-
- rcu_read_lock();
- serial_nr = bio_blkcg(bio)->css.serial_nr;
-
- /*
- * Check whether blkcg has changed. The condition may trigger
- * spuriously on a newly created cic but there's no harm.
- */
- if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
- goto out;
-
- bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
- bic->blkcg_serial_nr = serial_nr;
-out:
- rcu_read_unlock();
-}
-
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
- struct bfq_entity *entity = st->first_idle;
-
- for (; entity ; entity = st->first_idle)
- __bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
- struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- BUG_ON(!bfqq);
- bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_entities - move to the root group all active
- * entities.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
- */
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
- struct bfq_group *bfqg,
- struct bfq_service_tree *st)
-{
- struct rb_root *active = &st->active;
- struct bfq_entity *entity = NULL;
-
- if (!RB_EMPTY_ROOT(&st->active))
- entity = bfq_entity_of(rb_first(active));
-
- for (; entity ; entity = bfq_entity_of(rb_first(active)))
- bfq_reparent_leaf_entity(bfqd, entity);
-
- if (bfqg->sched_data.in_service_entity)
- bfq_reparent_leaf_entity(bfqd,
- bfqg->sched_data.in_service_entity);
-}
-
-/**
- * bfq_pd_offline - deactivate the entity associated with @pd,
- * and reparent its children entities.
- * @pd: descriptor of the policy going offline.
- *
- * blkio already grabs the queue_lock for us, so no need to use
- * RCU-based magic
- */
-static void bfq_pd_offline(struct blkg_policy_data *pd)
-{
- struct bfq_service_tree *st;
- struct bfq_group *bfqg;
- struct bfq_data *bfqd;
- struct bfq_entity *entity;
- int i;
-
- BUG_ON(!pd);
- bfqg = pd_to_bfqg(pd);
- BUG_ON(!bfqg);
- bfqd = bfqg->bfqd;
- BUG_ON(bfqd && !bfqd->root_group);
-
- entity = bfqg->my_entity;
-
- if (!entity) /* root group */
- return;
-
- /*
- * Empty all service_trees belonging to this group before
- * deactivating the group itself.
- */
- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
- st = bfqg->sched_data.service_tree + i;
- /*
- * The idle tree may still contain bfq_queues belonging
- * to exited task because they never migrated to a different
- * cgroup from the one being destroyed now. No one else
- * can access them so it's safe to act without any lock.
- */
- bfq_flush_idle_tree(st);
-
- /*
- * It may happen that some queues are still active
- * (busy) upon group destruction (if the corresponding
- * processes have been forced to terminate). We move
- * all the leaf entities corresponding to these queues
- * to the root_group.
- * Also, it may happen that the group has an entity
- * in service, which is disconnected from the active
- * tree: it must be moved, too.
- * There is no need to put the sync queues, as the
- * scheduler has taken no reference.
- */
- bfq_reparent_active_entities(bfqd, bfqg, st);
- BUG_ON(!RB_EMPTY_ROOT(&st->active));
- BUG_ON(!RB_EMPTY_ROOT(&st->idle));
- }
- BUG_ON(bfqg->sched_data.next_in_service);
- BUG_ON(bfqg->sched_data.in_service_entity);
-
- __bfq_deactivate_entity(entity, false);
- bfq_put_async_queues(bfqd, bfqg);
-
- /*
- * @blkg is going offline and will be ignored by
- * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
- * that they don't get lost. If IOs complete after this point, the
- * stats for them will be lost. Oh well...
- */
- bfqg_stats_xfer_dead(bfqg);
-}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
- struct blkcg_gq *blkg;
-
- list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- BUG_ON(!bfqg);
-
- bfq_end_wr_async_queues(bfqd, bfqg);
- }
- bfq_end_wr_async_queues(bfqd, bfqd->root_group);
-}
-
-static int bfq_io_show_weight(struct seq_file *sf, void *v)
-{
- struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
- unsigned int val = 0;
-
- if (bfqgd)
- val = bfqgd->weight;
-
- seq_printf(sf, "%u\n", val);
-
- return 0;
-}
-
-static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
- struct cftype *cftype,
- u64 val)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
- struct blkcg_gq *blkg;
- int ret = -ERANGE;
-
- if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
- return ret;
-
- ret = 0;
- spin_lock_irq(&blkcg->lock);
- bfqgd->weight = (unsigned short)val;
- hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
- struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-
- if (!bfqg)
- continue;
- /*
- * Setting the prio_changed flag of the entity
- * to 1 with new_weight == weight would re-set
- * the value of the weight to its ioprio mapping.
- * Set the flag only if necessary.
- */
- if ((unsigned short)val != bfqg->entity.new_weight) {
- bfqg->entity.new_weight = (unsigned short)val;
- /*
- * Make sure that the above new value has been
- * stored in bfqg->entity.new_weight before
- * setting the prio_changed flag. In fact,
- * this flag may be read asynchronously (in
- * critical sections protected by a different
- * lock than that held here), and finding this
- * flag set may cause the execution of the code
- * for updating parameters whose value may
- * depend also on bfqg->entity.new_weight (in
- * __bfq_entity_update_weight_prio).
- * This barrier makes sure that the new value
- * of bfqg->entity.new_weight is correctly
- * seen in that code.
- */
- smp_wmb();
- bfqg->entity.prio_changed = 1;
- }
- }
- spin_unlock_irq(&blkcg->lock);
-
- return ret;
-}
-
-static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- u64 weight;
- /* First unsigned long found in the file is used */
- int ret = kstrtoull(strim(buf), 0, &weight);
-
- if (ret)
- return ret;
-
- return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
-}
-
-static int bfqg_print_stat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- &blkcg_policy_bfq, seq_cft(sf)->private, false);
- return 0;
-}
-
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- &blkcg_policy_bfq, seq_cft(sf)->private, true);
- return 0;
-}
-
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq, off);
- return __blkg_prfill_u64(sf, pd, sum);
-}
-
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq,
- off);
- return __blkg_prfill_rwstat(sf, pd, &sum);
-}
-
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, false);
- return 0;
-}
-
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, true);
- return 0;
-}
-
-static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
- int off)
-{
- u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
-
- return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
- return 0;
-}
-
-static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
-
- return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
- false);
- return 0;
-}
-
-
-static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct bfq_group *bfqg = pd_to_bfqg(pd);
- u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
- u64 v = 0;
-
- if (samples) {
- v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
- v = div64_u64(v, samples);
- }
- __blkg_prfill_u64(sf, pd, v);
- return 0;
-}
-
-/* print avg_queue_size */
-static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
- 0, false);
- return 0;
-}
-
-static struct bfq_group *
-bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
-{
- int ret;
-
- ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
- if (ret)
- return NULL;
-
- return blkg_to_bfqg(bfqd->queue->root_blkg);
-}
-
-static struct cftype bfq_blkcg_legacy_files[] = {
- {
- .name = "bfq.weight",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = bfq_io_show_weight,
- .write_u64 = bfq_io_set_weight_legacy,
- },
-
- /* statistics, covers only the tasks in the bfqg */
- {
- .name = "bfq.time",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.sectors",
- .seq_show = bfqg_print_stat_sectors,
- },
- {
- .name = "bfq.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes,
- },
- {
- .name = "bfq.io_serviced",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios,
- },
- {
- .name = "bfq.io_service_time",
- .private = offsetof(struct bfq_group, stats.service_time),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_wait_time",
- .private = offsetof(struct bfq_group, stats.wait_time),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_merged",
- .private = offsetof(struct bfq_group, stats.merged),
- .seq_show = bfqg_print_rwstat,
- },
- {
- .name = "bfq.io_queued",
- .private = offsetof(struct bfq_group, stats.queued),
- .seq_show = bfqg_print_rwstat,
- },
-
- /* the same statictics which cover the bfqg and its descendants */
- {
- .name = "bfq.time_recursive",
- .private = offsetof(struct bfq_group, stats.time),
- .seq_show = bfqg_print_stat_recursive,
- },
- {
- .name = "bfq.sectors_recursive",
- .seq_show = bfqg_print_stat_sectors_recursive,
- },
- {
- .name = "bfq.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes_recursive,
- },
- {
- .name = "bfq.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios_recursive,
- },
- {
- .name = "bfq.io_service_time_recursive",
- .private = offsetof(struct bfq_group, stats.service_time),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_wait_time_recursive",
- .private = offsetof(struct bfq_group, stats.wait_time),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_merged_recursive",
- .private = offsetof(struct bfq_group, stats.merged),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.io_queued_recursive",
- .private = offsetof(struct bfq_group, stats.queued),
- .seq_show = bfqg_print_rwstat_recursive,
- },
- {
- .name = "bfq.avg_queue_size",
- .seq_show = bfqg_print_avg_queue_size,
- },
- {
- .name = "bfq.group_wait_time",
- .private = offsetof(struct bfq_group, stats.group_wait_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.idle_time",
- .private = offsetof(struct bfq_group, stats.idle_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.empty_time",
- .private = offsetof(struct bfq_group, stats.empty_time),
- .seq_show = bfqg_print_stat,
- },
- {
- .name = "bfq.dequeue",
- .private = offsetof(struct bfq_group, stats.dequeue),
- .seq_show = bfqg_print_stat,
- },
- { } /* terminate */
-};
-
-static struct cftype bfq_blkg_files[] = {
- {
- .name = "bfq.weight",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = bfq_io_show_weight,
- .write = bfq_io_set_weight,
- },
- {} /* terminate */
-};
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
- struct bfq_queue *bfqq, int op, int op_flags) { }
-static inline void
-bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
-static inline void
-bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
-static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
- uint64_t start_time, uint64_t io_start_time, int op,
- int op_flags) { }
-static inline void
-bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
- struct bfq_group *curr_bfqg) { }
-static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
-static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct bfq_group *bfqg) {}
-
-static void bfq_init_entity(struct bfq_entity *entity,
- struct bfq_group *bfqg)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- entity->weight = entity->new_weight;
- entity->orig_weight = entity->new_weight;
- if (bfqq) {
- bfqq->ioprio = bfqq->new_ioprio;
- bfqq->ioprio_class = bfqq->new_ioprio_class;
- }
- entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
- bfq_end_wr_async_queues(bfqd, bfqd->root_group);
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
- struct blkcg *blkcg)
-{
- return bfqd->root_group;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
-static struct bfq_group *
-bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
-{
- struct bfq_group *bfqg;
- int i;
-
- bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
- if (!bfqg)
- return NULL;
-
- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
- bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-
- return bfqg;
-}
-#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
deleted file mode 100644
index fb7bb8f08b75..000000000000
--- a/block/bfq-ioc.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * BFQ: I/O context handling.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe
- *
- * Copyright (C) 2008 Fabio Checconi
- * Paolo Valente
- *
- * Copyright (C) 2010 Paolo Valente
- */
-
-/**
- * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
- * @icq: the iocontext queue.
- */
-static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
-{
- /* bic->icq is the first member, %NULL will convert to %NULL */
- return container_of(icq, struct bfq_io_cq, icq);
-}
-
-/**
- * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
- *
- * Queue lock must be held.
- */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
- struct io_context *ioc)
-{
- if (ioc)
- return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
- return NULL;
-}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
deleted file mode 100644
index 6e6025dacfc6..000000000000
--- a/block/bfq-iosched.c
+++ /dev/null
@@ -1,5403 +0,0 @@
-/*
- * Budget Fair Queueing (BFQ) I/O scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe
- *
- * Copyright (C) 2008 Fabio Checconi
- * Paolo Valente
- *
- * Copyright (C) 2015 Paolo Valente
- *
- * Copyright (C) 2017 Paolo Valente
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
- * file.
- *
- * BFQ is a proportional-share I/O scheduler, with some extra
- * low-latency capabilities. BFQ also supports full hierarchical
- * scheduling through cgroups. Next paragraphs provide an introduction
- * on BFQ inner workings. Details on BFQ benefits and usage can be
- * found in Documentation/block/bfq-iosched.txt.
- *
- * BFQ is a proportional-share storage-I/O scheduling algorithm based
- * on the slice-by-slice service scheme of CFQ. But BFQ assigns
- * budgets, measured in number of sectors, to processes instead of
- * time slices. The device is not granted to the in-service process
- * for a given time slice, but until it has exhausted its assigned
- * budget. This change from the time to the service domain enables BFQ
- * to distribute the device throughput among processes as desired,
- * without any distortion due to throughput fluctuations, or to device
- * internal queueing. BFQ uses an ad hoc internal scheduler, called
- * B-WF2Q+, to schedule processes according to their budgets. More
- * precisely, BFQ schedules queues associated with processes. Thanks to
- * the accurate policy of B-WF2Q+, BFQ can afford to assign high
- * budgets to I/O-bound processes issuing sequential requests (to
- * boost the throughput), and yet guarantee a low latency to
- * interactive and soft real-time applications.
- *
- * NOTE: if the main or only goal, with a given device, is to achieve
- * the maximum-possible throughput at all times, then do switch off
- * all low-latency heuristics for that device, by setting low_latency
- * to 0.
- *
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
- *
- * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
- * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
- * complexity derives from the one introduced with EEVDF in [3].
- *
- * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
- * Scheduler", Proceedings of the First Workshop on Mobile System
- * Technologies (MST-2015), May 2015.
- * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
- *
- * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
- *
- * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
- * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
- * Oct 1997.
- *
- * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
- *
- * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
- * First: A Flexible and Accurate Mechanism for Proportional Share
- * Resource Allocation,'' technical report.
- *
- * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
- */
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include "bfq.h"
-#include "blk.h"
-
-/* Expiration time of sync (0) and async (1) requests, in ns. */
-static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
-
-/* Maximum backwards seek, in KiB. */
-static const int bfq_back_max = (16 * 1024);
-
-/* Penalty of a backwards seek, in number of sectors. */
-static const int bfq_back_penalty = 2;
-
-/* Idling period duration, in ns. */
-static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
-
-/* Minimum number of assigned budgets for which stats are safe to compute. */
-static const int bfq_stats_min_budgets = 194;
-
-/* Default maximum budget values, in sectors and number of requests. */
-static const int bfq_default_max_budget = (16 * 1024);
-
-/*
- * Async to sync throughput distribution is controlled as follows:
- * when an async request is served, the entity is charged the number
- * of sectors of the request, multiplied by the factor below
- */
-static const int bfq_async_charge_factor = 10;
-
-/* Default timeout values, in jiffies, approximating CFQ defaults. */
-static const int bfq_timeout = (HZ / 8);
-
-static struct kmem_cache *bfq_pool;
-
-/* Below this threshold (in ns), we consider thinktime immediate. */
-#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
-
-/* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD 4
-#define BFQ_HW_QUEUE_SAMPLES 32
-
-#define BFQQ_SEEK_THR (sector_t)(8 * 100)
-#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
-#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
-
-/* Min number of samples required to perform peak-rate update */
-#define BFQ_RATE_MIN_SAMPLES 32
-/* Min observation time interval required to perform a peak-rate update (ns) */
-#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
-/* Target observation time interval for a peak-rate update (ns) */
-#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
-
-/* Shift used for peak rate fixed precision calculations. */
-#define BFQ_RATE_SHIFT 16
-
-/*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- * SD cards, or fast, such as newer HDDs and SSDs.
- *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
- *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
- *
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
- */
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
-/*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
- */
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
-
-#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
- { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
-
-#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
-#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
-
-static void bfq_schedule_dispatch(struct bfq_data *bfqd);
-
-#include "bfq-ioc.c"
-#include "bfq-sched.c"
-#include "bfq-cgroup.c"
-
-#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
-
-#define bfq_sample_valid(samples) ((samples) > 80)
-
-/*
- * We regard a request as SYNC, if either it's a read or has the SYNC bit
- * set (in which case it could also be a direct WRITE).
- */
-static int bfq_bio_sync(struct bio *bio)
-{
- return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
-}
-
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
- if (bfqd->queued != 0) {
- bfq_log(bfqd, "schedule dispatch");
- kblockd_schedule_work(&bfqd->unplug_work);
- }
-}
-
-/*
- * Lifted from AS - choose which of rq1 and rq2 that is best served now.
- * We choose the request that is closesr to the head right now. Distance
- * behind the head is penalized and only allowed to a certain extent.
- */
-static struct request *bfq_choose_req(struct bfq_data *bfqd,
- struct request *rq1,
- struct request *rq2,
- sector_t last)
-{
- sector_t s1, s2, d1 = 0, d2 = 0;
- unsigned long back_max;
-#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
-#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
- unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
-
- if (!rq1 || rq1 == rq2)
- return rq2;
- if (!rq2)
- return rq1;
-
- if (rq_is_sync(rq1) && !rq_is_sync(rq2))
- return rq1;
- else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
- return rq2;
- if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
- return rq1;
- else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
- return rq2;
-
- s1 = blk_rq_pos(rq1);
- s2 = blk_rq_pos(rq2);
-
- /*
- * By definition, 1KiB is 2 sectors.
- */
- back_max = bfqd->bfq_back_max * 2;
-
- /*
- * Strict one way elevator _except_ in the case where we allow
- * short backward seeks which are biased as twice the cost of a
- * similar forward seek.
- */
- if (s1 >= last)
- d1 = s1 - last;
- else if (s1 + back_max >= last)
- d1 = (last - s1) * bfqd->bfq_back_penalty;
- else
- wrap |= BFQ_RQ1_WRAP;
-
- if (s2 >= last)
- d2 = s2 - last;
- else if (s2 + back_max >= last)
- d2 = (last - s2) * bfqd->bfq_back_penalty;
- else
- wrap |= BFQ_RQ2_WRAP;
-
- /* Found required data */
-
- /*
- * By doing switch() on the bit mask "wrap" we avoid having to
- * check two variables for all permutations: --> faster!
- */
- switch (wrap) {
- case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
- if (d1 < d2)
- return rq1;
- else if (d2 < d1)
- return rq2;
-
- if (s1 >= s2)
- return rq1;
- else
- return rq2;
-
- case BFQ_RQ2_WRAP:
- return rq1;
- case BFQ_RQ1_WRAP:
- return rq2;
- case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
- default:
- /*
- * Since both rqs are wrapped,
- * start with the one that's further behind head
- * (--> only *one* back seek required),
- * since back seek takes more time than forward.
- */
- if (s1 <= s2)
- return rq1;
- else
- return rq2;
- }
-}
-
-static struct bfq_queue *
-bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
- sector_t sector, struct rb_node **ret_parent,
- struct rb_node ***rb_link)
-{
- struct rb_node **p, *parent;
- struct bfq_queue *bfqq = NULL;
-
- parent = NULL;
- p = &root->rb_node;
- while (*p) {
- struct rb_node **n;
-
- parent = *p;
- bfqq = rb_entry(parent, struct bfq_queue, pos_node);
-
- /*
- * Sort strictly based on sector. Smallest to the left,
- * largest to the right.
- */
- if (sector > blk_rq_pos(bfqq->next_rq))
- n = &(*p)->rb_right;
- else if (sector < blk_rq_pos(bfqq->next_rq))
- n = &(*p)->rb_left;
- else
- break;
- p = n;
- bfqq = NULL;
- }
-
- *ret_parent = parent;
- if (rb_link)
- *rb_link = p;
-
- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
- (unsigned long long) sector,
- bfqq ? bfqq->pid : 0);
-
- return bfqq;
-}
-
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- struct rb_node **p, *parent;
- struct bfq_queue *__bfqq;
-
- if (bfqq->pos_root) {
- rb_erase(&bfqq->pos_node, bfqq->pos_root);
- bfqq->pos_root = NULL;
- }
-
- if (bfq_class_idle(bfqq))
- return;
- if (!bfqq->next_rq)
- return;
-
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
- __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
- blk_rq_pos(bfqq->next_rq), &parent, &p);
- if (!__bfqq) {
- rb_link_node(&bfqq->pos_node, parent, p);
- rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
- } else
- bfqq->pos_root = NULL;
-}
-
-/*
- * Tell whether there are active queues or groups with differentiated weights.
- */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
-{
- /*
- * For weights to differ, at least one of the trees must contain
- * at least two nodes.
- */
- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
- (bfqd->queue_weights_tree.rb_node->rb_left ||
- bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- ) ||
- (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
- (bfqd->group_weights_tree.rb_node->rb_left ||
- bfqd->group_weights_tree.rb_node->rb_right)
-#endif
- );
-}
-
-/*
- * The following function returns true if every queue must receive the
- * same share of the throughput (this condition is used when deciding
- * whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
- *
- * Such a scenario occurs when:
- * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- * weight,
- * 3) all active groups at the same level in the groups tree have the same
- * number of children.
- *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
- * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
- */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
-{
- return !bfq_differentiated_weights(bfqd);
-}
-
-/*
- * If the weight-counter tree passed as input contains no counter for
- * the weight of the input entity, then add that counter; otherwise just
- * increment the existing counter.
- *
- * Note that weight-counter trees contain few nodes in mostly symmetric
- * scenarios. For example, if all queues have the same weight, then the
- * weight-counter tree for the queues may contain at most one node.
- * This holds even if low_latency is on, because weight-raised queues
- * are not inserted in the tree.
- * In most scenarios, the rate at which nodes are created/destroyed
- * should be low too.
- */
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
- struct bfq_entity *entity,
- struct rb_root *root)
-{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
-
- /*
- * Do not insert if the entity is already associated with a
- * counter, which happens if:
- * 1) the entity is associated with a queue,
- * 2) a request arrival has caused the queue to become both
- * non-weight-raised, and hence change its weight, and
- * backlogged; in this respect, each of the two events
- * causes an invocation of this function,
- * 3) this is the invocation of this function caused by the
- * second event. This second invocation is actually useless,
- * and we handle this fact by exiting immediately. More
- * efficient or clearer solutions might possibly be adopted.
- */
- if (entity->weight_counter)
- return;
-
- while (*new) {
- struct bfq_weight_counter *__counter = container_of(*new,
- struct bfq_weight_counter,
- weights_node);
- parent = *new;
-
- if (entity->weight == __counter->weight) {
- entity->weight_counter = __counter;
- goto inc_counter;
- }
- if (entity->weight < __counter->weight)
- new = &((*new)->rb_left);
- else
- new = &((*new)->rb_right);
- }
-
- entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
- GFP_ATOMIC);
-
- /*
- * In the unlucky event of an allocation failure, we just
- * exit. This will cause the weight of entity to not be
- * considered in bfq_differentiated_weights, which, in its
- * turn, causes the scenario to be deemed wrongly symmetric in
- * case entity's weight would have been the only weight making
- * the scenario asymmetric. On the bright side, no unbalance
- * will however occur when entity becomes inactive again (the
- * invocation of this function is triggered by an activation
- * of entity). In fact, bfq_weights_tree_remove does nothing
- * if !entity->weight_counter.
- */
- if (unlikely(!entity->weight_counter))
- return;
-
- entity->weight_counter->weight = entity->weight;
- rb_link_node(&entity->weight_counter->weights_node, parent, new);
- rb_insert_color(&entity->weight_counter->weights_node, root);
-
-inc_counter:
- entity->weight_counter->num_active++;
-}
-
-/*
- * Decrement the weight counter associated with the entity, and, if the
- * counter reaches 0, remove the counter from the tree.
- * See the comments to the function bfq_weights_tree_add() for considerations
- * about overhead.
- */
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_entity *entity,
- struct rb_root *root)
-{
- if (!entity->weight_counter)
- return;
-
- BUG_ON(RB_EMPTY_ROOT(root));
- BUG_ON(entity->weight_counter->weight != entity->weight);
-
- BUG_ON(!entity->weight_counter->num_active);
- entity->weight_counter->num_active--;
- if (entity->weight_counter->num_active > 0)
- goto reset_entity_pointer;
-
- rb_erase(&entity->weight_counter->weights_node, root);
- kfree(entity->weight_counter);
-
-reset_entity_pointer:
- entity->weight_counter = NULL;
-}
-
-/*
- * Return expired entry, or NULL to just start from scratch in rbtree.
- */
-static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
- struct request *last)
-{
- struct request *rq;
-
- if (bfq_bfqq_fifo_expire(bfqq))
- return NULL;
-
- bfq_mark_bfqq_fifo_expire(bfqq);
-
- rq = rq_entry_fifo(bfqq->fifo.next);
-
- if (rq == last || ktime_get_ns() < rq->fifo_time)
- return NULL;
-
- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
- BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
- return rq;
-}
-
-static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct request *last)
-{
- struct rb_node *rbnext = rb_next(&last->rb_node);
- struct rb_node *rbprev = rb_prev(&last->rb_node);
- struct request *next, *prev = NULL;
-
- BUG_ON(list_empty(&bfqq->fifo));
-
- /* Follow expired path, else get first next available. */
- next = bfq_check_fifo(bfqq, last);
- if (next) {
- BUG_ON(next == last);
- return next;
- }
-
- BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
- if (rbprev)
- prev = rb_entry_rq(rbprev);
-
- if (rbnext)
- next = rb_entry_rq(rbnext);
- else {
- rbnext = rb_first(&bfqq->sort_list);
- if (rbnext && rbnext != &last->rb_node)
- next = rb_entry_rq(rbnext);
- }
-
- return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
-}
-
-/* see the definition of bfq_async_charge_factor for details */
-static unsigned long bfq_serv_to_charge(struct request *rq,
- struct bfq_queue *bfqq)
-{
- if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
- return blk_rq_sectors(rq);
-
- /*
- * If there are no weight-raised queues, then amplify service
- * by just the async charge factor; otherwise amplify service
- * by twice the async charge factor, to further reduce latency
- * for weight-raised queues.
- */
- if (bfqq->bfqd->wr_busy_queues == 0)
- return blk_rq_sectors(rq) * bfq_async_charge_factor;
-
- return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
-}
-
-/**
- * bfq_updated_next_req - update the queue after a new next_rq selection.
- * @bfqd: the device data the queue belongs to.
- * @bfqq: the queue to update.
- *
- * If the first request of a queue changes we make sure that the queue
- * has enough budget to serve at least its first request (if the
- * request has grown). We do this because if the queue has not enough
- * budget for its first request, it has to go through two dispatch
- * rounds to actually get it dispatched.
- */
-static void bfq_updated_next_req(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = &bfqq->entity;
- struct bfq_service_tree *st = bfq_entity_service_tree(entity);
- struct request *next_rq = bfqq->next_rq;
- unsigned long new_budget;
-
- if (!next_rq)
- return;
-
- if (bfqq == bfqd->in_service_queue)
- /*
- * In order not to break guarantees, budgets cannot be
- * changed after an entity has been selected.
- */
- return;
-
- BUG_ON(entity->tree != &st->active);
- BUG_ON(entity == entity->sched_data->in_service_entity);
-
- new_budget = max_t(unsigned long, bfqq->max_budget,
- bfq_serv_to_charge(next_rq, bfqq));
- if (entity->budget != new_budget) {
- entity->budget = new_budget;
- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
- new_budget);
- bfq_requeue_bfqq(bfqd, bfqq);
- }
-}
-
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
-{
- u64 dur;
-
- if (bfqd->bfq_wr_max_time > 0)
- return bfqd->bfq_wr_max_time;
-
- dur = bfqd->RT_prod;
- do_div(dur, bfqd->peak_rate);
-
- /*
- * Limit duration between 3 and 13 seconds. Tests show that
- * higher values than 13 seconds often yield the opposite of
- * the desired result, i.e., worsen responsiveness by letting
- * non-interactive and non-soft-real-time applications
- * preserve weight raising for a too long time interval.
- *
- * On the other end, lower values than 3 seconds make it
- * difficult for most interactive tasks to complete their jobs
- * before weight-raising finishes.
- */
- if (dur > msecs_to_jiffies(13000))
- dur = msecs_to_jiffies(13000);
- else if (dur < msecs_to_jiffies(3000))
- dur = msecs_to_jiffies(3000);
-
- return dur;
-}
-
-static void
-bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
- struct bfq_io_cq *bic, bool bfq_already_existing)
-{
- unsigned int old_wr_coeff;
- bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
-
- if (bic->saved_has_short_ttime)
- bfq_mark_bfqq_has_short_ttime(bfqq);
- else
- bfq_clear_bfqq_has_short_ttime(bfqq);
-
- if (bic->saved_IO_bound)
- bfq_mark_bfqq_IO_bound(bfqq);
- else
- bfq_clear_bfqq_IO_bound(bfqq);
-
- if (unlikely(busy))
- old_wr_coeff = bfqq->wr_coeff;
-
- bfqq->wr_coeff = bic->saved_wr_coeff;
- bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
- BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
- bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
- bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
- BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-
- if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
- time_is_before_jiffies(bfqq->last_wr_start_finish +
- bfqq->wr_cur_max_time))) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "resume state: switching off wr (%lu + %lu < %lu)",
- bfqq->last_wr_start_finish, bfqq->wr_cur_max_time,
- jiffies);
-
- bfqq->wr_coeff = 1;
- }
-
- /* make sure weight will be updated, however we got here */
- bfqq->entity.prio_changed = 1;
-
- if (likely(!busy))
- return;
-
- if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) {
- bfqd->wr_busy_queues++;
- BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
- } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) {
- bfqd->wr_busy_queues--;
- BUG_ON(bfqd->wr_busy_queues < 0);
- }
-}
-
-static int bfqq_process_refs(struct bfq_queue *bfqq)
-{
- int process_refs, io_refs;
-
- lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
-
- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
- process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
- BUG_ON(process_refs < 0);
- return process_refs;
-}
-
-/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
-static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- struct bfq_queue *item;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
- hlist_del_init(&item->burst_list_node);
- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
- bfqd->burst_size = 1;
- bfqd->burst_parent_entity = bfqq->entity.parent;
-}
-
-/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
-static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- /* Increment burst size to take into account also bfqq */
- bfqd->burst_size++;
-
- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
-
- BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
-
- if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
- struct bfq_queue *pos, *bfqq_item;
- struct hlist_node *n;
-
- /*
- * Enough queues have been activated shortly after each
- * other to consider this burst as large.
- */
- bfqd->large_burst = true;
- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
-
- /*
- * We can now mark all queues in the burst list as
- * belonging to a large burst.
- */
- hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
- burst_list_node) {
- bfq_mark_bfqq_in_large_burst(bfqq_item);
- bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
- }
- bfq_mark_bfqq_in_large_burst(bfqq);
- bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
-
- /*
- * From now on, and until the current burst finishes, any
- * new queue being activated shortly after the last queue
- * was inserted in the burst can be immediately marked as
- * belonging to a large burst. So the burst list is not
- * needed any more. Remove it.
- */
- hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
- burst_list_node)
- hlist_del_init(&pos->burst_list_node);
- } else /*
- * Burst not yet large: add bfqq to the burst list. Do
- * not increment the ref counter for bfqq, because bfqq
- * is removed from the burst list before freeing bfqq
- * in put_queue.
- */
- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
-}
-
-/*
- * If many queues belonging to the same group happen to be created
- * shortly after each other, then the processes associated with these
- * queues have typically a common goal. In particular, bursts of queue
- * creations are usually caused by services or applications that spawn
- * many parallel threads/processes. Examples are systemd during boot,
- * or git grep. To help these processes get their job done as soon as
- * possible, it is usually better to not grant either weight-raising
- * or device idling to their queues.
- *
- * In this comment we describe, firstly, the reasons why this fact
- * holds, and, secondly, the next function, which implements the main
- * steps needed to properly mark these queues so that they can then be
- * treated in a different way.
- *
- * The above services or applications benefit mostly from a high
- * throughput: the quicker the requests of the activated queues are
- * cumulatively served, the sooner the target job of these queues gets
- * completed. As a consequence, weight-raising any of these queues,
- * which also implies idling the device for it, is almost always
- * counterproductive. In most cases it just lowers throughput.
- *
- * On the other hand, a burst of queue creations may be caused also by
- * the start of an application that does not consist of a lot of
- * parallel I/O-bound threads. In fact, with a complex application,
- * several short processes may need to be executed to start-up the
- * application. In this respect, to start an application as quickly as
- * possible, the best thing to do is in any case to privilege the I/O
- * related to the application with respect to all other
- * I/O. Therefore, the best strategy to start as quickly as possible
- * an application that causes a burst of queue creations is to
- * weight-raise all the queues created during the burst. This is the
- * exact opposite of the best strategy for the other type of bursts.
- *
- * In the end, to take the best action for each of the two cases, the
- * two types of bursts need to be distinguished. Fortunately, this
- * seems relatively easy, by looking at the sizes of the bursts. In
- * particular, we found a threshold such that only bursts with a
- * larger size than that threshold are apparently caused by
- * services or commands such as systemd or git grep. For brevity,
- * hereafter we call just 'large' these bursts. BFQ *does not*
- * weight-raise queues whose creation occurs in a large burst. In
- * addition, for each of these queues BFQ performs or does not perform
- * idling depending on which choice boosts the throughput more. The
- * exact choice depends on the device and request pattern at
- * hand.
- *
- * Unfortunately, false positives may occur while an interactive task
- * is starting (e.g., an application is being started). The
- * consequence is that the queues associated with the task do not
- * enjoy weight raising as expected. Fortunately these false positives
- * are very rare. They typically occur if some service happens to
- * start doing I/O exactly when the interactive task starts.
- *
- * Turning back to the next function, it implements all the steps
- * needed to detect the occurrence of a large burst and to properly
- * mark all the queues belonging to it (so that they can then be
- * treated in a different way). This goal is achieved by maintaining a
- * "burst list" that holds, temporarily, the queues that belong to the
- * burst in progress. The list is then used to mark these queues as
- * belonging to a large burst if the burst does become large. The main
- * steps are the following.
- *
- * . when the very first queue is created, the queue is inserted into the
- * list (as it could be the first queue in a possible burst)
- *
- * . if the current burst has not yet become large, and a queue Q that does
- * not yet belong to the burst is activated shortly after the last time
- * at which a new queue entered the burst list, then the function appends
- * Q to the burst list
- *
- * . if, as a consequence of the previous step, the burst size reaches
- * the large-burst threshold, then
- *
- * . all the queues in the burst list are marked as belonging to a
- * large burst
- *
- * . the burst list is deleted; in fact, the burst list already served
- * its purpose (keeping temporarily track of the queues in a burst,
- * so as to be able to mark them as belonging to a large burst in the
- * previous sub-step), and now is not needed any more
- *
- * . the device enters a large-burst mode
- *
- * . if a queue Q that does not belong to the burst is created while
- * the device is in large-burst mode and shortly after the last time
- * at which a queue either entered the burst list or was marked as
- * belonging to the current large burst, then Q is immediately marked
- * as belonging to a large burst.
- *
- * . if a queue Q that does not belong to the burst is created a while
- * later, i.e., not shortly after, than the last time at which a queue
- * either entered the burst list or was marked as belonging to the
- * current large burst, then the current burst is deemed as finished and:
- *
- * . the large-burst mode is reset if set
- *
- * . the burst list is emptied
- *
- * . Q is inserted in the burst list, as Q may be the first queue
- * in a possible new burst (then the burst list contains just Q
- * after this step).
- */
-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- /*
- * If bfqq is already in the burst list or is part of a large
- * burst, or finally has just been split, then there is
- * nothing else to do.
- */
- if (!hlist_unhashed(&bfqq->burst_list_node) ||
- bfq_bfqq_in_large_burst(bfqq) ||
- time_is_after_eq_jiffies(bfqq->split_time +
- msecs_to_jiffies(10)))
- return;
-
- /*
- * If bfqq's creation happens late enough, or bfqq belongs to
- * a different group than the burst group, then the current
- * burst is finished, and related data structures must be
- * reset.
- *
- * In this respect, consider the special case where bfqq is
- * the very first queue created after BFQ is selected for this
- * device. In this case, last_ins_in_burst and
- * burst_parent_entity are not yet significant when we get
- * here. But it is easy to verify that, whether or not the
- * following condition is true, bfqq will end up being
- * inserted into the burst list. In particular the list will
- * happen to contain only bfqq. And this is exactly what has
- * to happen, as bfqq may be the first queue of the first
- * burst.
- */
- if (time_is_before_jiffies(bfqd->last_ins_in_burst +
- bfqd->bfq_burst_interval) ||
- bfqq->entity.parent != bfqd->burst_parent_entity) {
- bfqd->large_burst = false;
- bfq_reset_burst_list(bfqd, bfqq);
- bfq_log_bfqq(bfqd, bfqq,
- "handle_burst: late activation or different group");
- goto end;
- }
-
- /*
- * If we get here, then bfqq is being activated shortly after the
- * last queue. So, if the current burst is also large, we can mark
- * bfqq as belonging to this large burst immediately.
- */
- if (bfqd->large_burst) {
- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
- bfq_mark_bfqq_in_large_burst(bfqq);
- goto end;
- }
-
- /*
- * If we get here, then a large-burst state has not yet been
- * reached, but bfqq is being activated shortly after the last
- * queue. Then we add bfqq to the burst.
- */
- bfq_add_to_burst(bfqd, bfqq);
-end:
- /*
- * At this point, bfqq either has been added to the current
- * burst or has caused the current burst to terminate and a
- * possible new burst to start. In particular, in the second
- * case, bfqq has become the first queue in the possible new
- * burst. In both cases last_ins_in_burst needs to be moved
- * forward.
- */
- bfqd->last_ins_in_burst = jiffies;
-
-}
-
-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- return entity->budget - entity->service;
-}
-
-/*
- * If enough samples have been computed, return the current max budget
- * stored in bfqd, which is dynamically updated according to the
- * estimated disk peak rate; otherwise return the default max budget
- */
-static int bfq_max_budget(struct bfq_data *bfqd)
-{
- if (bfqd->budgets_assigned < bfq_stats_min_budgets)
- return bfq_default_max_budget;
- else
- return bfqd->bfq_max_budget;
-}
-
-/*
- * Return min budget, which is a fraction of the current or default
- * max budget (trying with 1/32)
- */
-static int bfq_min_budget(struct bfq_data *bfqd)
-{
- if (bfqd->budgets_assigned < bfq_stats_min_budgets)
- return bfq_default_max_budget / 32;
- else
- return bfqd->bfq_max_budget / 32;
-}
-
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- bool compensate,
- enum bfqq_expiration reason);
-
-/*
- * The next function, invoked after the input queue bfqq switches from
- * idle to busy, updates the budget of bfqq. The function also tells
- * whether the in-service queue should be expired, by returning
- * true. The purpose of expiring the in-service queue is to give bfqq
- * the chance to possibly preempt the in-service queue, and the reason
- * for preempting the in-service queue is to achieve one of the two
- * goals below.
- *
- * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
- * expired because it has remained idle. In particular, bfqq may have
- * expired for one of the following two reasons:
- *
- * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
- * did not make it to issue a new request before its last request
- * was served;
- *
- * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
- * a new request before the expiration of the idling-time.
- *
- * Even if bfqq has expired for one of the above reasons, the process
- * associated with the queue may be however issuing requests greedily,
- * and thus be sensitive to the bandwidth it receives (bfqq may have
- * remained idle for other reasons: CPU high load, bfqq not enjoying
- * idling, I/O throttling somewhere in the path from the process to
- * the I/O scheduler, ...). But if, after every expiration for one of
- * the above two reasons, bfqq has to wait for the service of at least
- * one full budget of another queue before being served again, then
- * bfqq is likely to get a much lower bandwidth or resource time than
- * its reserved ones. To address this issue, two countermeasures need
- * to be taken.
- *
- * First, the budget and the timestamps of bfqq need to be updated in
- * a special way on bfqq reactivation: they need to be updated as if
- * bfqq did not remain idle and did not expire. In fact, if they are
- * computed as if bfqq expired and remained idle until reactivation,
- * then the process associated with bfqq is treated as if, instead of
- * being greedy, it stopped issuing requests when bfqq remained idle,
- * and restarts issuing requests only on this reactivation. In other
- * words, the scheduler does not help the process recover the "service
- * hole" between bfqq expiration and reactivation. As a consequence,
- * the process receives a lower bandwidth than its reserved one. In
- * contrast, to recover this hole, the budget must be updated as if
- * bfqq was not expired at all before this reactivation, i.e., it must
- * be set to the value of the remaining budget when bfqq was
- * expired. Along the same line, timestamps need to be assigned the
- * value they had the last time bfqq was selected for service, i.e.,
- * before last expiration. Thus timestamps need to be back-shifted
- * with respect to their normal computation (see [1] for more details
- * on this tricky aspect).
- *
- * Secondly, to allow the process to recover the hole, the in-service
- * queue must be expired too, to give bfqq the chance to preempt it
- * immediately. In fact, if bfqq has to wait for a full budget of the
- * in-service queue to be completed, then it may become impossible to
- * let the process recover the hole, even if the back-shifted
- * timestamps of bfqq are lower than those of the in-service queue. If
- * this happens for most or all of the holes, then the process may not
- * receive its reserved bandwidth. In this respect, it is worth noting
- * that, being the service of outstanding requests unpreemptible, a
- * little fraction of the holes may however be unrecoverable, thereby
- * causing a little loss of bandwidth.
- *
- * The last important point is detecting whether bfqq does need this
- * bandwidth recovery. In this respect, the next function deems the
- * process associated with bfqq greedy, and thus allows it to recover
- * the hole, if: 1) the process is waiting for the arrival of a new
- * request (which implies that bfqq expired for one of the above two
- * reasons), and 2) such a request has arrived soon. The first
- * condition is controlled through the flag non_blocking_wait_rq,
- * while the second through the flag arrived_in_time. If both
- * conditions hold, then the function computes the budget in the
- * above-described special way, and signals that the in-service queue
- * should be expired. Timestamp back-shifting is done later in
- * __bfq_activate_entity.
- *
- * 2. Reduce latency. Even if timestamps are not backshifted to let
- * the process associated with bfqq recover a service hole, bfqq may
- * however happen to have, after being (re)activated, a lower finish
- * timestamp than the in-service queue. That is, the next budget of
- * bfqq may have to be completed before the one of the in-service
- * queue. If this is the case, then preempting the in-service queue
- * allows this goal to be achieved, apart from the unpreemptible,
- * outstanding requests mentioned above.
- *
- * Unfortunately, regardless of which of the above two goals one wants
- * to achieve, service trees need first to be updated to know whether
- * the in-service queue must be preempted. To have service trees
- * correctly updated, the in-service queue must be expired and
- * rescheduled, and bfqq must be scheduled too. This is one of the
- * most costly operations (in future versions, the scheduling
- * mechanism may be re-designed in such a way to make it possible to
- * know whether preemption is needed without needing to update service
- * trees). In addition, queue preemptions almost always cause random
- * I/O, and thus loss of throughput. Because of these facts, the next
- * function adopts the following simple scheme to avoid both costly
- * operations and too frequent preemptions: it requests the expiration
- * of the in-service queue (unconditionally) only for queues that need
- * to recover a hole, or that either are weight-raised or deserve to
- * be weight-raised.
- */
-static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- bool arrived_in_time,
- bool wr_or_deserves_wr)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
- /*
- * We do not clear the flag non_blocking_wait_rq here, as
- * the latter is used in bfq_activate_bfqq to signal
- * that timestamps need to be back-shifted (and is
- * cleared right after).
- */
-
- /*
- * In next assignment we rely on that either
- * entity->service or entity->budget are not updated
- * on expiration if bfqq is empty (see
- * __bfq_bfqq_recalc_budget). Thus both quantities
- * remain unchanged after such an expiration, and the
- * following statement therefore assigns to
- * entity->budget the remaining budget on such an
- * expiration. For clarity, entity->service is not
- * updated on expiration in any case, and, in normal
- * operation, is reset only when bfqq is selected for
- * service (see bfq_get_next_queue).
- */
- BUG_ON(bfqq->max_budget < 0);
- entity->budget = min_t(unsigned long,
- bfq_bfqq_budget_left(bfqq),
- bfqq->max_budget);
-
- BUG_ON(entity->budget < 0);
- return true;
- }
-
- BUG_ON(bfqq->max_budget < 0);
- entity->budget = max_t(unsigned long, bfqq->max_budget,
- bfq_serv_to_charge(bfqq->next_rq, bfqq));
- BUG_ON(entity->budget < 0);
-
- bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
- return wr_or_deserves_wr;
-}
-
-static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- unsigned int old_wr_coeff,
- bool wr_or_deserves_wr,
- bool interactive,
- bool in_burst,
- bool soft_rt)
-{
- if (old_wr_coeff == 1 && wr_or_deserves_wr) {
- /* start a weight-raising period */
- if (interactive) {
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- } else {
- bfqq->wr_start_at_switch_to_srt = jiffies;
- bfqq->wr_coeff = bfqd->bfq_wr_coeff *
- BFQ_SOFTRT_WEIGHT_FACTOR;
- bfqq->wr_cur_max_time =
- bfqd->bfq_wr_rt_max_time;
- }
- /*
- * If needed, further reduce budget to make sure it is
- * close to bfqq's backlog, so as to reduce the
- * scheduling-error component due to a too large
- * budget. Do not care about throughput consequences,
- * but only about latency. Finally, do not assign a
- * too small budget either, to avoid increasing
- * latency by causing too frequent expirations.
- */
- bfqq->entity.budget = min_t(unsigned long,
- bfqq->entity.budget,
- 2 * bfq_min_budget(bfqd));
-
- bfq_log_bfqq(bfqd, bfqq,
- "wrais starting at %lu, rais_max_time %u",
- jiffies,
- jiffies_to_msecs(bfqq->wr_cur_max_time));
- } else if (old_wr_coeff > 1) {
- if (interactive) { /* update wr coeff and duration */
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- } else if (in_burst) {
- bfqq->wr_coeff = 1;
- bfq_log_bfqq(bfqd, bfqq,
- "wrais ending at %lu, rais_max_time %u",
- jiffies,
- jiffies_to_msecs(bfqq->
- wr_cur_max_time));
- } else if (soft_rt) {
- /*
- * The application is now or still meeting the
- * requirements for being deemed soft rt. We
- * can then correctly and safely (re)charge
- * the weight-raising duration for the
- * application with the weight-raising
- * duration for soft rt applications.
- *
- * In particular, doing this recharge now, i.e.,
- * before the weight-raising period for the
- * application finishes, reduces the probability
- * of the following negative scenario:
- * 1) the weight of a soft rt application is
- * raised at startup (as for any newly
- * created application),
- * 2) since the application is not interactive,
- * at a certain time weight-raising is
- * stopped for the application,
- * 3) at that time the application happens to
- * still have pending requests, and hence
- * is destined to not have a chance to be
- * deemed soft rt before these requests are
- * completed (see the comments to the
- * function bfq_bfqq_softrt_next_start()
- * for details on soft rt detection),
- * 4) these pending requests experience a high
- * latency because the application is not
- * weight-raised while they are pending.
- */
- if (bfqq->wr_cur_max_time !=
- bfqd->bfq_wr_rt_max_time) {
- bfqq->wr_start_at_switch_to_srt =
- bfqq->last_wr_start_finish;
- BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-
- bfqq->wr_cur_max_time =
- bfqd->bfq_wr_rt_max_time;
- bfqq->wr_coeff = bfqd->bfq_wr_coeff *
- BFQ_SOFTRT_WEIGHT_FACTOR;
- bfq_log_bfqq(bfqd, bfqq,
- "switching to soft_rt wr");
- } else
- bfq_log_bfqq(bfqd, bfqq,
- "moving forward soft_rt wr duration");
- bfqq->last_wr_start_finish = jiffies;
- }
- }
-}
-
-static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- return bfqq->dispatched == 0 &&
- time_is_before_jiffies(
- bfqq->budget_timeout +
- bfqd->bfq_wr_min_idle_time);
-}
-
-static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- int old_wr_coeff,
- struct request *rq,
- bool *interactive)
-{
- bool soft_rt, in_burst, wr_or_deserves_wr,
- bfqq_wants_to_preempt,
- idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
- /*
- * See the comments on
- * bfq_bfqq_update_budg_for_activation for
- * details on the usage of the next variable.
- */
- arrived_in_time = ktime_get_ns() <=
- RQ_BIC(rq)->ttime.last_end_request +
- bfqd->bfq_slice_idle * 3;
-
- bfq_log_bfqq(bfqd, bfqq,
- "bfq_add_request non-busy: "
- "jiffies %lu, in_time %d, idle_long %d busyw %d "
- "wr_coeff %u",
- jiffies, arrived_in_time,
- idle_for_long_time,
- bfq_bfqq_non_blocking_wait_rq(bfqq),
- old_wr_coeff);
-
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
- BUG_ON(bfqq == bfqd->in_service_queue);
- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
- req_op(rq), rq->cmd_flags);
-
- /*
- * bfqq deserves to be weight-raised if:
- * - it is sync,
- * - it does not belong to a large burst,
- * - it has been idle for enough time or is soft real-time,
- * - is linked to a bfq_io_cq (it is not shared in any sense)
- */
- in_burst = bfq_bfqq_in_large_burst(bfqq);
- soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
- !in_burst &&
- time_is_before_jiffies(bfqq->soft_rt_next_start);
- *interactive =
- !in_burst &&
- idle_for_long_time;
- wr_or_deserves_wr = bfqd->low_latency &&
- (bfqq->wr_coeff > 1 ||
- (bfq_bfqq_sync(bfqq) &&
- bfqq->bic && (*interactive || soft_rt)));
-
- bfq_log_bfqq(bfqd, bfqq,
- "bfq_add_request: "
- "in_burst %d, "
- "soft_rt %d (next %lu), inter %d, bic %p",
- bfq_bfqq_in_large_burst(bfqq), soft_rt,
- bfqq->soft_rt_next_start,
- *interactive,
- bfqq->bic);
-
- /*
- * Using the last flag, update budget and check whether bfqq
- * may want to preempt the in-service queue.
- */
- bfqq_wants_to_preempt =
- bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
- arrived_in_time,
- wr_or_deserves_wr);
-
- /*
- * If bfqq happened to be activated in a burst, but has been
- * idle for much more than an interactive queue, then we
- * assume that, in the overall I/O initiated in the burst, the
- * I/O associated with bfqq is finished. So bfqq does not need
- * to be treated as a queue belonging to a burst
- * anymore. Accordingly, we reset bfqq's in_large_burst flag
- * if set, and remove bfqq from the burst list if it's
- * there. We do not decrement burst_size, because the fact
- * that bfqq does not need to belong to the burst list any
- * more does not invalidate the fact that bfqq was created in
- * a burst.
- */
- if (likely(!bfq_bfqq_just_created(bfqq)) &&
- idle_for_long_time &&
- time_is_before_jiffies(
- bfqq->budget_timeout +
- msecs_to_jiffies(10000))) {
- hlist_del_init(&bfqq->burst_list_node);
- bfq_clear_bfqq_in_large_burst(bfqq);
- }
-
- bfq_clear_bfqq_just_created(bfqq);
-
- if (!bfq_bfqq_IO_bound(bfqq)) {
- if (arrived_in_time) {
- bfqq->requests_within_timer++;
- if (bfqq->requests_within_timer >=
- bfqd->bfq_requests_within_timer)
- bfq_mark_bfqq_IO_bound(bfqq);
- } else
- bfqq->requests_within_timer = 0;
- bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
- bfqq->requests_within_timer);
- }
-
- if (bfqd->low_latency) {
- if (unlikely(time_is_after_jiffies(bfqq->split_time)))
- /* wraparound */
- bfqq->split_time =
- jiffies - bfqd->bfq_wr_min_idle_time - 1;
-
- if (time_is_before_jiffies(bfqq->split_time +
- bfqd->bfq_wr_min_idle_time)) {
- bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
- old_wr_coeff,
- wr_or_deserves_wr,
- *interactive,
- in_burst,
- soft_rt);
-
- if (old_wr_coeff != bfqq->wr_coeff)
- bfqq->entity.prio_changed = 1;
- }
- }
-
- bfqq->last_idle_bklogged = jiffies;
- bfqq->service_from_backlogged = 0;
- bfq_clear_bfqq_softrt_update(bfqq);
-
- bfq_add_bfqq_busy(bfqd, bfqq);
-
- /*
- * Expire in-service queue only if preemption may be needed
- * for guarantees. In this respect, the function
- * next_queue_may_preempt just checks a simple, necessary
- * condition, and not a sufficient condition based on
- * timestamps. In fact, for the latter condition to be
- * evaluated, timestamps would need first to be updated, and
- * this operation is quite costly (see the comments on the
- * function bfq_bfqq_update_budg_for_activation).
- */
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
- next_queue_may_preempt(bfqd)) {
- struct bfq_queue *in_serv =
- bfqd->in_service_queue;
- BUG_ON(in_serv == bfqq);
-
- bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
- false, BFQ_BFQQ_PREEMPTED);
- }
-}
-
-static void bfq_add_request(struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
- struct bfq_data *bfqd = bfqq->bfqd;
- struct request *next_rq, *prev;
- unsigned int old_wr_coeff = bfqq->wr_coeff;
- bool interactive = false;
-
- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
- blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
-
- if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
- bfq_log_bfqq(bfqd, bfqq,
- "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
- jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
- jiffies_to_msecs(bfqq->wr_cur_max_time),
- bfqq->wr_coeff,
- bfqq->entity.weight, bfqq->entity.orig_weight);
-
- bfqq->queued[rq_is_sync(rq)]++;
- bfqd->queued++;
-
- elv_rb_add(&bfqq->sort_list, rq);
-
- /*
- * Check if this request is a better next-to-serve candidate.
- */
- prev = bfqq->next_rq;
- next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
- BUG_ON(!next_rq);
- bfqq->next_rq = next_rq;
-
- /*
- * Adjust priority tree position, if next_rq changes.
- */
- if (prev != bfqq->next_rq)
- bfq_pos_tree_add_move(bfqd, bfqq);
-
- if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
- bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
- rq, &interactive);
- else {
- if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
- time_is_before_jiffies(
- bfqq->last_wr_start_finish +
- bfqd->bfq_wr_min_inter_arr_async)) {
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-
- bfqd->wr_busy_queues++;
- BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
- bfqq->entity.prio_changed = 1;
- bfq_log_bfqq(bfqd, bfqq,
- "non-idle wrais starting, "
- "wr_max_time %u wr_busy %d",
- jiffies_to_msecs(bfqq->wr_cur_max_time),
- bfqd->wr_busy_queues);
- }
- if (prev != bfqq->next_rq)
- bfq_updated_next_req(bfqd, bfqq);
- }
-
- /*
- * Assign jiffies to last_wr_start_finish in the following
- * cases:
- *
- * . if bfqq is not going to be weight-raised, because, for
- * non weight-raised queues, last_wr_start_finish stores the
- * arrival time of the last request; as of now, this piece
- * of information is used only for deciding whether to
- * weight-raise async queues
- *
- * . if bfqq is not weight-raised, because, if bfqq is now
- * switching to weight-raised, then last_wr_start_finish
- * stores the time when weight-raising starts
- *
- * . if bfqq is interactive, because, regardless of whether
- * bfqq is currently weight-raised, the weight-raising
- * period must start or restart (this case is considered
- * separately because it is not detected by the above
- * conditions, if bfqq is already weight-raised)
- *
- * last_wr_start_finish has to be updated also if bfqq is soft
- * real-time, because the weight-raising period is constantly
- * restarted on idle-to-busy transitions for these queues, but
- * this is already done in bfq_bfqq_handle_idle_busy_switch if
- * needed.
- */
- if (bfqd->low_latency &&
- (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
- bfqq->last_wr_start_finish = jiffies;
-}
-
-static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
- struct bio *bio)
-{
- struct task_struct *tsk = current;
- struct bfq_io_cq *bic;
- struct bfq_queue *bfqq;
-
- bic = bfq_bic_lookup(bfqd, tsk->io_context);
- if (!bic)
- return NULL;
-
- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
- if (bfqq)
- return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
-
- return NULL;
-}
-
-static sector_t get_sdist(sector_t last_pos, struct request *rq)
-{
- sector_t sdist = 0;
-
- if (last_pos) {
- if (last_pos < blk_rq_pos(rq))
- sdist = blk_rq_pos(rq) - last_pos;
- else
- sdist = last_pos - blk_rq_pos(rq);
- }
-
- return sdist;
-}
-
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- bfqd->rq_in_driver++;
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- BUG_ON(bfqd->rq_in_driver == 0);
- bfqd->rq_in_driver--;
-}
-
-static void bfq_remove_request(struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
- struct bfq_data *bfqd = bfqq->bfqd;
- const int sync = rq_is_sync(rq);
-
- BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
- bfqq == bfqd->in_service_queue);
-
- if (bfqq->next_rq == rq) {
- bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
- bfq_updated_next_req(bfqd, bfqq);
- }
-
- if (rq->queuelist.prev != &rq->queuelist)
- list_del_init(&rq->queuelist);
- BUG_ON(bfqq->queued[sync] == 0);
- bfqq->queued[sync]--;
- bfqd->queued--;
- elv_rb_del(&bfqq->sort_list, rq);
-
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
- bfqq->next_rq = NULL;
-
- BUG_ON(bfqq->entity.budget < 0);
-
- if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
- BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */
- bfq_del_bfqq_busy(bfqd, bfqq, false);
- /*
- * bfqq emptied. In normal operation, when
- * bfqq is empty, bfqq->entity.service and
- * bfqq->entity.budget must contain,
- * respectively, the service received and the
- * budget used last time bfqq emptied. These
- * facts do not hold in this case, as at least
- * this last removal occurred while bfqq is
- * not in service. To avoid inconsistencies,
- * reset both bfqq->entity.service and
- * bfqq->entity.budget, if bfqq has still a
- * process that may issue I/O requests to it.
- */
- bfqq->entity.budget = bfqq->entity.service = 0;
- }
-
- /*
- * Remove queue from request-position tree as it is empty.
- */
- if (bfqq->pos_root) {
- rb_erase(&bfqq->pos_node, bfqq->pos_root);
- bfqq->pos_root = NULL;
- }
- }
-
- if (rq->cmd_flags & REQ_META) {
- BUG_ON(bfqq->meta_pending == 0);
- bfqq->meta_pending--;
- }
- bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
- rq->cmd_flags);
-}
-
-static int bfq_merge(struct request_queue *q, struct request **req,
- struct bio *bio)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct request *__rq;
-
- __rq = bfq_find_rq_fmerge(bfqd, bio);
- if (__rq && elv_bio_merge_ok(__rq, bio)) {
- *req = __rq;
- return ELEVATOR_FRONT_MERGE;
- }
-
- return ELEVATOR_NO_MERGE;
-}
-
-static void bfq_merged_request(struct request_queue *q, struct request *req,
- int type)
-{
- if (type == ELEVATOR_FRONT_MERGE &&
- rb_prev(&req->rb_node) &&
- blk_rq_pos(req) <
- blk_rq_pos(container_of(rb_prev(&req->rb_node),
- struct request, rb_node))) {
- struct bfq_queue *bfqq = RQ_BFQQ(req);
- struct bfq_data *bfqd = bfqq->bfqd;
- struct request *prev, *next_rq;
-
- /* Reposition request in its sort_list */
- elv_rb_del(&bfqq->sort_list, req);
- elv_rb_add(&bfqq->sort_list, req);
- /* Choose next request to be served for bfqq */
- prev = bfqq->next_rq;
- next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
- bfqd->last_position);
- BUG_ON(!next_rq);
- bfqq->next_rq = next_rq;
- /*
- * If next_rq changes, update both the queue's budget to
- * fit the new request and the queue's position in its
- * rq_pos_tree.
- */
- if (prev != bfqq->next_rq) {
- bfq_updated_next_req(bfqd, bfqq);
- bfq_pos_tree_add_move(bfqd, bfqq);
- }
- }
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_bio_merged(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
- bio->bi_opf);
-}
-#endif
-
-static void bfq_merged_requests(struct request_queue *q, struct request *rq,
- struct request *next)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
- /*
- * If next and rq belong to the same bfq_queue and next is older
- * than rq, then reposition rq in the fifo (by substituting next
- * with rq). Otherwise, if next and rq belong to different
- * bfq_queues, never reposition rq: in fact, we would have to
- * reposition it with respect to next's position in its own fifo,
- * which would most certainly be too expensive with respect to
- * the benefits.
- */
- if (bfqq == next_bfqq &&
- !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
- next->fifo_time < rq->fifo_time) {
- list_del_init(&rq->queuelist);
- list_replace_init(&next->queuelist, &rq->queuelist);
- rq->fifo_time = next->fifo_time;
- }
-
- if (bfqq->next_rq == next)
- bfqq->next_rq = rq;
-
- bfq_remove_request(next);
- bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
- next->cmd_flags);
-}
-
-/* Must be called with bfqq != NULL */
-static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
-{
- BUG_ON(!bfqq);
-
- if (bfq_bfqq_busy(bfqq)) {
- bfqq->bfqd->wr_busy_queues--;
- BUG_ON(bfqq->bfqd->wr_busy_queues < 0);
- }
- bfqq->wr_coeff = 1;
- bfqq->wr_cur_max_time = 0;
- bfqq->last_wr_start_finish = jiffies;
- /*
- * Trigger a weight change on the next invocation of
- * __bfq_entity_update_weight_prio.
- */
- bfqq->entity.prio_changed = 1;
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "end_wr: wrais ending at %lu, rais_max_time %u",
- bfqq->last_wr_start_finish,
- jiffies_to_msecs(bfqq->wr_cur_max_time));
- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
- bfqq->bfqd->wr_busy_queues);
-}
-
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
- struct bfq_group *bfqg)
-{
- int i, j;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
- if (bfqg->async_bfqq[i][j])
- bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
- if (bfqg->async_idle_bfqq)
- bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
-}
-
-static void bfq_end_wr(struct bfq_data *bfqd)
-{
- struct bfq_queue *bfqq;
-
- spin_lock_irq(bfqd->queue->queue_lock);
-
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
- bfq_bfqq_end_wr(bfqq);
- list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
- bfq_bfqq_end_wr(bfqq);
- bfq_end_wr_async(bfqd);
-
- spin_unlock_irq(bfqd->queue->queue_lock);
-}
-
-static sector_t bfq_io_struct_pos(void *io_struct, bool request)
-{
- if (request)
- return blk_rq_pos(io_struct);
- else
- return ((struct bio *)io_struct)->bi_iter.bi_sector;
-}
-
-static int bfq_rq_close_to_sector(void *io_struct, bool request,
- sector_t sector)
-{
- return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
- BFQQ_CLOSE_THR;
-}
-
-static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- sector_t sector)
-{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
- struct rb_node *parent, *node;
- struct bfq_queue *__bfqq;
-
- if (RB_EMPTY_ROOT(root))
- return NULL;
-
- /*
- * First, if we find a request starting at the end of the last
- * request, choose it.
- */
- __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
- if (__bfqq)
- return __bfqq;
-
- /*
- * If the exact sector wasn't found, the parent of the NULL leaf
- * will contain the closest sector (rq_pos_tree sorted by
- * next_request position).
- */
- __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
- if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
- return __bfqq;
-
- if (blk_rq_pos(__bfqq->next_rq) < sector)
- node = rb_next(&__bfqq->pos_node);
- else
- node = rb_prev(&__bfqq->pos_node);
- if (!node)
- return NULL;
-
- __bfqq = rb_entry(node, struct bfq_queue, pos_node);
- if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
- return __bfqq;
-
- return NULL;
-}
-
-static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
- struct bfq_queue *cur_bfqq,
- sector_t sector)
-{
- struct bfq_queue *bfqq;
-
- /*
- * We shall notice if some of the queues are cooperating,
- * e.g., working closely on the same area of the device. In
- * that case, we can group them together and: 1) don't waste
- * time idling, and 2) serve the union of their requests in
- * the best possible order for throughput.
- */
- bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
- if (!bfqq || bfqq == cur_bfqq)
- return NULL;
-
- return bfqq;
-}
-
-static struct bfq_queue *
-bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
-{
- int process_refs, new_process_refs;
- struct bfq_queue *__bfqq;
-
- /*
- * If there are no process references on the new_bfqq, then it is
- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
- * may have dropped their last reference (not just their last process
- * reference).
- */
- if (!bfqq_process_refs(new_bfqq))
- return NULL;
-
- /* Avoid a circular list and skip interim queue merges. */
- while ((__bfqq = new_bfqq->new_bfqq)) {
- if (__bfqq == bfqq)
- return NULL;
- new_bfqq = __bfqq;
- }
-
- process_refs = bfqq_process_refs(bfqq);
- new_process_refs = bfqq_process_refs(new_bfqq);
- /*
- * If the process for the bfqq has gone away, there is no
- * sense in merging the queues.
- */
- if (process_refs == 0 || new_process_refs == 0)
- return NULL;
-
- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
- new_bfqq->pid);
-
- /*
- * Merging is just a redirection: the requests of the process
- * owning one of the two queues are redirected to the other queue.
- * The latter queue, in its turn, is set as shared if this is the
- * first time that the requests of some process are redirected to
- * it.
- *
- * We redirect bfqq to new_bfqq and not the opposite, because we
- * are in the context of the process owning bfqq, hence we have
- * the io_cq of this process. So we can immediately configure this
- * io_cq to redirect the requests of the process to new_bfqq.
- *
- * NOTE, even if new_bfqq coincides with the in-service queue, the
- * io_cq of new_bfqq is not available, because, if the in-service
- * queue is shared, bfqd->in_service_bic may not point to the
- * io_cq of the in-service queue.
- * Redirecting the requests of the process owning bfqq to the
- * currently in-service queue is in any case the best option, as
- * we feed the in-service queue with new requests close to the
- * last request served and, by doing so, hopefully increase the
- * throughput.
- */
- bfqq->new_bfqq = new_bfqq;
- new_bfqq->ref += process_refs;
- return new_bfqq;
-}
-
-static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
- struct bfq_queue *new_bfqq)
-{
- if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
- (bfqq->ioprio_class != new_bfqq->ioprio_class))
- return false;
-
- /*
- * If either of the queues has already been detected as seeky,
- * then merging it with the other queue is unlikely to lead to
- * sequential I/O.
- */
- if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
- return false;
-
- /*
- * Interleaved I/O is known to be done by (some) applications
- * only for reads, so it does not make sense to merge async
- * queues.
- */
- if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
- return false;
-
- return true;
-}
-
-/*
- * If this function returns true, then bfqq cannot be merged. The idea
- * is that true cooperation happens very early after processes start
- * to do I/O. Usually, late cooperations are just accidental false
- * positives. In case bfqq is weight-raised, such false positives
- * would evidently degrade latency guarantees for bfqq.
- */
-static bool wr_from_too_long(struct bfq_queue *bfqq)
-{
- return bfqq->wr_coeff > 1 &&
- time_is_before_jiffies(bfqq->last_wr_start_finish +
- msecs_to_jiffies(100));
-}
-
-/*
- * Attempt to schedule a merge of bfqq with the currently in-service
- * queue or with a close queue among the scheduled queues. Return
- * NULL if no merge was scheduled, a pointer to the shared bfq_queue
- * structure otherwise.
- *
- * The OOM queue is not allowed to participate to cooperation: in fact, since
- * the requests temporarily redirected to the OOM queue could be redirected
- * again to dedicated queues at any time, the state needed to correctly
- * handle merging with the OOM queue would be quite complex and expensive
- * to maintain. Besides, in such a critical condition as an out of memory,
- * the benefits of queue merging may be little relevant, or even negligible.
- *
- * Weight-raised queues can be merged only if their weight-raising
- * period has just started. In fact cooperating processes are usually
- * started together. Thus, with this filter we avoid false positives
- * that would jeopardize low-latency guarantees.
- *
- * WARNING: queue merging may impair fairness among non-weight raised
- * queues, for at least two reasons: 1) the original weight of a
- * merged queue may change during the merged state, 2) even being the
- * weight the same, a merged queue may be bloated with many more
- * requests than the ones produced by its originally-associated
- * process.
- */
-static struct bfq_queue *
-bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- void *io_struct, bool request)
-{
- struct bfq_queue *in_service_bfqq, *new_bfqq;
-
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
-
- if (io_struct && wr_from_too_long(bfqq) &&
- likely(bfqq != &bfqd->oom_bfqq))
- bfq_log_bfqq(bfqd, bfqq,
- "would have looked for coop, but bfq%d wr",
- bfqq->pid);
-
- if (!io_struct ||
- wr_from_too_long(bfqq) ||
- unlikely(bfqq == &bfqd->oom_bfqq))
- return NULL;
-
- /* If there is only one backlogged queue, don't search. */
- if (bfqd->busy_queues == 1)
- return NULL;
-
- in_service_bfqq = bfqd->in_service_queue;
-
- if (in_service_bfqq && in_service_bfqq != bfqq &&
- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
- && likely(in_service_bfqq == &bfqd->oom_bfqq))
- bfq_log_bfqq(bfqd, bfqq,
- "would have tried merge with in-service-queue, but wr");
-
- if (!in_service_bfqq || in_service_bfqq == bfqq ||
- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
- unlikely(in_service_bfqq == &bfqd->oom_bfqq))
- goto check_scheduled;
-
- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
- bfqq->entity.parent == in_service_bfqq->entity.parent &&
- bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
- new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
- if (new_bfqq)
- return new_bfqq;
- }
- /*
- * Check whether there is a cooperator among currently scheduled
- * queues. The only thing we need is that the bio/request is not
- * NULL, as we need it to establish whether a cooperator exists.
- */
-check_scheduled:
- new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
- bfq_io_struct_pos(io_struct, request));
-
- BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
-
- if (new_bfqq && wr_from_too_long(new_bfqq) &&
- likely(new_bfqq != &bfqd->oom_bfqq) &&
- bfq_may_be_close_cooperator(bfqq, new_bfqq))
- bfq_log_bfqq(bfqd, bfqq,
- "would have merged with bfq%d, but wr",
- new_bfqq->pid);
-
- if (new_bfqq && !wr_from_too_long(new_bfqq) &&
- likely(new_bfqq != &bfqd->oom_bfqq) &&
- bfq_may_be_close_cooperator(bfqq, new_bfqq))
- return bfq_setup_merge(bfqq, new_bfqq);
-
- return NULL;
-}
-
-static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
-{
- struct bfq_io_cq *bic = bfqq->bic;
-
- /*
- * If !bfqq->bic, the queue is already shared or its requests
- * have already been redirected to a shared queue; both idle window
- * and weight raising state have already been saved. Do nothing.
- */
- if (!bic)
- return;
-
- bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
- bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
- bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
- bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
- bic->saved_wr_coeff = bfqq->wr_coeff;
- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
- BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
-}
-
-static void bfq_get_bic_reference(struct bfq_queue *bfqq)
-{
- /*
- * If bfqq->bic has a non-NULL value, the bic to which it belongs
- * is about to begin using a shared bfq_queue.
- */
- if (bfqq->bic)
- atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
-}
-
-static void
-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
-{
- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
- (unsigned long) new_bfqq->pid);
- /* Save weight raising and idle window of the merged queues */
- bfq_bfqq_save_state(bfqq);
- bfq_bfqq_save_state(new_bfqq);
- if (bfq_bfqq_IO_bound(bfqq))
- bfq_mark_bfqq_IO_bound(new_bfqq);
- bfq_clear_bfqq_IO_bound(bfqq);
-
- /*
- * If bfqq is weight-raised, then let new_bfqq inherit
- * weight-raising. To reduce false positives, neglect the case
- * where bfqq has just been created, but has not yet made it
- * to be weight-raised (which may happen because EQM may merge
- * bfqq even before bfq_add_request is executed for the first
- * time for bfqq). Handling this case would however be very
- * easy, thanks to the flag just_created.
- */
- if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
- new_bfqq->wr_coeff = bfqq->wr_coeff;
- new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
- new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
- new_bfqq->wr_start_at_switch_to_srt =
- bfqq->wr_start_at_switch_to_srt;
- if (bfq_bfqq_busy(new_bfqq)) {
- bfqd->wr_busy_queues++;
- BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
- }
-
- new_bfqq->entity.prio_changed = 1;
- bfq_log_bfqq(bfqd, new_bfqq,
- "wr start after merge with %d, rais_max_time %u",
- bfqq->pid,
- jiffies_to_msecs(bfqq->wr_cur_max_time));
- }
-
- if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
- bfqq->wr_coeff = 1;
- bfqq->entity.prio_changed = 1;
- if (bfq_bfqq_busy(bfqq)) {
- bfqd->wr_busy_queues--;
- BUG_ON(bfqd->wr_busy_queues < 0);
- }
-
- }
-
- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
- bfqd->wr_busy_queues);
-
- /*
- * Grab a reference to the bic, to prevent it from being destroyed
- * before being possibly touched by a bfq_split_bfqq().
- */
- bfq_get_bic_reference(bfqq);
- bfq_get_bic_reference(new_bfqq);
- /*
- * Merge queues (that is, let bic redirect its requests to new_bfqq)
- */
- bic_set_bfqq(bic, new_bfqq, 1);
- bfq_mark_bfqq_coop(new_bfqq);
- /*
- * new_bfqq now belongs to at least two bics (it is a shared queue):
- * set new_bfqq->bic to NULL. bfqq either:
- * - does not belong to any bic any more, and hence bfqq->bic must
- * be set to NULL, or
- * - is a queue whose owning bics have already been redirected to a
- * different queue, hence the queue is destined to not belong to
- * any bic soon and bfqq->bic is already NULL (therefore the next
- * assignment causes no harm).
- */
- new_bfqq->bic = NULL;
- bfqq->bic = NULL;
- /* release process reference to bfqq */
- bfq_put_queue(bfqq);
-}
-
-static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
- struct bio *bio)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_io_cq *bic;
- struct bfq_queue *bfqq, *new_bfqq;
-
- /*
- * Disallow merge of a sync bio into an async request.
- */
- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
- return false;
-
- /*
- * Lookup the bfqq that this bio will be queued with. Allow
- * merge only if rq is queued there.
- * Queue lock is held here.
- */
- bic = bfq_bic_lookup(bfqd, current->io_context);
- if (!bic)
- return false;
-
- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
- /*
- * We take advantage of this function to perform an early merge
- * of the queues of possible cooperating processes.
- */
- if (bfqq) {
- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
- if (new_bfqq) {
- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
- /*
- * If we get here, the bio will be queued in the
- * shared queue, i.e., new_bfqq, so use new_bfqq
- * to decide whether bio and rq can be merged.
- */
- bfqq = new_bfqq;
- }
- }
-
- return bfqq == RQ_BFQQ(rq);
-}
-
-static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
- struct request *next)
-{
- return RQ_BFQQ(rq) == RQ_BFQQ(next);
-}
-
-/*
- * Set the maximum time for the in-service queue to consume its
- * budget. This prevents seeky processes from lowering the throughput.
- * In practice, a time-slice service scheme is used with seeky
- * processes.
- */
-static void bfq_set_budget_timeout(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- unsigned int timeout_coeff;
-
- if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
- timeout_coeff = 1;
- else
- timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
-
- bfqd->last_budget_start = ktime_get();
-
- bfqq->budget_timeout = jiffies +
- bfqd->bfq_timeout * timeout_coeff;
-
- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
- jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
-}
-
-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- if (bfqq) {
- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
- bfq_mark_bfqq_must_alloc(bfqq);
- bfq_clear_bfqq_fifo_expire(bfqq);
-
- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
-
- BUG_ON(bfqq == bfqd->in_service_queue);
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-
- if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
- bfqq->wr_coeff > 1 &&
- bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
- time_is_before_jiffies(bfqq->budget_timeout)) {
- /*
- * For soft real-time queues, move the start
- * of the weight-raising period forward by the
- * time the queue has not received any
- * service. Otherwise, a relatively long
- * service delay is likely to cause the
- * weight-raising period of the queue to end,
- * because of the short duration of the
- * weight-raising period of a soft real-time
- * queue. It is worth noting that this move
- * is not so dangerous for the other queues,
- * because soft real-time queues are not
- * greedy.
- *
- * To not add a further variable, we use the
- * overloaded field budget_timeout to
- * determine for how long the queue has not
- * received service, i.e., how much time has
- * elapsed since the queue expired. However,
- * this is a little imprecise, because
- * budget_timeout is set to jiffies if bfqq
- * not only expires, but also remains with no
- * request.
- */
- if (time_after(bfqq->budget_timeout,
- bfqq->last_wr_start_finish))
- bfqq->last_wr_start_finish +=
- jiffies - bfqq->budget_timeout;
- else
- bfqq->last_wr_start_finish = jiffies;
-
- if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
- pr_crit(
- "BFQ WARNING:last %lu budget %lu jiffies %lu",
- bfqq->last_wr_start_finish,
- bfqq->budget_timeout,
- jiffies);
- pr_crit("diff %lu", jiffies -
- max_t(unsigned long,
- bfqq->last_wr_start_finish,
- bfqq->budget_timeout));
- bfqq->last_wr_start_finish = jiffies;
- }
- }
-
- bfq_set_budget_timeout(bfqd, bfqq);
- bfq_log_bfqq(bfqd, bfqq,
- "set_in_service_queue, cur-budget = %d",
- bfqq->entity.budget);
- } else
- bfq_log(bfqd, "set_in_service_queue: NULL");
-
- bfqd->in_service_queue = bfqq;
-}
-
-/*
- * Get and set a new queue for service.
- */
-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
-{
- struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
-
- __bfq_set_in_service_queue(bfqd, bfqq);
- return bfqq;
-}
-
-static void bfq_arm_slice_timer(struct bfq_data *bfqd)
-{
- struct bfq_queue *bfqq = bfqd->in_service_queue;
- struct bfq_io_cq *bic;
- u32 sl;
-
- BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
- /* Processes have exited, don't wait. */
- bic = bfqd->in_service_bic;
- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
- return;
-
- bfq_mark_bfqq_wait_request(bfqq);
-
- /*
- * We don't want to idle for seeks, but we do want to allow
- * fair distribution of slice time for a process doing back-to-back
- * seeks. So allow a little bit of time for him to submit a new rq.
- *
- * To prevent processes with (partly) seeky workloads from
- * being too ill-treated, grant them a small fraction of the
- * assigned budget before reducing the waiting time to
- * BFQ_MIN_TT. This happened to help reduce latency.
- */
- sl = bfqd->bfq_slice_idle;
- /*
- * Unless the queue is being weight-raised or the scenario is
- * asymmetric, grant only minimum idle time if the queue
- * is seeky. A long idling is preserved for a weight-raised
- * queue, or, more in general, in an asymemtric scenario,
- * because a long idling is needed for guaranteeing to a queue
- * its reserved share of the throughput (in particular, it is
- * needed if the queue has a higher weight than some other
- * queue).
- */
- if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
- bfq_symmetric_scenario(bfqd))
- sl = min_t(u32, sl, BFQ_MIN_TT);
-
- bfqd->last_idling_start = ktime_get();
- hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
- HRTIMER_MODE_REL);
- bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
- bfq_log(bfqd, "arm idle: %ld/%ld ms",
- sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
-}
-
-/*
- * In autotuning mode, max_budget is dynamically recomputed as the
- * amount of sectors transferred in timeout at the estimated peak
- * rate. This enables BFQ to utilize a full timeslice with a full
- * budget, even if the in-service queue is served at peak rate. And
- * this maximises throughput with sequential workloads.
- */
-static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
-{
- return (u64)bfqd->peak_rate * USEC_PER_MSEC *
- jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
-}
-
-/*
- * Update parameters related to throughput and responsiveness, as a
- * function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
- */
-static void update_thr_responsiveness_params(struct bfq_data *bfqd)
-{
- int dev_type = blk_queue_nonrot(bfqd->queue);
-
- if (bfqd->bfq_user_max_budget == 0) {
- bfqd->bfq_max_budget =
- bfq_calc_max_budget(bfqd);
- BUG_ON(bfqd->bfq_max_budget < 0);
- bfq_log(bfqd, "new max_budget = %d",
- bfqd->bfq_max_budget);
- }
-
- if (bfqd->device_speed == BFQ_BFQD_FAST &&
- bfqd->peak_rate < device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_SLOW;
- bfqd->RT_prod = R_slow[dev_type] *
- T_slow[dev_type];
- } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
- bfqd->peak_rate > device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_FAST;
- bfqd->RT_prod = R_fast[dev_type] *
- T_fast[dev_type];
- }
-
- bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
- dev_type == 0 ? "ROT" : "NONROT",
- bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
- bfqd->device_speed == BFQ_BFQD_FAST ?
- (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
- (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
- (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
- BFQ_RATE_SHIFT);
-}
-
-static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
-{
- if (rq != NULL) { /* new rq dispatch now, reset accordingly */
- bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
- bfqd->peak_rate_samples = 1;
- bfqd->sequential_samples = 0;
- bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
- blk_rq_sectors(rq);
- } else /* no new rq dispatched, just reset the number of samples */
- bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
-
- bfq_log(bfqd,
- "reset_rate_computation at end, sample %u/%u tot_sects %llu",
- bfqd->peak_rate_samples, bfqd->sequential_samples,
- bfqd->tot_sectors_dispatched);
-}
-
-static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
-{
- u32 rate, weight, divisor;
-
- /*
- * For the convergence property to hold (see comments on
- * bfq_update_peak_rate()) and for the assessment to be
- * reliable, a minimum number of samples must be present, and
- * a minimum amount of time must have elapsed. If not so, do
- * not compute new rate. Just reset parameters, to get ready
- * for a new evaluation attempt.
- */
- if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
- bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
- bfq_log(bfqd,
- "update_rate_reset: only resetting, delta_first %lluus samples %d",
- bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
- goto reset_computation;
- }
-
- /*
- * If a new request completion has occurred after last
- * dispatch, then, to approximate the rate at which requests
- * have been served by the device, it is more precise to
- * extend the observation interval to the last completion.
- */
- bfqd->delta_from_first =
- max_t(u64, bfqd->delta_from_first,
- bfqd->last_completion - bfqd->first_dispatch);
-
- BUG_ON(bfqd->delta_from_first == 0);
- /*
- * Rate computed in sects/usec, and not sects/nsec, for
- * precision issues.
- */
- rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC));
-
- bfq_log(bfqd,
-"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
- bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
- ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
- rate > 20< 20M sectors/sec)
- */
- if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
- rate <= bfqd->peak_rate) ||
- rate > 20<peak_rate_samples, bfqd->sequential_samples,
- ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
- ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
- goto reset_computation;
- } else {
- bfq_log(bfqd,
- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
- bfqd->peak_rate_samples, bfqd->sequential_samples,
- ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
- ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
- }
-
- /*
- * We have to update the peak rate, at last! To this purpose,
- * we use a low-pass filter. We compute the smoothing constant
- * of the filter as a function of the 'weight' of the new
- * measured rate.
- *
- * As can be seen in next formulas, we define this weight as a
- * quantity proportional to how sequential the workload is,
- * and to how long the observation time interval is.
- *
- * The weight runs from 0 to 8. The maximum value of the
- * weight, 8, yields the minimum value for the smoothing
- * constant. At this minimum value for the smoothing constant,
- * the measured rate contributes for half of the next value of
- * the estimated peak rate.
- *
- * So, the first step is to compute the weight as a function
- * of how sequential the workload is. Note that the weight
- * cannot reach 9, because bfqd->sequential_samples cannot
- * become equal to bfqd->peak_rate_samples, which, in its
- * turn, holds true because bfqd->sequential_samples is not
- * incremented for the first sample.
- */
- weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
-
- /*
- * Second step: further refine the weight as a function of the
- * duration of the observation interval.
- */
- weight = min_t(u32, 8,
- div_u64(weight * bfqd->delta_from_first,
- BFQ_RATE_REF_INTERVAL));
-
- /*
- * Divisor ranging from 10, for minimum weight, to 2, for
- * maximum weight.
- */
- divisor = 10 - weight;
- BUG_ON(divisor == 0);
-
- /*
- * Finally, update peak rate:
- *
- * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
- */
- bfqd->peak_rate *= divisor-1;
- bfqd->peak_rate /= divisor;
- rate /= divisor; /* smoothing constant alpha = 1/divisor */
-
- bfq_log(bfqd,
- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
- divisor,
- ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
- (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
-
- BUG_ON(bfqd->peak_rate == 0);
- BUG_ON(bfqd->peak_rate > 20<peak_rate += rate;
- update_thr_responsiveness_params(bfqd);
- BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */
- bfq_log(bfqd,
- "update_peak_rate: goto reset, samples %d",
- bfqd->peak_rate_samples) ;
- bfq_reset_rate_computation(bfqd, rq);
- goto update_last_values; /* will add one sample */
- }
-
- /*
- * Device idle for very long: the observation interval lasting
- * up to this dispatch cannot be a valid observation interval
- * for computing a new peak rate (similarly to the late-
- * completion event in bfq_completed_request()). Go to
- * update_rate_and_reset to have the following three steps
- * taken:
- * - close the observation interval at the last (previous)
- * request dispatch or completion
- * - compute rate, if possible, for that observation interval
- * - start a new observation interval with this dispatch
- */
- if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
- bfqd->rq_in_driver == 0) {
- bfq_log(bfqd,
-"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
- (now_ns - bfqd->last_dispatch)>>10,
- bfqd->peak_rate_samples) ;
- goto update_rate_and_reset;
- }
-
- /* Update sampling information */
- bfqd->peak_rate_samples++;
-
- if ((bfqd->rq_in_driver > 0 ||
- now_ns - bfqd->last_completion < BFQ_MIN_TT)
- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
- bfqd->sequential_samples++;
-
- bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
-
- /* Reset max observed rq size every 32 dispatches */
- if (likely(bfqd->peak_rate_samples % 32))
- bfqd->last_rq_max_size =
- max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
- else
- bfqd->last_rq_max_size = blk_rq_sectors(rq);
-
- bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
-
- bfq_log(bfqd,
- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
- bfqd->peak_rate_samples, bfqd->sequential_samples,
- bfqd->tot_sectors_dispatched,
- bfqd->delta_from_first>>10);
-
- /* Target observation interval not yet reached, go on sampling */
- if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
- goto update_last_values;
-
-update_rate_and_reset:
- bfq_update_rate_reset(bfqd, rq);
-update_last_values:
- bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
- bfqd->last_dispatch = now_ns;
-
- bfq_log(bfqd,
- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
- (now_ns - bfqd->first_dispatch)>>10,
- (unsigned long long) bfqd->last_position,
- ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
- bfq_log(bfqd,
- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
-}
-
-/*
- * Move request from internal lists to the dispatch list of the request queue
- */
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
- /*
- * For consistency, the next instruction should have been executed
- * after removing the request from the queue and dispatching it.
- * We execute instead this instruction before bfq_remove_request()
- * (and hence introduce a temporary inconsistency), for efficiency.
- * In fact, in a forced_dispatch, this prevents two counters related
- * to bfqq->dispatched to risk to be uselessly decremented if bfqq
- * is not in service, and then to be incremented again after
- * incrementing bfqq->dispatched.
- */
- bfqq->dispatched++;
- bfq_update_peak_rate(q->elevator->elevator_data, rq);
-
- bfq_remove_request(rq);
- elv_dispatch_sort(q, rq);
-}
-
-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- BUG_ON(bfqq != bfqd->in_service_queue);
-
- /*
- * If this bfqq is shared between multiple processes, check
- * to make sure that those processes are still issuing I/Os
- * within the mean seek distance. If not, it may be time to
- * break the queues apart again.
- */
- if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
- bfq_mark_bfqq_split_coop(bfqq);
-
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
- if (bfqq->dispatched == 0)
- /*
- * Overloading budget_timeout field to store
- * the time at which the queue remains with no
- * backlog and no outstanding request; used by
- * the weight-raising mechanism.
- */
- bfqq->budget_timeout = jiffies;
-
- bfq_del_bfqq_busy(bfqd, bfqq, true);
- } else {
- bfq_requeue_bfqq(bfqd, bfqq);
- /*
- * Resort priority tree of potential close cooperators.
- */
- bfq_pos_tree_add_move(bfqd, bfqq);
- }
-
- /*
- * All in-service entities must have been properly deactivated
- * or requeued before executing the next function, which
- * resets all in-service entites as no more in service.
- */
- __bfq_bfqd_reset_in_service(bfqd);
-}
-
-/**
- * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
- * @bfqd: device data.
- * @bfqq: queue to update.
- * @reason: reason for expiration.
- *
- * Handle the feedback on @bfqq budget at queue expiration.
- * See the body for detailed comments.
- */
-static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- enum bfqq_expiration reason)
-{
- struct request *next_rq;
- int budget, min_budget;
-
- BUG_ON(bfqq != bfqd->in_service_queue);
-
- min_budget = bfq_min_budget(bfqd);
-
- if (bfqq->wr_coeff == 1)
- budget = bfqq->max_budget;
- else /*
- * Use a constant, low budget for weight-raised queues,
- * to help achieve a low latency. Keep it slightly higher
- * than the minimum possible budget, to cause a little
- * bit fewer expirations.
- */
- budget = 2 * min_budget;
-
- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
- bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
- budget, bfq_min_budget(bfqd));
- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
- bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
-
- if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
- switch (reason) {
- /*
- * Caveat: in all the following cases we trade latency
- * for throughput.
- */
- case BFQ_BFQQ_TOO_IDLE:
- /*
- * This is the only case where we may reduce
- * the budget: if there is no request of the
- * process still waiting for completion, then
- * we assume (tentatively) that the timer has
- * expired because the batch of requests of
- * the process could have been served with a
- * smaller budget. Hence, betting that
- * process will behave in the same way when it
- * becomes backlogged again, we reduce its
- * next budget. As long as we guess right,
- * this budget cut reduces the latency
- * experienced by the process.
- *
- * However, if there are still outstanding
- * requests, then the process may have not yet
- * issued its next request just because it is
- * still waiting for the completion of some of
- * the still outstanding ones. So in this
- * subcase we do not reduce its budget, on the
- * contrary we increase it to possibly boost
- * the throughput, as discussed in the
- * comments to the BUDGET_TIMEOUT case.
- */
- if (bfqq->dispatched > 0) /* still outstanding reqs */
- budget = min(budget * 2, bfqd->bfq_max_budget);
- else {
- if (budget > 5 * min_budget)
- budget -= 4 * min_budget;
- else
- budget = min_budget;
- }
- break;
- case BFQ_BFQQ_BUDGET_TIMEOUT:
- /*
- * We double the budget here because it gives
- * the chance to boost the throughput if this
- * is not a seeky process (and has bumped into
- * this timeout because of, e.g., ZBR).
- */
- budget = min(budget * 2, bfqd->bfq_max_budget);
- break;
- case BFQ_BFQQ_BUDGET_EXHAUSTED:
- /*
- * The process still has backlog, and did not
- * let either the budget timeout or the disk
- * idling timeout expire. Hence it is not
- * seeky, has a short thinktime and may be
- * happy with a higher budget too. So
- * definitely increase the budget of this good
- * candidate to boost the disk throughput.
- */
- budget = min(budget * 4, bfqd->bfq_max_budget);
- break;
- case BFQ_BFQQ_NO_MORE_REQUESTS:
- /*
- * For queues that expire for this reason, it
- * is particularly important to keep the
- * budget close to the actual service they
- * need. Doing so reduces the timestamp
- * misalignment problem described in the
- * comments in the body of
- * __bfq_activate_entity. In fact, suppose
- * that a queue systematically expires for
- * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
- * new request in time to enjoy timestamp
- * back-shifting. The larger the budget of the
- * queue is with respect to the service the
- * queue actually requests in each service
- * slot, the more times the queue can be
- * reactivated with the same virtual finish
- * time. It follows that, even if this finish
- * time is pushed to the system virtual time
- * to reduce the consequent timestamp
- * misalignment, the queue unjustly enjoys for
- * many re-activations a lower finish time
- * than all newly activated queues.
- *
- * The service needed by bfqq is measured
- * quite precisely by bfqq->entity.service.
- * Since bfqq does not enjoy device idling,
- * bfqq->entity.service is equal to the number
- * of sectors that the process associated with
- * bfqq requested to read/write before waiting
- * for request completions, or blocking for
- * other reasons.
- */
- budget = max_t(int, bfqq->entity.service, min_budget);
- break;
- default:
- return;
- }
- } else if (!bfq_bfqq_sync(bfqq))
- /*
- * Async queues get always the maximum possible
- * budget, as for them we do not care about latency
- * (in addition, their ability to dispatch is limited
- * by the charging factor).
- */
- budget = bfqd->bfq_max_budget;
-
- bfqq->max_budget = budget;
-
- if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
- !bfqd->bfq_user_max_budget)
- bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
-
- /*
- * If there is still backlog, then assign a new budget, making
- * sure that it is large enough for the next request. Since
- * the finish time of bfqq must be kept in sync with the
- * budget, be sure to call __bfq_bfqq_expire() *after* this
- * update.
- *
- * If there is no backlog, then no need to update the budget;
- * it will be updated on the arrival of a new request.
- */
- next_rq = bfqq->next_rq;
- if (next_rq) {
- BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
- reason == BFQ_BFQQ_NO_MORE_REQUESTS);
- bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
- bfq_serv_to_charge(next_rq, bfqq));
- BUG_ON(!bfq_bfqq_busy(bfqq));
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
- }
-
- bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
- next_rq ? blk_rq_sectors(next_rq) : 0,
- bfqq->entity.budget);
-}
-
-/*
- * Return true if the process associated with bfqq is "slow". The slow
- * flag is used, in addition to the budget timeout, to reduce the
- * amount of service provided to seeky processes, and thus reduce
- * their chances to lower the throughput. More details in the comments
- * on the function bfq_bfqq_expire().
- *
- * An important observation is in order: as discussed in the comments
- * on the function bfq_update_peak_rate(), with devices with internal
- * queues, it is hard if ever possible to know when and for how long
- * an I/O request is processed by the device (apart from the trivial
- * I/O pattern where a new request is dispatched only after the
- * previous one has been completed). This makes it hard to evaluate
- * the real rate at which the I/O requests of each bfq_queue are
- * served. In fact, for an I/O scheduler like BFQ, serving a
- * bfq_queue means just dispatching its requests during its service
- * slot (i.e., until the budget of the queue is exhausted, or the
- * queue remains idle, or, finally, a timeout fires). But, during the
- * service slot of a bfq_queue, around 100 ms at most, the device may
- * be even still processing requests of bfq_queues served in previous
- * service slots. On the opposite end, the requests of the in-service
- * bfq_queue may be completed after the service slot of the queue
- * finishes.
- *
- * Anyway, unless more sophisticated solutions are used
- * (where possible), the sum of the sizes of the requests dispatched
- * during the service slot of a bfq_queue is probably the only
- * approximation available for the service received by the bfq_queue
- * during its service slot. And this sum is the quantity used in this
- * function to evaluate the I/O speed of a process.
- */
-static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool compensate, enum bfqq_expiration reason,
- unsigned long *delta_ms)
-{
- ktime_t delta_ktime;
- u32 delta_usecs;
- bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
-
- if (!bfq_bfqq_sync(bfqq))
- return false;
-
- if (compensate)
- delta_ktime = bfqd->last_idling_start;
- else
- delta_ktime = ktime_get();
- delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
- delta_usecs = ktime_to_us(delta_ktime);
-
- /* don't use too short time intervals */
- if (delta_usecs < 1000) {
- if (blk_queue_nonrot(bfqd->queue))
- /*
- * give same worst-case guarantees as idling
- * for seeky
- */
- *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
- else /* charge at least one seek */
- *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
-
- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs);
-
- return slow;
- }
-
- *delta_ms = delta_usecs / USEC_PER_MSEC;
-
- /*
- * Use only long (> 20ms) intervals to filter out excessive
- * spikes in service rate estimation.
- */
- if (delta_usecs > 20000) {
- /*
- * Caveat for rotational devices: processes doing I/O
- * in the slower disk zones tend to be slow(er) even
- * if not seeky. In this respect, the estimated peak
- * rate is likely to be an average over the disk
- * surface. Accordingly, to not be too harsh with
- * unlucky processes, a process is deemed slow only if
- * its rate has been lower than half of the estimated
- * peak rate.
- */
- slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
- bfqq->entity.service, bfqd->bfq_max_budget);
- }
-
- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
-
- return slow;
-}
-
-/*
- * To be deemed as soft real-time, an application must meet two
- * requirements. First, the application must not require an average
- * bandwidth higher than the approximate bandwidth required to playback or
- * record a compressed high-definition video.
- * The next function is invoked on the completion of the last request of a
- * batch, to compute the next-start time instant, soft_rt_next_start, such
- * that, if the next request of the application does not arrive before
- * soft_rt_next_start, then the above requirement on the bandwidth is met.
- *
- * The second requirement is that the request pattern of the application is
- * isochronous, i.e., that, after issuing a request or a batch of requests,
- * the application stops issuing new requests until all its pending requests
- * have been completed. After that, the application may issue a new batch,
- * and so on.
- * For this reason the next function is invoked to compute
- * soft_rt_next_start only for applications that meet this requirement,
- * whereas soft_rt_next_start is set to infinity for applications that do
- * not.
- *
- * Unfortunately, even a greedy application may happen to behave in an
- * isochronous way if the CPU load is high. In fact, the application may
- * stop issuing requests while the CPUs are busy serving other processes,
- * then restart, then stop again for a while, and so on. In addition, if
- * the disk achieves a low enough throughput with the request pattern
- * issued by the application (e.g., because the request pattern is random
- * and/or the device is slow), then the application may meet the above
- * bandwidth requirement too. To prevent such a greedy application to be
- * deemed as soft real-time, a further rule is used in the computation of
- * soft_rt_next_start: soft_rt_next_start must be higher than the current
- * time plus the maximum time for which the arrival of a request is waited
- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
- * This filters out greedy applications, as the latter issue instead their
- * next request as soon as possible after the last one has been completed
- * (in contrast, when a batch of requests is completed, a soft real-time
- * application spends some time processing data).
- *
- * Unfortunately, the last filter may easily generate false positives if
- * only bfqd->bfq_slice_idle is used as a reference time interval and one
- * or both the following cases occur:
- * 1) HZ is so low that the duration of a jiffy is comparable to or higher
- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
- * HZ=100.
- * 2) jiffies, instead of increasing at a constant rate, may stop increasing
- * for a while, then suddenly 'jump' by several units to recover the lost
- * increments. This seems to happen, e.g., inside virtual machines.
- * To address this issue, we do not use as a reference time interval just
- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
- * particular we add the minimum number of jiffies for which the filter
- * seems to be quite precise also in embedded systems and KVM/QEMU virtual
- * machines.
- */
-static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- bfq_log_bfqq(bfqd, bfqq,
-"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
- bfqq->service_from_backlogged,
- bfqd->bfq_wr_max_softrt_rate,
- jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
- bfqd->bfq_wr_max_softrt_rate));
-
- return max(bfqq->last_idle_bklogged +
- HZ * bfqq->service_from_backlogged /
- bfqd->bfq_wr_max_softrt_rate,
- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
-}
-
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
- return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
- return jiffies - MAX_JIFFY_OFFSET;
-}
-
-/**
- * bfq_bfqq_expire - expire a queue.
- * @bfqd: device owning the queue.
- * @bfqq: the queue to expire.
- * @compensate: if true, compensate for the time spent idling.
- * @reason: the reason causing the expiration.
- *
- * If the process associated with bfqq does slow I/O (e.g., because it
- * issues random requests), we charge bfqq with the time it has been
- * in service instead of the service it has received (see
- * bfq_bfqq_charge_time for details on how this goal is achieved). As
- * a consequence, bfqq will typically get higher timestamps upon
- * reactivation, and hence it will be rescheduled as if it had
- * received more service than what it has actually received. In the
- * end, bfqq receives less service in proportion to how slowly its
- * associated process consumes its budgets (and hence how seriously it
- * tends to lower the throughput). In addition, this time-charging
- * strategy guarantees time fairness among slow processes. In
- * contrast, if the process associated with bfqq is not slow, we
- * charge bfqq exactly with the service it has received.
- *
- * Charging time to the first type of queues and the exact service to
- * the other has the effect of using the WF2Q+ policy to schedule the
- * former on a timeslice basis, without violating service domain
- * guarantees among the latter.
- */
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- bool compensate,
- enum bfqq_expiration reason)
-{
- bool slow;
- unsigned long delta = 0;
- struct bfq_entity *entity = &bfqq->entity;
- int ref;
-
- BUG_ON(bfqq != bfqd->in_service_queue);
-
- /*
- * Check whether the process is slow (see bfq_bfqq_is_slow).
- */
- slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
-
- /*
- * Increase service_from_backlogged before next statement,
- * because the possible next invocation of
- * bfq_bfqq_charge_time would likely inflate
- * entity->service. In contrast, service_from_backlogged must
- * contain real service, to enable the soft real-time
- * heuristic to correctly compute the bandwidth consumed by
- * bfqq.
- */
- bfqq->service_from_backlogged += entity->service;
-
- /*
- * As above explained, charge slow (typically seeky) and
- * timed-out queues with the time and not the service
- * received, to favor sequential workloads.
- *
- * Processes doing I/O in the slower disk zones will tend to
- * be slow(er) even if not seeky. Therefore, since the
- * estimated peak rate is actually an average over the disk
- * surface, these processes may timeout just for bad luck. To
- * avoid punishing them, do not charge time to processes that
- * succeeded in consuming at least 2/3 of their budget. This
- * allows BFQ to preserve enough elasticity to still perform
- * bandwidth, and not time, distribution with little unlucky
- * or quasi-sequential processes.
- */
- if (bfqq->wr_coeff == 1 &&
- (slow ||
- (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
- bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
- bfq_bfqq_charge_time(bfqd, bfqq, delta);
-
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
- if (reason == BFQ_BFQQ_TOO_IDLE &&
- entity->service <= 2 * entity->budget / 10)
- bfq_clear_bfqq_IO_bound(bfqq);
-
- if (bfqd->low_latency && bfqq->wr_coeff == 1)
- bfqq->last_wr_start_finish = jiffies;
-
- if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
- RB_EMPTY_ROOT(&bfqq->sort_list)) {
- /*
- * If we get here, and there are no outstanding
- * requests, then the request pattern is isochronous
- * (see the comments on the function
- * bfq_bfqq_softrt_next_start()). Thus we can compute
- * soft_rt_next_start. If, instead, the queue still
- * has outstanding requests, then we have to wait for
- * the completion of all the outstanding requests to
- * discover whether the request pattern is actually
- * isochronous.
- */
- BUG_ON(bfqd->busy_queues < 1);
- if (bfqq->dispatched == 0) {
- bfqq->soft_rt_next_start =
- bfq_bfqq_softrt_next_start(bfqd, bfqq);
- bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
- bfqq->soft_rt_next_start);
- } else {
- /*
- * The application is still waiting for the
- * completion of one or more requests:
- * prevent it from possibly being incorrectly
- * deemed as soft real-time by setting its
- * soft_rt_next_start to infinity. In fact,
- * without this assignment, the application
- * would be incorrectly deemed as soft
- * real-time if:
- * 1) it issued a new request before the
- * completion of all its in-flight
- * requests, and
- * 2) at that time, its soft_rt_next_start
- * happened to be in the past.
- */
- bfqq->soft_rt_next_start =
- bfq_greatest_from_now();
- /*
- * Schedule an update of soft_rt_next_start to when
- * the task may be discovered to be isochronous.
- */
- bfq_mark_bfqq_softrt_update(bfqq);
- }
- }
-
- bfq_log_bfqq(bfqd, bfqq,
- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)",
- reason, slow, bfqq->dispatched,
- bfq_bfqq_has_short_ttime(bfqq), entity->weight);
-
- /*
- * Increase, decrease or leave budget unchanged according to
- * reason.
- */
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
- __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
- BUG_ON(bfqq->next_rq == NULL &&
- bfqq->entity.budget < bfqq->entity.service);
- ref = bfqq->ref;
- __bfq_bfqq_expire(bfqd, bfqq);
-
- BUG_ON(ref > 1 &&
- !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
- !bfq_class_idle(bfqq));
-
- /* mark bfqq as waiting a request only if a bic still points to it */
- if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
- reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
- reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
- bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
-}
-
-/*
- * Budget timeout is not implemented through a dedicated timer, but
- * just checked on request arrivals and completions, as well as on
- * idle timer expirations.
- */
-static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
-{
- return time_is_before_eq_jiffies(bfqq->budget_timeout);
-}
-
-/*
- * If we expire a queue that is actively waiting (i.e., with the
- * device idled) for the arrival of a new request, then we may incur
- * the timestamp misalignment problem described in the body of the
- * function __bfq_activate_entity. Hence we return true only if this
- * condition does not hold, or if the queue is slow enough to deserve
- * only to be kicked off for preserving a high throughput.
- */
-static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
-{
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "may_budget_timeout: wait_request %d left %d timeout %d",
- bfq_bfqq_wait_request(bfqq),
- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
- bfq_bfqq_budget_timeout(bfqq));
-
- return (!bfq_bfqq_wait_request(bfqq) ||
- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
- &&
- bfq_bfqq_budget_timeout(bfqq);
-}
-
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for that queue. As a consequence, since
- * device idling plays a critical role for both throughput boosting
- * and service guarantees, the return value of this function plays a
- * critical role as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * while introducing the variables.
- */
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
-{
- struct bfq_data *bfqd = bfqq->bfqd;
- bool rot_without_queueing =
- !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
- bfqq_sequential_and_IO_bound,
- idling_boosts_thr, idling_boosts_thr_without_issues,
- idling_needed_for_service_guarantees,
- asymmetric_scenario;
-
- if (bfqd->strict_guarantees)
- return true;
-
- /*
- * Idling is performed only if slice_idle > 0. In addition, we
- * do not idle if
- * (a) bfqq is async
- * (b) bfqq is in the idle io prio class: in this case we do
- * not idle because we want to minimize the bandwidth that
- * queues in this class can steal to higher-priority queues
- */
- if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
- bfq_class_idle(bfqq))
- return false;
-
- bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
- bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
- /*
- * The next variable takes into account the cases where idling
- * boosts the throughput.
- *
- * The value of the variable is computed considering, first, that
- * idling is virtually always beneficial for the throughput if:
- * (a) the device is not NCQ-capable and rotational, or
- * (b) regardless of the presence of NCQ, the device is rotational and
- * the request pattern for bfqq is I/O-bound and sequential, or
- * (c) regardless of whether it is rotational, the device is
- * not NCQ-capable and the request pattern for bfqq is
- * I/O-bound and sequential.
- *
- * Secondly, and in contrast to the above item (b), idling an
- * NCQ-capable flash-based device would not boost the
- * throughput even with sequential I/O; rather it would lower
- * the throughput in proportion to how fast the device
- * is. Accordingly, the next variable is true if any of the
- * above conditions (a), (b) or (c) is true, and, in
- * particular, happens to be false if bfqd is an NCQ-capable
- * flash-based device.
- */
- idling_boosts_thr = rot_without_queueing ||
- ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) &&
- bfqq_sequential_and_IO_bound);
-
- /*
- * The value of the next variable,
- * idling_boosts_thr_without_issues, is equal to that of
- * idling_boosts_thr, unless a special case holds. In this
- * special case, described below, idling may cause problems to
- * weight-raised queues.
- *
- * When the request pool is saturated (e.g., in the presence
- * of write hogs), if the processes associated with
- * non-weight-raised queues ask for requests at a lower rate,
- * then processes associated with weight-raised queues have a
- * higher probability to get a request from the pool
- * immediately (or at least soon) when they need one. Thus
- * they have a higher probability to actually get a fraction
- * of the device throughput proportional to their high
- * weight. This is especially true with NCQ-capable drives,
- * which enqueue several requests in advance, and further
- * reorder internally-queued requests.
- *
- * For this reason, we force to false the value of
- * idling_boosts_thr_without_issues if there are weight-raised
- * busy queues. In this case, and if bfqq is not weight-raised,
- * this guarantees that the device is not idled for bfqq (if,
- * instead, bfqq is weight-raised, then idling will be
- * guaranteed by another variable, see below). Combined with
- * the timestamping rules of BFQ (see [1] for details), this
- * behavior causes bfqq, and hence any sync non-weight-raised
- * queue, to get a lower number of requests served, and thus
- * to ask for a lower number of requests from the request
- * pool, before the busy weight-raised queues get served
- * again. This often mitigates starvation problems in the
- * presence of heavy write workloads and NCQ, thereby
- * guaranteeing a higher application and system responsiveness
- * in these hostile scenarios.
- */
- idling_boosts_thr_without_issues = idling_boosts_thr &&
- bfqd->wr_busy_queues == 0;
-
- /*
- * There is then a case where idling must be performed not
- * for throughput concerns, but to preserve service
- * guarantees.
- *
- * To introduce this case, we can note that allowing the drive
- * to enqueue more than one request at a time, and hence
- * delegating de facto final scheduling decisions to the
- * drive's internal scheduler, entails loss of control on the
- * actual request service order. In particular, the critical
- * situation is when requests from different processes happen
- * to be present, at the same time, in the internal queue(s)
- * of the drive. In such a situation, the drive, by deciding
- * the service order of the internally-queued requests, does
- * determine also the actual throughput distribution among
- * these processes. But the drive typically has no notion or
- * concern about per-process throughput distribution, and
- * makes its decisions only on a per-request basis. Therefore,
- * the service distribution enforced by the drive's internal
- * scheduler is likely to coincide with the desired
- * device-throughput distribution only in a completely
- * symmetric scenario where:
- * (i) each of these processes must get the same throughput as
- * the others;
- * (ii) all these processes have the same I/O pattern
- * (either sequential or random).
- * In fact, in such a scenario, the drive will tend to treat
- * the requests of each of these processes in about the same
- * way as the requests of the others, and thus to provide
- * each of these processes with about the same throughput
- * (which is exactly the desired throughput distribution). In
- * contrast, in any asymmetric scenario, device idling is
- * certainly needed to guarantee that bfqq receives its
- * assigned fraction of the device throughput (see [1] for
- * details).
- *
- * We address this issue by controlling, actually, only the
- * symmetry sub-condition (i), i.e., provided that
- * sub-condition (i) holds, idling is not performed,
- * regardless of whether sub-condition (ii) holds. In other
- * words, only if sub-condition (i) holds, then idling is
- * allowed, and the device tends to be prevented from queueing
- * many requests, possibly of several processes. The reason
- * for not controlling also sub-condition (ii) is that we
- * exploit preemption to preserve guarantees in case of
- * symmetric scenarios, even if (ii) does not hold, as
- * explained in the next two paragraphs.
- *
- * Even if a queue, say Q, is expired when it remains idle, Q
- * can still preempt the new in-service queue if the next
- * request of Q arrives soon (see the comments on
- * bfq_bfqq_update_budg_for_activation). If all queues and
- * groups have the same weight, this form of preemption,
- * combined with the hole-recovery heuristic described in the
- * comments on function bfq_bfqq_update_budg_for_activation,
- * are enough to preserve a correct bandwidth distribution in
- * the mid term, even without idling. In fact, even if not
- * idling allows the internal queues of the device to contain
- * many requests, and thus to reorder requests, we can rather
- * safely assume that the internal scheduler still preserves a
- * minimum of mid-term fairness. The motivation for using
- * preemption instead of idling is that, by not idling,
- * service guarantees are preserved without minimally
- * sacrificing throughput. In other words, both a high
- * throughput and its desired distribution are obtained.
- *
- * More precisely, this preemption-based, idleless approach
- * provides fairness in terms of IOPS, and not sectors per
- * second. This can be seen with a simple example. Suppose
- * that there are two queues with the same weight, but that
- * the first queue receives requests of 8 sectors, while the
- * second queue receives requests of 1024 sectors. In
- * addition, suppose that each of the two queues contains at
- * most one request at a time, which implies that each queue
- * always remains idle after it is served. Finally, after
- * remaining idle, each queue receives very quickly a new
- * request. It follows that the two queues are served
- * alternatively, preempting each other if needed. This
- * implies that, although both queues have the same weight,
- * the queue with large requests receives a service that is
- * 1024/8 times as high as the service received by the other
- * queue.
- *
- * On the other hand, device idling is performed, and thus
- * pure sector-domain guarantees are provided, for the
- * following queues, which are likely to need stronger
- * throughput guarantees: weight-raised queues, and queues
- * with a higher weight than other queues. When such queues
- * are active, sub-condition (i) is false, which triggers
- * device idling.
- *
- * According to the above considerations, the next variable is
- * true (only) if sub-condition (i) holds. To compute the
- * value of this variable, we not only use the return value of
- * the function bfq_symmetric_scenario(), but also check
- * whether bfqq is being weight-raised, because
- * bfq_symmetric_scenario() does not take into account also
- * weight-raised queues (see comments on
- * bfq_weights_tree_add()).
- *
- * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
- */
- asymmetric_scenario = bfqq->wr_coeff > 1 ||
- !bfq_symmetric_scenario(bfqd);
-
- /*
- * Finally, there is a case where maximizing throughput is the
- * best choice even if it may cause unfairness toward
- * bfqq. Such a case is when bfqq became active in a burst of
- * queue activations. Queues that became active during a large
- * burst benefit only from throughput, as discussed in the
- * comments on bfq_handle_burst. Thus, if bfqq became active
- * in a burst and not idling the device maximizes throughput,
- * then the device must no be idled, because not idling the
- * device provides bfqq and all other queues in the burst with
- * maximum benefit. Combining this and the above case, we can
- * now establish when idling is actually needed to preserve
- * service guarantees.
- */
- idling_needed_for_service_guarantees =
- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
-
- /*
- * We have now all the components we need to compute the
- * return value of the function, which is true only if idling
- * either boosts the throughput (without issues), or is
- * necessary to preserve service guarantees.
- */
- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
- bfq_bfqq_sync(bfqq), idling_boosts_thr);
-
- bfq_log_bfqq(bfqd, bfqq,
- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
- bfqd->wr_busy_queues,
- idling_boosts_thr_without_issues,
- bfq_bfqq_IO_bound(bfqq),
- idling_needed_for_service_guarantees);
-
- return idling_boosts_thr_without_issues ||
- idling_needed_for_service_guarantees;
-}
-
-/*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
- * returns true, then:
- * 1) the queue must remain in service and cannot be expired, and
- * 2) the device must be idled to wait for the possible arrival of a new
- * request for the queue.
- * See the comments on the function bfq_bfqq_may_idle for the reasons
- * why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
- * returns true.
- */
-static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
-{
- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
-}
-
-/*
- * Select a queue for service. If we have a current queue in service,
- * check whether to continue servicing it, or retrieve and set a new one.
- */
-static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
-{
- struct bfq_queue *bfqq;
- struct request *next_rq;
- enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
-
- bfqq = bfqd->in_service_queue;
- if (!bfqq)
- goto new_queue;
-
- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
-
- if (bfq_may_expire_for_budg_timeout(bfqq) &&
- !hrtimer_active(&bfqd->idle_slice_timer) &&
- !bfq_bfqq_must_idle(bfqq))
- goto expire;
-
-check_queue:
- /*
- * This loop is rarely executed more than once. Even when it
- * happens, it is much more convenient to re-execute this loop
- * than to return NULL and trigger a new dispatch to get a
- * request served.
- */
- next_rq = bfqq->next_rq;
- /*
- * If bfqq has requests queued and it has enough budget left to
- * serve them, keep the queue, otherwise expire it.
- */
- if (next_rq) {
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-
- if (bfq_serv_to_charge(next_rq, bfqq) >
- bfq_bfqq_budget_left(bfqq)) {
- /*
- * Expire the queue for budget exhaustion,
- * which makes sure that the next budget is
- * enough to serve the next request, even if
- * it comes from the fifo expired path.
- */
- reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
- goto expire;
- } else {
- /*
- * The idle timer may be pending because we may
- * not disable disk idling even when a new request
- * arrives.
- */
- if (bfq_bfqq_wait_request(bfqq)) {
- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
- /*
- * If we get here: 1) at least a new request
- * has arrived but we have not disabled the
- * timer because the request was too small,
- * 2) then the block layer has unplugged
- * the device, causing the dispatch to be
- * invoked.
- *
- * Since the device is unplugged, now the
- * requests are probably large enough to
- * provide a reasonable throughput.
- * So we disable idling.
- */
- bfq_clear_bfqq_wait_request(bfqq);
- hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
- }
- goto keep_queue;
- }
- }
-
- /*
- * No requests pending. However, if the in-service queue is idling
- * for a new request, or has requests waiting for a completion and
- * may idle after their completion, then keep it anyway.
- */
- if (hrtimer_active(&bfqd->idle_slice_timer) ||
- (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
- bfqq = NULL;
- goto keep_queue;
- }
-
- reason = BFQ_BFQQ_NO_MORE_REQUESTS;
-expire:
- bfq_bfqq_expire(bfqd, bfqq, false, reason);
-new_queue:
- bfqq = bfq_set_in_service_queue(bfqd);
- if (bfqq) {
- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
- goto check_queue;
- }
-keep_queue:
- if (bfqq)
- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
- else
- bfq_log(bfqd, "select_queue: no queue returned");
-
- return bfqq;
-}
-
-static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
- BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
- time_is_after_jiffies(bfqq->last_wr_start_finish));
-
- bfq_log_bfqq(bfqd, bfqq,
- "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
- jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
- jiffies_to_msecs(bfqq->wr_cur_max_time),
- bfqq->wr_coeff,
- bfqq->entity.weight, bfqq->entity.orig_weight);
-
- BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
- entity->orig_weight * bfqq->wr_coeff);
- if (entity->prio_changed)
- bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
-
- /*
- * If the queue was activated in a burst, or too much
- * time has elapsed from the beginning of this
- * weight-raising period, then end weight raising.
- */
- if (bfq_bfqq_in_large_burst(bfqq))
- bfq_bfqq_end_wr(bfqq);
- else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
- bfqq->wr_cur_max_time)) {
- if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
- time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
- bfq_wr_duration(bfqd)))
- bfq_bfqq_end_wr(bfqq);
- else {
- /* switch back to interactive wr */
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- bfqq->last_wr_start_finish =
- bfqq->wr_start_at_switch_to_srt;
- BUG_ON(time_is_after_jiffies(
- bfqq->last_wr_start_finish));
- bfqq->entity.prio_changed = 1;
- bfq_log_bfqq(bfqd, bfqq,
- "back to interactive wr");
- }
- }
- }
- /*
- * To improve latency (for this or other queues), immediately
- * update weight both if it must be raised and if it must be
- * lowered. Since, entity may be on some active tree here, and
- * might have a pending change of its ioprio class, invoke
- * next function with the last parameter unset (see the
- * comments on the function).
- */
- if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
- __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity),
- entity, false);
-}
-
-/*
- * Dispatch one request from bfqq, moving it to the request queue
- * dispatch list.
- */
-static int bfq_dispatch_request(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- int dispatched = 0;
- struct request *rq = bfqq->next_rq;
- unsigned long service_to_charge;
-
- BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
- BUG_ON(!rq);
- service_to_charge = bfq_serv_to_charge(rq, bfqq);
-
- BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq));
-
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
- bfq_bfqq_served(bfqq, service_to_charge);
-
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
- bfq_dispatch_insert(bfqd->queue, rq);
-
- /*
- * If weight raising has to terminate for bfqq, then next
- * function causes an immediate update of bfqq's weight,
- * without waiting for next activation. As a consequence, on
- * expiration, bfqq will be timestamped as if has never been
- * weight-raised during this service slot, even if it has
- * received part or even most of the service as a
- * weight-raised queue. This inflates bfqq's timestamps, which
- * is beneficial, as bfqq is then more willing to leave the
- * device immediately to possible other weight-raised queues.
- */
- bfq_update_wr_data(bfqd, bfqq);
-
- bfq_log_bfqq(bfqd, bfqq,
- "dispatched %u sec req (%llu), budg left %d",
- blk_rq_sectors(rq),
- (unsigned long long) blk_rq_pos(rq),
- bfq_bfqq_budget_left(bfqq));
-
- dispatched++;
-
- if (!bfqd->in_service_bic) {
- atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
- bfqd->in_service_bic = RQ_BIC(rq);
- }
-
- if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
- goto expire;
-
- return dispatched;
-
-expire:
- bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
- return dispatched;
-}
-
-static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
-{
- int dispatched = 0;
-
- while (bfqq->next_rq) {
- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
- dispatched++;
- }
-
- BUG_ON(!list_empty(&bfqq->fifo));
- return dispatched;
-}
-
-/*
- * Drain our current requests.
- * Used for barriers and when switching io schedulers on-the-fly.
- */
-static int bfq_forced_dispatch(struct bfq_data *bfqd)
-{
- struct bfq_queue *bfqq, *n;
- struct bfq_service_tree *st;
- int dispatched = 0;
-
- bfqq = bfqd->in_service_queue;
- if (bfqq)
- __bfq_bfqq_expire(bfqd, bfqq);
-
- /*
- * Loop through classes, and be careful to leave the scheduler
- * in a consistent state, as feedback mechanisms and vtime
- * updates cannot be disabled during the process.
- */
- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
- st = bfq_entity_service_tree(&bfqq->entity);
-
- dispatched += __bfq_forced_dispatch_bfqq(bfqq);
-
- bfqq->max_budget = bfq_max_budget(bfqd);
- bfq_forget_idle(st);
- }
-
- BUG_ON(bfqd->busy_queues != 0);
-
- return dispatched;
-}
-
-static int bfq_dispatch_requests(struct request_queue *q, int force)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_queue *bfqq;
-
- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
-
- if (bfqd->busy_queues == 0)
- return 0;
-
- if (unlikely(force))
- return bfq_forced_dispatch(bfqd);
-
- /*
- * Force device to serve one request at a time if
- * strict_guarantees is true. Forcing this service scheme is
- * currently the ONLY way to guarantee that the request
- * service order enforced by the scheduler is respected by a
- * queueing device. Otherwise the device is free even to make
- * some unlucky request wait for as long as the device
- * wishes.
- *
- * Of course, serving one request at at time may cause loss of
- * throughput.
- */
- if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
- return 0;
-
- bfqq = bfq_select_queue(bfqd);
- if (!bfqq)
- return 0;
-
- BUG_ON(bfqq->entity.budget < bfqq->entity.service);
-
- BUG_ON(bfq_bfqq_wait_request(bfqq));
-
- if (!bfq_dispatch_request(bfqd, bfqq))
- return 0;
-
- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
- bfq_bfqq_sync(bfqq) ? "sync" : "async");
-
- BUG_ON(bfqq->next_rq == NULL &&
- bfqq->entity.budget < bfqq->entity.service);
- return 1;
-}
-
-/*
- * Task holds one reference to the queue, dropped when task exits. Each rq
- * in-flight on this queue also holds a reference, dropped when rq is freed.
- *
- * Queue lock must be held here. Recall not to use bfqq after calling
- * this function on it.
- */
-static void bfq_put_queue(struct bfq_queue *bfqq)
-{
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
-
- BUG_ON(bfqq->ref <= 0);
-
- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
- bfqq->ref--;
- if (bfqq->ref)
- return;
-
- BUG_ON(rb_first(&bfqq->sort_list));
- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
- BUG_ON(bfqq->entity.tree);
- BUG_ON(bfq_bfqq_busy(bfqq));
-
- if (bfq_bfqq_sync(bfqq))
- /*
- * The fact that this queue is being destroyed does not
- * invalidate the fact that this queue may have been
- * activated during the current burst. As a consequence,
- * although the queue does not exist anymore, and hence
- * needs to be removed from the burst list if there,
- * the burst size has not to be decremented.
- */
- hlist_del_init(&bfqq->burst_list_node);
-
- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
-
- kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_put(bfqg);
-#endif
-}
-
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
-{
- struct bfq_queue *__bfqq, *next;
-
- /*
- * If this queue was scheduled to merge with another queue, be
- * sure to drop the reference taken on that queue (and others in
- * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
- */
- __bfqq = bfqq->new_bfqq;
- while (__bfqq) {
- if (__bfqq == bfqq)
- break;
- next = __bfqq->new_bfqq;
- bfq_put_queue(__bfqq);
- __bfqq = next;
- }
-}
-
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- if (bfqq == bfqd->in_service_queue) {
- __bfq_bfqq_expire(bfqd, bfqq);
- bfq_schedule_dispatch(bfqd);
- }
-
- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
-
- bfq_put_cooperator(bfqq);
-
- bfq_put_queue(bfqq); /* release process reference */
-}
-
-static void bfq_init_icq(struct io_cq *icq)
-{
- icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
-}
-
-static void bfq_exit_icq(struct io_cq *icq)
-{
- struct bfq_io_cq *bic = icq_to_bic(icq);
- struct bfq_data *bfqd = bic_to_bfqd(bic);
-
- if (bic_to_bfqq(bic, false)) {
- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
- bic_set_bfqq(bic, NULL, false);
- }
-
- if (bic_to_bfqq(bic, true)) {
- /*
- * If the bic is using a shared queue, put the reference
- * taken on the io_context when the bic started using a
- * shared bfq_queue.
- */
- if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
- put_io_context(icq->ioc);
- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
- bic_set_bfqq(bic, NULL, true);
- }
-}
-
-/*
- * Update the entity prio values; note that the new values will not
- * be used until the next (re)activation.
- */
-static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
- struct bfq_io_cq *bic)
-{
- struct task_struct *tsk = current;
- int ioprio_class;
-
- ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
- switch (ioprio_class) {
- default:
- dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
- "bfq: bad prio class %d\n", ioprio_class);
- case IOPRIO_CLASS_NONE:
- /*
- * No prio set, inherit CPU scheduling settings.
- */
- bfqq->new_ioprio = task_nice_ioprio(tsk);
- bfqq->new_ioprio_class = task_nice_ioclass(tsk);
- break;
- case IOPRIO_CLASS_RT:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
- break;
- case IOPRIO_CLASS_BE:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
- break;
- case IOPRIO_CLASS_IDLE:
- bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
- bfqq->new_ioprio = 7;
- break;
- }
-
- if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
- pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
- bfqq->new_ioprio);
- BUG();
- }
-
- bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
- bfqq->entity.prio_changed = 1;
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "set_next_ioprio_data: bic_class %d prio %d class %d",
- ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
-}
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
-{
- struct bfq_data *bfqd = bic_to_bfqd(bic);
- struct bfq_queue *bfqq;
- unsigned long uninitialized_var(flags);
- int ioprio = bic->icq.ioc->ioprio;
-
- /*
- * This condition may trigger on a newly created bic, be sure to
- * drop the lock before returning.
- */
- if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
- return;
-
- bic->ioprio = ioprio;
-
- bfqq = bic_to_bfqq(bic, false);
- if (bfqq) {
- /* release process reference on this queue */
- bfq_put_queue(bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
- bic_set_bfqq(bic, bfqq, false);
- bfq_log_bfqq(bfqd, bfqq,
- "check_ioprio_change: bfqq %p %d",
- bfqq, bfqq->ref);
- }
-
- bfqq = bic_to_bfqq(bic, true);
- if (bfqq)
- bfq_set_next_ioprio_data(bfqq, bic);
-}
-
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct bfq_io_cq *bic, pid_t pid, int is_sync)
-{
- RB_CLEAR_NODE(&bfqq->entity.rb_node);
- INIT_LIST_HEAD(&bfqq->fifo);
- INIT_HLIST_NODE(&bfqq->burst_list_node);
- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
-
- bfqq->ref = 0;
- bfqq->bfqd = bfqd;
-
- if (bic)
- bfq_set_next_ioprio_data(bfqq, bic);
-
- if (is_sync) {
- /*
- * No need to mark as has_short_ttime if in
- * idle_class, because no device idling is performed
- * for queues in idle class
- */
- if (!bfq_class_idle(bfqq))
- /* tentatively mark as has_short_ttime */
- bfq_mark_bfqq_has_short_ttime(bfqq);
- bfq_mark_bfqq_sync(bfqq);
- bfq_mark_bfqq_just_created(bfqq);
- } else
- bfq_clear_bfqq_sync(bfqq);
- bfq_mark_bfqq_IO_bound(bfqq);
-
- /* Tentative initial value to trade off between thr and lat */
- bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
- bfqq->pid = pid;
-
- bfqq->wr_coeff = 1;
- bfqq->last_wr_start_finish = jiffies;
- bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
- bfqq->budget_timeout = bfq_smallest_from_now();
- bfqq->split_time = bfq_smallest_from_now();
-
- /*
- * Set to the value for which bfqq will not be deemed as
- * soft rt when it becomes backlogged.
- */
- bfqq->soft_rt_next_start = bfq_greatest_from_now();
-
- /* first request is almost certainly seeky */
- bfqq->seek_history = 1;
-}
-
-static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
- struct bfq_group *bfqg,
- int ioprio_class, int ioprio)
-{
- switch (ioprio_class) {
- case IOPRIO_CLASS_RT:
- return &bfqg->async_bfqq[0][ioprio];
- case IOPRIO_CLASS_NONE:
- ioprio = IOPRIO_NORM;
- /* fall through */
- case IOPRIO_CLASS_BE:
- return &bfqg->async_bfqq[1][ioprio];
- case IOPRIO_CLASS_IDLE:
- return &bfqg->async_idle_bfqq;
- default:
- BUG();
- }
-}
-
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic)
-{
- const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
- struct bfq_queue **async_bfqq = NULL;
- struct bfq_queue *bfqq;
- struct bfq_group *bfqg;
-
- rcu_read_lock();
-
- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
- if (!bfqg) {
- bfqq = &bfqd->oom_bfqq;
- goto out;
- }
-
- if (!is_sync) {
- async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
- ioprio);
- bfqq = *async_bfqq;
- if (bfqq)
- goto out;
- }
-
- bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
- bfqd->queue->node);
-
- if (bfqq) {
- bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
- is_sync);
- bfq_init_entity(&bfqq->entity, bfqg);
- bfq_log_bfqq(bfqd, bfqq, "allocated");
- } else {
- bfqq = &bfqd->oom_bfqq;
- bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
- goto out;
- }
-
- /*
- * Pin the queue now that it's allocated, scheduler exit will
- * prune it.
- */
- if (async_bfqq) {
- bfqq->ref++; /*
- * Extra group reference, w.r.t. sync
- * queue. This extra reference is removed
- * only if bfqq->bfqg disappears, to
- * guarantee that this queue is not freed
- * until its group goes away.
- */
- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
- bfqq, bfqq->ref);
- *async_bfqq = bfqq;
- }
-
-out:
- bfqq->ref++; /* get a process reference to this queue */
- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
- rcu_read_unlock();
- return bfqq;
-}
-
-static void bfq_update_io_thinktime(struct bfq_data *bfqd,
- struct bfq_io_cq *bic)
-{
- struct bfq_ttime *ttime = &bic->ttime;
- u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
-
- elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
-
- ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
- ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
- ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
- ttime->ttime_samples);
-}
-
-static void
-bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct request *rq)
-{
- bfqq->seek_history <<= 1;
- bfqq->seek_history |=
- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
- (!blk_queue_nonrot(bfqd->queue) ||
- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
-}
-
-static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct bfq_io_cq *bic)
-{
- bool has_short_ttime = true;
-
- /*
- * No need to update has_short_ttime if bfqq is async or in
- * idle io prio class, or if bfq_slice_idle is zero, because
- * no device idling is performed for bfqq in this case.
- */
- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) ||
- bfqd->bfq_slice_idle == 0)
- return;
-
- /* Idle window just restored, statistics are meaningless. */
- if (time_is_after_eq_jiffies(bfqq->split_time +
- bfqd->bfq_wr_min_idle_time))
- return;
-
- /* Think time is infinite if no process is linked to
- * bfqq. Otherwise check average think time to
- * decide whether to mark as has_short_ttime
- */
- if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
- (bfq_sample_valid(bic->ttime.ttime_samples) &&
- bic->ttime.ttime_mean > bfqd->bfq_slice_idle))
- has_short_ttime = false;
-
- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
- has_short_ttime);
-
- if (has_short_ttime)
- bfq_mark_bfqq_has_short_ttime(bfqq);
- else
- bfq_clear_bfqq_has_short_ttime(bfqq);
-}
-
-/*
- * Called when a new fs request (rq) is added to bfqq. Check if there's
- * something we should do about it.
- */
-static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct request *rq)
-{
- struct bfq_io_cq *bic = RQ_BIC(rq);
-
- if (rq->cmd_flags & REQ_META)
- bfqq->meta_pending++;
-
- bfq_update_io_thinktime(bfqd, bic);
- bfq_update_has_short_ttime(bfqd, bfqq, bic);
- bfq_update_io_seektime(bfqd, bfqq, rq);
-
- bfq_log_bfqq(bfqd, bfqq,
- "rq_enqueued: has_short_ttime=%d (seeky %d)",
- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
-
- bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
- if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
- bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
- blk_rq_sectors(rq) < 32;
- bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
-
- /*
- * There is just this request queued: if the request
- * is small and the queue is not to be expired, then
- * just exit.
- *
- * In this way, if the device is being idled to wait
- * for a new request from the in-service queue, we
- * avoid unplugging the device and committing the
- * device to serve just a small request. On the
- * contrary, we wait for the block layer to decide
- * when to unplug the device: hopefully, new requests
- * will be merged to this one quickly, then the device
- * will be unplugged and larger requests will be
- * dispatched.
- */
- if (small_req && !budget_timeout)
- return;
-
- /*
- * A large enough request arrived, or the queue is to
- * be expired: in both cases disk idling is to be
- * stopped, so clear wait_request flag and reset
- * timer.
- */
- bfq_clear_bfqq_wait_request(bfqq);
- hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
- bfqg_stats_update_idle_time(bfqq_group(bfqq));
-
- /*
- * The queue is not empty, because a new request just
- * arrived. Hence we can safely expire the queue, in
- * case of budget timeout, without risking that the
- * timestamps of the queue are not updated correctly.
- * See [1] for more details.
- */
- if (budget_timeout)
- bfq_bfqq_expire(bfqd, bfqq, false,
- BFQ_BFQQ_BUDGET_TIMEOUT);
-
- /*
- * Let the request rip immediately, or let a new queue be
- * selected if bfqq has just been expired.
- */
- __blk_run_queue(bfqd->queue);
- }
-}
-
-static void bfq_insert_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
-
- assert_spin_locked(bfqd->queue->queue_lock);
-
- /*
- * An unplug may trigger a requeue of a request from the device
- * driver: make sure we are in process context while trying to
- * merge two bfq_queues.
- */
- if (!in_interrupt()) {
- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
- if (new_bfqq) {
- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
- /*
- * Release the request's reference to the old bfqq
- * and make sure one is taken to the shared queue.
- */
- new_bfqq->allocated[rq_data_dir(rq)]++;
- bfqq->allocated[rq_data_dir(rq)]--;
- new_bfqq->ref++;
- bfq_clear_bfqq_just_created(bfqq);
- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
- bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
- bfqq, new_bfqq);
- /*
- * rq is about to be enqueued into new_bfqq,
- * release rq reference on bfqq
- */
- bfq_put_queue(bfqq);
- rq->elv.priv[1] = new_bfqq;
- bfqq = new_bfqq;
- }
- }
-
- bfq_add_request(rq);
-
- rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
- list_add_tail(&rq->queuelist, &bfqq->fifo);
-
- bfq_rq_enqueued(bfqd, bfqq, rq);
-}
-
-static void bfq_update_hw_tag(struct bfq_data *bfqd)
-{
- bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
- bfqd->rq_in_driver);
-
- if (bfqd->hw_tag == 1)
- return;
-
- /*
- * This sample is valid if the number of outstanding requests
- * is large enough to allow a queueing behavior. Note that the
- * sum is not exact, as it's not taking into account deactivated
- * requests.
- */
- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
- return;
-
- if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
- return;
-
- bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
- bfqd->max_rq_in_driver = 0;
- bfqd->hw_tag_samples = 0;
-}
-
-static void bfq_completed_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
- struct bfq_data *bfqd = bfqq->bfqd;
- u64 now_ns;
- u32 delta_us;
-
- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
- blk_rq_sectors(rq));
-
- assert_spin_locked(bfqd->queue->queue_lock);
- bfq_update_hw_tag(bfqd);
-
- BUG_ON(!bfqd->rq_in_driver);
- BUG_ON(!bfqq->dispatched);
- bfqd->rq_in_driver--;
- bfqq->dispatched--;
- bfqg_stats_update_completion(bfqq_group(bfqq),
- rq_start_time_ns(rq),
- rq_io_start_time_ns(rq), req_op(rq),
- rq->cmd_flags);
-
- if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
- BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
- /*
- * Set budget_timeout (which we overload to store the
- * time at which the queue remains with no backlog and
- * no outstanding request; used by the weight-raising
- * mechanism).
- */
- bfqq->budget_timeout = jiffies;
-
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
- }
-
- now_ns = ktime_get_ns();
-
- RQ_BIC(rq)->ttime.last_end_request = now_ns;
-
- /*
- * Using us instead of ns, to get a reasonable precision in
- * computing rate in next check.
- */
- delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
-
- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
- delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
- (USEC_PER_SEC*
- (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT,
- (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
-
- /*
- * If the request took rather long to complete, and, according
- * to the maximum request size recorded, this completion latency
- * implies that the request was certainly served at a very low
- * rate (less than 1M sectors/sec), then the whole observation
- * interval that lasts up to this time instant cannot be a
- * valid time interval for computing a new peak rate. Invoke
- * bfq_update_rate_reset to have the following three steps
- * taken:
- * - close the observation interval at the last (previous)
- * request dispatch or completion
- * - compute rate, if possible, for that observation interval
- * - reset to zero samples, which will trigger a proper
- * re-initialization of the observation interval on next
- * dispatch
- */
- if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
- (bfqd->last_rq_max_size<last_completion = now_ns;
-
- /*
- * If we are waiting to discover whether the request pattern
- * of the task associated with the queue is actually
- * isochronous, and both requisites for this condition to hold
- * are now satisfied, then compute soft_rt_next_start (see the
- * comments on the function bfq_bfqq_softrt_next_start()). We
- * schedule this delayed check when bfqq expires, if it still
- * has in-flight requests.
- */
- if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
- RB_EMPTY_ROOT(&bfqq->sort_list))
- bfqq->soft_rt_next_start =
- bfq_bfqq_softrt_next_start(bfqd, bfqq);
-
- /*
- * If this is the in-service queue, check if it needs to be expired,
- * or if we want to idle in case it has no pending requests.
- */
- if (bfqd->in_service_queue == bfqq) {
- if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
- bfq_arm_slice_timer(bfqd);
- goto out;
- } else if (bfq_may_expire_for_budg_timeout(bfqq))
- bfq_bfqq_expire(bfqd, bfqq, false,
- BFQ_BFQQ_BUDGET_TIMEOUT);
- else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
- (bfqq->dispatched == 0 ||
- !bfq_bfqq_may_idle(bfqq)))
- bfq_bfqq_expire(bfqd, bfqq, false,
- BFQ_BFQQ_NO_MORE_REQUESTS);
- }
-
- if (!bfqd->rq_in_driver)
- bfq_schedule_dispatch(bfqd);
-
-out:
- return;
-}
-
-static int __bfq_may_queue(struct bfq_queue *bfqq)
-{
- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
- bfq_clear_bfqq_must_alloc(bfqq);
- return ELV_MQUEUE_MUST;
- }
-
- return ELV_MQUEUE_MAY;
-}
-
-static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct task_struct *tsk = current;
- struct bfq_io_cq *bic;
- struct bfq_queue *bfqq;
-
- /*
- * Don't force setup of a queue from here, as a call to may_queue
- * does not necessarily imply that a request actually will be
- * queued. So just lookup a possibly existing queue, or return
- * 'may queue' if that fails.
- */
- bic = bfq_bic_lookup(bfqd, tsk->io_context);
- if (!bic)
- return ELV_MQUEUE_MAY;
-
- bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
- if (bfqq)
- return __bfq_may_queue(bfqq);
-
- return ELV_MQUEUE_MAY;
-}
-
-/*
- * Queue lock held here.
- */
-static void bfq_put_request(struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
- if (bfqq) {
- const int rw = rq_data_dir(rq);
-
- BUG_ON(!bfqq->allocated[rw]);
- bfqq->allocated[rw]--;
-
- rq->elv.priv[0] = NULL;
- rq->elv.priv[1] = NULL;
-
- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
- bfqq, bfqq->ref);
- bfq_put_queue(bfqq);
- }
-}
-
-/*
- * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
- * was the last process referring to that bfqq.
- */
-static struct bfq_queue *
-bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
-{
- bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
-
- put_io_context(bic->icq.ioc);
-
- if (bfqq_process_refs(bfqq) == 1) {
- bfqq->pid = current->pid;
- bfq_clear_bfqq_coop(bfqq);
- bfq_clear_bfqq_split_coop(bfqq);
- return bfqq;
- }
-
- bic_set_bfqq(bic, NULL, 1);
-
- bfq_put_cooperator(bfqq);
-
- bfq_put_queue(bfqq);
- return NULL;
-}
-
-/*
- * Allocate bfq data structures associated with this request.
- */
-static int bfq_set_request(struct request_queue *q, struct request *rq,
- struct bio *bio, gfp_t gfp_mask)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
- const int rw = rq_data_dir(rq);
- const int is_sync = rq_is_sync(rq);
- struct bfq_queue *bfqq;
- unsigned long flags;
- bool bfqq_already_existing = false, split = false;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- if (!bic)
- goto queue_fail;
-
- bfq_check_ioprio_change(bic, bio);
-
- bfq_bic_update_cgroup(bic, bio);
-
-new_queue:
- bfqq = bic_to_bfqq(bic, is_sync);
- if (!bfqq || bfqq == &bfqd->oom_bfqq) {
- if (bfqq)
- bfq_put_queue(bfqq);
- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
-
- bic_set_bfqq(bic, bfqq, is_sync);
- if (split && is_sync) {
- bfq_log_bfqq(bfqd, bfqq,
- "set_request: was_in_list %d "
- "was_in_large_burst %d "
- "large burst in progress %d",
- bic->was_in_burst_list,
- bic->saved_in_large_burst,
- bfqd->large_burst);
-
- if ((bic->was_in_burst_list && bfqd->large_burst) ||
- bic->saved_in_large_burst) {
- bfq_log_bfqq(bfqd, bfqq,
- "set_request: marking in "
- "large burst");
- bfq_mark_bfqq_in_large_burst(bfqq);
- } else {
- bfq_log_bfqq(bfqd, bfqq,
- "set_request: clearing in "
- "large burst");
- bfq_clear_bfqq_in_large_burst(bfqq);
- if (bic->was_in_burst_list)
- hlist_add_head(&bfqq->burst_list_node,
- &bfqd->burst_list);
- }
- bfqq->split_time = jiffies;
- }
- } else {
- /* If the queue was seeky for too long, break it apart. */
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
-
- /* Update bic before losing reference to bfqq */
- if (bfq_bfqq_in_large_burst(bfqq))
- bic->saved_in_large_burst = true;
-
- bfqq = bfq_split_bfqq(bic, bfqq);
- split = true;
- if (!bfqq)
- goto new_queue;
- else
- bfqq_already_existing = true;
- }
- }
-
- bfqq->allocated[rw]++;
- bfqq->ref++;
- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
-
- rq->elv.priv[0] = bic;
- rq->elv.priv[1] = bfqq;
-
- /*
- * If a bfq_queue has only one process reference, it is owned
- * by only one bfq_io_cq: we can set the bic field of the
- * bfq_queue to the address of that structure. Also, if the
- * queue has just been split, mark a flag so that the
- * information is available to the other scheduler hooks.
- */
- if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
- bfqq->bic = bic;
- if (split) {
- /*
- * If the queue has just been split from a shared
- * queue, restore the idle window and the possible
- * weight raising period.
- */
- bfq_bfqq_resume_state(bfqq, bfqd, bic,
- bfqq_already_existing);
- }
- }
-
- if (unlikely(bfq_bfqq_just_created(bfqq)))
- bfq_handle_burst(bfqd, bfqq);
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-
- return 0;
-
-queue_fail:
- bfq_schedule_dispatch(bfqd);
- spin_unlock_irqrestore(q->queue_lock, flags);
-
- return 1;
-}
-
-static void bfq_kick_queue(struct work_struct *work)
-{
- struct bfq_data *bfqd =
- container_of(work, struct bfq_data, unplug_work);
- struct request_queue *q = bfqd->queue;
-
- spin_lock_irq(q->queue_lock);
- __blk_run_queue(q);
- spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * Handler of the expiration of the timer running if the in-service queue
- * is idling inside its time slice.
- */
-static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
-{
- struct bfq_data *bfqd = container_of(timer, struct bfq_data,
- idle_slice_timer);
- struct bfq_queue *bfqq;
- unsigned long flags;
- enum bfqq_expiration reason;
-
- spin_lock_irqsave(bfqd->queue->queue_lock, flags);
-
- bfqq = bfqd->in_service_queue;
- /*
- * Theoretical race here: the in-service queue can be NULL or
- * different from the queue that was idling if the timer handler
- * spins on the queue_lock and a new request arrives for the
- * current queue and there is a full dispatch cycle that changes
- * the in-service queue. This can hardly happen, but in the worst
- * case we just expire a queue too early.
- */
- if (bfqq) {
- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
- bfq_clear_bfqq_wait_request(bfqq);
-
- if (bfq_bfqq_budget_timeout(bfqq))
- /*
- * Also here the queue can be safely expired
- * for budget timeout without wasting
- * guarantees
- */
- reason = BFQ_BFQQ_BUDGET_TIMEOUT;
- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
- /*
- * The queue may not be empty upon timer expiration,
- * because we may not disable the timer when the
- * first request of the in-service queue arrives
- * during disk idling.
- */
- reason = BFQ_BFQQ_TOO_IDLE;
- else
- goto schedule_dispatch;
-
- bfq_bfqq_expire(bfqd, bfqq, true, reason);
- }
-
-schedule_dispatch:
- bfq_schedule_dispatch(bfqd);
-
- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
- return HRTIMER_NORESTART;
-}
-
-static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
-{
- hrtimer_cancel(&bfqd->idle_slice_timer);
- cancel_work_sync(&bfqd->unplug_work);
-}
-
-static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
- struct bfq_queue **bfqq_ptr)
-{
- struct bfq_group *root_group = bfqd->root_group;
- struct bfq_queue *bfqq = *bfqq_ptr;
-
- bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
- if (bfqq) {
- bfq_bfqq_move(bfqd, bfqq, root_group);
- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
- bfqq, bfqq->ref);
- bfq_put_queue(bfqq);
- *bfqq_ptr = NULL;
- }
-}
-
-/*
- * Release all the bfqg references to its async queues. If we are
- * deallocating the group these queues may still contain requests, so
- * we reparent them to the root cgroup (i.e., the only one that will
- * exist for sure until all the requests on a device are gone).
- */
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
-{
- int i, j;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
- __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
-
- __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
-}
-
-static void bfq_exit_queue(struct elevator_queue *e)
-{
- struct bfq_data *bfqd = e->elevator_data;
- struct request_queue *q = bfqd->queue;
- struct bfq_queue *bfqq, *n;
-
- bfq_shutdown_timer_wq(bfqd);
-
- spin_lock_irq(q->queue_lock);
-
- BUG_ON(bfqd->in_service_queue);
- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
- bfq_deactivate_bfqq(bfqd, bfqq, false, false);
-
- spin_unlock_irq(q->queue_lock);
-
- bfq_shutdown_timer_wq(bfqd);
-
- BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_deactivate_policy(q, &blkcg_policy_bfq);
-#else
- bfq_put_async_queues(bfqd, bfqd->root_group);
- kfree(bfqd->root_group);
-#endif
-
- kfree(bfqd);
-}
-
-static void bfq_init_root_group(struct bfq_group *root_group,
- struct bfq_data *bfqd)
-{
- int i;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- root_group->entity.parent = NULL;
- root_group->my_entity = NULL;
- root_group->bfqd = bfqd;
-#endif
- root_group->rq_pos_tree = RB_ROOT;
- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
- root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
- root_group->sched_data.bfq_class_idle_last_service = jiffies;
-}
-
-static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
-{
- struct bfq_data *bfqd;
- struct elevator_queue *eq;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
-
- bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
- if (!bfqd) {
- kobject_put(&eq->kobj);
- return -ENOMEM;
- }
- eq->elevator_data = bfqd;
-
- /*
- * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
- * Grab a permanent reference to it, so that the normal code flow
- * will not attempt to free it.
- */
- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
- bfqd->oom_bfqq.ref++;
- bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
- bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
- bfqd->oom_bfqq.entity.new_weight =
- bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
-
- /* oom_bfqq does not participate to bursts */
- bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
- /*
- * Trigger weight initialization, according to ioprio, at the
- * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
- * class won't be changed any more.
- */
- bfqd->oom_bfqq.entity.prio_changed = 1;
-
- bfqd->queue = q;
-
- spin_lock_irq(q->queue_lock);
- q->elevator = eq;
- spin_unlock_irq(q->queue_lock);
-
- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
- if (!bfqd->root_group)
- goto out_free;
- bfq_init_root_group(bfqd->root_group, bfqd);
- bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-
- hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
-
- bfqd->queue_weights_tree = RB_ROOT;
- bfqd->group_weights_tree = RB_ROOT;
-
- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
-
- INIT_LIST_HEAD(&bfqd->active_list);
- INIT_LIST_HEAD(&bfqd->idle_list);
- INIT_HLIST_HEAD(&bfqd->burst_list);
-
- bfqd->hw_tag = -1;
-
- bfqd->bfq_max_budget = bfq_default_max_budget;
-
- bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
- bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
- bfqd->bfq_back_max = bfq_back_max;
- bfqd->bfq_back_penalty = bfq_back_penalty;
- bfqd->bfq_slice_idle = bfq_slice_idle;
- bfqd->bfq_timeout = bfq_timeout;
-
- bfqd->bfq_requests_within_timer = 120;
-
- bfqd->bfq_large_burst_thresh = 8;
- bfqd->bfq_burst_interval = msecs_to_jiffies(180);
-
- bfqd->low_latency = true;
-
- /*
- * Trade-off between responsiveness and fairness.
- */
- bfqd->bfq_wr_coeff = 30;
- bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
- bfqd->bfq_wr_max_time = 0;
- bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
- bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
- bfqd->bfq_wr_max_softrt_rate = 7000; /*
- * Approximate rate required
- * to playback or record a
- * high-definition compressed
- * video.
- */
- bfqd->wr_busy_queues = 0;
-
- /*
- * Begin by assuming, optimistically, that the device is a
- * high-speed one, and that its peak rate is equal to 2/3 of
- * the highest reference rate.
- */
- bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
- T_fast[blk_queue_nonrot(bfqd->queue)];
- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
- bfqd->device_speed = BFQ_BFQD_FAST;
-
- return 0;
-
-out_free:
- kfree(bfqd);
- kobject_put(&eq->kobj);
- return -ENOMEM;
-}
-
-static void bfq_slab_kill(void)
-{
- kmem_cache_destroy(bfq_pool);
-}
-
-static int __init bfq_slab_setup(void)
-{
- bfq_pool = KMEM_CACHE(bfq_queue, 0);
- if (!bfq_pool)
- return -ENOMEM;
- return 0;
-}
-
-static ssize_t bfq_var_show(unsigned int var, char *page)
-{
- return sprintf(page, "%u\n", var);
-}
-
-static ssize_t bfq_var_store(unsigned long *var, const char *page,
- size_t count)
-{
- unsigned long new_val;
- int ret = kstrtoul(page, 10, &new_val);
-
- if (ret == 0)
- *var = new_val;
-
- return count;
-}
-
-static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
-{
- struct bfq_data *bfqd = e->elevator_data;
-
- return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
- jiffies_to_msecs(bfqd->bfq_wr_max_time) :
- jiffies_to_msecs(bfq_wr_duration(bfqd)));
-}
-
-static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
-{
- struct bfq_queue *bfqq;
- struct bfq_data *bfqd = e->elevator_data;
- ssize_t num_char = 0;
-
- num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
- bfqd->queued);
-
- spin_lock_irq(bfqd->queue->queue_lock);
-
- num_char += sprintf(page + num_char, "Active:\n");
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
- num_char += sprintf(page + num_char,
- "pid%d: weight %hu, nr_queued %d %d, ",
- bfqq->pid,
- bfqq->entity.weight,
- bfqq->queued[0],
- bfqq->queued[1]);
- num_char += sprintf(page + num_char,
- "dur %d/%u\n",
- jiffies_to_msecs(
- jiffies -
- bfqq->last_wr_start_finish),
- jiffies_to_msecs(bfqq->wr_cur_max_time));
- }
-
- num_char += sprintf(page + num_char, "Idle:\n");
- list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
- num_char += sprintf(page + num_char,
- "pid%d: weight %hu, dur %d/%u\n",
- bfqq->pid,
- bfqq->entity.weight,
- jiffies_to_msecs(jiffies -
- bfqq->last_wr_start_finish),
- jiffies_to_msecs(bfqq->wr_cur_max_time));
- }
-
- spin_unlock_irq(bfqd->queue->queue_lock);
-
- return num_char;
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
-static ssize_t __FUNC(struct elevator_queue *e, char *page) \
-{ \
- struct bfq_data *bfqd = e->elevator_data; \
- u64 __data = __VAR; \
- if (__CONV == 1) \
- __data = jiffies_to_msecs(__data); \
- else if (__CONV == 2) \
- __data = div_u64(__data, NSEC_PER_MSEC); \
- return bfq_var_show(__data, (page)); \
-}
-SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
-SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
-SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
-SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
-SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
-SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
-SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
-SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
-SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
-SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
-SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
-SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
- 1);
-SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
-#undef SHOW_FUNCTION
-
-#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
-static ssize_t __FUNC(struct elevator_queue *e, char *page) \
-{ \
- struct bfq_data *bfqd = e->elevator_data; \
- u64 __data = __VAR; \
- __data = div_u64(__data, NSEC_PER_USEC); \
- return bfq_var_show(__data, (page)); \
-}
-USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
-#undef USEC_SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
-static ssize_t \
-__FUNC(struct elevator_queue *e, const char *page, size_t count) \
-{ \
- struct bfq_data *bfqd = e->elevator_data; \
- unsigned long uninitialized_var(__data); \
- int ret = bfq_var_store(&__data, (page), count); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
- if (__CONV == 1) \
- *(__PTR) = msecs_to_jiffies(__data); \
- else if (__CONV == 2) \
- *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
- else \
- *(__PTR) = __data; \
- return ret; \
-}
-STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
- INT_MAX, 2);
-STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
- INT_MAX, 2);
-STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
-STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
- INT_MAX, 0);
-STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
-STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
-STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
- 1);
-STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
- INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
- &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
- INT_MAX, 0);
-#undef STORE_FUNCTION
-
-#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
-{ \
- struct bfq_data *bfqd = e->elevator_data; \
- unsigned long uninitialized_var(__data); \
- int ret = bfq_var_store(&__data, (page), count); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
- *(__PTR) = (u64)__data * NSEC_PER_USEC; \
- return ret; \
-}
-USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
- UINT_MAX);
-#undef USEC_STORE_FUNCTION
-
-/* do nothing for the moment */
-static ssize_t bfq_weights_store(struct elevator_queue *e,
- const char *page, size_t count)
-{
- return count;
-}
-
-static ssize_t bfq_max_budget_store(struct elevator_queue *e,
- const char *page, size_t count)
-{
- struct bfq_data *bfqd = e->elevator_data;
- unsigned long uninitialized_var(__data);
- int ret = bfq_var_store(&__data, (page), count);
-
- if (__data == 0)
- bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
- else {
- if (__data > INT_MAX)
- __data = INT_MAX;
- bfqd->bfq_max_budget = __data;
- }
-
- bfqd->bfq_user_max_budget = __data;
-
- return ret;
-}
-
-/*
- * Leaving this name to preserve name compatibility with cfq
- * parameters, but this timeout is used for both sync and async.
- */
-static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
- const char *page, size_t count)
-{
- struct bfq_data *bfqd = e->elevator_data;
- unsigned long uninitialized_var(__data);
- int ret = bfq_var_store(&__data, (page), count);
-
- if (__data < 1)
- __data = 1;
- else if (__data > INT_MAX)
- __data = INT_MAX;
-
- bfqd->bfq_timeout = msecs_to_jiffies(__data);
- if (bfqd->bfq_user_max_budget == 0)
- bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
-
- return ret;
-}
-
-static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
- const char *page, size_t count)
-{
- struct bfq_data *bfqd = e->elevator_data;
- unsigned long uninitialized_var(__data);
- int ret = bfq_var_store(&__data, (page), count);
-
- if (__data > 1)
- __data = 1;
- if (!bfqd->strict_guarantees && __data == 1
- && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
- bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
-
- bfqd->strict_guarantees = __data;
-
- return ret;
-}
-
-static ssize_t bfq_low_latency_store(struct elevator_queue *e,
- const char *page, size_t count)
-{
- struct bfq_data *bfqd = e->elevator_data;
- unsigned long uninitialized_var(__data);
- int ret = bfq_var_store(&__data, (page), count);
-
- if (__data > 1)
- __data = 1;
- if (__data == 0 && bfqd->low_latency != 0)
- bfq_end_wr(bfqd);
- bfqd->low_latency = __data;
-
- return ret;
-}
-
-#define BFQ_ATTR(name) \
- __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
-
-static struct elv_fs_entry bfq_attrs[] = {
- BFQ_ATTR(fifo_expire_sync),
- BFQ_ATTR(fifo_expire_async),
- BFQ_ATTR(back_seek_max),
- BFQ_ATTR(back_seek_penalty),
- BFQ_ATTR(slice_idle),
- BFQ_ATTR(slice_idle_us),
- BFQ_ATTR(max_budget),
- BFQ_ATTR(timeout_sync),
- BFQ_ATTR(strict_guarantees),
- BFQ_ATTR(low_latency),
- BFQ_ATTR(wr_coeff),
- BFQ_ATTR(wr_max_time),
- BFQ_ATTR(wr_rt_max_time),
- BFQ_ATTR(wr_min_idle_time),
- BFQ_ATTR(wr_min_inter_arr_async),
- BFQ_ATTR(wr_max_softrt_rate),
- BFQ_ATTR(weights),
- __ATTR_NULL
-};
-
-static struct elevator_type iosched_bfq = {
- .ops = {
- .elevator_merge_fn = bfq_merge,
- .elevator_merged_fn = bfq_merged_request,
- .elevator_merge_req_fn = bfq_merged_requests,
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- .elevator_bio_merged_fn = bfq_bio_merged,
-#endif
- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge,
- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge,
- .elevator_dispatch_fn = bfq_dispatch_requests,
- .elevator_add_req_fn = bfq_insert_request,
- .elevator_activate_req_fn = bfq_activate_request,
- .elevator_deactivate_req_fn = bfq_deactivate_request,
- .elevator_completed_req_fn = bfq_completed_request,
- .elevator_former_req_fn = elv_rb_former_request,
- .elevator_latter_req_fn = elv_rb_latter_request,
- .elevator_init_icq_fn = bfq_init_icq,
- .elevator_exit_icq_fn = bfq_exit_icq,
- .elevator_set_req_fn = bfq_set_request,
- .elevator_put_req_fn = bfq_put_request,
- .elevator_may_queue_fn = bfq_may_queue,
- .elevator_init_fn = bfq_init_queue,
- .elevator_exit_fn = bfq_exit_queue,
- },
- .icq_size = sizeof(struct bfq_io_cq),
- .icq_align = __alignof__(struct bfq_io_cq),
- .elevator_attrs = bfq_attrs,
- .elevator_name = "bfq",
- .elevator_owner = THIS_MODULE,
-};
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct blkcg_policy blkcg_policy_bfq = {
- .dfl_cftypes = bfq_blkg_files,
- .legacy_cftypes = bfq_blkcg_legacy_files,
-
- .cpd_alloc_fn = bfq_cpd_alloc,
- .cpd_init_fn = bfq_cpd_init,
- .cpd_bind_fn = bfq_cpd_init,
- .cpd_free_fn = bfq_cpd_free,
-
- .pd_alloc_fn = bfq_pd_alloc,
- .pd_init_fn = bfq_pd_init,
- .pd_offline_fn = bfq_pd_offline,
- .pd_free_fn = bfq_pd_free,
- .pd_reset_stats_fn = bfq_pd_reset_stats,
-};
-#endif
-
-static int __init bfq_init(void)
-{
- int ret;
- char msg[60] = "BFQ I/O-scheduler: v8r12";
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- ret = blkcg_policy_register(&blkcg_policy_bfq);
- if (ret)
- return ret;
-#endif
-
- ret = -ENOMEM;
- if (bfq_slab_setup())
- goto err_pol_unreg;
-
- /*
- * Times to load large popular applications for the typical
- * systems installed on the reference devices (see the
- * comments before the definitions of the next two
- * arrays). Actually, we use slightly slower values, as the
- * estimated peak rate tends to be smaller than the actual
- * peak rate. The reason for this last fact is that estimates
- * are computed over much shorter time intervals than the long
- * intervals typically used for benchmarking. Why? First, to
- * adapt more quickly to variations. Second, because an I/O
- * scheduler cannot rely on a peak-rate-evaluation workload to
- * be run for a long time.
- */
- T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
- T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
- T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
- T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
- /*
- * Thresholds that determine the switch between speed classes
- * (see the comments before the definition of the array
- * device_speed_thresh). These thresholds are biased towards
- * transitions to the fast class. This is safer than the
- * opposite bias. In fact, a wrong transition to the slow
- * class results in short weight-raising periods, because the
- * speed of the device then tends to be higher that the
- * reference peak rate. On the opposite end, a wrong
- * transition to the fast class tends to increase
- * weight-raising periods, because of the opposite reason.
- */
- device_speed_thresh[0] = (4 * R_slow[0]) / 3;
- device_speed_thresh[1] = (4 * R_slow[1]) / 3;
-
- ret = elv_register(&iosched_bfq);
- if (ret)
- goto err_pol_unreg;
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- strcat(msg, " (with cgroups support)");
-#endif
- pr_info("%s", msg);
-
- return 0;
-
-err_pol_unreg:
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
- return ret;
-}
-
-static void __exit bfq_exit(void)
-{
- elv_unregister(&iosched_bfq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_policy_unregister(&blkcg_policy_bfq);
-#endif
- bfq_slab_kill();
-}
-
-module_init(bfq_init);
-module_exit(bfq_exit);
-
-MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
-MODULE_LICENSE("GPL");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
deleted file mode 100644
index be985d9d5f17..000000000000
--- a/block/bfq-sched.c
+++ /dev/null
@@ -1,2025 +0,0 @@
-/*
- * BFQ: Hierarchical B-WF2Q+ scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe
- *
- * Copyright (C) 2008 Fabio Checconi
- * Paolo Valente
- *
- * Copyright (C) 2015 Paolo Valente
- *
- * Copyright (C) 2016 Paolo Valente
- */
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
- return (s64)(a - b) > 0;
-}
-
-static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
-{
- struct rb_node *node = tree->rb_node;
-
- return rb_entry(node, struct bfq_entity, rb_node);
-}
-
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
-
-/**
- * bfq_update_next_in_service - update sd->next_in_service
- * @sd: sched_data for which to perform the update.
- * @new_entity: if not NULL, pointer to the entity whose activation,
- * requeueing or repositionig triggered the invocation of
- * this function.
- *
- * This function is called to update sd->next_in_service, which, in
- * its turn, may change as a consequence of the insertion or
- * extraction of an entity into/from one of the active trees of
- * sd. These insertions/extractions occur as a consequence of
- * activations/deactivations of entities, with some activations being
- * 'true' activations, and other activations being requeueings (i.e.,
- * implementing the second, requeueing phase of the mechanism used to
- * reposition an entity in its active tree; see comments on
- * __bfq_activate_entity and __bfq_requeue_entity for details). In
- * both the last two activation sub-cases, new_entity points to the
- * just activated or requeued entity.
- *
- * Returns true if sd->next_in_service changes in such a way that
- * entity->parent may become the next_in_service for its parent
- * entity.
- */
-static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
- struct bfq_entity *new_entity)
-{
- struct bfq_entity *next_in_service = sd->next_in_service;
- struct bfq_queue *bfqq;
- bool parent_sched_may_change = false;
-
- /*
- * If this update is triggered by the activation, requeueing
- * or repositiong of an entity that does not coincide with
- * sd->next_in_service, then a full lookup in the active tree
- * can be avoided. In fact, it is enough to check whether the
- * just-modified entity has a higher priority than
- * sd->next_in_service, or, even if it has the same priority
- * as sd->next_in_service, is eligible and has a lower virtual
- * finish time than sd->next_in_service. If this compound
- * condition holds, then the new entity becomes the new
- * next_in_service. Otherwise no change is needed.
- */
- if (new_entity && new_entity != sd->next_in_service) {
- /*
- * Flag used to decide whether to replace
- * sd->next_in_service with new_entity. Tentatively
- * set to true, and left as true if
- * sd->next_in_service is NULL.
- */
- bool replace_next = true;
-
- /*
- * If there is already a next_in_service candidate
- * entity, then compare class priorities or timestamps
- * to decide whether to replace sd->service_tree with
- * new_entity.
- */
- if (next_in_service) {
- unsigned int new_entity_class_idx =
- bfq_class_idx(new_entity);
- struct bfq_service_tree *st =
- sd->service_tree + new_entity_class_idx;
-
- /*
- * For efficiency, evaluate the most likely
- * sub-condition first.
- */
- replace_next =
- (new_entity_class_idx ==
- bfq_class_idx(next_in_service)
- &&
- !bfq_gt(new_entity->start, st->vtime)
- &&
- bfq_gt(next_in_service->finish,
- new_entity->finish))
- ||
- new_entity_class_idx <
- bfq_class_idx(next_in_service);
- }
-
- if (replace_next)
- next_in_service = new_entity;
- } else /* invoked because of a deactivation: lookup needed */
- next_in_service = bfq_lookup_next_entity(sd);
-
- if (next_in_service) {
- parent_sched_may_change = !sd->next_in_service ||
- bfq_update_parent_budget(next_in_service);
- }
-
- sd->next_in_service = next_in_service;
-
- if (!next_in_service)
- return parent_sched_may_change;
-
- bfqq = bfq_entity_to_bfqq(next_in_service);
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "update_next_in_service: chosen this queue");
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(next_in_service,
- struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "update_next_in_service: chosen this entity");
- }
-#endif
- return parent_sched_may_change;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-/* both next loops stop at one of the child entities of the root group */
-#define for_each_entity(entity) \
- for (; entity ; entity = entity->parent)
-
-/*
- * For each iteration, compute parent in advance, so as to be safe if
- * entity is deallocated during the iteration. Such a deallocation may
- * happen as a consequence of a bfq_put_queue that frees the bfq_queue
- * containing entity.
- */
-#define for_each_entity_safe(entity, parent) \
- for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
-
-/*
- * Returns true if this budget changes may let next_in_service->parent
- * become the next_in_service entity for its parent entity.
- */
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
- struct bfq_entity *bfqg_entity;
- struct bfq_group *bfqg;
- struct bfq_sched_data *group_sd;
- bool ret = false;
-
- BUG_ON(!next_in_service);
-
- group_sd = next_in_service->sched_data;
-
- bfqg = container_of(group_sd, struct bfq_group, sched_data);
- /*
- * bfq_group's my_entity field is not NULL only if the group
- * is not the root group. We must not touch the root entity
- * as it must never become an in-service entity.
- */
- bfqg_entity = bfqg->my_entity;
- if (bfqg_entity) {
- if (bfqg_entity->budget > next_in_service->budget)
- ret = true;
- bfqg_entity->budget = next_in_service->budget;
- }
-
- return ret;
-}
-
-/*
- * This function tells whether entity stops being a candidate for next
- * service, according to the restrictive definition of the field
- * next_in_service. In particular, this function is invoked for an
- * entity that is about to be set in service.
- *
- * If entity is a queue, then the entity is no longer a candidate for
- * next service according to the that definition, because entity is
- * about to become the in-service queue. This function then returns
- * true if entity is a queue.
- *
- * In contrast, entity could still be a candidate for next service if
- * it is not a queue, and has more than one active child. In fact,
- * even if one of its children is about to be set in service, other
- * active children may still be the next to serve, for the parent
- * entity, even according to the above definition. As a consequence, a
- * non-queue entity is not a candidate for next-service only if it has
- * only one active child. And only if this condition holds, then this
- * function returns true for a non-queue entity.
- */
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
- struct bfq_group *bfqg;
-
- if (bfq_entity_to_bfqq(entity))
- return true;
-
- bfqg = container_of(entity, struct bfq_group, entity);
-
- BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group);
- BUG_ON(bfqg->active_entities == 0);
- /*
- * The field active_entities does not always contain the
- * actual number of active children entities: it happens to
- * not account for the in-service entity in case the latter is
- * removed from its active tree (which may get done after
- * invoking the function bfq_no_longer_next_in_service in
- * bfq_get_next_queue). Fortunately, here, i.e., while
- * bfq_no_longer_next_in_service is not yet completed in
- * bfq_get_next_queue, bfq_active_extract has not yet been
- * invoked, and thus active_entities still coincides with the
- * actual number of active entities.
- */
- if (bfqg->active_entities == 1)
- return true;
-
- return false;
-}
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-#define for_each_entity(entity) \
- for (; entity ; entity = NULL)
-
-#define for_each_entity_safe(entity, parent) \
- for (parent = NULL; entity ; entity = parent)
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
- return false;
-}
-
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
- return true;
-}
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-/*
- * Shift for timestamp calculations. This actually limits the maximum
- * service allowed in one timestamp delta (small shift values increase it),
- * the maximum total weight that can be used for the queues in the system
- * (big shift values increase it), and the period of virtual time
- * wraparounds.
- */
-#define WFQ_SERVICE_SHIFT 22
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = NULL;
-
- BUG_ON(!entity);
-
- if (!entity->my_sched_data)
- bfqq = container_of(entity, struct bfq_queue, entity);
-
- return bfqq;
-}
-
-
-/**
- * bfq_delta - map service into the virtual time domain.
- * @service: amount of service.
- * @weight: scale factor (weight of an entity or weight sum).
- */
-static u64 bfq_delta(unsigned long service, unsigned long weight)
-{
- u64 d = (u64)service << WFQ_SERVICE_SHIFT;
-
- do_div(d, weight);
- return d;
-}
-
-/**
- * bfq_calc_finish - assign the finish time to an entity.
- * @entity: the entity to act upon.
- * @service: the service to be charged to the entity.
- */
-static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- unsigned long long start, finish, delta;
-
- BUG_ON(entity->weight == 0);
-
- entity->finish = entity->start +
- bfq_delta(service, entity->weight);
-
- start = ((entity->start>>10)*1000)>>12;
- finish = ((entity->finish>>10)*1000)>>12;
- delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
-
- if (bfqq) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "calc_finish: serv %lu, w %d",
- service, entity->weight);
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "calc_finish: start %llu, finish %llu, delta %llu",
- start, finish, delta);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- } else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "calc_finish group: serv %lu, w %d",
- service, entity->weight);
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "calc_finish group: start %llu, finish %llu, delta %llu",
- start, finish, delta);
-#endif
- }
-}
-
-/**
- * bfq_entity_of - get an entity from a node.
- * @node: the node field of the entity.
- *
- * Convert a node pointer to the relative entity. This is used only
- * to simplify the logic of some functions and not as the generic
- * conversion mechanism because, e.g., in the tree walking functions,
- * the check for a %NULL value would be redundant.
- */
-static struct bfq_entity *bfq_entity_of(struct rb_node *node)
-{
- struct bfq_entity *entity = NULL;
-
- if (node)
- entity = rb_entry(node, struct bfq_entity, rb_node);
-
- return entity;
-}
-
-/**
- * bfq_extract - remove an entity from a tree.
- * @root: the tree root.
- * @entity: the entity to remove.
- */
-static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
-{
- BUG_ON(entity->tree != root);
-
- entity->tree = NULL;
- rb_erase(&entity->rb_node, root);
-}
-
-/**
- * bfq_idle_extract - extract an entity from the idle tree.
- * @st: the service tree of the owning @entity.
- * @entity: the entity being removed.
- */
-static void bfq_idle_extract(struct bfq_service_tree *st,
- struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- struct rb_node *next;
-
- BUG_ON(entity->tree != &st->idle);
-
- if (entity == st->first_idle) {
- next = rb_next(&entity->rb_node);
- st->first_idle = bfq_entity_of(next);
- }
-
- if (entity == st->last_idle) {
- next = rb_prev(&entity->rb_node);
- st->last_idle = bfq_entity_of(next);
- }
-
- bfq_extract(&st->idle, entity);
-
- if (bfqq)
- list_del(&bfqq->bfqq_list);
-}
-
-/**
- * bfq_insert - generic tree insertion.
- * @root: tree root.
- * @entity: entity to insert.
- *
- * This is used for the idle and the active tree, since they are both
- * ordered by finish time.
- */
-static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
-{
- struct bfq_entity *entry;
- struct rb_node **node = &root->rb_node;
- struct rb_node *parent = NULL;
-
- BUG_ON(entity->tree);
-
- while (*node) {
- parent = *node;
- entry = rb_entry(parent, struct bfq_entity, rb_node);
-
- if (bfq_gt(entry->finish, entity->finish))
- node = &parent->rb_left;
- else
- node = &parent->rb_right;
- }
-
- rb_link_node(&entity->rb_node, parent, node);
- rb_insert_color(&entity->rb_node, root);
-
- entity->tree = root;
-}
-
-/**
- * bfq_update_min - update the min_start field of a entity.
- * @entity: the entity to update.
- * @node: one of its children.
- *
- * This function is called when @entity may store an invalid value for
- * min_start due to updates to the active tree. The function assumes
- * that the subtree rooted at @node (which may be its left or its right
- * child) has a valid min_start value.
- */
-static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
-{
- struct bfq_entity *child;
-
- if (node) {
- child = rb_entry(node, struct bfq_entity, rb_node);
- if (bfq_gt(entity->min_start, child->min_start))
- entity->min_start = child->min_start;
- }
-}
-
-/**
- * bfq_update_active_node - recalculate min_start.
- * @node: the node to update.
- *
- * @node may have changed position or one of its children may have moved,
- * this function updates its min_start value. The left and right subtrees
- * are assumed to hold a correct min_start value.
- */
-static void bfq_update_active_node(struct rb_node *node)
-{
- struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- entity->min_start = entity->start;
- bfq_update_min(entity, node->rb_right);
- bfq_update_min(entity, node->rb_left);
-
- if (bfqq) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "update_active_node: new min_start %llu",
- ((entity->min_start>>10)*1000)>>12);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- } else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "update_active_node: new min_start %llu",
- ((entity->min_start>>10)*1000)>>12);
-#endif
- }
-}
-
-/**
- * bfq_update_active_tree - update min_start for the whole active tree.
- * @node: the starting node.
- *
- * @node must be the deepest modified node after an update. This function
- * updates its min_start using the values held by its children, assuming
- * that they did not change, and then updates all the nodes that may have
- * changed in the path to the root. The only nodes that may have changed
- * are the ones in the path or their siblings.
- */
-static void bfq_update_active_tree(struct rb_node *node)
-{
- struct rb_node *parent;
-
-up:
- bfq_update_active_node(node);
-
- parent = rb_parent(node);
- if (!parent)
- return;
-
- if (node == parent->rb_left && parent->rb_right)
- bfq_update_active_node(parent->rb_right);
- else if (parent->rb_left)
- bfq_update_active_node(parent->rb_left);
-
- node = parent;
- goto up;
-}
-
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
- struct bfq_entity *entity,
- struct rb_root *root);
-
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_entity *entity,
- struct rb_root *root);
-
-
-/**
- * bfq_active_insert - insert an entity in the active tree of its
- * group/device.
- * @st: the service tree of the entity.
- * @entity: the entity being inserted.
- *
- * The active tree is ordered by finish time, but an extra key is kept
- * per each node, containing the minimum value for the start times of
- * its children (and the node itself), so it's possible to search for
- * the eligible node with the lowest finish time in logarithmic time.
- */
-static void bfq_active_insert(struct bfq_service_tree *st,
- struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
-
- bfq_insert(&st->active, entity);
-
- if (node->rb_left)
- node = node->rb_left;
- else if (node->rb_right)
- node = node->rb_right;
-
- bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- BUG_ON(!bfqg);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
- if (bfqq)
- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else { /* bfq_group */
- BUG_ON(!bfqd);
- bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
- }
- if (bfqg != bfqd->root_group) {
- BUG_ON(!bfqg);
- BUG_ON(!bfqd);
- bfqg->active_entities++;
- }
-#endif
-}
-
-/**
- * bfq_ioprio_to_weight - calc a weight from an ioprio.
- * @ioprio: the ioprio value to convert.
- */
-static unsigned short bfq_ioprio_to_weight(int ioprio)
-{
- BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
- return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
-}
-
-/**
- * bfq_weight_to_ioprio - calc an ioprio from a weight.
- * @weight: the weight value to convert.
- *
- * To preserve as much as possible the old only-ioprio user interface,
- * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
- */
-static unsigned short bfq_weight_to_ioprio(int weight)
-{
- BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
- 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
-}
-
-static void bfq_get_entity(struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- if (bfqq) {
- bfqq->ref++;
- bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
- bfqq, bfqq->ref);
- }
-}
-
-/**
- * bfq_find_deepest - find the deepest node that an extraction can modify.
- * @node: the node being removed.
- *
- * Do the first step of an extraction in an rb tree, looking for the
- * node that will replace @node, and returning the deepest node that
- * the following modifications to the tree can touch. If @node is the
- * last node in the tree return %NULL.
- */
-static struct rb_node *bfq_find_deepest(struct rb_node *node)
-{
- struct rb_node *deepest;
-
- if (!node->rb_right && !node->rb_left)
- deepest = rb_parent(node);
- else if (!node->rb_right)
- deepest = node->rb_left;
- else if (!node->rb_left)
- deepest = node->rb_right;
- else {
- deepest = rb_next(node);
- if (deepest->rb_right)
- deepest = deepest->rb_right;
- else if (rb_parent(deepest) != node)
- deepest = rb_parent(deepest);
- }
-
- return deepest;
-}
-
-/**
- * bfq_active_extract - remove an entity from the active tree.
- * @st: the service_tree containing the tree.
- * @entity: the entity being removed.
- */
-static void bfq_active_extract(struct bfq_service_tree *st,
- struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
-
- node = bfq_find_deepest(&entity->rb_node);
- bfq_extract(&st->active, entity);
-
- if (node)
- bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- BUG_ON(!bfqg);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
- if (bfqq)
- list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else { /* bfq_group */
- BUG_ON(!bfqd);
- bfq_weights_tree_remove(bfqd, entity,
- &bfqd->group_weights_tree);
- }
- if (bfqg != bfqd->root_group) {
- BUG_ON(!bfqg);
- BUG_ON(!bfqd);
- BUG_ON(!bfqg->active_entities);
- bfqg->active_entities--;
- }
-#endif
-}
-
-/**
- * bfq_idle_insert - insert an entity into the idle tree.
- * @st: the service tree containing the tree.
- * @entity: the entity to insert.
- */
-static void bfq_idle_insert(struct bfq_service_tree *st,
- struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- struct bfq_entity *first_idle = st->first_idle;
- struct bfq_entity *last_idle = st->last_idle;
-
- if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
- st->first_idle = entity;
- if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
- st->last_idle = entity;
-
- bfq_insert(&st->idle, entity);
-
- if (bfqq)
- list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
-}
-
-/**
- * bfq_forget_entity - do not consider entity any longer for scheduling
- * @st: the service tree.
- * @entity: the entity being removed.
- * @is_in_service: true if entity is currently the in-service entity.
- *
- * Forget everything about @entity. In addition, if entity represents
- * a queue, and the latter is not in service, then release the service
- * reference to the queue (the one taken through bfq_get_entity). In
- * fact, in this case, there is really no more service reference to
- * the queue, as the latter is also outside any service tree. If,
- * instead, the queue is in service, then __bfq_bfqd_reset_in_service
- * will take care of putting the reference when the queue finally
- * stops being served.
- */
-static void bfq_forget_entity(struct bfq_service_tree *st,
- struct bfq_entity *entity,
- bool is_in_service)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- BUG_ON(!entity->on_st);
-
- entity->on_st = false;
- st->wsum -= entity->weight;
- if (bfqq && !is_in_service) {
- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d",
- bfqq, bfqq->ref);
- bfq_put_queue(bfqq);
- }
-}
-
-/**
- * bfq_put_idle_entity - release the idle tree ref of an entity.
- * @st: service tree for the entity.
- * @entity: the entity being released.
- */
-static void bfq_put_idle_entity(struct bfq_service_tree *st,
- struct bfq_entity *entity)
-{
- bfq_idle_extract(st, entity);
- bfq_forget_entity(st, entity,
- entity == entity->sched_data->in_service_entity);
-}
-
-/**
- * bfq_forget_idle - update the idle tree if necessary.
- * @st: the service tree to act upon.
- *
- * To preserve the global O(log N) complexity we only remove one entry here;
- * as the idle tree will not grow indefinitely this can be done safely.
- */
-static void bfq_forget_idle(struct bfq_service_tree *st)
-{
- struct bfq_entity *first_idle = st->first_idle;
- struct bfq_entity *last_idle = st->last_idle;
-
- if (RB_EMPTY_ROOT(&st->active) && last_idle &&
- !bfq_gt(last_idle->finish, st->vtime)) {
- /*
- * Forget the whole idle tree, increasing the vtime past
- * the last finish time of idle entities.
- */
- st->vtime = last_idle->finish;
- }
-
- if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
- bfq_put_idle_entity(st, first_idle);
-}
-
-/*
- * Update weight and priority of entity. If update_class_too is true,
- * then update the ioprio_class of entity too.
- *
- * The reason why the update of ioprio_class is controlled through the
- * last parameter is as follows. Changing the ioprio class of an
- * entity implies changing the destination service trees for that
- * entity. If such a change occurred when the entity is already on one
- * of the service trees for its previous class, then the state of the
- * entity would become more complex: none of the new possible service
- * trees for the entity, according to bfq_entity_service_tree(), would
- * match any of the possible service trees on which the entity
- * is. Complex operations involving these trees, such as entity
- * activations and deactivations, should take into account this
- * additional complexity. To avoid this issue, this function is
- * invoked with update_class_too unset in the points in the code where
- * entity may happen to be on some tree.
- */
-static struct bfq_service_tree *
-__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
- struct bfq_entity *entity,
- bool update_class_too)
-{
- struct bfq_service_tree *new_st = old_st;
-
- if (entity->prio_changed) {
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- unsigned int prev_weight, new_weight;
- struct bfq_data *bfqd = NULL;
- struct rb_root *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd;
- struct bfq_group *bfqg;
-#endif
-
- if (bfqq)
- bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- sd = entity->my_sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- BUG_ON(!bfqg);
- bfqd = (struct bfq_data *)bfqg->bfqd;
- BUG_ON(!bfqd);
- }
-#endif
-
- BUG_ON(entity->tree && update_class_too);
- BUG_ON(old_st->wsum < entity->weight);
- old_st->wsum -= entity->weight;
-
- if (entity->new_weight != entity->orig_weight) {
- if (entity->new_weight < BFQ_MIN_WEIGHT ||
- entity->new_weight > BFQ_MAX_WEIGHT) {
- pr_crit("update_weight_prio: new_weight %d\n",
- entity->new_weight);
- if (entity->new_weight < BFQ_MIN_WEIGHT)
- entity->new_weight = BFQ_MIN_WEIGHT;
- else
- entity->new_weight = BFQ_MAX_WEIGHT;
- }
- entity->orig_weight = entity->new_weight;
- if (bfqq)
- bfqq->ioprio =
- bfq_weight_to_ioprio(entity->orig_weight);
- }
-
- if (bfqq && update_class_too)
- bfqq->ioprio_class = bfqq->new_ioprio_class;
-
- /*
- * Reset prio_changed only if the ioprio_class change
- * is not pending any longer.
- */
- if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class)
- entity->prio_changed = 0;
-
- /*
- * NOTE: here we may be changing the weight too early,
- * this will cause unfairness. The correct approach
- * would have required additional complexity to defer
- * weight changes to the proper time instants (i.e.,
- * when entity->finish <= old_st->vtime).
- */
- new_st = bfq_entity_service_tree(entity);
-
- prev_weight = entity->weight;
- new_weight = entity->orig_weight *
- (bfqq ? bfqq->wr_coeff : 1);
- /*
- * If the weight of the entity changes, remove the entity
- * from its old weight counter (if there is a counter
- * associated with the entity), and add it to the counter
- * associated with its new weight.
- */
- if (prev_weight != new_weight) {
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "weight changed %d %d(%d %d)",
- prev_weight, new_weight,
- entity->orig_weight,
- bfqq->wr_coeff);
-
- root = bfqq ? &bfqd->queue_weights_tree :
- &bfqd->group_weights_tree;
- bfq_weights_tree_remove(bfqd, entity, root);
- }
- entity->weight = new_weight;
- /*
- * Add the entity to its weights tree only if it is
- * not associated with a weight-raised queue.
- */
- if (prev_weight != new_weight &&
- (bfqq ? bfqq->wr_coeff == 1 : 1))
- /* If we get here, root has been initialized. */
- bfq_weights_tree_add(bfqd, entity, root);
-
- new_st->wsum += entity->weight;
-
- if (new_st != old_st) {
- BUG_ON(!update_class_too);
- entity->start = new_st->vtime;
- }
- }
-
- return new_st;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-#endif
-
-/**
- * bfq_bfqq_served - update the scheduler status after selection for
- * service.
- * @bfqq: the queue being served.
- * @served: bytes to transfer.
- *
- * NOTE: this can be optimized, as the timestamps of upper level entities
- * are synchronized every time a new bfqq is selected for service. By now,
- * we keep it to better check consistency.
- */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
-{
- struct bfq_entity *entity = &bfqq->entity;
- struct bfq_service_tree *st;
-
- for_each_entity(entity) {
- st = bfq_entity_service_tree(entity);
-
- entity->service += served;
-
- BUG_ON(st->wsum == 0);
-
- st->vtime += bfq_delta(served, st->wsum);
- bfq_forget_idle(st);
- }
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
-#endif
- st = bfq_entity_service_tree(&bfqq->entity);
- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
- served, ((st->vtime>>10)*1000)>>12, st);
-}
-
-/**
- * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
- * of the time interval during which bfqq has been in
- * service.
- * @bfqd: the device
- * @bfqq: the queue that needs a service update.
- * @time_ms: the amount of time during which the queue has received service
- *
- * If a queue does not consume its budget fast enough, then providing
- * the queue with service fairness may impair throughput, more or less
- * severely. For this reason, queues that consume their budget slowly
- * are provided with time fairness instead of service fairness. This
- * goal is achieved through the BFQ scheduling engine, even if such an
- * engine works in the service, and not in the time domain. The trick
- * is charging these queues with an inflated amount of service, equal
- * to the amount of service that they would have received during their
- * service slot if they had been fast, i.e., if their requests had
- * been dispatched at a rate equal to the estimated peak rate.
- *
- * It is worth noting that time fairness can cause important
- * distortions in terms of bandwidth distribution, on devices with
- * internal queueing. The reason is that I/O requests dispatched
- * during the service slot of a queue may be served after that service
- * slot is finished, and may have a total processing time loosely
- * correlated with the duration of the service slot. This is
- * especially true for short service slots.
- */
-static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- unsigned long time_ms)
-{
- struct bfq_entity *entity = &bfqq->entity;
- int tot_serv_to_charge = entity->service;
- unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
-
- if (time_ms > 0 && time_ms < timeout_ms)
- tot_serv_to_charge =
- (bfqd->bfq_max_budget * time_ms) / timeout_ms;
-
- if (tot_serv_to_charge < entity->service)
- tot_serv_to_charge = entity->service;
-
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "charge_time: %lu/%u ms, %d/%d/%d sectors",
- time_ms, timeout_ms, entity->service,
- tot_serv_to_charge, entity->budget);
-
- /* Increase budget to avoid inconsistencies */
- if (tot_serv_to_charge > entity->budget)
- entity->budget = tot_serv_to_charge;
-
- bfq_bfqq_served(bfqq,
- max_t(int, 0, tot_serv_to_charge - entity->service));
-}
-
-static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
- struct bfq_service_tree *st,
- bool backshifted)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- struct bfq_sched_data *sd = entity->sched_data;
-
- /*
- * When this function is invoked, entity is not in any service
- * tree, then it is safe to invoke next function with the last
- * parameter set (see the comments on the function).
- */
- BUG_ON(entity->tree);
- st = __bfq_entity_update_weight_prio(st, entity, true);
- bfq_calc_finish(entity, entity->budget);
-
- /*
- * If some queues enjoy backshifting for a while, then their
- * (virtual) finish timestamps may happen to become lower and
- * lower than the system virtual time. In particular, if
- * these queues often happen to be idle for short time
- * periods, and during such time periods other queues with
- * higher timestamps happen to be busy, then the backshifted
- * timestamps of the former queues can become much lower than
- * the system virtual time. In fact, to serve the queues with
- * higher timestamps while the ones with lower timestamps are
- * idle, the system virtual time may be pushed-up to much
- * higher values than the finish timestamps of the idle
- * queues. As a consequence, the finish timestamps of all new
- * or newly activated queues may end up being much larger than
- * those of lucky queues with backshifted timestamps. The
- * latter queues may then monopolize the device for a lot of
- * time. This would simply break service guarantees.
- *
- * To reduce this problem, push up a little bit the
- * backshifted timestamps of the queue associated with this
- * entity (only a queue can happen to have the backshifted
- * flag set): just enough to let the finish timestamp of the
- * queue be equal to the current value of the system virtual
- * time. This may introduce a little unfairness among queues
- * with backshifted timestamps, but it does not break
- * worst-case fairness guarantees.
- *
- * As a special case, if bfqq is weight-raised, push up
- * timestamps much less, to keep very low the probability that
- * this push up causes the backshifted finish timestamps of
- * weight-raised queues to become higher than the backshifted
- * finish timestamps of non weight-raised queues.
- */
- if (backshifted && bfq_gt(st->vtime, entity->finish)) {
- unsigned long delta = st->vtime - entity->finish;
-
- if (bfqq)
- delta /= bfqq->wr_coeff;
-
- entity->start += delta;
- entity->finish += delta;
-
- if (bfqq) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "__activate_entity: new queue finish %llu",
- ((entity->finish>>10)*1000)>>12);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- } else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "__activate_entity: new group finish %llu",
- ((entity->finish>>10)*1000)>>12);
-#endif
- }
- }
-
- bfq_active_insert(st, entity);
-
- if (bfqq) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "__activate_entity: queue %seligible in st %p",
- entity->start <= st->vtime ? "" : "non ", st);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- } else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "__activate_entity: group %seligible in st %p",
- entity->start <= st->vtime ? "" : "non ", st);
-#endif
- }
- BUG_ON(RB_EMPTY_ROOT(&st->active));
- BUG_ON(&st->active != &sd->service_tree->active &&
- &st->active != &(sd->service_tree+1)->active &&
- &st->active != &(sd->service_tree+2)->active);
-}
-
-/**
- * __bfq_activate_entity - handle activation of entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if entity was waiting for a request
- *
- * Called for a 'true' activation, i.e., if entity is not active and
- * one of its children receives a new request.
- *
- * Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possibly extracting it
- * from its idle tree.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
- bool non_blocking_wait_rq)
-{
- struct bfq_sched_data *sd = entity->sched_data;
- struct bfq_service_tree *st = bfq_entity_service_tree(entity);
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- bool backshifted = false;
- unsigned long long min_vstart;
-
- BUG_ON(!sd);
- BUG_ON(!st);
-
- /* See comments on bfq_fqq_update_budg_for_activation */
- if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
- backshifted = true;
- min_vstart = entity->finish;
- } else
- min_vstart = st->vtime;
-
- if (entity->tree == &st->idle) {
- /*
- * Must be on the idle tree, bfq_idle_extract() will
- * check for that.
- */
- bfq_idle_extract(st, entity);
- BUG_ON(entity->tree);
- entity->start = bfq_gt(min_vstart, entity->finish) ?
- min_vstart : entity->finish;
- } else {
- BUG_ON(entity->tree);
- /*
- * The finish time of the entity may be invalid, and
- * it is in the past for sure, otherwise the queue
- * would have been on the idle tree.
- */
- entity->start = min_vstart;
- st->wsum += entity->weight;
- /*
- * entity is about to be inserted into a service tree,
- * and then set in service: get a reference to make
- * sure entity does not disappear until it is no
- * longer in service or scheduled for service.
- */
- bfq_get_entity(entity);
-
- BUG_ON(entity->on_st && bfqq);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (entity->on_st && !bfqq) {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group,
- entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd,
- bfqg,
- "activate bug, class %d in_service %p",
- bfq_class_idx(entity), sd->in_service_entity);
- }
-#endif
- BUG_ON(entity->on_st && !bfqq);
- entity->on_st = true;
- }
-
- bfq_update_fin_time_enqueue(entity, st, backshifted);
-}
-
-/**
- * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
- * @entity: the entity being requeued or repositioned.
- *
- * Requeueing is needed if this entity stops being served, which
- * happens if a leaf descendant entity has expired. On the other hand,
- * repositioning is needed if the next_inservice_entity for the child
- * entity has changed. See the comments inside the function for
- * details.
- *
- * Basically, this function: 1) removes entity from its active tree if
- * present there, 2) updates the timestamps of entity and 3) inserts
- * entity back into its active tree (in the new, right position for
- * the new values of the timestamps).
- */
-static void __bfq_requeue_entity(struct bfq_entity *entity)
-{
- struct bfq_sched_data *sd = entity->sched_data;
- struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
- BUG_ON(!sd);
- BUG_ON(!st);
-
- BUG_ON(entity != sd->in_service_entity &&
- entity->tree != &st->active);
-
- if (entity == sd->in_service_entity) {
- /*
- * We are requeueing the current in-service entity,
- * which may have to be done for one of the following
- * reasons:
- * - entity represents the in-service queue, and the
- * in-service queue is being requeued after an
- * expiration;
- * - entity represents a group, and its budget has
- * changed because one of its child entities has
- * just been either activated or requeued for some
- * reason; the timestamps of the entity need then to
- * be updated, and the entity needs to be enqueued
- * or repositioned accordingly.
- *
- * In particular, before requeueing, the start time of
- * the entity must be moved forward to account for the
- * service that the entity has received while in
- * service. This is done by the next instructions. The
- * finish time will then be updated according to this
- * new value of the start time, and to the budget of
- * the entity.
- */
- bfq_calc_finish(entity, entity->service);
- entity->start = entity->finish;
- BUG_ON(entity->tree && entity->tree == &st->idle);
- BUG_ON(entity->tree && entity->tree != &st->active);
- /*
- * In addition, if the entity had more than one child
- * when set in service, then it was not extracted from
- * the active tree. This implies that the position of
- * the entity in the active tree may need to be
- * changed now, because we have just updated the start
- * time of the entity, and we will update its finish
- * time in a moment (the requeueing is then, more
- * precisely, a repositioning in this case). To
- * implement this repositioning, we: 1) dequeue the
- * entity here, 2) update the finish time and requeue
- * the entity according to the new timestamps below.
- */
- if (entity->tree)
- bfq_active_extract(st, entity);
- } else { /* The entity is already active, and not in service */
- /*
- * In this case, this function gets called only if the
- * next_in_service entity below this entity has
- * changed, and this change has caused the budget of
- * this entity to change, which, finally implies that
- * the finish time of this entity must be
- * updated. Such an update may cause the scheduling,
- * i.e., the position in the active tree, of this
- * entity to change. We handle this change by: 1)
- * dequeueing the entity here, 2) updating the finish
- * time and requeueing the entity according to the new
- * timestamps below. This is the same approach as the
- * non-extracted-entity sub-case above.
- */
- bfq_active_extract(st, entity);
- }
-
- bfq_update_fin_time_enqueue(entity, st, false);
-}
-
-static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
- struct bfq_sched_data *sd,
- bool non_blocking_wait_rq)
-{
- struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
- if (sd->in_service_entity == entity || entity->tree == &st->active)
- /*
- * in service or already queued on the active tree,
- * requeue or reposition
- */
- __bfq_requeue_entity(entity);
- else
- /*
- * Not in service and not queued on its active tree:
- * the activity is idle and this is a true activation.
- */
- __bfq_activate_entity(entity, non_blocking_wait_rq);
-}
-
-
-/**
- * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue,
- * and activate, requeue or reposition all ancestors
- * for which such an update becomes necessary.
- * @entity: the entity to activate.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- * @requeue: true if this is a requeue, which implies that bfqq is
- * being expired; thus ALL its ancestors stop being served and must
- * therefore be requeued
- */
-static void bfq_activate_requeue_entity(struct bfq_entity *entity,
- bool non_blocking_wait_rq,
- bool requeue)
-{
- struct bfq_sched_data *sd;
-
- for_each_entity(entity) {
- BUG_ON(!entity);
- sd = entity->sched_data;
- __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
- BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) &&
- RB_EMPTY_ROOT(&(sd->service_tree+1)->active) &&
- RB_EMPTY_ROOT(&(sd->service_tree+2)->active));
-
- if (!bfq_update_next_in_service(sd, entity) && !requeue) {
- BUG_ON(!sd->next_in_service);
- break;
- }
- BUG_ON(!sd->next_in_service);
- }
-}
-
-/**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: if false, the entity will not be put into the
- * idle tree.
- *
- * Deactivates an entity, independently of its previous state. Must
- * be invoked only if entity is on a service tree. Extracts the entity
- * from that tree, and if necessary and allowed, puts it into the idle
- * tree.
- */
-static bool __bfq_deactivate_entity(struct bfq_entity *entity,
- bool ins_into_idle_tree)
-{
- struct bfq_sched_data *sd = entity->sched_data;
- struct bfq_service_tree *st;
- bool is_in_service;
-
- if (!entity->on_st) { /* entity never activated, or already inactive */
- BUG_ON(sd && entity == sd->in_service_entity);
- return false;
- }
-
- /*
- * If we get here, then entity is active, which implies that
- * bfq_group_set_parent has already been invoked for the group
- * represented by entity. Therefore, the field
- * entity->sched_data has been set, and we can safely use it.
- */
- st = bfq_entity_service_tree(entity);
- is_in_service = entity == sd->in_service_entity;
-
- BUG_ON(is_in_service && entity->tree && entity->tree != &st->active);
-
- if (is_in_service) {
- bfq_calc_finish(entity, entity->service);
- sd->in_service_entity = NULL;
- }
-
- if (entity->tree == &st->active)
- bfq_active_extract(st, entity);
- else if (!is_in_service && entity->tree == &st->idle)
- bfq_idle_extract(st, entity);
- else if (entity->tree)
- BUG();
-
- if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
- bfq_forget_entity(st, entity, is_in_service);
- else
- bfq_idle_insert(st, entity);
-
- return true;
-}
-
-/**
- * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: true if the entity can be put into the idle tree
- */
-static void bfq_deactivate_entity(struct bfq_entity *entity,
- bool ins_into_idle_tree,
- bool expiration)
-{
- struct bfq_sched_data *sd;
- struct bfq_entity *parent = NULL;
-
- for_each_entity_safe(entity, parent) {
- sd = entity->sched_data;
-
- BUG_ON(sd == NULL); /*
- * It would mean that this is the
- * root group.
- */
-
- BUG_ON(expiration && entity != sd->in_service_entity);
-
- BUG_ON(entity != sd->in_service_entity &&
- entity->tree ==
- &bfq_entity_service_tree(entity)->active &&
- !sd->next_in_service);
-
- if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
- /*
- * entity is not in any tree any more, so
- * this deactivation is a no-op, and there is
- * nothing to change for upper-level entities
- * (in case of expiration, this can never
- * happen).
- */
- BUG_ON(expiration); /*
- * entity cannot be already out of
- * any tree
- */
- return;
- }
-
- if (sd->next_in_service == entity)
- /*
- * entity was the next_in_service entity,
- * then, since entity has just been
- * deactivated, a new one must be found.
- */
- bfq_update_next_in_service(sd, NULL);
-
- if (sd->next_in_service || sd->in_service_entity) {
- /*
- * The parent entity is still active, because
- * either next_in_service or in_service_entity
- * is not NULL. So, no further upwards
- * deactivation must be performed. Yet,
- * next_in_service has changed. Then the
- * schedule does need to be updated upwards.
- *
- * NOTE If in_service_entity is not NULL, then
- * next_in_service may happen to be NULL,
- * although the parent entity is evidently
- * active. This happens if 1) the entity
- * pointed by in_service_entity is the only
- * active entity in the parent entity, and 2)
- * according to the definition of
- * next_in_service, the in_service_entity
- * cannot be considered as
- * next_in_service. See the comments on the
- * definition of next_in_service for details.
- */
- BUG_ON(sd->next_in_service == entity);
- BUG_ON(sd->in_service_entity == entity);
- break;
- }
-
- /*
- * If we get here, then the parent is no more
- * backlogged and we need to propagate the
- * deactivation upwards. Thus let the loop go on.
- */
-
- /*
- * Also let parent be queued into the idle tree on
- * deactivation, to preserve service guarantees, and
- * assuming that who invoked this function does not
- * need parent entities too to be removed completely.
- */
- ins_into_idle_tree = true;
- }
-
- /*
- * If the deactivation loop is fully executed, then there are
- * no more entities to touch and next loop is not executed at
- * all. Otherwise, requeue remaining entities if they are
- * about to stop receiving service, or reposition them if this
- * is not the case.
- */
- entity = parent;
- for_each_entity(entity) {
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- /*
- * Invoke __bfq_requeue_entity on entity, even if
- * already active, to requeue/reposition it in the
- * active tree (because sd->next_in_service has
- * changed)
- */
- __bfq_requeue_entity(entity);
-
- sd = entity->sched_data;
- BUG_ON(expiration && sd->in_service_entity != entity);
-
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "invoking udpdate_next for this queue");
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(entity,
- struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "invoking udpdate_next for this entity");
- }
-#endif
- if (!bfq_update_next_in_service(sd, entity) &&
- !expiration)
- /*
- * next_in_service unchanged or not causing
- * any change in entity->parent->sd, and no
- * requeueing needed for expiration: stop
- * here.
- */
- break;
- }
-}
-
-/**
- * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
- * if needed, to have at least one entity eligible.
- * @st: the service tree to act upon.
- *
- * Assumes that st is not empty.
- */
-static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
-{
- struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
-
- if (bfq_gt(root_entity->min_start, st->vtime)) {
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity);
-
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "calc_vtime_jump: new value %llu",
- root_entity->min_start);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(root_entity, struct bfq_group,
- entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "calc_vtime_jump: new value %llu",
- root_entity->min_start);
- }
-#endif
- return root_entity->min_start;
- }
- return st->vtime;
-}
-
-static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
-{
- if (new_value > st->vtime) {
- st->vtime = new_value;
- bfq_forget_idle(st);
- }
-}
-
-/**
- * bfq_first_active_entity - find the eligible entity with
- * the smallest finish time
- * @st: the service tree to select from.
- * @vtime: the system virtual to use as a reference for eligibility
- *
- * This function searches the first schedulable entity, starting from the
- * root of the tree and going on the left every time on this side there is
- * a subtree with at least one eligible (start >= vtime) entity. The path on
- * the right is followed only if a) the left subtree contains no eligible
- * entities and b) no eligible entity has been found yet.
- */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
- u64 vtime)
-{
- struct bfq_entity *entry, *first = NULL;
- struct rb_node *node = st->active.rb_node;
-
- while (node) {
- entry = rb_entry(node, struct bfq_entity, rb_node);
-left:
- if (!bfq_gt(entry->start, vtime))
- first = entry;
-
- BUG_ON(bfq_gt(entry->min_start, vtime));
-
- if (node->rb_left) {
- entry = rb_entry(node->rb_left,
- struct bfq_entity, rb_node);
- if (!bfq_gt(entry->min_start, vtime)) {
- node = node->rb_left;
- goto left;
- }
- }
- if (first)
- break;
- node = node->rb_right;
- }
-
- BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
- return first;
-}
-
-/**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
- *
- * If there is no in-service entity for the sched_data st belongs to,
- * then return the entity that will be set in service if:
- * 1) the parent entity this st belongs to is set in service;
- * 2) no entity belonging to such parent entity undergoes a state change
- * that would influence the timestamps of the entity (e.g., becomes idle,
- * becomes backlogged, changes its budget, ...).
- *
- * In this first case, update the virtual time in @st too (see the
- * comments on this update inside the function).
- *
- * In constrast, if there is an in-service entity, then return the
- * entity that would be set in service if not only the above
- * conditions, but also the next one held true: the currently
- * in-service entity, on expiration,
- * 1) gets a finish time equal to the current one, or
- * 2) is not eligible any more, or
- * 3) is idle.
- */
-static struct bfq_entity *
-__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service
-#if 0
- , bool force
-#endif
- )
-{
- struct bfq_entity *entity
-#if 0
- , *new_next_in_service = NULL
-#endif
- ;
- u64 new_vtime;
- struct bfq_queue *bfqq;
-
- if (RB_EMPTY_ROOT(&st->active))
- return NULL;
-
- /*
- * Get the value of the system virtual time for which at
- * least one entity is eligible.
- */
- new_vtime = bfq_calc_vtime_jump(st);
-
- /*
- * If there is no in-service entity for the sched_data this
- * active tree belongs to, then push the system virtual time
- * up to the value that guarantees that at least one entity is
- * eligible. If, instead, there is an in-service entity, then
- * do not make any such update, because there is already an
- * eligible entity, namely the in-service one (even if the
- * entity is not on st, because it was extracted when set in
- * service).
- */
- if (!in_service)
- bfq_update_vtime(st, new_vtime);
-
- entity = bfq_first_active_entity(st, new_vtime);
- BUG_ON(bfq_gt(entity->start, new_vtime));
-
- /* Log some information */
- bfqq = bfq_entity_to_bfqq(entity);
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "__lookup_next: start %llu vtime %llu st %p",
- ((entity->start>>10)*1000)>>12,
- ((new_vtime>>10)*1000)>>12, st);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "__lookup_next: start %llu vtime %llu st %p",
- ((entity->start>>10)*1000)>>12,
- ((new_vtime>>10)*1000)>>12, st);
- }
-#endif
-
- BUG_ON(!entity);
-
- return entity;
-}
-
-/**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- *
- * This function is invoked when there has been a change in the trees
- * for sd, and we need know what is the new next entity after this
- * change.
- */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
-{
- struct bfq_service_tree *st = sd->service_tree;
- struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
- struct bfq_entity *entity = NULL;
- struct bfq_queue *bfqq;
- int class_idx = 0;
-
- BUG_ON(!sd);
- BUG_ON(!st);
- /*
- * Choose from idle class, if needed to guarantee a minimum
- * bandwidth to this class (and if there is some active entity
- * in idle class). This should also mitigate
- * priority-inversion problems in case a low priority task is
- * holding file system resources.
- */
- if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
- BFQ_CL_IDLE_TIMEOUT)) {
- if (!RB_EMPTY_ROOT(&idle_class_st->active))
- class_idx = BFQ_IOPRIO_CLASSES - 1;
- /* About to be served if backlogged, or not yet backlogged */
- sd->bfq_class_idle_last_service = jiffies;
- }
-
- /*
- * Find the next entity to serve for the highest-priority
- * class, unless the idle class needs to be served.
- */
- for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
- entity = __bfq_lookup_next_entity(st + class_idx,
- sd->in_service_entity);
-
- if (entity)
- break;
- }
-
- BUG_ON(!entity &&
- (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) ||
- !RB_EMPTY_ROOT(&(st+2)->active)));
-
- if (!entity)
- return NULL;
-
- /* Log some information */
- bfqq = bfq_entity_to_bfqq(entity);
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d",
- st + class_idx, class_idx);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "chosen from st %p %d",
- st + class_idx, class_idx);
- }
-#endif
-
- return entity;
-}
-
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
-{
- struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
-
- return sd->next_in_service != sd->in_service_entity;
-}
-
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
-{
- struct bfq_entity *entity = NULL;
- struct bfq_sched_data *sd;
- struct bfq_queue *bfqq;
-
- BUG_ON(bfqd->in_service_queue);
-
- if (bfqd->busy_queues == 0)
- return NULL;
-
- /*
- * Traverse the path from the root to the leaf entity to
- * serve. Set in service all the entities visited along the
- * way.
- */
- sd = &bfqd->root_group->sched_data;
- for (; sd ; sd = entity->my_sched_data) {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (entity) {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg(bfqd, bfqg,
- "get_next_queue: lookup in this group");
- if (!sd->next_in_service)
- pr_crit("get_next_queue: lookup in this group");
- } else {
- bfq_log_bfqg(bfqd, bfqd->root_group,
- "get_next_queue: lookup in root group");
- if (!sd->next_in_service)
- pr_crit("get_next_queue: lookup in root group");
- }
-#endif
-
- BUG_ON(!sd->next_in_service);
-
- /*
- * WARNING. We are about to set the in-service entity
- * to sd->next_in_service, i.e., to the (cached) value
- * returned by bfq_lookup_next_entity(sd) the last
- * time it was invoked, i.e., the last time when the
- * service order in sd changed as a consequence of the
- * activation or deactivation of an entity. In this
- * respect, if we execute bfq_lookup_next_entity(sd)
- * in this very moment, it may, although with low
- * probability, yield a different entity than that
- * pointed to by sd->next_in_service. This rare event
- * happens in case there was no CLASS_IDLE entity to
- * serve for sd when bfq_lookup_next_entity(sd) was
- * invoked for the last time, while there is now one
- * such entity.
- *
- * If the above event happens, then the scheduling of
- * such entity in CLASS_IDLE is postponed until the
- * service of the sd->next_in_service entity
- * finishes. In fact, when the latter is expired,
- * bfq_lookup_next_entity(sd) gets called again,
- * exactly to update sd->next_in_service.
- */
-
- /* Make next_in_service entity become in_service_entity */
- entity = sd->next_in_service;
- sd->in_service_entity = entity;
-
- /*
- * Reset the accumulator of the amount of service that
- * the entity is about to receive.
- */
- entity->service = 0;
-
- /*
- * If entity is no longer a candidate for next
- * service, then it must be extracted from its active
- * tree, so as to make sure that it won't be
- * considered when computing next_in_service. See the
- * comments on the function
- * bfq_no_longer_next_in_service() for details.
- */
- if (bfq_no_longer_next_in_service(entity))
- bfq_active_extract(bfq_entity_service_tree(entity),
- entity);
-
- /*
- * Even if entity is not to be extracted according to
- * the above check, a descendant entity may get
- * extracted in one of the next iterations of this
- * loop. Such an event could cause a change in
- * next_in_service for the level of the descendant
- * entity, and thus possibly back to this level.
- *
- * However, we cannot perform the resulting needed
- * update of next_in_service for this level before the
- * end of the whole loop, because, to know which is
- * the correct next-to-serve candidate entity for each
- * level, we need first to find the leaf entity to set
- * in service. In fact, only after we know which is
- * the next-to-serve leaf entity, we can discover
- * whether the parent entity of the leaf entity
- * becomes the next-to-serve, and so on.
- */
-
- /* Log some information */
- bfqq = bfq_entity_to_bfqq(entity);
- if (bfqq)
- bfq_log_bfqq(bfqd, bfqq,
- "get_next_queue: this queue, finish %llu",
- (((entity->finish>>10)*1000)>>10)>>2);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg(bfqd, bfqg,
- "get_next_queue: this entity, finish %llu",
- (((entity->finish>>10)*1000)>>10)>>2);
- }
-#endif
-
- }
-
- BUG_ON(!entity);
- bfqq = bfq_entity_to_bfqq(entity);
- BUG_ON(!bfqq);
-
- /*
- * We can finally update all next-to-serve entities along the
- * path from the leaf entity just set in service to the root.
- */
- for_each_entity(entity) {
- struct bfq_sched_data *sd = entity->sched_data;
-
- if(!bfq_update_next_in_service(sd, NULL))
- break;
- }
-
- return bfqq;
-}
-
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
-{
- struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
- struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
- struct bfq_entity *entity = in_serv_entity;
-
- if (bfqd->in_service_bic) {
- put_io_context(bfqd->in_service_bic->icq.ioc);
- bfqd->in_service_bic = NULL;
- }
-
- bfq_clear_bfqq_wait_request(in_serv_bfqq);
- hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
- bfqd->in_service_queue = NULL;
-
- /*
- * When this function is called, all in-service entities have
- * been properly deactivated or requeued, so we can safely
- * execute the final step: reset in_service_entity along the
- * path from entity to the root.
- */
- for_each_entity(entity)
- entity->sched_data->in_service_entity = NULL;
-
- /*
- * in_serv_entity is no longer in service, so, if it is in no
- * service tree either, then release the service reference to
- * the queue it represents (taken with bfq_get_entity).
- */
- if (!in_serv_entity->on_st)
- bfq_put_queue(in_serv_bfqq);
-}
-
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool ins_into_idle_tree, bool expiration)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
-}
-
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = &bfqq->entity;
- struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
- BUG_ON(bfqq == bfqd->in_service_queue);
- BUG_ON(entity->tree != &st->active && entity->tree != &st->idle &&
- entity->on_st);
-
- bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
- false);
- bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
-}
-
-static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = &bfqq->entity;
-
- bfq_activate_requeue_entity(entity, false,
- bfqq == bfqd->in_service_queue);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree. As a special case, it can be invoked during an
- * expiration.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool expiration)
-{
- BUG_ON(!bfq_bfqq_busy(bfqq));
- BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
- bfq_log_bfqq(bfqd, bfqq, "del from busy");
-
- bfq_clear_bfqq_busy(bfqq);
-
- BUG_ON(bfqd->busy_queues == 0);
- bfqd->busy_queues--;
-
- if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
-
- if (bfqq->wr_coeff > 1) {
- bfqd->wr_busy_queues--;
- BUG_ON(bfqd->wr_busy_queues < 0);
- }
-
- bfqg_stats_update_dequeue(bfqq_group(bfqq));
-
- BUG_ON(bfqq->entity.budget < 0);
-
- bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
-}
-
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
- BUG_ON(bfq_bfqq_busy(bfqq));
- BUG_ON(bfqq == bfqd->in_service_queue);
-
- bfq_log_bfqq(bfqd, bfqq, "add to busy");
-
- bfq_activate_bfqq(bfqd, bfqq);
-
- bfq_mark_bfqq_busy(bfqq);
- bfqd->busy_queues++;
-
- if (!bfqq->dispatched)
- if (bfqq->wr_coeff == 1)
- bfq_weights_tree_add(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
-
- if (bfqq->wr_coeff > 1) {
- bfqd->wr_busy_queues++;
- BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues);
- }
-
-}
diff --git a/block/bfq.h b/block/bfq.h
deleted file mode 100644
index e35bf89b09f3..000000000000
--- a/block/bfq.h
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- * BFQ v8r12 for 4.9.0: data structures and common functions prototypes.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe
- *
- * Copyright (C) 2008 Fabio Checconi
- * Paolo Valente
- *
- * Copyright (C) 2015 Paolo Valente
- *
- * Copyright (C) 2017 Paolo Valente
- */
-
-#ifndef _BFQ_H
-#define _BFQ_H
-
-#include
-#include
-#include
-#include
-#include
-
-#define BFQ_IOPRIO_CLASSES 3
-#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
-
-#define BFQ_MIN_WEIGHT 1
-#define BFQ_MAX_WEIGHT 1000
-#define BFQ_WEIGHT_CONVERSION_COEFF 10
-
-#define BFQ_DEFAULT_QUEUE_IOPRIO 4
-
-#define BFQ_WEIGHT_LEGACY_DFL 100
-#define BFQ_DEFAULT_GRP_IOPRIO 0
-#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
-
-/*
- * Soft real-time applications are extremely more latency sensitive
- * than interactive ones. Over-raise the weight of the former to
- * privilege them against the latter.
- */
-#define BFQ_SOFTRT_WEIGHT_FACTOR 100
-
-struct bfq_entity;
-
-/**
- * struct bfq_service_tree - per ioprio_class service tree.
- *
- * Each service tree represents a B-WF2Q+ scheduler on its own. Each
- * ioprio_class has its own independent scheduler, and so its own
- * bfq_service_tree. All the fields are protected by the queue lock
- * of the containing bfqd.
- */
-struct bfq_service_tree {
- /* tree for active entities (i.e., those backlogged) */
- struct rb_root active;
- /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
- struct rb_root idle;
-
- struct bfq_entity *first_idle; /* idle entity with minimum F_i */
- struct bfq_entity *last_idle; /* idle entity with maximum F_i */
-
- u64 vtime; /* scheduler virtual time */
- /* scheduler weight sum; active and idle entities contribute to it */
- unsigned long wsum;
-};
-
-/**
- * struct bfq_sched_data - multi-class scheduler.
- *
- * bfq_sched_data is the basic scheduler queue. It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as an
- * intermediate queue in a hierarchical setup.
- *
- * The supported ioprio_classes are the same as in CFQ, in descending
- * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
- * Requests from higher priority queues are served before all the
- * requests from lower priority queues; among requests of the same
- * queue requests are served according to B-WF2Q+.
- *
- * The schedule is implemented by the service trees, plus the field
- * @next_in_service, which points to the entity on the active trees
- * that will be served next, if 1) no changes in the schedule occurs
- * before the current in-service entity is expired, 2) the in-service
- * queue becomes idle when it expires, and 3) if the entity pointed by
- * in_service_entity is not a queue, then the in-service child entity
- * of the entity pointed by in_service_entity becomes idle on
- * expiration. This peculiar definition allows for the following
- * optimization, not yet exploited: while a given entity is still in
- * service, we already know which is the best candidate for next
- * service among the other active entitities in the same parent
- * entity. We can then quickly compare the timestamps of the
- * in-service entity with those of such best candidate.
- *
- * All the fields are protected by the queue lock of the containing
- * bfqd.
- */
-struct bfq_sched_data {
- struct bfq_entity *in_service_entity; /* entity in service */
- /* head-of-the-line entity in the scheduler (see comments above) */
- struct bfq_entity *next_in_service;
- /* array of service trees, one per ioprio_class */
- struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
- /* last time CLASS_IDLE was served */
- unsigned long bfq_class_idle_last_service;
-
-};
-
-/**
- * struct bfq_weight_counter - counter of the number of all active entities
- * with a given weight.
- */
-struct bfq_weight_counter {
- unsigned int weight; /* weight of the entities this counter refers to */
- unsigned int num_active; /* nr of active entities with this weight */
- /*
- * Weights tree member (see bfq_data's @queue_weights_tree and
- * @group_weights_tree)
- */
- struct rb_node weights_node;
-};
-
-/**
- * struct bfq_entity - schedulable entity.
- *
- * A bfq_entity is used to represent either a bfq_queue (leaf node in the
- * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
- * entity belongs to the sched_data of the parent group in the cgroup
- * hierarchy. Non-leaf entities have also their own sched_data, stored
- * in @my_sched_data.
- *
- * Each entity stores independently its priority values; this would
- * allow different weights on different devices, but this
- * functionality is not exported to userspace by now. Priorities and
- * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @prio_changed flag. As soon as
- * there is a transition in the entity state that allows the priority
- * update to take place the effective and the requested priority
- * values are synchronized.
- *
- * Unless cgroups are used, the weight value is calculated from the
- * ioprio to export the same interface as CFQ. When dealing with
- * ``well-behaved'' queues (i.e., queues that do not spend too much
- * time to consume their budget and have true sequential behavior, and
- * when there are no external factors breaking anticipation) the
- * relative weights at each level of the cgroups hierarchy should be
- * guaranteed. All the fields are protected by the queue lock of the
- * containing bfqd.
- */
-struct bfq_entity {
- struct rb_node rb_node; /* service_tree member */
- /* pointer to the weight counter associated with this entity */
- struct bfq_weight_counter *weight_counter;
-
- /*
- * Flag, true if the entity is on a tree (either the active or
- * the idle one of its service_tree) or is in service.
- */
- bool on_st;
-
- u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
- u64 start; /* B-WF2Q+ start timestamp (aka S_i) */
-
- /* tree the entity is enqueued into; %NULL if not on a tree */
- struct rb_root *tree;
-
- /*
- * minimum start time of the (active) subtree rooted at this
- * entity; used for O(log N) lookups into active trees
- */
- u64 min_start;
-
- /* amount of service received during the last service slot */
- int service;
-
- /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
- int budget;
-
- unsigned int weight; /* weight of the queue */
- unsigned int new_weight; /* next weight if a change is in progress */
-
- /* original weight, used to implement weight boosting */
- unsigned int orig_weight;
-
- /* parent entity, for hierarchical scheduling */
- struct bfq_entity *parent;
-
- /*
- * For non-leaf nodes in the hierarchy, the associated
- * scheduler queue, %NULL on leaf nodes.
- */
- struct bfq_sched_data *my_sched_data;
- /* the scheduler queue this entity belongs to */
- struct bfq_sched_data *sched_data;
-
- /* flag, set to request a weight, ioprio or ioprio_class change */
- int prio_changed;
-};
-
-struct bfq_group;
-
-/**
- * struct bfq_queue - leaf schedulable entity.
- *
- * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async or shared between cooperating
- * processes. @cgroup holds a reference to the cgroup, to be sure that it
- * does not disappear while a bfqq still references it (mostly to avoid
- * races between request issuing and task migration followed by cgroup
- * destruction).
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_queue {
- /* reference counter */
- int ref;
- /* parent bfq_data */
- struct bfq_data *bfqd;
-
- /* current ioprio and ioprio class */
- unsigned short ioprio, ioprio_class;
- /* next ioprio and ioprio class if a change is in progress */
- unsigned short new_ioprio, new_ioprio_class;
-
- /*
- * Shared bfq_queue if queue is cooperating with one or more
- * other queues.
- */
- struct bfq_queue *new_bfqq;
- /* request-position tree member (see bfq_group's @rq_pos_tree) */
- struct rb_node pos_node;
- /* request-position tree root (see bfq_group's @rq_pos_tree) */
- struct rb_root *pos_root;
-
- /* sorted list of pending requests */
- struct rb_root sort_list;
- /* if fifo isn't expired, next request to serve */
- struct request *next_rq;
- /* number of sync and async requests queued */
- int queued[2];
- /* number of sync and async requests currently allocated */
- int allocated[2];
- /* number of pending metadata requests */
- int meta_pending;
- /* fifo list of requests in sort_list */
- struct list_head fifo;
-
- /* entity representing this queue in the scheduler */
- struct bfq_entity entity;
-
- /* maximum budget allowed from the feedback mechanism */
- int max_budget;
- /* budget expiration (in jiffies) */
- unsigned long budget_timeout;
-
- /* number of requests on the dispatch list or inside driver */
- int dispatched;
-
- unsigned int flags; /* status flags.*/
-
- /* node for active/idle bfqq list inside parent bfqd */
- struct list_head bfqq_list;
-
- /* bit vector: a 1 for each seeky requests in history */
- u32 seek_history;
-
- /* node for the device's burst list */
- struct hlist_node burst_list_node;
-
- /* position of the last request enqueued */
- sector_t last_request_pos;
-
- /* Number of consecutive pairs of request completion and
- * arrival, such that the queue becomes idle after the
- * completion, but the next request arrives within an idle
- * time slice; used only if the queue's IO_bound flag has been
- * cleared.
- */
- unsigned int requests_within_timer;
-
- /* pid of the process owning the queue, used for logging purposes */
- pid_t pid;
-
- /*
- * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
- * if the queue is shared.
- */
- struct bfq_io_cq *bic;
-
- /* current maximum weight-raising time for this queue */
- unsigned long wr_cur_max_time;
- /*
- * Minimum time instant such that, only if a new request is
- * enqueued after this time instant in an idle @bfq_queue with
- * no outstanding requests, then the task associated with the
- * queue it is deemed as soft real-time (see the comments on
- * the function bfq_bfqq_softrt_next_start())
- */
- unsigned long soft_rt_next_start;
- /*
- * Start time of the current weight-raising period if
- * the @bfq-queue is being weight-raised, otherwise
- * finish time of the last weight-raising period.
- */
- unsigned long last_wr_start_finish;
- /* factor by which the weight of this queue is multiplied */
- unsigned int wr_coeff;
- /*
- * Time of the last transition of the @bfq_queue from idle to
- * backlogged.
- */
- unsigned long last_idle_bklogged;
- /*
- * Cumulative service received from the @bfq_queue since the
- * last transition from idle to backlogged.
- */
- unsigned long service_from_backlogged;
- /*
- * Value of wr start time when switching to soft rt
- */
- unsigned long wr_start_at_switch_to_srt;
-
- unsigned long split_time; /* time of last split */
-};
-
-/**
- * struct bfq_ttime - per process thinktime stats.
- */
-struct bfq_ttime {
- u64 last_end_request; /* completion time of last request */
-
- u64 ttime_total; /* total process thinktime */
- unsigned long ttime_samples; /* number of thinktime samples */
- u64 ttime_mean; /* average process thinktime */
-
-};
-
-/**
- * struct bfq_io_cq - per (request_queue, io_context) structure.
- */
-struct bfq_io_cq {
- /* associated io_cq structure */
- struct io_cq icq; /* must be the first member */
- /* array of two process queues, the sync and the async */
- struct bfq_queue *bfqq[2];
- /* associated @bfq_ttime struct */
- struct bfq_ttime ttime;
- /* per (request_queue, blkcg) ioprio */
- int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- uint64_t blkcg_serial_nr; /* the current blkcg serial */
-#endif
-
- /*
- * Snapshot of the has_short_time flag before merging; taken
- * to remember its value while the queue is merged, so as to
- * be able to restore it in case of split.
- */
- bool saved_has_short_ttime;
- /*
- * Same purpose as the previous two fields for the I/O bound
- * classification of a queue.
- */
- bool saved_IO_bound;
-
- /*
- * Same purpose as the previous fields for the value of the
- * field keeping the queue's belonging to a large burst
- */
- bool saved_in_large_burst;
- /*
- * True if the queue belonged to a burst list before its merge
- * with another cooperating queue.
- */
- bool was_in_burst_list;
-
- /*
- * Similar to previous fields: save wr information.
- */
- unsigned long saved_wr_coeff;
- unsigned long saved_last_wr_start_finish;
- unsigned long saved_wr_start_at_switch_to_srt;
- unsigned int saved_wr_cur_max_time;
-};
-
-enum bfq_device_speed {
- BFQ_BFQD_FAST,
- BFQ_BFQD_SLOW,
-};
-
-/**
- * struct bfq_data - per-device data structure.
- *
- * All the fields are protected by the @queue lock.
- */
-struct bfq_data {
- /* request queue for the device */
- struct request_queue *queue;
-
- /* root bfq_group for the device */
- struct bfq_group *root_group;
-
- /*
- * rbtree of weight counters of @bfq_queues, sorted by
- * weight. Used to keep track of whether all @bfq_queues have
- * the same weight. The tree contains one counter for each
- * distinct weight associated to some active and not
- * weight-raised @bfq_queue (see the comments to the functions
- * bfq_weights_tree_[add|remove] for further details).
- */
- struct rb_root queue_weights_tree;
- /*
- * rbtree of non-queue @bfq_entity weight counters, sorted by
- * weight. Used to keep track of whether all @bfq_groups have
- * the same weight. The tree contains one counter for each
- * distinct weight associated to some active @bfq_group (see
- * the comments to the functions bfq_weights_tree_[add|remove]
- * for further details).
- */
- struct rb_root group_weights_tree;
-
- /*
- * Number of bfq_queues containing requests (including the
- * queue in service, even if it is idling).
- */
- int busy_queues;
- /* number of weight-raised busy @bfq_queues */
- int wr_busy_queues;
- /* number of queued requests */
- int queued;
- /* number of requests dispatched and waiting for completion */
- int rq_in_driver;
-
- /*
- * Maximum number of requests in driver in the last
- * @hw_tag_samples completed requests.
- */
- int max_rq_in_driver;
- /* number of samples used to calculate hw_tag */
- int hw_tag_samples;
- /* flag set to one if the driver is showing a queueing behavior */
- int hw_tag;
-
- /* number of budgets assigned */
- int budgets_assigned;
-
- /*
- * Timer set when idling (waiting) for the next request from
- * the queue in service.
- */
- struct hrtimer idle_slice_timer;
- /* delayed work to restart dispatching on the request queue */
- struct work_struct unplug_work;
-
- /* bfq_queue in service */
- struct bfq_queue *in_service_queue;
- /* bfq_io_cq (bic) associated with the @in_service_queue */
- struct bfq_io_cq *in_service_bic;
-
- /* on-disk position of the last served request */
- sector_t last_position;
-
- /* time of last request completion (ns) */
- u64 last_completion;
-
- /* time of first rq dispatch in current observation interval (ns) */
- u64 first_dispatch;
- /* time of last rq dispatch in current observation interval (ns) */
- u64 last_dispatch;
-
- /* beginning of the last budget */
- ktime_t last_budget_start;
- /* beginning of the last idle slice */
- ktime_t last_idling_start;
-
- /* number of samples in current observation interval */
- int peak_rate_samples;
- /* num of samples of seq dispatches in current observation interval */
- u32 sequential_samples;
- /* total num of sectors transferred in current observation interval */
- u64 tot_sectors_dispatched;
- /* max rq size seen during current observation interval (sectors) */
- u32 last_rq_max_size;
- /* time elapsed from first dispatch in current observ. interval (us) */
- u64 delta_from_first;
- /* current estimate of device peak rate */
- u32 peak_rate;
-
- /* maximum budget allotted to a bfq_queue before rescheduling */
- int bfq_max_budget;
-
- /* list of all the bfq_queues active on the device */
- struct list_head active_list;
- /* list of all the bfq_queues idle on the device */
- struct list_head idle_list;
-
- /*
- * Timeout for async/sync requests; when it fires, requests
- * are served in fifo order.
- */
- u64 bfq_fifo_expire[2];
- /* weight of backward seeks wrt forward ones */
- unsigned int bfq_back_penalty;
- /* maximum allowed backward seek */
- unsigned int bfq_back_max;
- /* maximum idling time */
- u32 bfq_slice_idle;
-
- /* user-configured max budget value (0 for auto-tuning) */
- int bfq_user_max_budget;
- /*
- * Timeout for bfq_queues to consume their budget; used to
- * prevent seeky queues from imposing long latencies to
- * sequential or quasi-sequential ones (this also implies that
- * seeky queues cannot receive guarantees in the service
- * domain; after a timeout they are charged for the time they
- * have been in service, to preserve fairness among them, but
- * without service-domain guarantees).
- */
- unsigned int bfq_timeout;
-
- /*
- * Number of consecutive requests that must be issued within
- * the idle time slice to set again idling to a queue which
- * was marked as non-I/O-bound (see the definition of the
- * IO_bound flag for further details).
- */
- unsigned int bfq_requests_within_timer;
-
- /*
- * Force device idling whenever needed to provide accurate
- * service guarantees, without caring about throughput
- * issues. CAVEAT: this may even increase latencies, in case
- * of useless idling for processes that did stop doing I/O.
- */
- bool strict_guarantees;
-
- /*
- * Last time at which a queue entered the current burst of
- * queues being activated shortly after each other; for more
- * details about this and the following parameters related to
- * a burst of activations, see the comments on the function
- * bfq_handle_burst.
- */
- unsigned long last_ins_in_burst;
- /*
- * Reference time interval used to decide whether a queue has
- * been activated shortly after @last_ins_in_burst.
- */
- unsigned long bfq_burst_interval;
- /* number of queues in the current burst of queue activations */
- int burst_size;
-
- /* common parent entity for the queues in the burst */
- struct bfq_entity *burst_parent_entity;
- /* Maximum burst size above which the current queue-activation
- * burst is deemed as 'large'.
- */
- unsigned long bfq_large_burst_thresh;
- /* true if a large queue-activation burst is in progress */
- bool large_burst;
- /*
- * Head of the burst list (as for the above fields, more
- * details in the comments on the function bfq_handle_burst).
- */
- struct hlist_head burst_list;
-
- /* if set to true, low-latency heuristics are enabled */
- bool low_latency;
- /*
- * Maximum factor by which the weight of a weight-raised queue
- * is multiplied.
- */
- unsigned int bfq_wr_coeff;
- /* maximum duration of a weight-raising period (jiffies) */
- unsigned int bfq_wr_max_time;
-
- /* Maximum weight-raising duration for soft real-time processes */
- unsigned int bfq_wr_rt_max_time;
- /*
- * Minimum idle period after which weight-raising may be
- * reactivated for a queue (in jiffies).
- */
- unsigned int bfq_wr_min_idle_time;
- /*
- * Minimum period between request arrivals after which
- * weight-raising may be reactivated for an already busy async
- * queue (in jiffies).
- */
- unsigned long bfq_wr_min_inter_arr_async;
-
- /* Max service-rate for a soft real-time queue, in sectors/sec */
- unsigned int bfq_wr_max_softrt_rate;
- /*
- * Cached value of the product R*T, used for computing the
- * maximum duration of weight raising automatically.
- */
- u64 RT_prod;
- /* device-speed class for the low-latency heuristic */
- enum bfq_device_speed device_speed;
-
- /* fallback dummy bfqq for extreme OOM conditions */
- struct bfq_queue oom_bfqq;
-};
-
-enum bfqq_state_flags {
- BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */
- BFQ_BFQQ_FLAG_busy, /* has requests or is in service */
- BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
- BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
- * waiting for a request
- * without idling the device
- */
- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
- BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
- BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */
- BFQ_BFQQ_FLAG_sync, /* synchronous queue */
- BFQ_BFQQ_FLAG_IO_bound, /*
- * bfqq has timed-out at least once
- * having consumed at most 2/10 of
- * its budget
- */
- BFQ_BFQQ_FLAG_in_large_burst, /*
- * bfqq activated in a large burst,
- * see comments to bfq_handle_burst.
- */
- BFQ_BFQQ_FLAG_softrt_update, /*
- * may need softrt-next-start
- * update
- */
- BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
- BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */
-};
-
-#define BFQ_BFQQ_FNS(name) \
-static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
-{ \
- (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
-} \
-static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
-{ \
- (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
-} \
-static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
-{ \
- return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
-}
-
-BFQ_BFQQ_FNS(just_created);
-BFQ_BFQQ_FNS(busy);
-BFQ_BFQQ_FNS(wait_request);
-BFQ_BFQQ_FNS(non_blocking_wait_rq);
-BFQ_BFQQ_FNS(must_alloc);
-BFQ_BFQQ_FNS(fifo_expire);
-BFQ_BFQQ_FNS(has_short_ttime);
-BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(IO_bound);
-BFQ_BFQQ_FNS(in_large_burst);
-BFQ_BFQQ_FNS(coop);
-BFQ_BFQQ_FNS(split_coop);
-BFQ_BFQQ_FNS(softrt_update);
-#undef BFQ_BFQQ_FNS
-
-/* Logging facilities. */
-#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char __pbuf[128]; \
- \
- assert_spin_locked((bfqd)->queue->queue_lock); \
- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
- pr_crit("bfq%d%c %s " fmt "\n", \
- (bfqq)->pid, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- __pbuf, ##args); \
-} while (0)
-
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
- char __pbuf[128]; \
- \
- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
- pr_crit("%s " fmt "\n", __pbuf, ##args); \
-} while (0)
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
- pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- ##args)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log(bfqd, fmt, args...) \
- pr_crit("bfq " fmt "\n", ##args)
-
-#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char __pbuf[128]; \
- \
- assert_spin_locked((bfqd)->queue->queue_lock); \
- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
- (bfqq)->pid, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- __pbuf, ##args); \
-} while (0)
-
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
- char __pbuf[128]; \
- \
- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
-} while (0)
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- ##args)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log(bfqd, fmt, args...) \
- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
-#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
-
-/* Expiration reasons. */
-enum bfqq_expiration {
- BFQ_BFQQ_TOO_IDLE = 0, /*
- * queue has been idling for
- * too long
- */
- BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
- BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
- BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
- BFQ_BFQQ_PREEMPTED /* preemption in progress */
-};
-
-
-struct bfqg_stats {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- /* number of ios merged */
- struct blkg_rwstat merged;
- /* total time spent on device in ns, may not be accurate w/ queueing */
- struct blkg_rwstat service_time;
- /* total time spent waiting in scheduler queue in ns */
- struct blkg_rwstat wait_time;
- /* number of IOs queued up */
- struct blkg_rwstat queued;
- /* total disk time and nr sectors dispatched by this group */
- struct blkg_stat time;
- /* sum of number of ios queued across all samples */
- struct blkg_stat avg_queue_size_sum;
- /* count of samples taken for average */
- struct blkg_stat avg_queue_size_samples;
- /* how many times this group has been removed from service tree */
- struct blkg_stat dequeue;
- /* total time spent waiting for it to be assigned a timeslice. */
- struct blkg_stat group_wait_time;
- /* time spent idling for this blkcg_gq */
- struct blkg_stat idle_time;
- /* total time with empty current active q with other requests queued */
- struct blkg_stat empty_time;
- /* fields after this shouldn't be cleared on stat reset */
- uint64_t start_group_wait_time;
- uint64_t start_idle_time;
- uint64_t start_empty_time;
- uint16_t flags;
-#endif
-};
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-/*
- * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
- *
- * @ps: @blkcg_policy_storage that this structure inherits
- * @weight: weight of the bfq_group
- */
-struct bfq_group_data {
- /* must be the first member */
- struct blkcg_policy_data pd;
-
- unsigned int weight;
-};
-
-/**
- * struct bfq_group - per (device, cgroup) data structure.
- * @entity: schedulable entity to insert into the parent group sched_data.
- * @sched_data: own sched_data, to contain child entities (they may be
- * both bfq_queues and bfq_groups).
- * @bfqd: the bfq_data for the device this group acts upon.
- * @async_bfqq: array of async queues for all the tasks belonging to
- * the group, one queue per ioprio value per ioprio_class,
- * except for the idle class that has only one queue.
- * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
- * @my_entity: pointer to @entity, %NULL for the toplevel group; used
- * to avoid too many special cases during group creation/
- * migration.
- * @active_entities: number of active entities belonging to the group;
- * unused for the root group. Used to know whether there
- * are groups with more than one active @bfq_entity
- * (see the comments to the function
- * bfq_bfqq_may_idle()).
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- * determining if two or more queues have interleaving
- * requests (see bfq_find_close_cooperator()).
- *
- * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
- * there is a set of bfq_groups, each one collecting the lower-level
- * entities belonging to the group that are acting on the same device.
- *
- * Locking works as follows:
- * o @bfqd is protected by the queue lock, RCU is used to access it
- * from the readers.
- * o All the other fields are protected by the @bfqd queue lock.
- */
-struct bfq_group {
- /* must be the first member */
- struct blkg_policy_data pd;
-
- struct bfq_entity entity;
- struct bfq_sched_data sched_data;
-
- void *bfqd;
-
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
- struct bfq_queue *async_idle_bfqq;
-
- struct bfq_entity *my_entity;
-
- int active_entities;
-
- struct rb_root rq_pos_tree;
-
- struct bfqg_stats stats;
-};
-
-#else
-struct bfq_group {
- struct bfq_sched_data sched_data;
-
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
- struct bfq_queue *async_idle_bfqq;
-
- struct rb_root rq_pos_tree;
-};
-#endif
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
-static unsigned int bfq_class_idx(struct bfq_entity *entity)
-{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
- return bfqq ? bfqq->ioprio_class - 1 :
- BFQ_DEFAULT_GRP_CLASS - 1;
-}
-
-static struct bfq_service_tree *
-bfq_entity_service_tree(struct bfq_entity *entity)
-{
- struct bfq_sched_data *sched_data = entity->sched_data;
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- unsigned int idx = bfq_class_idx(entity);
-
- BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
- BUG_ON(sched_data == NULL);
-
- if (bfqq)
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "entity_service_tree %p %d",
- sched_data->service_tree + idx, idx);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
-
- bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
- "entity_service_tree %p %d",
- sched_data->service_tree + idx, idx);
- }
-#endif
- return sched_data->service_tree + idx;
-}
-
-static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
-{
- return bic->bfqq[is_sync];
-}
-
-static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
- bool is_sync)
-{
- bic->bfqq[is_sync] = bfqq;
-}
-
-static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
-{
- return bic->icq.q->elevator->elevator_data;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
-#else
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
-#endif
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
-static void bfq_put_queue(struct bfq_queue *bfqq);
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic);
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
- struct bfq_group *bfqg);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
-#endif
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-
-#endif /* _BFQ_H */
diff --git a/drivers/Kconfig b/drivers/Kconfig
index c89e0f383be6..981778f02e56 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -216,8 +216,6 @@ source "drivers/gps/Kconfig"
source "drivers/halls/Kconfig"
-source "drivers/rekernel/Kconfig"
-
source "drivers/kernelsu/Kconfig"
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index f691364e80c8..8d445b4401be 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -183,5 +183,4 @@ obj-$(CONFIG_SENSORS_SSC) += sensors/
obj-$(CONFIG_TEE) += tee/
obj-$(CONFIG_BCM_GPS_SPI_DRIVER) += gps/
obj-$(CONFIG_HALLS) += halls/
-obj-$(CONFIG_REKERNEL) += rekernel/
obj-$(CONFIG_KSU) += kernelsu/
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 491751ab0dbf..bb2a5b581622 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -1,9 +1,8 @@
-# SPDX-License-Identifier: GPL-2.0
menu "Android"
config ANDROID
bool "Android Drivers"
- help
+ ---help---
Enable support for various drivers needed on the Android platform
if ANDROID
@@ -12,7 +11,7 @@ config ANDROID_BINDER_IPC
bool "Android Binder IPC Driver"
depends on MMU
default n
- help
+ ---help---
Binder is used in Android for both communication between processes,
and remote method invocation.
@@ -20,23 +19,11 @@ config ANDROID_BINDER_IPC
Android process, using Binder to identify, invoke and pass arguments
between said processes.
-config ANDROID_BINDERFS
- bool "Android Binderfs filesystem"
- depends on ANDROID_BINDER_IPC
- default n
- help
- Binderfs is a pseudo-filesystem for the Android Binder IPC driver
- which can be mounted per-ipc namespace allowing to run multiple
- instances of Android.
- Each binderfs mount initially only contains a binder-control device.
- It can be used to dynamically allocate new binder IPC devices via
- ioctls.
-
config ANDROID_BINDER_DEVICES
string "Android Binder devices"
depends on ANDROID_BINDER_IPC
default "binder,hwbinder,vndbinder"
- help
+ ---help---
Default value for the binder.devices parameter.
The binder.devices parameter is a comma-separated list of strings
@@ -44,71 +31,29 @@ config ANDROID_BINDER_DEVICES
created. Each binder device has its own context manager, and is
therefore logically separated from the other devices.
+config ANDROID_BINDER_IPC_32BIT
+ bool "Android Binder IPC 32BIT Driver"
+ depends on !64BIT && ANDROID_BINDER_IPC
+ default n
+ ---help---
+ The Binder API has been changed to support both 32 and 64bit
+ applications in a mixed environment.
+
+ Enable this to support an old 32-bit Android user-space (v4.4 and
+ earlier).
+
+ Note that enabling this will break newer Android user-space.
+
config ANDROID_BINDER_IPC_SELFTEST
bool "Android Binder IPC Driver Selftest"
depends on ANDROID_BINDER_IPC
- help
+ ---help---
This feature allows binder selftest to run.
Binder selftest checks the allocation and free of binder buffers
exhaustively with combinations of various buffer sizes and
alignments.
-config ANDROID_DEBUG_SYMBOLS
- bool "Android Debug Symbols"
- help
- Enables export of debug symbols that are useful for offline debugging
- of a kernel. These symbols would be used in vendor modules to find
- addresses of the core kernel symbols for vendor extensions.
-
- This driver is statically compiled into kernel and maintains all the
- required symbol addresses for vendor modules and provides necessary
- interface vendor modules.
-
-config ANDROID_VENDOR_HOOKS
- bool "Android Vendor Hooks"
- depends on TRACEPOINTS
- help
- Enable vendor hooks implemented as tracepoints
-
- Allow vendor modules to attach to tracepoint "hooks" defined via
- DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
-
-config ANDROID_KABI_RESERVE
- bool "Android KABI reserve padding"
- default y
- help
- This option enables the padding that the Android GKI kernel adds
- to many different kernel structures to support an in-kernel stable ABI
- over the lifespan of support for the kernel.
-
- Only disable this option if you have a system that needs the Android
- kernel drivers, but is NOT an Android GKI kernel image. If disabled
- it has the possibility to make the kernel static and runtime image
- slightly smaller but will NOT be supported by the Google Android
- kernel team.
-
- If even slightly unsure, say Y.
-
-config ANDROID_VENDOR_OEM_DATA
- bool "Android vendor and OEM data padding"
- default y
- help
- This option enables the padding that the Android GKI kernel adds
- to many different kernel structures to support an in-kernel stable ABI
- over the lifespan of support for the kernel as well as OEM additional
- fields that are needed by some of the Android kernel tracepoints. The
- macros enabled by this option are used to enable padding in vendor modules
- used for the above specified purposes.
-
- Only disable this option if you have a system that needs the Android
- kernel drivers, but is NOT an Android GKI kernel image and you do NOT
- use the Android kernel tracepoints. If disabled it has the possibility
- to make the kernel static and runtime image slightly smaller but will
- NOT be supported by the Google Android kernel team.
-
- If even slightly unsure, say Y.
-
endif # if ANDROID
endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index f1ac44102987..a01254c43ee3 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,8 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
ccflags-y += -I$(src) # needed for trace events
-obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
-obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS) += android_debug_symbols.o
-obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o
diff --git a/drivers/android/android_debug_symbols.c b/drivers/android/android_debug_symbols.c
deleted file mode 100644
index dd75ddac2085..000000000000
--- a/drivers/android/android_debug_symbols.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-/*
- * Copyright (c) 2021, The Linux Foundation. All rights reserved.
- */
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include "../../mm/slab.h"
-#include
-#include
-#include
-#include
-#include
-
-struct ads_entry {
- char *name;
- void *addr;
-};
-
-bool ads_page_owner;
-bool ads_slub_debug;
-unsigned long ads_vmalloc_nr_pages;
-unsigned long ads_pcpu_nr_pages;
-
-#define _ADS_ENTRY(index, symbol) \
- [index] = { .name = #symbol, .addr = (void *)symbol }
-#define ADS_ENTRY(index, symbol) _ADS_ENTRY(index, symbol)
-
-#define _ADS_PER_CPU_ENTRY(index, symbol) \
- [index] = { .name = #symbol, .addr = (void *)&symbol }
-#define ADS_PER_CPU_ENTRY(index, symbol) _ADS_PER_CPU_ENTRY(index, symbol)
-
-/*
- * This module maintains static array of symbol and address information.
- * Add all required core kernel symbols and their addresses into ads_entries[] array,
- * so that vendor modules can query and to find address of non-exported symbol.
- */
-static const struct ads_entry ads_entries[ADS_END] = {
- ADS_ENTRY(ADS_SDATA, _sdata),
- ADS_ENTRY(ADS_BSS_END, __bss_stop),
- ADS_ENTRY(ADS_PER_CPU_START, __per_cpu_start),
- ADS_ENTRY(ADS_PER_CPU_END, __per_cpu_end),
- ADS_ENTRY(ADS_START_RO_AFTER_INIT, __start_ro_after_init),
- ADS_ENTRY(ADS_END_RO_AFTER_INIT, __end_ro_after_init),
- ADS_ENTRY(ADS_LINUX_BANNER, linux_banner),
-#ifdef CONFIG_CMA
- ADS_ENTRY(ADS_TOTAL_CMA, &totalcma_pages),
-#endif
- ADS_ENTRY(ADS_SLAB_CACHES, &slab_caches),
- ADS_ENTRY(ADS_SLAB_MUTEX, &slab_mutex),
- ADS_ENTRY(ADS_MIN_LOW_PFN, &min_low_pfn),
- ADS_ENTRY(ADS_MAX_PFN, &max_pfn),
- ADS_ENTRY(ADS_VMALLOC_NR_PAGES, &ads_vmalloc_nr_pages),
- ADS_ENTRY(ADS_PCPU_NR_PAGES, &ads_pcpu_nr_pages),
-#ifdef CONFIG_PAGE_OWNER
- ADS_ENTRY(ADS_PAGE_OWNER_ENABLED, &ads_page_owner),
-#endif
-#ifdef CONFIG_SLUB_DEBUG
- ADS_ENTRY(ADS_SLUB_DEBUG, &ads_slub_debug),
-#endif
-#ifdef CONFIG_SWAP
- ADS_ENTRY(ADS_NR_SWAP_PAGES, &nr_swap_pages),
-#endif
-#ifdef CONFIG_MMU
- ADS_ENTRY(ADS_MMAP_MIN_ADDR, &mmap_min_addr),
-#endif
- ADS_ENTRY(ADS_STACK_GUARD_GAP, &stack_guard_gap),
-#ifdef CONFIG_SYSCTL
- ADS_ENTRY(ADS_SYSCTL_LEGACY_VA_LAYOUT, &sysctl_legacy_va_layout),
-#endif
- ADS_ENTRY(ADS_SHOW_MEM, show_mem),
-#ifdef CONFIG_ARM64
- ADS_ENTRY(ADS_PUT_TASK_STACK, put_task_stack),
-#endif
-};
-
-/*
- * ads_per_cpu_entries array contains all the per_cpu variable address information.
- */
-static const struct ads_entry ads_per_cpu_entries[ADS_DEBUG_PER_CPU_END] = {
-#ifdef CONFIG_ARM64
- ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, irq_stack_ptr),
-#endif
-#ifdef CONFIG_X86
- ADS_PER_CPU_ENTRY(ADS_IRQ_STACK_PTR, hardirq_stack_ptr),
-#endif
-};
-
-/*
- * android_debug_symbol - Provide address inforamtion of debug symbol.
- * @symbol: Index of debug symbol array.
- *
- * Return address of core kernel symbol on success and a negative errno will be
- * returned in error cases.
- *
- */
-void *android_debug_symbol(enum android_debug_symbol symbol)
-{
- if (symbol >= ADS_END)
- return ERR_PTR(-EINVAL);
-
- return ads_entries[symbol].addr;
-}
-EXPORT_SYMBOL_NS_GPL(android_debug_symbol, MINIDUMP);
-
-/*
- * android_debug_per_cpu_symbol - Provide address inforamtion of per cpu debug symbol.
- * @symbol: Index of per cpu debug symbol array.
- *
- * Return address of core kernel symbol on success and a negative errno will be
- * returned in error cases.
- *
- */
-void *android_debug_per_cpu_symbol(enum android_debug_per_cpu_symbol symbol)
-{
- if (symbol >= ADS_DEBUG_PER_CPU_END)
- return ERR_PTR(-EINVAL);
-
- return ads_per_cpu_entries[symbol].addr;
-}
-EXPORT_SYMBOL_NS_GPL(android_debug_per_cpu_symbol, MINIDUMP);
-
-static int __init debug_symbol_init(void)
-{
-#ifdef CONFIG_PAGE_OWNER
- ads_page_owner = page_owner_ops.need();
-#endif
-#ifdef CONFIG_SLUB_DEBUG
- ads_slub_debug = __slub_debug_enabled();
-#endif
- ads_vmalloc_nr_pages = vmalloc_nr_pages();
- ads_pcpu_nr_pages = pcpu_nr_pages();
- return 0;
-}
-module_init(debug_symbol_init);
-
-static void __exit debug_symbol_exit(void)
-{ }
-module_exit(debug_symbol_exit);
-
-MODULE_DESCRIPTION("Debug Symbol Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 977cb783ea0b..20356105e4ba 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1,9 +1,18 @@
-// SPDX-License-Identifier: GPL-2.0-only
/* binder.c
*
* Android IPC Subsystem
*
* Copyright (C) 2007-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
*/
/*
@@ -42,6 +51,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include
#include
#include
#include
@@ -54,28 +64,13 @@
#include
#include
#include
-#include
-#include
+#include
#include
-#include
#include
#include
#include
#include
-#include
-#include
-#include
-#include
-
-#include
-#ifdef CONFIG_REKERNEL
-#include <../rekernel/rekernel.h>
-#endif /* CONFIG_REKERNEL */
-#include
-
-#include
-
-#include "binder_internal.h"
+#include "binder_alloc.h"
#include "binder_trace.h"
static HLIST_HEAD(binder_deferred_list);
@@ -92,11 +87,36 @@ static struct dentry *binder_debugfs_dir_entry_root;
static struct dentry *binder_debugfs_dir_entry_proc;
static atomic_t binder_last_id;
-static int proc_show(struct seq_file *m, void *unused);
-DEFINE_SHOW_ATTRIBUTE(proc);
+#define BINDER_DEBUG_ENTRY(name) \
+static int binder_##name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, binder_##name##_show, inode->i_private); \
+} \
+\
+static const struct file_operations binder_##name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = binder_##name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+static int binder_proc_show(struct seq_file *m, void *unused);
+BINDER_DEBUG_ENTRY(proc);
+
+/* This is only defined in include/asm-arm/sizes.h */
+#ifndef SZ_1K
+#define SZ_1K 0x400
+#endif
+
+#ifndef SZ_4M
+#define SZ_4M 0x400000
+#endif
#define FORBIDDEN_MMAP_FLAGS (VM_WRITE)
+#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64)
+
enum {
BINDER_DEBUG_USER_ERROR = 1U << 0,
BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1,
@@ -118,8 +138,8 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
module_param_named(debug_mask, binder_debug_mask, uint, 0644);
-char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
-module_param_named(devices, binder_devices_param, charp, 0444);
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
static int binder_stop_on_user_error;
@@ -140,13 +160,13 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
#define binder_debug(mask, x...) \
do { \
if (binder_debug_mask & mask) \
- pr_info_ratelimited(x); \
+ pr_info(x); \
} while (0)
#define binder_user_error(x...) \
do { \
if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \
- pr_info_ratelimited(x); \
+ pr_info(x); \
if (binder_stop_on_user_error) \
binder_stop_on_user_error = 2; \
} while (0)
@@ -162,6 +182,24 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
#define to_binder_fd_array_object(hdr) \
container_of(hdr, struct binder_fd_array_object, hdr)
+enum binder_stat_types {
+ BINDER_STAT_PROC,
+ BINDER_STAT_THREAD,
+ BINDER_STAT_NODE,
+ BINDER_STAT_REF,
+ BINDER_STAT_DEATH,
+ BINDER_STAT_TRANSACTION,
+ BINDER_STAT_TRANSACTION_COMPLETE,
+ BINDER_STAT_COUNT
+};
+
+struct binder_stats {
+ atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1];
+ atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
+ atomic_t obj_created[BINDER_STAT_COUNT];
+ atomic_t obj_deleted[BINDER_STAT_COUNT];
+};
+
static struct binder_stats binder_stats;
static inline void binder_stats_deleted(enum binder_stat_types type)
@@ -189,26 +227,16 @@ struct binder_transaction_log_entry {
int return_error_line;
uint32_t return_error;
uint32_t return_error_param;
- char context_name[BINDERFS_MAX_NAME + 1];
+ const char *context_name;
};
-
struct binder_transaction_log {
atomic_t cur;
bool full;
struct binder_transaction_log_entry entry[32];
};
-
static struct binder_transaction_log binder_transaction_log;
static struct binder_transaction_log binder_transaction_log_failed;
-static struct kmem_cache *binder_node_pool;
-static struct kmem_cache *binder_proc_pool;
-static struct kmem_cache *binder_ref_death_pool;
-static struct kmem_cache *binder_ref_pool;
-static struct kmem_cache *binder_thread_pool;
-static struct kmem_cache *binder_transaction_pool;
-static struct kmem_cache *binder_work_pool;
-
static struct binder_transaction_log_entry *binder_transaction_log_add(
struct binder_transaction_log *log)
{
@@ -229,9 +257,320 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
return e;
}
+struct binder_context {
+ struct binder_node *binder_context_mgr_node;
+ struct mutex context_mgr_node_lock;
+
+ kuid_t binder_context_mgr_uid;
+ const char *name;
+};
+
+struct binder_device {
+ struct hlist_node hlist;
+ struct miscdevice miscdev;
+ struct binder_context context;
+};
+
+/**
+ * struct binder_work - work enqueued on a worklist
+ * @entry: node enqueued on list
+ * @type: type of work to be performed
+ *
+ * There are separate work lists for proc, thread, and node (async).
+ */
+struct binder_work {
+ struct list_head entry;
+
+ enum binder_work_type {
+ BINDER_WORK_TRANSACTION = 1,
+ BINDER_WORK_TRANSACTION_COMPLETE,
+ BINDER_WORK_RETURN_ERROR,
+ BINDER_WORK_NODE,
+ BINDER_WORK_DEAD_BINDER,
+ BINDER_WORK_DEAD_BINDER_AND_CLEAR,
+ BINDER_WORK_CLEAR_DEATH_NOTIFICATION,
+ } type;
+};
+
+struct binder_error {
+ struct binder_work work;
+ uint32_t cmd;
+};
+
+/**
+ * struct binder_node - binder node bookkeeping
+ * @debug_id: unique ID for debugging
+ * (invariant after initialized)
+ * @lock: lock for node fields
+ * @work: worklist element for node work
+ * (protected by @proc->inner_lock)
+ * @rb_node: element for proc->nodes tree
+ * (protected by @proc->inner_lock)
+ * @dead_node: element for binder_dead_nodes list
+ * (protected by binder_dead_nodes_lock)
+ * @proc: binder_proc that owns this node
+ * (invariant after initialized)
+ * @refs: list of references on this node
+ * (protected by @lock)
+ * @internal_strong_refs: used to take strong references when
+ * initiating a transaction
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_weak_refs: weak user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_strong_refs: strong user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @tmp_refs: temporary kernel refs
+ * (protected by @proc->inner_lock while @proc
+ * is valid, and by binder_dead_nodes_lock
+ * if @proc is NULL. During inc/dec and node release
+ * it is also protected by @lock to provide safety
+ * as the node dies and @proc becomes NULL)
+ * @ptr: userspace pointer for node
+ * (invariant, no lock needed)
+ * @cookie: userspace cookie for node
+ * (invariant, no lock needed)
+ * @has_strong_ref: userspace notified of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_strong_ref: userspace has acked notification of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_weak_ref: userspace notified of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_weak_ref: userspace has acked notification of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_async_transaction: async transaction to node in progress
+ * (protected by @lock)
+ * @sched_policy: minimum scheduling policy for node
+ * (invariant after initialized)
+ * @accept_fds: file descriptor operations supported for node
+ * (invariant after initialized)
+ * @min_priority: minimum scheduling priority
+ * (invariant after initialized)
+ * @inherit_rt: inherit RT scheduling policy from caller
+ * @txn_security_ctx: require sender's security context
+ * (invariant after initialized)
+ * @async_todo: list of async work items
+ * (protected by @proc->inner_lock)
+ *
+ * Bookkeeping structure for binder nodes.
+ */
+struct binder_node {
+ int debug_id;
+ spinlock_t lock;
+ struct binder_work work;
+ union {
+ struct rb_node rb_node;
+ struct hlist_node dead_node;
+ };
+ struct binder_proc *proc;
+ struct hlist_head refs;
+ int internal_strong_refs;
+ int local_weak_refs;
+ int local_strong_refs;
+ int tmp_refs;
+ binder_uintptr_t ptr;
+ binder_uintptr_t cookie;
+ struct {
+ /*
+ * bitfield elements protected by
+ * proc inner_lock
+ */
+ u8 has_strong_ref:1;
+ u8 pending_strong_ref:1;
+ u8 has_weak_ref:1;
+ u8 pending_weak_ref:1;
+ };
+ struct {
+ /*
+ * invariant after initialization
+ */
+ u8 sched_policy:2;
+ u8 inherit_rt:1;
+ u8 accept_fds:1;
+ u8 txn_security_ctx:1;
+ u8 min_priority;
+ };
+ bool has_async_transaction;
+ struct list_head async_todo;
+};
+
+struct binder_ref_death {
+ /**
+ * @work: worklist element for death notifications
+ * (protected by inner_lock of the proc that
+ * this ref belongs to)
+ */
+ struct binder_work work;
+ binder_uintptr_t cookie;
+};
+
+/**
+ * struct binder_ref_data - binder_ref counts and id
+ * @debug_id: unique ID for the ref
+ * @desc: unique userspace handle for ref
+ * @strong: strong ref count (debugging only if not locked)
+ * @weak: weak ref count (debugging only if not locked)
+ *
+ * Structure to hold ref count and ref id information. Since
+ * the actual ref can only be accessed with a lock, this structure
+ * is used to return information about the ref to callers of
+ * ref inc/dec functions.
+ */
+struct binder_ref_data {
+ int debug_id;
+ uint32_t desc;
+ int strong;
+ int weak;
+};
+
+/**
+ * struct binder_ref - struct to track references on nodes
+ * @data: binder_ref_data containing id, handle, and current refcounts
+ * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
+ * @rb_node_node: node for lookup by @node in proc's rb_tree
+ * @node_entry: list entry for node->refs list in target node
+ * (protected by @node->lock)
+ * @proc: binder_proc containing ref
+ * @node: binder_node of target node. When cleaning up a
+ * ref for deletion in binder_cleanup_ref, a non-NULL
+ * @node indicates the node must be freed
+ * @death: pointer to death notification (ref_death) if requested
+ * (protected by @node->lock)
+ *
+ * Structure to track references from procA to target node (on procB). This
+ * structure is unsafe to access without holding @proc->outer_lock.
+ */
+struct binder_ref {
+ /* Lookups needed: */
+ /* node + proc => ref (transaction) */
+ /* desc + proc => ref (transaction, inc/dec ref) */
+ /* node => refs + procs (proc exit) */
+ struct binder_ref_data data;
+ struct rb_node rb_node_desc;
+ struct rb_node rb_node_node;
+ struct hlist_node node_entry;
+ struct binder_proc *proc;
+ struct binder_node *node;
+ struct binder_ref_death *death;
+};
+
enum binder_deferred_state {
- BINDER_DEFERRED_FLUSH = 0x01,
- BINDER_DEFERRED_RELEASE = 0x02,
+ BINDER_DEFERRED_PUT_FILES = 0x01,
+ BINDER_DEFERRED_FLUSH = 0x02,
+ BINDER_DEFERRED_RELEASE = 0x04,
+};
+
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy scheduler policy
+ * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+ unsigned int sched_policy;
+ int prio;
+};
+
+/**
+ * struct binder_proc - binder process bookkeeping
+ * @proc_node: element for binder_procs list
+ * @threads: rbtree of binder_threads in this proc
+ * (protected by @inner_lock)
+ * @nodes: rbtree of binder nodes associated with
+ * this proc ordered by node->ptr
+ * (protected by @inner_lock)
+ * @refs_by_desc: rbtree of refs ordered by ref->desc
+ * (protected by @outer_lock)
+ * @refs_by_node: rbtree of refs ordered by ref->node
+ * (protected by @outer_lock)
+ * @waiting_threads: threads currently waiting for proc work
+ * (protected by @inner_lock)
+ * @pid PID of group_leader of process
+ * (invariant after initialized)
+ * @tsk task_struct for group_leader of process
+ * (invariant after initialized)
+ * @files files_struct for process
+ * (protected by @files_lock)
+ * @files_lock mutex to protect @files
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
+ * @deferred_work_node: element for binder_deferred_list
+ * (protected by binder_deferred_lock)
+ * @deferred_work: bitmap of deferred work to perform
+ * (protected by binder_deferred_lock)
+ * @is_dead: process is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @inner_lock)
+ * @todo: list of work for this process
+ * (protected by @inner_lock)
+ * @stats: per-process binder statistics
+ * (atomics, no lock needed)
+ * @delivered_death: list of delivered death notification
+ * (protected by @inner_lock)
+ * @max_threads: cap on number of binder threads
+ * (protected by @inner_lock)
+ * @requested_threads: number of binder threads requested but not
+ * yet started. In current implementation, can
+ * only be 0 or 1.
+ * (protected by @inner_lock)
+ * @requested_threads_started: number binder threads started
+ * (protected by @inner_lock)
+ * @tmp_ref: temporary reference to indicate proc is in use
+ * (atomic since @proc->inner_lock cannot
+ * always be acquired)
+ * @default_priority: default scheduler priority
+ * (invariant after initialized)
+ * @debugfs_entry: debugfs node
+ * @alloc: binder allocator bookkeeping
+ * @context: binder_context for this proc
+ * (invariant after initialized)
+ * @inner_lock: can nest under outer_lock and/or node lock
+ * @outer_lock: no nesting under innor or node lock
+ * Lock order: 1) outer, 2) node, 3) inner
+ *
+ * Bookkeeping structure for binder processes
+ */
+struct binder_proc {
+ struct hlist_node proc_node;
+ struct rb_root threads;
+ struct rb_root nodes;
+ struct rb_root refs_by_desc;
+ struct rb_root refs_by_node;
+ struct list_head waiting_threads;
+ int pid;
+ struct task_struct *tsk;
+ struct files_struct *files;
+ struct mutex files_lock;
+ const struct cred *cred;
+ struct hlist_node deferred_work_node;
+ int deferred_work;
+ bool is_dead;
+
+ struct list_head todo;
+ struct binder_stats stats;
+ struct list_head delivered_death;
+ int max_threads;
+ int requested_threads;
+ int requested_threads_started;
+ atomic_t tmp_ref;
+ struct binder_priority default_priority;
+ struct dentry *debugfs_entry;
+ struct binder_alloc alloc;
+ struct binder_context *context;
+ spinlock_t inner_lock;
+ spinlock_t outer_lock;
};
enum {
@@ -243,6 +582,110 @@ enum {
BINDER_LOOPER_STATE_POLL = 0x20,
};
+/**
+ * struct binder_thread - binder thread bookkeeping
+ * @proc: binder process for this thread
+ * (invariant after initialization)
+ * @rb_node: element for proc->threads rbtree
+ * (protected by @proc->inner_lock)
+ * @waiting_thread_node: element for @proc->waiting_threads list
+ * (protected by @proc->inner_lock)
+ * @pid: PID for this thread
+ * (invariant after initialization)
+ * @looper: bitmap of looping state
+ * (only accessed by this thread)
+ * @looper_needs_return: looping thread needs to exit driver
+ * (no lock needed)
+ * @transaction_stack: stack of in-progress transactions for this thread
+ * (protected by @proc->inner_lock)
+ * @todo: list of work to do for this thread
+ * (protected by @proc->inner_lock)
+ * @process_todo: whether work in @todo should be processed
+ * (protected by @proc->inner_lock)
+ * @return_error: transaction errors reported by this thread
+ * (only accessed by this thread)
+ * @reply_error: transaction errors reported by target thread
+ * (protected by @proc->inner_lock)
+ * @wait: wait queue for thread work
+ * @stats: per-thread statistics
+ * (atomics, no lock needed)
+ * @tmp_ref: temporary reference to indicate thread is in use
+ * (atomic since @proc->inner_lock cannot
+ * always be acquired)
+ * @is_dead: thread is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @proc->inner_lock)
+ * @task: struct task_struct for this thread
+ *
+ * Bookkeeping structure for binder threads.
+ */
+struct binder_thread {
+ struct binder_proc *proc;
+ struct rb_node rb_node;
+ struct list_head waiting_thread_node;
+ int pid;
+ int looper; /* only modified by this thread */
+ bool looper_need_return; /* can be written by other thread */
+ struct binder_transaction *transaction_stack;
+ struct list_head todo;
+ bool process_todo;
+ struct binder_error return_error;
+ struct binder_error reply_error;
+ wait_queue_head_t wait;
+ struct binder_stats stats;
+ atomic_t tmp_ref;
+ bool is_dead;
+ struct task_struct *task;
+};
+
+struct binder_transaction {
+ int debug_id;
+ struct binder_work work;
+ struct binder_thread *from;
+ struct binder_transaction *from_parent;
+ struct binder_proc *to_proc;
+ struct binder_thread *to_thread;
+ struct binder_transaction *to_parent;
+ unsigned need_reply:1;
+ /* unsigned is_dead:1; */ /* not used at the moment */
+
+ struct binder_buffer *buffer;
+ unsigned int code;
+ unsigned int flags;
+ struct binder_priority priority;
+ struct binder_priority saved_priority;
+ bool set_priority_called;
+ kuid_t sender_euid;
+ binder_uintptr_t security_ctx;
+ /**
+ * @lock: protects @from, @to_proc, and @to_thread
+ *
+ * @from, @to_proc, and @to_thread can be set to NULL
+ * during thread teardown
+ */
+ spinlock_t lock;
+};
+
+/**
+ * struct binder_object - union of flat binder object types
+ * @hdr: generic object header
+ * @fbo: binder object (nodes and refs)
+ * @fdo: file descriptor object
+ * @bbo: binder buffer pointer
+ * @fdao: file descriptor array
+ *
+ * Used for type-independent object copies
+ */
+struct binder_object {
+ union {
+ struct binder_object_header hdr;
+ struct flat_binder_object fbo;
+ struct binder_fd_object fdo;
+ struct binder_buffer_object bbo;
+ struct binder_fd_array_object fdao;
+ };
+};
+
/**
* binder_proc_lock() - Acquire outer lock for given binder_proc
* @proc: struct binder_proc to acquire
@@ -253,7 +696,6 @@ enum {
#define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__)
static void
_binder_proc_lock(struct binder_proc *proc, int line)
- __acquires(&proc->outer_lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -269,7 +711,6 @@ _binder_proc_lock(struct binder_proc *proc, int line)
#define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__)
static void
_binder_proc_unlock(struct binder_proc *proc, int line)
- __releases(&proc->outer_lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -285,7 +726,6 @@ _binder_proc_unlock(struct binder_proc *proc, int line)
#define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__)
static void
_binder_inner_proc_lock(struct binder_proc *proc, int line)
- __acquires(&proc->inner_lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -301,7 +741,6 @@ _binder_inner_proc_lock(struct binder_proc *proc, int line)
#define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__)
static void
_binder_inner_proc_unlock(struct binder_proc *proc, int line)
- __releases(&proc->inner_lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -317,7 +756,6 @@ _binder_inner_proc_unlock(struct binder_proc *proc, int line)
#define binder_node_lock(node) _binder_node_lock(node, __LINE__)
static void
_binder_node_lock(struct binder_node *node, int line)
- __acquires(&node->lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -333,7 +771,6 @@ _binder_node_lock(struct binder_node *node, int line)
#define binder_node_unlock(node) _binder_node_unlock(node, __LINE__)
static void
_binder_node_unlock(struct binder_node *node, int line)
- __releases(&node->lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
@@ -350,16 +787,12 @@ _binder_node_unlock(struct binder_node *node, int line)
#define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__)
static void
_binder_node_inner_lock(struct binder_node *node, int line)
- __acquires(&node->lock) __acquires(&node->proc->inner_lock)
{
binder_debug(BINDER_DEBUG_SPINLOCKS,
"%s: line=%d\n", __func__, line);
spin_lock(&node->lock);
if (node->proc)
binder_inner_proc_lock(node->proc);
- else
- /* annotation for sparse */
- __acquire(&node->proc->inner_lock);
}
/**
@@ -371,7 +804,6 @@ _binder_node_inner_lock(struct binder_node *node, int line)
#define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__)
static void
_binder_node_inner_unlock(struct binder_node *node, int line)
- __releases(&node->lock) __releases(&node->proc->inner_lock)
{
struct binder_proc *proc = node->proc;
@@ -379,9 +811,6 @@ _binder_node_inner_unlock(struct binder_node *node, int line)
"%s: line=%d\n", __func__, line);
if (proc)
binder_inner_proc_unlock(proc);
- else
- /* annotation for sparse */
- __release(&node->proc->inner_lock);
spin_unlock(&node->lock);
}
@@ -442,7 +871,6 @@ static void
binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread,
struct binder_work *work)
{
- WARN_ON(!list_empty(&thread->waiting_thread_node));
binder_enqueue_work_ilocked(work, &thread->todo);
}
@@ -460,7 +888,6 @@ static void
binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
struct binder_work *work)
{
- WARN_ON(!list_empty(&thread->waiting_thread_node));
binder_enqueue_work_ilocked(work, &thread->todo);
thread->process_todo = true;
}
@@ -521,13 +948,69 @@ static void binder_free_thread(struct binder_thread *thread);
static void binder_free_proc(struct binder_proc *proc);
static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
+static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
+{
+ unsigned long rlim_cur;
+ unsigned long irqs;
+ int ret;
+
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ ret = -ESRCH;
+ goto err;
+ }
+ if (!lock_task_sighand(proc->tsk, &irqs)) {
+ ret = -EMFILE;
+ goto err;
+ }
+ rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
+ unlock_task_sighand(proc->tsk, &irqs);
+
+ ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+ mutex_unlock(&proc->files_lock);
+ return ret;
+}
+
+/*
+ * copied from fd_install
+ */
+static void task_fd_install(
+ struct binder_proc *proc, unsigned int fd, struct file *file)
+{
+ mutex_lock(&proc->files_lock);
+ if (proc->files)
+ __fd_install(proc->files, fd, file);
+ mutex_unlock(&proc->files_lock);
+}
+
+/*
+ * copied from sys_close
+ */
+static long task_close_fd(struct binder_proc *proc, unsigned int fd)
+{
+ int retval;
+
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ retval = -ESRCH;
+ goto err;
+ }
+ retval = __close_fd(proc->files, fd);
+ /* can't restart close syscall because file table entry was cleared */
+ if (unlikely(retval == -ERESTARTSYS ||
+ retval == -ERESTARTNOINTR ||
+ retval == -ERESTARTNOHAND ||
+ retval == -ERESTART_RESTARTBLOCK))
+ retval = -EINTR;
+err:
+ mutex_unlock(&proc->files_lock);
+ return retval;
+}
+
static bool binder_has_work_ilocked(struct binder_thread *thread,
bool do_proc_work)
{
- int ret = 0;
-
- if (ret)
- return true;
return thread->process_todo ||
thread->looper_need_return ||
(do_proc_work &&
@@ -672,7 +1155,7 @@ static int to_userspace_prio(int policy, int kernel_priority)
if (is_fair_policy(policy))
return PRIO_TO_NICE(kernel_priority);
else
- return MAX_RT_PRIO - 1 - kernel_priority;
+ return MAX_USER_RT_PRIO - 1 - kernel_priority;
}
static int to_kernel_prio(int policy, int user_priority)
@@ -680,29 +1163,23 @@ static int to_kernel_prio(int policy, int user_priority)
if (is_fair_policy(policy))
return NICE_TO_PRIO(user_priority);
else
- return MAX_RT_PRIO - 1 - user_priority;
+ return MAX_USER_RT_PRIO - 1 - user_priority;
}
-static void binder_do_set_priority(struct binder_thread *thread,
- const struct binder_priority *desired,
+static void binder_do_set_priority(struct task_struct *task,
+ struct binder_priority desired,
bool verify)
{
- struct task_struct *task = thread->task;
int priority; /* user-space prio value */
bool has_cap_nice;
- unsigned int policy = desired->sched_policy;
+ unsigned int policy = desired.sched_policy;
- if (task->policy == policy && task->normal_prio == desired->prio) {
- spin_lock(&thread->prio_lock);
- if (thread->prio_state == BINDER_PRIO_PENDING)
- thread->prio_state = BINDER_PRIO_SET;
- spin_unlock(&thread->prio_lock);
+ if (task->policy == policy && task->normal_prio == desired.prio)
return;
- }
has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
- priority = to_userspace_prio(policy, desired->prio);
+ priority = to_userspace_prio(policy, desired.prio);
if (verify && is_rt_policy(policy) && !has_cap_nice) {
long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
@@ -727,30 +1204,16 @@ static void binder_do_set_priority(struct binder_thread *thread,
}
}
- if (policy != desired->sched_policy ||
- to_kernel_prio(policy, priority) != desired->prio)
+ if (policy != desired.sched_policy ||
+ to_kernel_prio(policy, priority) != desired.prio)
binder_debug(BINDER_DEBUG_PRIORITY_CAP,
"%d: priority %d not allowed, using %d instead\n",
- task->pid, desired->prio,
+ task->pid, desired.prio,
to_kernel_prio(policy, priority));
trace_binder_set_priority(task->tgid, task->pid, task->normal_prio,
to_kernel_prio(policy, priority),
- desired->prio);
-
- spin_lock(&thread->prio_lock);
- if (!verify && thread->prio_state == BINDER_PRIO_ABORT) {
- /*
- * A new priority has been set by an incoming nested
- * transaction. Abort this priority restore and allow
- * the transaction to run at the new desired priority.
- */
- spin_unlock(&thread->prio_lock);
- binder_debug(BINDER_DEBUG_PRIORITY_CAP,
- "%d: %s: aborting priority restore\n",
- thread->pid, __func__);
- return;
- }
+ desired.prio);
/* Set the actual priority */
if (task->policy != policy || is_rt_policy(policy)) {
@@ -764,46 +1227,37 @@ static void binder_do_set_priority(struct binder_thread *thread,
}
if (is_fair_policy(policy))
set_user_nice(task, priority);
-
- thread->prio_state = BINDER_PRIO_SET;
- spin_unlock(&thread->prio_lock);
}
-static void binder_set_priority(struct binder_thread *thread,
- const struct binder_priority *desired)
+static void binder_set_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- binder_do_set_priority(thread, desired, /* verify = */ true);
+ binder_do_set_priority(task, desired, /* verify = */ true);
}
-static void binder_restore_priority(struct binder_thread *thread,
- const struct binder_priority *desired)
+static void binder_restore_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- binder_do_set_priority(thread, desired, /* verify = */ false);
+ binder_do_set_priority(task, desired, /* verify = */ false);
}
-static void binder_transaction_priority(struct binder_thread *thread,
+static void binder_transaction_priority(struct task_struct *task,
struct binder_transaction *t,
- struct binder_node *node)
+ struct binder_priority node_prio,
+ bool inherit_rt)
{
- struct task_struct *task = thread->task;
- struct binder_priority desired = t->priority;
- const struct binder_priority node_prio = {
- .sched_policy = node->sched_policy,
- .prio = node->min_priority,
- };
- bool skip = false;
+ struct binder_priority desired_prio = t->priority;
if (t->set_priority_called)
return;
t->set_priority_called = true;
+ t->saved_priority.sched_policy = task->policy;
+ t->saved_priority.prio = task->normal_prio;
- if (skip)
- return;
-
- if (!node->inherit_rt && is_rt_policy(desired.sched_policy)) {
- desired.prio = NICE_TO_PRIO(0);
- desired.sched_policy = SCHED_NORMAL;
+ if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+ desired_prio.prio = NICE_TO_PRIO(0);
+ desired_prio.sched_policy = SCHED_NORMAL;
}
if (node_prio.prio < t->priority.prio ||
@@ -816,29 +1270,10 @@ static void binder_transaction_priority(struct binder_thread *thread,
* SCHED_FIFO, prefer SCHED_FIFO, since it can
* run unbounded, unlike SCHED_RR.
*/
- desired = node_prio;
- }
-
- spin_lock(&thread->prio_lock);
- if (thread->prio_state == BINDER_PRIO_PENDING) {
- /*
- * Task is in the process of changing priorities
- * saving its current values would be incorrect.
- * Instead, save the pending priority and signal
- * the task to abort the priority restore.
- */
- t->saved_priority = thread->prio_next;
- thread->prio_state = BINDER_PRIO_ABORT;
- binder_debug(BINDER_DEBUG_PRIORITY_CAP,
- "%d: saved pending priority %d\n",
- current->pid, thread->prio_next.prio);
- } else {
- t->saved_priority.sched_policy = task->policy;
- t->saved_priority.prio = task->normal_prio;
+ desired_prio = node_prio;
}
- spin_unlock(&thread->prio_lock);
- binder_set_priority(thread, &desired);
+ binder_set_priority(task, desired_prio);
}
static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
@@ -945,9 +1380,9 @@ static struct binder_node *binder_init_node_ilocked(
static struct binder_node *binder_new_node(struct binder_proc *proc,
struct flat_binder_object *fp)
{
- struct binder_node *node, *new_node;
+ struct binder_node *node;
+ struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL);
- new_node = kmem_cache_zalloc(binder_node_pool, GFP_KERNEL);
if (!new_node)
return NULL;
binder_inner_proc_lock(proc);
@@ -957,14 +1392,14 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
/*
* The node was already added by another thread
*/
- kmem_cache_free(binder_node_pool, new_node);
+ kfree(new_node);
return node;
}
static void binder_free_node(struct binder_node *node)
{
- kmem_cache_free(binder_node_pool, node);
+ kfree(node);
binder_stats_deleted(BINDER_STAT_NODE);
}
@@ -982,7 +1417,8 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
if (target_list == NULL &&
node->internal_strong_refs == 0 &&
!(node->proc &&
- node == node->proc->context->binder_context_mgr_node &&
+ node == node->proc->context->
+ binder_context_mgr_node &&
node->has_strong_ref)) {
pr_err("invalid inc strong node for %d\n",
node->debug_id);
@@ -992,12 +1428,19 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
} else
node->local_strong_refs++;
if (!node->has_strong_ref && target_list) {
- struct binder_thread *thread = container_of(target_list,
- struct binder_thread, todo);
binder_dequeue_work_ilocked(&node->work);
- BUG_ON(&thread->todo != target_list);
- binder_enqueue_deferred_thread_work_ilocked(thread,
- &node->work);
+ /*
+ * Note: this function is the only place where we queue
+ * directly to a thread->todo without using the
+ * corresponding binder_enqueue_thread_work() helper
+ * functions; in this case it's ok to not set the
+ * process_todo flag, since we know this node work will
+ * always be followed by other work that starts queue
+ * processing: in case of synchronous transactions, a
+ * BR_REPLY or BR_ERROR; in case of oneway
+ * transactions, a BR_TRANSACTION_COMPLETE.
+ */
+ binder_enqueue_work_ilocked(&node->work, target_list);
}
} else {
if (!internal)
@@ -1151,14 +1594,10 @@ static void binder_dec_node_tmpref(struct binder_node *node)
binder_node_inner_lock(node);
if (!node->proc)
spin_lock(&binder_dead_nodes_lock);
- else
- __acquire(&binder_dead_nodes_lock);
node->tmp_refs--;
BUG_ON(node->tmp_refs < 0);
if (!node->proc)
spin_unlock(&binder_dead_nodes_lock);
- else
- __release(&binder_dead_nodes_lock);
/*
* Call binder_dec_node() to check if all refcounts are 0
* and cleanup is needed. Calling with strong=0 and internal=1
@@ -1448,9 +1887,8 @@ static void binder_free_ref(struct binder_ref *ref)
{
if (ref->node)
binder_free_node(ref->node);
- if (ref->death)
- kmem_cache_free(binder_ref_death_pool, ref->death);
- kmem_cache_free(binder_ref_pool, ref);
+ kfree(ref->death);
+ kfree(ref);
}
/**
@@ -1543,7 +1981,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
ref = binder_get_ref_for_node_olocked(proc, node, NULL);
if (!ref) {
binder_proc_unlock(proc);
- new_ref = kmem_cache_zalloc(binder_ref_pool, GFP_KERNEL);
+ new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
if (!new_ref)
return -ENOMEM;
binder_proc_lock(proc);
@@ -1569,7 +2007,7 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
* Another thread created the ref first so
* free the one we allocated
*/
- kmem_cache_free(binder_ref_pool, new_ref);
+ kfree(new_ref);
return ret;
}
@@ -1628,9 +2066,9 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread)
static void binder_proc_dec_tmpref(struct binder_proc *proc)
{
binder_inner_proc_lock(proc);
- proc->tmp_ref--;
+ atomic_dec(&proc->tmp_ref);
if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
- !proc->tmp_ref) {
+ !atomic_read(&proc->tmp_ref)) {
binder_inner_proc_unlock(proc);
binder_free_proc(proc);
return;
@@ -1674,89 +2112,45 @@ static struct binder_thread *binder_get_txn_from(
*/
static struct binder_thread *binder_get_txn_from_and_acq_inner(
struct binder_transaction *t)
- __acquires(&t->from->proc->inner_lock)
{
struct binder_thread *from;
from = binder_get_txn_from(t);
- if (!from) {
- __acquire(&from->proc->inner_lock);
+ if (!from)
return NULL;
- }
binder_inner_proc_lock(from->proc);
if (t->from) {
BUG_ON(from != t->from);
return from;
}
binder_inner_proc_unlock(from->proc);
- __acquire(&from->proc->inner_lock);
binder_thread_dec_tmpref(from);
return NULL;
}
-/**
- * binder_free_txn_fixups() - free unprocessed fd fixups
- * @t: binder transaction for t->from
- *
- * If the transaction is being torn down prior to being
- * processed by the target process, free all of the
- * fd fixups and fput the file structs. It is safe to
- * call this function after the fixups have been
- * processed -- in that case, the list will be empty.
- */
-static void binder_free_txn_fixups(struct binder_transaction *t)
-{
- struct binder_txn_fd_fixup *fixup, *tmp;
-
- list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) {
- fput(fixup->file);
- list_del(&fixup->fixup_entry);
- kfree(fixup);
- }
-}
-
-static void binder_txn_latency_free(struct binder_transaction *t)
-{
- int from_proc, from_thread, to_proc, to_thread;
-
- spin_lock(&t->lock);
- from_proc = t->from ? t->from->proc->pid : 0;
- from_thread = t->from ? t->from->pid : 0;
- to_proc = t->to_proc ? t->to_proc->pid : 0;
- to_thread = t->to_thread ? t->to_thread->pid : 0;
- spin_unlock(&t->lock);
-
- trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread);
-}
-
static void binder_free_transaction(struct binder_transaction *t)
{
- struct binder_proc *target_proc = t->to_proc;
+ struct binder_proc *target_proc;
+ spin_lock(&t->lock);
+ target_proc = t->to_proc;
if (target_proc) {
+ atomic_inc(&target_proc->tmp_ref);
+ spin_unlock(&t->lock);
+
binder_inner_proc_lock(target_proc);
- target_proc->outstanding_txns--;
- if (target_proc->outstanding_txns < 0)
- pr_warn("%s: Unexpected outstanding_txns %d\n",
- __func__, target_proc->outstanding_txns);
- if (!target_proc->outstanding_txns && target_proc->is_frozen)
- wake_up_interruptible_all(&target_proc->freeze_wait);
if (t->buffer)
t->buffer->transaction = NULL;
binder_inner_proc_unlock(target_proc);
+ binder_proc_dec_tmpref(target_proc);
+ } else {
+ /*
+ * If the transaction has no target_proc, then
+ * t->buffer->transaction * has already been cleared.
+ */
+ spin_unlock(&t->lock);
}
- if (trace_binder_txn_latency_free_enabled())
- binder_txn_latency_free(t);
- /*
- * If the transaction has no target_proc, then
- * t->buffer->transaction has already been cleared.
- */
- binder_free_txn_fixups(t);
- /*
- * If the transaction has no target_proc, then
- * t->buffer->transaction has already been cleared.
- */
- kmem_cache_free(binder_transaction_pool, t);
+ kfree(t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
}
@@ -1798,7 +2192,6 @@ static void binder_send_failed_reply(struct binder_transaction *t,
binder_free_transaction(t);
return;
}
- __release(&target_thread->proc->inner_lock);
next = t->from_parent;
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
@@ -1841,21 +2234,15 @@ static void binder_cleanup_transaction(struct binder_transaction *t,
/**
* binder_get_object() - gets object and checks for valid metadata
* @proc: binder_proc owning the buffer
- * @u: sender's user pointer to base of buffer
* @buffer: binder_buffer that we're parsing.
* @offset: offset in the @buffer at which to validate an object.
* @object: struct binder_object to read into
*
- * Copy the binder object at the given offset into @object. If @u is
- * provided then the copy is from the sender's buffer. If not, then
- * it is copied from the target's @buffer.
- *
- * Return: If there's a valid metadata object at @offset, the
+ * Return: If there's a valid metadata object at @offset in @buffer, the
* size of that object. Otherwise, it returns zero. The object
* is read into the struct binder_object pointed to by @object.
*/
static size_t binder_get_object(struct binder_proc *proc,
- const void __user *u,
struct binder_buffer *buffer,
unsigned long offset,
struct binder_object *object)
@@ -1865,16 +2252,11 @@ static size_t binder_get_object(struct binder_proc *proc,
size_t object_size = 0;
read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset);
- if (offset > buffer->data_size || read_size < sizeof(*hdr))
+ if (offset > buffer->data_size || read_size < sizeof(*hdr) ||
+ !IS_ALIGNED(offset, sizeof(u32)))
return 0;
- if (u) {
- if (copy_from_user(object, u + offset, read_size))
- return 0;
- } else {
- if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
- offset, read_size))
- return 0;
- }
+ binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
+ offset, read_size);
/* Ok, now see if we read a complete object. */
hdr = &object->hdr;
@@ -1943,11 +2325,9 @@ static struct binder_buffer_object *binder_validate_ptr(
return NULL;
buffer_offset = start_offset + sizeof(binder_size_t) * index;
- if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
- b, buffer_offset,
- sizeof(object_offset)))
- return NULL;
- object_size = binder_get_object(proc, NULL, b, object_offset, object);
+ binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
+ b, buffer_offset, sizeof(object_offset));
+ object_size = binder_get_object(proc, b, object_offset, object);
if (!object_size || object->hdr.type != BINDER_TYPE_PTR)
return NULL;
if (object_offsetp)
@@ -2012,8 +2392,7 @@ static bool binder_validate_fixup(struct binder_proc *proc,
unsigned long buffer_offset;
struct binder_object last_object;
struct binder_buffer_object *last_bbo;
- size_t object_size = binder_get_object(proc, NULL, b,
- last_obj_offset,
+ size_t object_size = binder_get_object(proc, b, last_obj_offset,
&last_object);
if (object_size != sizeof(*last_bbo))
return false;
@@ -2027,78 +2406,15 @@ static bool binder_validate_fixup(struct binder_proc *proc,
return false;
last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t);
buffer_offset = objects_start_offset +
- sizeof(binder_size_t) * last_bbo->parent;
- if (binder_alloc_copy_from_buffer(&proc->alloc,
- &last_obj_offset,
- b, buffer_offset,
- sizeof(last_obj_offset)))
- return false;
+ sizeof(binder_size_t) * last_bbo->parent,
+ binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset,
+ b, buffer_offset,
+ sizeof(last_obj_offset));
}
return (fixup_offset >= last_min_offset);
}
-/**
- * struct binder_task_work_cb - for deferred close
- *
- * @twork: callback_head for task work
- * @fd: fd to close
- *
- * Structure to pass task work to be handled after
- * returning from binder_ioctl() via task_work_add().
- */
-struct binder_task_work_cb {
- struct callback_head twork;
- struct file *file;
-};
-
-/**
- * binder_do_fd_close() - close list of file descriptors
- * @twork: callback head for task work
- *
- * It is not safe to call ksys_close() during the binder_ioctl()
- * function if there is a chance that binder's own file descriptor
- * might be closed. This is to meet the requirements for using
- * fdget() (see comments for __fget_light()). Therefore use
- * task_work_add() to schedule the close operation once we have
- * returned from binder_ioctl(). This function is a callback
- * for that mechanism and does the actual ksys_close() on the
- * given file descriptor.
- */
-static void binder_do_fd_close(struct callback_head *twork)
-{
- struct binder_task_work_cb *twcb = container_of(twork,
- struct binder_task_work_cb, twork);
-
- fput(twcb->file);
- kfree(twcb);
-}
-
-/**
- * binder_deferred_fd_close() - schedule a close for the given file-descriptor
- * @fd: file-descriptor to close
- *
- * See comments in binder_do_fd_close(). This function is used to schedule
- * a file-descriptor to be closed after returning from binder_ioctl().
- */
-static void binder_deferred_fd_close(int fd)
-{
- struct binder_task_work_cb *twcb;
-
- twcb = kzalloc(sizeof(*twcb), GFP_KERNEL);
- if (!twcb)
- return;
- init_task_work(&twcb->twork, binder_do_fd_close);
- close_fd_get_file(fd, &twcb->file);
- if (twcb->file) {
- filp_close(twcb->file, current->files);
- task_work_add(current, &twcb->twork, true);
- } else {
- kfree(twcb);
- }
-}
-
static void binder_transaction_buffer_release(struct binder_proc *proc,
- struct binder_thread *thread,
struct binder_buffer *buffer,
binder_size_t failed_at,
bool is_failure)
@@ -2116,20 +2432,20 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_dec_node(buffer->target_node, 1, 0);
off_start_offset = ALIGN(buffer->data_size, sizeof(void *));
- off_end_offset = is_failure && failed_at ? failed_at :
+ off_end_offset = is_failure ? failed_at :
off_start_offset + buffer->offsets_size;
for (buffer_offset = off_start_offset; buffer_offset < off_end_offset;
buffer_offset += sizeof(binder_size_t)) {
struct binder_object_header *hdr;
- size_t object_size = 0;
+ size_t object_size;
struct binder_object object;
binder_size_t object_offset;
- if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
- buffer, buffer_offset,
- sizeof(object_offset)))
- object_size = binder_get_object(proc, NULL, buffer,
- object_offset, &object);
+ binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
+ buffer, buffer_offset,
+ sizeof(object_offset));
+ object_size = binder_get_object(proc, buffer,
+ object_offset, &object);
if (object_size == 0) {
pr_err("transaction release %d bad object at offset %lld, size %zd\n",
debug_id, (u64)object_offset, buffer->data_size);
@@ -2177,15 +2493,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
} break;
case BINDER_TYPE_FD: {
- /*
- * No need to close the file here since user-space
- * closes it for for successfully delivered
- * transactions. For transactions that weren't
- * delivered, the new fd was never allocated so
- * there is no need to close and the fput on the
- * file is done when the transaction is torn
- * down.
- */
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " fd %d\n", fp->fd);
+ if (failed_at)
+ task_close_fd(proc, fp->fd);
} break;
case BINDER_TYPE_PTR:
/*
@@ -2202,14 +2515,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_size_t fd_buf_size;
binder_size_t num_valid;
- if (is_failure) {
- /*
- * The fd fixups have not been applied so no
- * fds need to be closed.
- */
- continue;
- }
-
num_valid = (buffer_offset - off_start_offset) /
sizeof(binder_size_t);
fda = to_binder_fd_array_object(hdr);
@@ -2219,7 +2524,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
NULL,
num_valid);
if (!parent) {
- pr_err("transaction release %d bad parent offset\n",
+ pr_err("transaction release %d bad parent offset",
debug_id);
continue;
}
@@ -2249,24 +2554,15 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
for (fd_index = 0; fd_index < fda->num_fds;
fd_index++) {
u32 fd;
- int err;
binder_size_t offset = fda_offset +
fd_index * sizeof(fd);
- err = binder_alloc_copy_from_buffer(
- &proc->alloc, &fd, buffer,
- offset, sizeof(fd));
- WARN_ON(err);
- if (!err) {
- binder_deferred_fd_close(fd);
- /*
- * Need to make sure the thread goes
- * back to userspace to complete the
- * deferred close
- */
- if (thread)
- thread->looper_need_return = true;
- }
+ binder_alloc_copy_from_buffer(&proc->alloc,
+ &fd,
+ buffer,
+ offset,
+ sizeof(fd));
+ task_close_fd(proc, fd);
}
} break;
default:
@@ -2362,15 +2658,11 @@ static int binder_translate_handle(struct flat_binder_object *fp,
fp->cookie = node->cookie;
if (node->proc)
binder_inner_proc_lock(node->proc);
- else
- __acquire(&node->proc->inner_lock);
binder_inc_node_nilocked(node,
fp->hdr.type == BINDER_TYPE_BINDER,
0, NULL);
if (node->proc)
binder_inner_proc_unlock(node->proc);
- else
- __release(&node->proc->inner_lock);
trace_binder_transaction_ref_to_node(t, node, &src_rdata);
binder_debug(BINDER_DEBUG_TRANSACTION,
" ref %d desc %d -> node %d u%016llx\n",
@@ -2403,16 +2695,16 @@ static int binder_translate_handle(struct flat_binder_object *fp,
return ret;
}
-static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
+static int binder_translate_fd(int fd,
struct binder_transaction *t,
struct binder_thread *thread,
struct binder_transaction *in_reply_to)
{
struct binder_proc *proc = thread->proc;
struct binder_proc *target_proc = t->to_proc;
- struct binder_txn_fd_fixup *fixup;
+ int target_fd;
struct file *file;
- int ret = 0;
+ int ret;
bool target_allows_fd;
if (in_reply_to)
@@ -2441,24 +2733,19 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
goto err_security;
}
- /*
- * Add fixup record for this transaction. The allocation
- * of the fd in the target needs to be done from a
- * target thread.
- */
- fixup = kzalloc(sizeof(*fixup), GFP_KERNEL);
- if (!fixup) {
+ target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+ if (target_fd < 0) {
ret = -ENOMEM;
- goto err_alloc;
+ goto err_get_unused_fd;
}
- fixup->file = file;
- fixup->offset = fd_offset;
- trace_binder_transaction_fd_send(t, fd, fixup->offset);
- list_add_tail(&fixup->fixup_entry, &t->fd_fixups);
+ task_fd_install(target_proc, target_fd, file);
+ trace_binder_transaction_fd(t, fd, target_fd);
+ binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n",
+ fd, target_fd);
- return ret;
+ return target_fd;
-err_alloc:
+err_get_unused_fd:
err_security:
fput(file);
err_fget:
@@ -2466,266 +2753,17 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
return ret;
}
-/**
- * struct binder_ptr_fixup - data to be fixed-up in target buffer
- * @offset offset in target buffer to fixup
- * @skip_size bytes to skip in copy (fixup will be written later)
- * @fixup_data data to write at fixup offset
- * @node list node
- *
- * This is used for the pointer fixup list (pf) which is created and consumed
- * during binder_transaction() and is only accessed locally. No
- * locking is necessary.
- *
- * The list is ordered by @offset.
- */
-struct binder_ptr_fixup {
- binder_size_t offset;
- size_t skip_size;
- binder_uintptr_t fixup_data;
- struct list_head node;
-};
-
-/**
- * struct binder_sg_copy - scatter-gather data to be copied
- * @offset offset in target buffer
- * @sender_uaddr user address in source buffer
- * @length bytes to copy
- * @node list node
- *
- * This is used for the sg copy list (sgc) which is created and consumed
- * during binder_transaction() and is only accessed locally. No
- * locking is necessary.
- *
- * The list is ordered by @offset.
- */
-struct binder_sg_copy {
- binder_size_t offset;
- const void __user *sender_uaddr;
- size_t length;
- struct list_head node;
-};
-
-/**
- * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data
- * @alloc: binder_alloc associated with @buffer
- * @buffer: binder buffer in target process
- * @sgc_head: list_head of scatter-gather copy list
- * @pf_head: list_head of pointer fixup list
- *
- * Processes all elements of @sgc_head, applying fixups from @pf_head
- * and copying the scatter-gather data from the source process' user
- * buffer to the target's buffer. It is expected that the list creation
- * and processing all occurs during binder_transaction() so these lists
- * are only accessed in local context.
- *
- * Return: 0=success, else -errno
- */
-static int binder_do_deferred_txn_copies(struct binder_alloc *alloc,
- struct binder_buffer *buffer,
- struct list_head *sgc_head,
- struct list_head *pf_head)
-{
- int ret = 0;
- struct binder_sg_copy *sgc, *tmpsgc;
- struct binder_ptr_fixup *tmppf;
- struct binder_ptr_fixup *pf =
- list_first_entry_or_null(pf_head, struct binder_ptr_fixup,
- node);
-
- list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
- size_t bytes_copied = 0;
-
- while (bytes_copied < sgc->length) {
- size_t copy_size;
- size_t bytes_left = sgc->length - bytes_copied;
- size_t offset = sgc->offset + bytes_copied;
-
- /*
- * We copy up to the fixup (pointed to by pf)
- */
- copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset)
- : bytes_left;
- if (!ret && copy_size)
- ret = binder_alloc_copy_user_to_buffer(
- alloc, buffer,
- offset,
- sgc->sender_uaddr + bytes_copied,
- copy_size);
- bytes_copied += copy_size;
- if (copy_size != bytes_left) {
- BUG_ON(!pf);
- /* we stopped at a fixup offset */
- if (pf->skip_size) {
- /*
- * we are just skipping. This is for
- * BINDER_TYPE_FDA where the translated
- * fds will be fixed up when we get
- * to target context.
- */
- bytes_copied += pf->skip_size;
- } else {
- /* apply the fixup indicated by pf */
- if (!ret)
- ret = binder_alloc_copy_to_buffer(
- alloc, buffer,
- pf->offset,
- &pf->fixup_data,
- sizeof(pf->fixup_data));
- bytes_copied += sizeof(pf->fixup_data);
- }
- list_del(&pf->node);
- kfree(pf);
- pf = list_first_entry_or_null(pf_head,
- struct binder_ptr_fixup, node);
- }
- }
- list_del(&sgc->node);
- kfree(sgc);
- }
- list_for_each_entry_safe(pf, tmppf, pf_head, node) {
- BUG_ON(pf->skip_size == 0);
- list_del(&pf->node);
- kfree(pf);
- }
- BUG_ON(!list_empty(sgc_head));
-
- return ret > 0 ? -EINVAL : ret;
-}
-
-/**
- * binder_cleanup_deferred_txn_lists() - free specified lists
- * @sgc_head: list_head of scatter-gather copy list
- * @pf_head: list_head of pointer fixup list
- *
- * Called to clean up @sgc_head and @pf_head if there is an
- * error.
- */
-static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head,
- struct list_head *pf_head)
-{
- struct binder_sg_copy *sgc, *tmpsgc;
- struct binder_ptr_fixup *pf, *tmppf;
-
- list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
- list_del(&sgc->node);
- kfree(sgc);
- }
- list_for_each_entry_safe(pf, tmppf, pf_head, node) {
- list_del(&pf->node);
- kfree(pf);
- }
-}
-
-/**
- * binder_defer_copy() - queue a scatter-gather buffer for copy
- * @sgc_head: list_head of scatter-gather copy list
- * @offset: binder buffer offset in target process
- * @sender_uaddr: user address in source process
- * @length: bytes to copy
- *
- * Specify a scatter-gather block to be copied. The actual copy must
- * be deferred until all the needed fixups are identified and queued.
- * Then the copy and fixups are done together so un-translated values
- * from the source are never visible in the target buffer.
- *
- * We are guaranteed that repeated calls to this function will have
- * monotonically increasing @offset values so the list will naturally
- * be ordered.
- *
- * Return: 0=success, else -errno
- */
-static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset,
- const void __user *sender_uaddr, size_t length)
-{
- struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL);
-
- if (!bc)
- return -ENOMEM;
-
- bc->offset = offset;
- bc->sender_uaddr = sender_uaddr;
- bc->length = length;
- INIT_LIST_HEAD(&bc->node);
-
- /*
- * We are guaranteed that the deferred copies are in-order
- * so just add to the tail.
- */
- list_add_tail(&bc->node, sgc_head);
-
- return 0;
-}
-
-/**
- * binder_add_fixup() - queue a fixup to be applied to sg copy
- * @pf_head: list_head of binder ptr fixup list
- * @offset: binder buffer offset in target process
- * @fixup: bytes to be copied for fixup
- * @skip_size: bytes to skip when copying (fixup will be applied later)
- *
- * Add the specified fixup to a list ordered by @offset. When copying
- * the scatter-gather buffers, the fixup will be copied instead of
- * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup
- * will be applied later (in target process context), so we just skip
- * the bytes specified by @skip_size. If @skip_size is 0, we copy the
- * value in @fixup.
- *
- * This function is called *mostly* in @offset order, but there are
- * exceptions. Since out-of-order inserts are relatively uncommon,
- * we insert the new element by searching backward from the tail of
- * the list.
- *
- * Return: 0=success, else -errno
- */
-static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset,
- binder_uintptr_t fixup, size_t skip_size)
-{
- struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL);
- struct binder_ptr_fixup *tmppf;
-
- if (!pf)
- return -ENOMEM;
-
- pf->offset = offset;
- pf->fixup_data = fixup;
- pf->skip_size = skip_size;
- INIT_LIST_HEAD(&pf->node);
-
- /* Fixups are *mostly* added in-order, but there are some
- * exceptions. Look backwards through list for insertion point.
- */
- list_for_each_entry_reverse(tmppf, pf_head, node) {
- if (tmppf->offset < pf->offset) {
- list_add(&pf->node, &tmppf->node);
- return 0;
- }
- }
- /*
- * if we get here, then the new offset is the lowest so
- * insert at the head
- */
- list_add(&pf->node, pf_head);
- return 0;
-}
-
-static int binder_translate_fd_array(struct list_head *pf_head,
- struct binder_fd_array_object *fda,
- const void __user *sender_ubuffer,
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
struct binder_buffer_object *parent,
- struct binder_buffer_object *sender_uparent,
struct binder_transaction *t,
struct binder_thread *thread,
struct binder_transaction *in_reply_to)
{
- binder_size_t fdi, fd_buf_size;
+ binder_size_t fdi, fd_buf_size, num_installed_fds;
binder_size_t fda_offset;
- const void __user *sender_ufda_base;
+ int target_fd;
struct binder_proc *proc = thread->proc;
- int ret;
-
- if (fda->num_fds == 0)
- return 0;
+ struct binder_proc *target_proc = t->to_proc;
fd_buf_size = sizeof(u32) * fda->num_fds;
if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
@@ -2749,36 +2787,46 @@ static int binder_translate_fd_array(struct list_head *pf_head,
*/
fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) +
fda->parent_offset;
- sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer +
- fda->parent_offset;
-
- if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) ||
- !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) {
+ if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) {
binder_user_error("%d:%d parent offset not aligned correctly.\n",
proc->pid, thread->pid);
return -EINVAL;
}
- ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32));
- if (ret)
- return ret;
-
for (fdi = 0; fdi < fda->num_fds; fdi++) {
u32 fd;
+
binder_size_t offset = fda_offset + fdi * sizeof(fd);
- binder_size_t sender_uoffset = fdi * sizeof(fd);
- ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd));
- if (!ret)
- ret = binder_translate_fd(fd, offset, t, thread,
- in_reply_to);
- if (ret)
- return ret > 0 ? -EINVAL : ret;
+ binder_alloc_copy_from_buffer(&target_proc->alloc,
+ &fd, t->buffer,
+ offset, sizeof(fd));
+ target_fd = binder_translate_fd(fd, t, thread, in_reply_to);
+ if (target_fd < 0)
+ goto err_translate_fd_failed;
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, offset,
+ &target_fd, sizeof(fd));
}
return 0;
+
+err_translate_fd_failed:
+ /*
+ * Failed to allocate fd or security error, free fds
+ * installed so far.
+ */
+ num_installed_fds = fdi;
+ for (fdi = 0; fdi < num_installed_fds; fdi++) {
+ u32 fd;
+ binder_size_t offset = fda_offset + fdi * sizeof(fd);
+ binder_alloc_copy_from_buffer(&target_proc->alloc,
+ &fd, t->buffer,
+ offset, sizeof(fd));
+ task_close_fd(target_proc, fd);
+ }
+ return target_fd;
}
-static int binder_fixup_parent(struct list_head *pf_head,
- struct binder_transaction *t,
+static int binder_fixup_parent(struct binder_transaction *t,
struct binder_thread *thread,
struct binder_buffer_object *bp,
binder_size_t off_start_offset,
@@ -2824,88 +2872,11 @@ static int binder_fixup_parent(struct list_head *pf_head,
}
buffer_offset = bp->parent_offset +
(uintptr_t)parent->buffer - (uintptr_t)b->user_data;
- return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0);
-}
-
-/**
- * binder_can_update_transaction() - Can a txn be superseded by an updated one?
- * @t1: the pending async txn in the frozen process
- * @t2: the new async txn to supersede the outdated pending one
- *
- * Return: true if t2 can supersede t1
- * false if t2 can not supersede t1
- */
-static bool binder_can_update_transaction(struct binder_transaction *t1,
- struct binder_transaction *t2)
-{
-#ifdef CONFIG_REKERNEL
- if ((t1->flags & t2->flags & TF_ONE_WAY) != TF_ONE_WAY || !t1->to_proc || !t2->to_proc)
-#else
- if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) !=
- (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc)
-#endif /* CONFIG_REKERNEL */
- return false;
- if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code &&
- t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid &&
- t1->buffer->target_node->ptr == t2->buffer->target_node->ptr &&
- t1->buffer->target_node->cookie == t2->buffer->target_node->cookie)
- return true;
- return false;
-}
-
-/**
- * binder_find_outdated_transaction_ilocked() - Find the outdated transaction
- * @t: new async transaction
- * @target_list: list to find outdated transaction
- *
- * Return: the outdated transaction if found
- * NULL if no outdated transacton can be found
- *
- * Requires the proc->inner_lock to be held.
- */
-static struct binder_transaction *
-binder_find_outdated_transaction_ilocked(struct binder_transaction *t,
- struct list_head *target_list)
-{
- struct binder_work *w;
+ binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset,
+ &bp->buffer, sizeof(bp->buffer));
- list_for_each_entry(w, target_list, entry) {
- struct binder_transaction *t_queued;
-
- if (w->type != BINDER_WORK_TRANSACTION)
- continue;
- t_queued = container_of(w, struct binder_transaction, work);
- if (binder_can_update_transaction(t_queued, t))
- return t_queued;
- }
- return NULL;
-}
-
-#ifdef CONFIG_REKERNEL
-void rekernel_binder_transaction(bool reply, struct binder_transaction *t,
- struct binder_node *target_node, struct binder_transaction_data *tr) {
- struct binder_proc *to_proc;
- struct binder_alloc *target_alloc;
- if (!t->to_proc)
- return;
- to_proc = t->to_proc;
-
- if (reply) {
- binder_reply_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, false, tr);
- } else if (t->from) {
- if (t->from->proc) {
- binder_trans_handler(t->from->proc->pid, t->from->proc->tsk, to_proc->pid, to_proc->tsk, false, tr);
- }
- } else { // oneway=1
- binder_trans_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr);
-
- target_alloc = &to_proc->alloc;
- if (target_alloc->free_async_space < (target_alloc->buffer_size / 10 + 0x300)) {
- binder_overflow_handler(task_tgid_nr(current), current, to_proc->pid, to_proc->tsk, true, tr);
- }
- }
+ return 0;
}
-#endif /* CONFIG_REKERNEL */
/**
* binder_proc_transaction() - sends a transaction to a process and wakes it up
@@ -2921,95 +2892,60 @@ void rekernel_binder_transaction(bool reply, struct binder_transaction *t,
* If the @thread parameter is not NULL, the transaction is always queued
* to the waitlist of that specific thread.
*
- * Return: 0 if the transaction was successfully queued
- * BR_DEAD_REPLY if the target process or thread is dead
- * BR_FROZEN_REPLY if the target process or thread is frozen
+ * Return: true if the transactions was successfully queued
+ * false if the target process or thread is dead
*/
-static int binder_proc_transaction(struct binder_transaction *t,
+static bool binder_proc_transaction(struct binder_transaction *t,
struct binder_proc *proc,
struct binder_thread *thread)
{
struct binder_node *node = t->buffer->target_node;
+ struct binder_priority node_prio;
bool oneway = !!(t->flags & TF_ONE_WAY);
bool pending_async = false;
- bool skip = false;
- struct binder_transaction *t_outdated = NULL;
BUG_ON(!node);
binder_node_lock(node);
+ node_prio.prio = node->min_priority;
+ node_prio.sched_policy = node->sched_policy;
if (oneway) {
BUG_ON(thread);
- if (node->has_async_transaction)
+ if (node->has_async_transaction) {
pending_async = true;
- else
+ } else {
node->has_async_transaction = true;
+ }
}
binder_inner_proc_lock(proc);
- if (proc->is_frozen) {
- proc->sync_recv |= !oneway;
- proc->async_recv |= oneway;
- }
- if ((proc->is_frozen && !oneway) || proc->is_dead ||
- (thread && thread->is_dead)) {
+ if (proc->is_dead || (thread && thread->is_dead)) {
binder_inner_proc_unlock(proc);
binder_node_unlock(node);
- return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY;
+ return false;
}
- if (!thread && !pending_async && !skip)
+ if (!thread && !pending_async)
thread = binder_select_thread_ilocked(proc);
if (thread) {
- binder_transaction_priority(thread, t, node);
+ binder_transaction_priority(thread->task, t, node_prio,
+ node->inherit_rt);
binder_enqueue_thread_work_ilocked(thread, &t->work);
} else if (!pending_async) {
binder_enqueue_work_ilocked(&t->work, &proc->todo);
} else {
-#ifdef CONFIG_REKERNEL
- if (frozen_task_group(proc->tsk)) {
-#else
- if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) {
-#endif /* CONFIG_REKERNEL */
- t_outdated = binder_find_outdated_transaction_ilocked(t,
- &node->async_todo);
- if (t_outdated) {
- binder_debug(BINDER_DEBUG_TRANSACTION,
- "txn %d supersedes %d\n",
- t->debug_id, t_outdated->debug_id);
- list_del_init(&t_outdated->work.entry);
- proc->outstanding_txns--;
- }
- }
binder_enqueue_work_ilocked(&t->work, &node->async_todo);
}
if (!pending_async)
binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
- proc->outstanding_txns++;
binder_inner_proc_unlock(proc);
binder_node_unlock(node);
- /*
- * To reduce potential contention, free the outdated transaction and
- * buffer after releasing the locks.
- */
- if (t_outdated) {
- struct binder_buffer *buffer = t_outdated->buffer;
-
- t_outdated->buffer = NULL;
- buffer->transaction = NULL;
- trace_binder_transaction_update_buffer_release(buffer);
- binder_transaction_buffer_release(proc, NULL, buffer, 0, 0);
- binder_alloc_free_buf(&proc->alloc, buffer);
- kfree(t_outdated);
- binder_stats_deleted(BINDER_STAT_TRANSACTION);
- }
-
- return 0;
+ return true;
}
/**
@@ -3045,7 +2981,7 @@ static struct binder_node *binder_get_node_refs_for_txn(
target_node = node;
binder_inc_node_nilocked(node, 1, 0, NULL);
binder_inc_node_tmpref_ilocked(node);
- node->proc->tmp_ref++;
+ atomic_inc(&node->proc->tmp_ref);
*procp = node->proc;
} else
*error = BR_DEAD_REPLY;
@@ -3061,13 +2997,11 @@ static void binder_transaction(struct binder_proc *proc,
{
int ret;
struct binder_transaction *t;
- struct binder_work *w;
struct binder_work *tcomplete;
binder_size_t buffer_offset = 0;
binder_size_t off_start_offset, off_end_offset;
binder_size_t off_min;
binder_size_t sg_buf_offset, sg_buf_end_offset;
- binder_size_t user_offset = 0;
struct binder_proc *target_proc = NULL;
struct binder_thread *target_thread = NULL;
struct binder_node *target_node = NULL;
@@ -3082,13 +3016,6 @@ static void binder_transaction(struct binder_proc *proc,
int t_debug_id = atomic_inc_return(&binder_last_id);
char *secctx = NULL;
u32 secctx_sz = 0;
- bool is_nested = false;
- struct list_head sgc_head;
- struct list_head pf_head;
- const void __user *user_buffer = (const void __user *)
- (uintptr_t)tr->data.ptr.buffer;
- INIT_LIST_HEAD(&sgc_head);
- INIT_LIST_HEAD(&pf_head);
e = binder_transaction_log_add(&binder_transaction_log);
e->debug_id = t_debug_id;
@@ -3098,7 +3025,7 @@ static void binder_transaction(struct binder_proc *proc,
e->target_handle = tr->target.handle;
e->data_size = tr->data_size;
e->offsets_size = tr->offsets_size;
- strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME);
+ e->context_name = proc->context->name;
if (reply) {
binder_inner_proc_lock(proc);
@@ -3132,8 +3059,6 @@ static void binder_transaction(struct binder_proc *proc,
binder_inner_proc_unlock(proc);
target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
if (target_thread == NULL) {
- /* annotation for sparse */
- __release(&target_thread->proc->inner_lock);
return_error = BR_DEAD_REPLY;
return_error_line = __LINE__;
goto err_dead_binder;
@@ -3153,7 +3078,7 @@ static void binder_transaction(struct binder_proc *proc,
goto err_dead_binder;
}
target_proc = target_thread->proc;
- target_proc->tmp_ref++;
+ atomic_inc(&target_proc->tmp_ref);
binder_inner_proc_unlock(target_thread->proc);
} else {
if (tr->target.handle) {
@@ -3174,8 +3099,8 @@ static void binder_transaction(struct binder_proc *proc,
ref->node, &target_proc,
&return_error);
} else {
- binder_user_error("%d:%d got transaction to invalid handle, %u\n",
- proc->pid, thread->pid, tr->target.handle);
+ binder_user_error("%d:%d got transaction to invalid handle\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
}
binder_proc_unlock(proc);
@@ -3189,7 +3114,7 @@ static void binder_transaction(struct binder_proc *proc,
else
return_error = BR_DEAD_REPLY;
mutex_unlock(&context->context_mgr_node_lock);
- if (target_node && target_proc->pid == proc->pid) {
+ if (target_node && target_proc == proc) {
binder_user_error("%d:%d got transaction to context manager from process owning it\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
@@ -3221,29 +3146,6 @@ static void binder_transaction(struct binder_proc *proc,
goto err_invalid_target_handle;
}
binder_inner_proc_lock(proc);
-
- w = list_first_entry_or_null(&thread->todo,
- struct binder_work, entry);
- if (!(tr->flags & TF_ONE_WAY) && w &&
- w->type == BINDER_WORK_TRANSACTION) {
- /*
- * Do not allow new outgoing transaction from a
- * thread that has a transaction at the head of
- * its todo list. Only need to check the head
- * because binder_select_thread_ilocked picks a
- * thread from proc->waiting_threads to enqueue
- * the transaction, and nothing is queued to the
- * todo list while the thread is on waiting_threads.
- */
- binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n",
- proc->pid, thread->pid);
- binder_inner_proc_unlock(proc);
- return_error = BR_FAILED_REPLY;
- return_error_param = -EPROTO;
- return_error_line = __LINE__;
- goto err_bad_todo_list;
- }
-
if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
struct binder_transaction *tmp;
@@ -3271,7 +3173,6 @@ static void binder_transaction(struct binder_proc *proc,
atomic_inc(&from->tmp_ref);
target_thread = from;
spin_unlock(&tmp->lock);
- is_nested = true;
break;
}
spin_unlock(&tmp->lock);
@@ -3285,18 +3186,17 @@ static void binder_transaction(struct binder_proc *proc,
e->to_proc = target_proc->pid;
/* TODO: reuse incoming transaction for reply */
- t = kmem_cache_zalloc(binder_transaction_pool, GFP_KERNEL);
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
if (t == NULL) {
return_error = BR_FAILED_REPLY;
return_error_param = -ENOMEM;
return_error_line = __LINE__;
goto err_alloc_t_failed;
}
- INIT_LIST_HEAD(&t->fd_fixups);
binder_stats_created(BINDER_STAT_TRANSACTION);
spin_lock_init(&t->lock);
- tcomplete = kmem_cache_zalloc(binder_work_pool, GFP_KERNEL);
+ tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
if (tcomplete == NULL) {
return_error = BR_FAILED_REPLY;
return_error_param = -ENOMEM;
@@ -3335,7 +3235,6 @@ static void binder_transaction(struct binder_proc *proc,
t->to_thread = target_thread;
t->code = tr->code;
t->flags = tr->flags;
- t->is_nested = is_nested;
if (!(t->flags & TF_ONE_WAY) &&
binder_supported_policy(current->policy)) {
/* Inherit supported policies for synchronous transactions */
@@ -3363,15 +3262,12 @@ static void binder_transaction(struct binder_proc *proc,
if (extra_buffers_size < added_size) {
/* integer overflow of extra_buffers_size */
return_error = BR_FAILED_REPLY;
- return_error_param = -EINVAL;
+ return_error_param = EINVAL;
return_error_line = __LINE__;
goto err_bad_extra_size;
}
}
-#ifdef CONFIG_REKERNEL
- rekernel_binder_transaction(reply, t, target_node, tr);
-#endif /* CONFIG_REKERNEL */
trace_binder_transaction(reply, t, target_node);
t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
@@ -3389,20 +3285,15 @@ static void binder_transaction(struct binder_proc *proc,
goto err_binder_alloc_buf_failed;
}
if (secctx) {
- int err;
size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) +
ALIGN(tr->offsets_size, sizeof(void *)) +
ALIGN(extra_buffers_size, sizeof(void *)) -
ALIGN(secctx_sz, sizeof(u64));
t->security_ctx = (uintptr_t)t->buffer->user_data + buf_offset;
- err = binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer, buf_offset,
- secctx, secctx_sz);
- if (err) {
- t->security_ctx = 0;
- WARN_ON(1);
- }
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, buf_offset,
+ secctx, secctx_sz);
security_release_secctx(secctx, secctx_sz);
secctx = NULL;
}
@@ -3412,6 +3303,19 @@ static void binder_transaction(struct binder_proc *proc,
t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF);
trace_binder_transaction_alloc_buf(t->buffer);
+ if (binder_alloc_copy_user_to_buffer(
+ &target_proc->alloc,
+ t->buffer, 0,
+ (const void __user *)
+ (uintptr_t)tr->data.ptr.buffer,
+ tr->data_size)) {
+ binder_user_error("%d:%d got transaction with invalid data ptr\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EFAULT;
+ return_error_line = __LINE__;
+ goto err_copy_data_failed;
+ }
if (binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer,
@@ -3456,39 +3360,14 @@ static void binder_transaction(struct binder_proc *proc,
size_t object_size;
struct binder_object object;
binder_size_t object_offset;
- binder_size_t copy_size;
- if (binder_alloc_copy_from_buffer(&target_proc->alloc,
- &object_offset,
- t->buffer,
- buffer_offset,
- sizeof(object_offset))) {
- return_error = BR_FAILED_REPLY;
- return_error_param = -EINVAL;
- return_error_line = __LINE__;
- goto err_bad_offset;
- }
-
- /*
- * Copy the source user buffer up to the next object
- * that will be processed.
- */
- copy_size = object_offset - user_offset;
- if (copy_size && (user_offset > object_offset ||
- binder_alloc_copy_user_to_buffer(
- &target_proc->alloc,
- t->buffer, user_offset,
- user_buffer + user_offset,
- copy_size))) {
- binder_user_error("%d:%d got transaction with invalid data ptr\n",
- proc->pid, thread->pid);
- return_error = BR_FAILED_REPLY;
- return_error_param = -EFAULT;
- return_error_line = __LINE__;
- goto err_copy_data_failed;
- }
- object_size = binder_get_object(target_proc, user_buffer,
- t->buffer, object_offset, &object);
+ binder_alloc_copy_from_buffer(&target_proc->alloc,
+ &object_offset,
+ t->buffer,
+ buffer_offset,
+ sizeof(object_offset));
+ object_size = binder_get_object(target_proc, t->buffer,
+ object_offset, &object);
if (object_size == 0 || object_offset < off_min) {
binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
proc->pid, thread->pid,
@@ -3500,11 +3379,6 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_offset;
}
- /*
- * Set offset to the next buffer fragment to be
- * copied
- */
- user_offset = object_offset + object_size;
hdr = &object.hdr;
off_min = object_offset + object_size;
@@ -3515,17 +3389,15 @@ static void binder_transaction(struct binder_proc *proc,
fp = to_flat_binder_object(hdr);
ret = binder_translate_binder(fp, t, thread);
-
- if (ret < 0 ||
- binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer,
- object_offset,
- fp, sizeof(*fp))) {
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_line = __LINE__;
goto err_translate_failed;
}
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, object_offset,
+ fp, sizeof(*fp));
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
@@ -3533,42 +3405,37 @@ static void binder_transaction(struct binder_proc *proc,
fp = to_flat_binder_object(hdr);
ret = binder_translate_handle(fp, t, thread);
- if (ret < 0 ||
- binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer,
- object_offset,
- fp, sizeof(*fp))) {
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_line = __LINE__;
goto err_translate_failed;
}
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, object_offset,
+ fp, sizeof(*fp));
} break;
case BINDER_TYPE_FD: {
struct binder_fd_object *fp = to_binder_fd_object(hdr);
- binder_size_t fd_offset = object_offset +
- (uintptr_t)&fp->fd - (uintptr_t)fp;
- int ret = binder_translate_fd(fp->fd, fd_offset, t,
- thread, in_reply_to);
+ int target_fd = binder_translate_fd(fp->fd, t, thread,
+ in_reply_to);
- fp->pad_binder = 0;
- if (ret < 0 ||
- binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer,
- object_offset,
- fp, sizeof(*fp))) {
+ if (target_fd < 0) {
return_error = BR_FAILED_REPLY;
- return_error_param = ret;
+ return_error_param = target_fd;
return_error_line = __LINE__;
goto err_translate_failed;
}
+ fp->pad_binder = 0;
+ fp->fd = target_fd;
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, object_offset,
+ fp, sizeof(*fp));
} break;
case BINDER_TYPE_FDA: {
struct binder_object ptr_object;
binder_size_t parent_offset;
- struct binder_object user_object;
- size_t user_parent_size;
struct binder_fd_array_object *fda =
to_binder_fd_array_object(hdr);
size_t num_valid = (buffer_offset - off_start_offset) /
@@ -3600,35 +3467,11 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_parent;
}
- /*
- * We need to read the user version of the parent
- * object to get the original user offset
- */
- user_parent_size =
- binder_get_object(proc, user_buffer, t->buffer,
- parent_offset, &user_object);
- if (user_parent_size != sizeof(user_object.bbo)) {
- binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n",
- proc->pid, thread->pid,
- user_parent_size,
- sizeof(user_object.bbo));
- return_error = BR_FAILED_REPLY;
- return_error_param = -EINVAL;
- return_error_line = __LINE__;
- goto err_bad_parent;
- }
- ret = binder_translate_fd_array(&pf_head, fda,
- user_buffer, parent,
- &user_object.bbo, t,
- thread, in_reply_to);
- if (!ret)
- ret = binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer,
- object_offset,
- fda, sizeof(*fda));
- if (ret) {
+ ret = binder_translate_fd_array(fda, parent, t, thread,
+ in_reply_to);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- return_error_param = ret > 0 ? -EINVAL : ret;
+ return_error_param = ret;
return_error_line = __LINE__;
goto err_translate_failed;
}
@@ -3650,14 +3493,19 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_offset;
}
- ret = binder_defer_copy(&sgc_head, sg_buf_offset,
- (const void __user *)(uintptr_t)bp->buffer,
- bp->length);
- if (ret) {
+ if (binder_alloc_copy_user_to_buffer(
+ &target_proc->alloc,
+ t->buffer,
+ sg_buf_offset,
+ (const void __user *)
+ (uintptr_t)bp->buffer,
+ bp->length)) {
+ binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+ proc->pid, thread->pid);
+ return_error_param = -EFAULT;
return_error = BR_FAILED_REPLY;
- return_error_param = ret;
return_error_line = __LINE__;
- goto err_translate_failed;
+ goto err_copy_data_failed;
}
/* Fixup buffer pointer to target proc address space */
bp->buffer = (uintptr_t)
@@ -3666,22 +3514,20 @@ static void binder_transaction(struct binder_proc *proc,
num_valid = (buffer_offset - off_start_offset) /
sizeof(binder_size_t);
- ret = binder_fixup_parent(&pf_head, t,
- thread, bp,
+ ret = binder_fixup_parent(t, thread, bp,
off_start_offset,
num_valid,
last_fixup_obj_off,
last_fixup_min_off);
- if (ret < 0 ||
- binder_alloc_copy_to_buffer(&target_proc->alloc,
- t->buffer,
- object_offset,
- bp, sizeof(*bp))) {
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_line = __LINE__;
goto err_translate_failed;
}
+ binder_alloc_copy_to_buffer(&target_proc->alloc,
+ t->buffer, object_offset,
+ bp, sizeof(*bp));
last_fixup_obj_off = object_offset;
last_fixup_min_off = 0;
} break;
@@ -3694,57 +3540,22 @@ static void binder_transaction(struct binder_proc *proc,
goto err_bad_object_type;
}
}
- /* Done processing objects, copy the rest of the buffer */
- if (binder_alloc_copy_user_to_buffer(
- &target_proc->alloc,
- t->buffer, user_offset,
- user_buffer + user_offset,
- tr->data_size - user_offset)) {
- binder_user_error("%d:%d got transaction with invalid data ptr\n",
- proc->pid, thread->pid);
- return_error = BR_FAILED_REPLY;
- return_error_param = -EFAULT;
- return_error_line = __LINE__;
- goto err_copy_data_failed;
- }
-
- ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer,
- &sgc_head, &pf_head);
- if (ret) {
- binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
- proc->pid, thread->pid);
- return_error = BR_FAILED_REPLY;
- return_error_param = ret;
- return_error_line = __LINE__;
- goto err_copy_data_failed;
- }
- if (t->buffer->oneway_spam_suspect)
- tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT;
- else
- tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+ tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
t->work.type = BINDER_WORK_TRANSACTION;
if (reply) {
binder_enqueue_thread_work(thread, tcomplete);
binder_inner_proc_lock(target_proc);
if (target_thread->is_dead) {
- return_error = BR_DEAD_REPLY;
binder_inner_proc_unlock(target_proc);
goto err_dead_proc_or_thread;
}
BUG_ON(t->buffer->async_transaction != 0);
binder_pop_transaction_ilocked(target_thread, in_reply_to);
binder_enqueue_thread_work_ilocked(target_thread, &t->work);
- target_proc->outstanding_txns++;
binder_inner_proc_unlock(target_proc);
- if (in_reply_to->is_nested) {
- spin_lock(&thread->prio_lock);
- thread->prio_state = BINDER_PRIO_PENDING;
- thread->prio_next = in_reply_to->saved_priority;
- spin_unlock(&thread->prio_lock);
- }
wake_up_interruptible_sync(&target_thread->wait);
- binder_restore_priority(thread, &in_reply_to->saved_priority);
+ binder_restore_priority(current, in_reply_to->saved_priority);
binder_free_transaction(in_reply_to);
} else if (!(t->flags & TF_ONE_WAY)) {
BUG_ON(t->buffer->async_transaction != 0);
@@ -3761,9 +3572,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from_parent = thread->transaction_stack;
thread->transaction_stack = t;
binder_inner_proc_unlock(proc);
- return_error = binder_proc_transaction(t,
- target_proc, target_thread);
- if (return_error) {
+ if (!binder_proc_transaction(t, target_proc, target_thread)) {
binder_inner_proc_lock(proc);
binder_pop_transaction_ilocked(thread, t);
binder_inner_proc_unlock(proc);
@@ -3773,8 +3582,7 @@ static void binder_transaction(struct binder_proc *proc,
BUG_ON(target_node == NULL);
BUG_ON(t->buffer->async_transaction != 1);
binder_enqueue_thread_work(thread, tcomplete);
- return_error = binder_proc_transaction(t, target_proc, NULL);
- if (return_error)
+ if (!binder_proc_transaction(t, target_proc, NULL))
goto err_dead_proc_or_thread;
}
if (target_thread)
@@ -3791,6 +3599,7 @@ static void binder_transaction(struct binder_proc *proc,
return;
err_dead_proc_or_thread:
+ return_error = BR_DEAD_REPLY;
return_error_line = __LINE__;
binder_dequeue_work(proc, tcomplete);
err_translate_failed:
@@ -3798,10 +3607,8 @@ static void binder_transaction(struct binder_proc *proc,
err_bad_offset:
err_bad_parent:
err_copy_data_failed:
- binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head);
- binder_free_txn_fixups(t);
trace_binder_transaction_failed_buffer_release(t->buffer);
- binder_transaction_buffer_release(target_proc, NULL, t->buffer,
+ binder_transaction_buffer_release(target_proc, t->buffer,
buffer_offset, true);
if (target_node)
binder_dec_node_tmpref(target_node);
@@ -3813,15 +3620,12 @@ static void binder_transaction(struct binder_proc *proc,
if (secctx)
security_release_secctx(secctx, secctx_sz);
err_get_secctx_failed:
- kmem_cache_free(binder_work_pool, tcomplete);
+ kfree(tcomplete);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
err_alloc_tcomplete_failed:
- if (trace_binder_txn_latency_free_enabled())
- binder_txn_latency_free(t);
- kmem_cache_free(binder_transaction_pool, t);
+ kfree(t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
err_alloc_t_failed:
-err_bad_todo_list:
err_bad_call_stack:
err_empty_call_stack:
err_dead_binder:
@@ -3855,65 +3659,19 @@ static void binder_transaction(struct binder_proc *proc,
*/
smp_wmb();
WRITE_ONCE(e->debug_id_done, t_debug_id);
- WRITE_ONCE(fe->debug_id_done, t_debug_id);
- }
-
- BUG_ON(thread->return_error.cmd != BR_OK);
- if (in_reply_to) {
- binder_restore_priority(thread, &in_reply_to->saved_priority);
- thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
- binder_enqueue_thread_work(thread, &thread->return_error.work);
- binder_send_failed_reply(in_reply_to, return_error);
- } else {
- thread->return_error.cmd = return_error;
- binder_enqueue_thread_work(thread, &thread->return_error.work);
- }
-}
-
-/**
- * binder_free_buf() - free the specified buffer
- * @proc: binder proc that owns buffer
- * @buffer: buffer to be freed
- * @is_failure: failed to send transaction
- *
- * If buffer for an async transaction, enqueue the next async
- * transaction from the node.
- *
- * Cleanup buffer and free it.
- */
-static void
-binder_free_buf(struct binder_proc *proc,
- struct binder_thread *thread,
- struct binder_buffer *buffer, bool is_failure)
-{
- binder_inner_proc_lock(proc);
- if (buffer->transaction) {
- buffer->transaction->buffer = NULL;
- buffer->transaction = NULL;
- }
- binder_inner_proc_unlock(proc);
- if (buffer->async_transaction && buffer->target_node) {
- struct binder_node *buf_node;
- struct binder_work *w;
-
- buf_node = buffer->target_node;
- binder_node_inner_lock(buf_node);
- BUG_ON(!buf_node->has_async_transaction);
- BUG_ON(buf_node->proc != proc);
- w = binder_dequeue_work_head_ilocked(
- &buf_node->async_todo);
- if (!w) {
- buf_node->has_async_transaction = false;
- } else {
- binder_enqueue_work_ilocked(
- w, &proc->todo);
- binder_wakeup_proc_ilocked(proc);
- }
- binder_node_inner_unlock(buf_node);
+ WRITE_ONCE(fe->debug_id_done, t_debug_id);
+ }
+
+ BUG_ON(thread->return_error.cmd != BR_OK);
+ if (in_reply_to) {
+ binder_restore_priority(current, in_reply_to->saved_priority);
+ thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
+ binder_enqueue_thread_work(thread, &thread->return_error.work);
+ binder_send_failed_reply(in_reply_to, return_error);
+ } else {
+ thread->return_error.cmd = return_error;
+ binder_enqueue_thread_work(thread, &thread->return_error.work);
}
- trace_binder_transaction_buffer_release(buffer);
- binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure);
- binder_alloc_free_buf(&proc->alloc, buffer);
}
static int binder_thread_write(struct binder_proc *proc,
@@ -3957,7 +3715,6 @@ static int binder_thread_write(struct binder_proc *proc,
ret = -1;
if (increment && !target) {
struct binder_node *ctx_mgr_node;
-
mutex_lock(&context->context_mgr_node_lock);
ctx_mgr_node = context->binder_context_mgr_node;
if (ctx_mgr_node) {
@@ -4114,7 +3871,35 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid, (u64)data_ptr,
buffer->debug_id,
buffer->transaction ? "active" : "finished");
- binder_free_buf(proc, thread, buffer, false);
+
+ binder_inner_proc_lock(proc);
+ if (buffer->transaction) {
+ buffer->transaction->buffer = NULL;
+ buffer->transaction = NULL;
+ }
+ binder_inner_proc_unlock(proc);
+ if (buffer->async_transaction && buffer->target_node) {
+ struct binder_node *buf_node;
+ struct binder_work *w;
+
+ buf_node = buffer->target_node;
+ binder_node_inner_lock(buf_node);
+ BUG_ON(!buf_node->has_async_transaction);
+ BUG_ON(buf_node->proc != proc);
+ w = binder_dequeue_work_head_ilocked(
+ &buf_node->async_todo);
+ if (!w) {
+ buf_node->has_async_transaction = false;
+ } else {
+ binder_enqueue_work_ilocked(
+ w, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
+ }
+ binder_node_inner_unlock(buf_node);
+ }
+ trace_binder_transaction_buffer_release(buffer);
+ binder_transaction_buffer_release(proc, buffer, 0, false);
+ binder_alloc_free_buf(&proc->alloc, buffer);
break;
}
@@ -4197,7 +3982,7 @@ static int binder_thread_write(struct binder_proc *proc,
* Allocate memory for death notification
* before taking lock
*/
- death = kmem_cache_zalloc(binder_ref_death_pool, GFP_KERNEL);
+ death = kzalloc(sizeof(*death), GFP_KERNEL);
if (death == NULL) {
WARN_ON(thread->return_error.cmd !=
BR_OK);
@@ -4222,8 +4007,7 @@ static int binder_thread_write(struct binder_proc *proc,
"BC_CLEAR_DEATH_NOTIFICATION",
target);
binder_proc_unlock(proc);
- if (death)
- kmem_cache_free(binder_ref_death_pool, death);
+ kfree(death);
break;
}
@@ -4244,7 +4028,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid);
binder_node_unlock(ref->node);
binder_proc_unlock(proc);
- kmem_cache_free(binder_ref_death_pool, death);
+ kfree(death);
break;
}
binder_stats_created(BINDER_STAT_DEATH);
@@ -4427,7 +4211,7 @@ static int binder_wait_for_work(struct binder_thread *thread,
binder_inner_proc_lock(proc);
list_del_init(&thread->waiting_thread_node);
if (signal_pending(current)) {
- ret = -EINTR;
+ ret = -ERESTARTSYS;
break;
}
}
@@ -4438,71 +4222,6 @@ static int binder_wait_for_work(struct binder_thread *thread,
return ret;
}
-/**
- * binder_apply_fd_fixups() - finish fd translation
- * @proc: binder_proc associated @t->buffer
- * @t: binder transaction with list of fd fixups
- *
- * Now that we are in the context of the transaction target
- * process, we can allocate and install fds. Process the
- * list of fds to translate and fixup the buffer with the
- * new fds.
- *
- * If we fail to allocate an fd, then free the resources by
- * fput'ing files that have not been processed and ksys_close'ing
- * any fds that have already been allocated.
- */
-static int binder_apply_fd_fixups(struct binder_proc *proc,
- struct binder_transaction *t)
-{
- struct binder_txn_fd_fixup *fixup, *tmp;
- int ret = 0;
-
- list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) {
- int fd = get_unused_fd_flags(O_CLOEXEC);
-
- if (fd < 0) {
- binder_debug(BINDER_DEBUG_TRANSACTION,
- "failed fd fixup txn %d fd %d\n",
- t->debug_id, fd);
- ret = -ENOMEM;
- break;
- }
- binder_debug(BINDER_DEBUG_TRANSACTION,
- "fd fixup txn %d fd %d\n",
- t->debug_id, fd);
- trace_binder_transaction_fd_recv(t, fd, fixup->offset);
- fd_install(fd, fixup->file);
- fixup->file = NULL;
- if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer,
- fixup->offset, &fd,
- sizeof(u32))) {
- ret = -EINVAL;
- break;
- }
- }
- list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) {
- if (fixup->file) {
- fput(fixup->file);
- } else if (ret) {
- u32 fd;
- int err;
-
- err = binder_alloc_copy_from_buffer(&proc->alloc, &fd,
- t->buffer,
- fixup->offset,
- sizeof(fd));
- WARN_ON(err);
- if (!err)
- binder_deferred_fd_close(fd);
- }
- list_del(&fixup->fixup_entry);
- kfree(fixup);
- }
-
- return ret;
-}
-
static int binder_thread_read(struct binder_proc *proc,
struct binder_thread *thread,
binder_uintptr_t binder_buffer, size_t size,
@@ -4539,7 +4258,7 @@ static int binder_thread_read(struct binder_proc *proc,
wait_event_interruptible(binder_user_error_wait,
binder_stop_on_user_error < 2);
}
- binder_restore_priority(thread, &proc->default_priority);
+ binder_restore_priority(current, proc->default_priority);
}
if (non_block) {
@@ -4565,8 +4284,6 @@ static int binder_thread_read(struct binder_proc *proc,
size_t trsize = sizeof(*trd);
binder_inner_proc_lock(proc);
- if (list)
- goto skip;
if (!binder_worklist_empty_ilocked(&thread->todo))
list = &thread->todo;
else if (!binder_worklist_empty_ilocked(&proc->todo) &&
@@ -4580,7 +4297,7 @@ static int binder_thread_read(struct binder_proc *proc,
goto retry;
break;
}
-skip:
+
if (end - ptr < sizeof(tr) + 4) {
binder_inner_proc_unlock(proc);
break;
@@ -4606,18 +4323,11 @@ static int binder_thread_read(struct binder_proc *proc,
e->cmd = BR_OK;
ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, cmd);
+ binder_stat_br(proc, thread, e->cmd);
} break;
- case BINDER_WORK_TRANSACTION_COMPLETE:
- case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: {
- if (proc->oneway_spam_detection_enabled &&
- w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT)
- cmd = BR_ONEWAY_SPAM_SUSPECT;
- else
- cmd = BR_TRANSACTION_COMPLETE;
+ case BINDER_WORK_TRANSACTION_COMPLETE: {
binder_inner_proc_unlock(proc);
- kmem_cache_free(binder_work_pool, w);
- binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
+ cmd = BR_TRANSACTION_COMPLETE;
if (put_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
@@ -4626,6 +4336,8 @@ static int binder_thread_read(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
"%d:%d BR_TRANSACTION_COMPLETE\n",
proc->pid, thread->pid);
+ kfree(w);
+ binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
} break;
case BINDER_WORK_NODE: {
struct binder_node *node = container_of(w, struct binder_node, work);
@@ -4737,7 +4449,7 @@ static int binder_thread_read(struct binder_proc *proc,
(u64)cookie);
if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
binder_inner_proc_unlock(proc);
- kmem_cache_free(binder_ref_death_pool, death);
+ kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
} else {
binder_enqueue_work_ilocked(
@@ -4755,11 +4467,6 @@ static int binder_thread_read(struct binder_proc *proc,
if (cmd == BR_DEAD_BINDER)
goto done; /* DEAD_BINDER notifications can cause transactions */
} break;
- default:
- binder_inner_proc_unlock(proc);
- pr_err("%d:%d: bad work type %d\n",
- proc->pid, thread->pid, w->type);
- break;
}
if (!t)
@@ -4768,10 +4475,14 @@ static int binder_thread_read(struct binder_proc *proc,
BUG_ON(t->buffer == NULL);
if (t->buffer->target_node) {
struct binder_node *target_node = t->buffer->target_node;
+ struct binder_priority node_prio;
trd->target.ptr = target_node->ptr;
trd->cookie = target_node->cookie;
- binder_transaction_priority(thread, t, target_node);
+ node_prio.sched_policy = target_node->sched_policy;
+ node_prio.prio = target_node->min_priority;
+ binder_transaction_priority(current, t, node_prio,
+ target_node->inherit_rt);
cmd = BR_TRANSACTION;
} else {
trd->target.ptr = 0;
@@ -4793,34 +4504,6 @@ static int binder_thread_read(struct binder_proc *proc,
trd->sender_pid = 0;
}
- ret = binder_apply_fd_fixups(proc, t);
- if (ret) {
- struct binder_buffer *buffer = t->buffer;
- bool oneway = !!(t->flags & TF_ONE_WAY);
- int tid = t->debug_id;
-
- if (t_from)
- binder_thread_dec_tmpref(t_from);
- buffer->transaction = NULL;
- binder_cleanup_transaction(t, "fd fixups failed",
- BR_FAILED_REPLY);
- binder_free_buf(proc, thread, buffer, true);
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n",
- proc->pid, thread->pid,
- oneway ? "async " :
- (cmd == BR_REPLY ? "reply " : ""),
- tid, BR_FAILED_REPLY, ret, __LINE__);
- if (cmd == BR_REPLY) {
- cmd = BR_FAILED_REPLY;
- if (put_user(cmd, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, cmd);
- break;
- }
- continue;
- }
trd->data_size = t->buffer->data_size;
trd->offsets_size = t->buffer->offsets_size;
trd->data.ptr.buffer = (uintptr_t)t->buffer->user_data;
@@ -4940,7 +4623,7 @@ static void binder_release_work(struct binder_proc *proc,
case BINDER_WORK_TRANSACTION_COMPLETE: {
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"undelivered TRANSACTION_COMPLETE\n");
- kmem_cache_free(binder_work_pool, w);
+ kfree(w);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
} break;
case BINDER_WORK_DEAD_BINDER_AND_CLEAR:
@@ -4951,7 +4634,7 @@ static void binder_release_work(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"undelivered death notification, %016llx\n",
(u64)death->cookie);
- kmem_cache_free(binder_ref_death_pool, death);
+ kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
} break;
case BINDER_WORK_NODE:
@@ -5001,8 +4684,6 @@ static struct binder_thread *binder_get_thread_ilocked(
thread->return_error.cmd = BR_OK;
thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
thread->reply_error.cmd = BR_OK;
- spin_lock_init(&thread->prio_lock);
- thread->prio_state = BINDER_PRIO_SET;
INIT_LIST_HEAD(&new_thread->waiting_thread_node);
return thread;
}
@@ -5016,37 +4697,27 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
thread = binder_get_thread_ilocked(proc, NULL);
binder_inner_proc_unlock(proc);
if (!thread) {
- new_thread = kmem_cache_zalloc(binder_thread_pool, GFP_KERNEL);
+ new_thread = kzalloc(sizeof(*thread), GFP_KERNEL);
if (new_thread == NULL)
return NULL;
binder_inner_proc_lock(proc);
thread = binder_get_thread_ilocked(proc, new_thread);
binder_inner_proc_unlock(proc);
if (thread != new_thread)
- kmem_cache_free(binder_thread_pool, new_thread);
+ kfree(new_thread);
}
return thread;
}
static void binder_free_proc(struct binder_proc *proc)
{
- struct binder_device *device;
-
BUG_ON(!list_empty(&proc->todo));
BUG_ON(!list_empty(&proc->delivered_death));
- if (proc->outstanding_txns)
- pr_warn("%s: Unexpected outstanding_txns %d\n",
- __func__, proc->outstanding_txns);
- device = container_of(proc->context, struct binder_device, context);
- if (refcount_dec_and_test(&device->ref)) {
- kfree(proc->context->name);
- kfree(device);
- }
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
- kmem_cache_free(binder_proc_pool, proc);
+ kfree(proc);
}
static void binder_free_thread(struct binder_thread *thread)
@@ -5055,7 +4726,7 @@ static void binder_free_thread(struct binder_thread *thread)
binder_stats_deleted(BINDER_STAT_THREAD);
binder_proc_dec_tmpref(thread->proc);
put_task_struct(thread->task);
- kmem_cache_free(binder_thread_pool, thread);
+ kfree(thread);
}
static int binder_thread_release(struct binder_proc *proc,
@@ -5073,7 +4744,7 @@ static int binder_thread_release(struct binder_proc *proc,
* The corresponding dec is when we actually
* free the thread in binder_free_thread()
*/
- proc->tmp_ref++;
+ atomic_inc(&proc->tmp_ref);
/*
* take a ref on this thread to ensure it
* survives while we are releasing it
@@ -5085,8 +4756,6 @@ static int binder_thread_release(struct binder_proc *proc,
spin_lock(&t->lock);
if (t->to_thread == thread)
send_reply = t;
- } else {
- __acquire(&t->lock);
}
thread->is_dead = true;
@@ -5100,7 +4769,6 @@ static int binder_thread_release(struct binder_proc *proc,
(t->to_thread == thread) ? "in" : "out");
if (t->to_thread == thread) {
- thread->proc->outstanding_txns--;
t->to_proc = NULL;
t->to_thread = NULL;
if (t->buffer) {
@@ -5116,11 +4784,7 @@ static int binder_thread_release(struct binder_proc *proc,
spin_unlock(&last_t->lock);
if (t)
spin_lock(&t->lock);
- else
- __acquire(&t->lock);
}
- /* annotation for sparse, lock not acquired in last iteration above */
- __release(&t->lock);
/*
* If this thread used poll, make sure we remove the waitqueue from any
@@ -5148,7 +4812,7 @@ static int binder_thread_release(struct binder_proc *proc,
return active_transactions;
}
-static __poll_t binder_poll(struct file *filp,
+static unsigned int binder_poll(struct file *filp,
struct poll_table_struct *wait)
{
struct binder_proc *proc = filp->private_data;
@@ -5168,7 +4832,7 @@ static __poll_t binder_poll(struct file *filp,
poll_wait(filp, &thread->wait, wait);
if (binder_has_work(thread, wait_for_proc_work))
- return EPOLLIN;
+ return POLLIN;
return 0;
}
@@ -5324,8 +4988,7 @@ static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc,
}
static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
- struct binder_node_debug_info *info)
-{
+ struct binder_node_debug_info *info) {
struct rb_node *n;
binder_uintptr_t ptr = info->ptr;
@@ -5348,100 +5011,6 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
return 0;
}
-static bool binder_txns_pending_ilocked(struct binder_proc *proc)
-{
- struct rb_node *n;
- struct binder_thread *thread;
-
- if (proc->outstanding_txns > 0)
- return true;
-
- for (n = rb_first(&proc->threads); n; n = rb_next(n)) {
- thread = rb_entry(n, struct binder_thread, rb_node);
- if (thread->transaction_stack)
- return true;
- }
- return false;
-}
-
-static int binder_ioctl_freeze(struct binder_freeze_info *info,
- struct binder_proc *target_proc)
-{
- int ret = 0;
-
- if (!info->enable) {
- binder_inner_proc_lock(target_proc);
- target_proc->sync_recv = false;
- target_proc->async_recv = false;
- target_proc->is_frozen = false;
- binder_inner_proc_unlock(target_proc);
- return 0;
- }
-
- /*
- * Freezing the target. Prevent new transactions by
- * setting frozen state. If timeout specified, wait
- * for transactions to drain.
- */
- binder_inner_proc_lock(target_proc);
- target_proc->sync_recv = false;
- target_proc->async_recv = false;
- target_proc->is_frozen = true;
- binder_inner_proc_unlock(target_proc);
-
- if (info->timeout_ms > 0)
- ret = wait_event_interruptible_timeout(
- target_proc->freeze_wait,
- (!target_proc->outstanding_txns),
- msecs_to_jiffies(info->timeout_ms));
-
- /* Check pending transactions that wait for reply */
- if (ret >= 0) {
- binder_inner_proc_lock(target_proc);
- if (binder_txns_pending_ilocked(target_proc))
- ret = -EAGAIN;
- binder_inner_proc_unlock(target_proc);
- }
-
- if (ret < 0) {
- binder_inner_proc_lock(target_proc);
- target_proc->is_frozen = false;
- binder_inner_proc_unlock(target_proc);
- }
-
- return ret;
-}
-
-static int binder_ioctl_get_freezer_info(
- struct binder_frozen_status_info *info)
-{
- struct binder_proc *target_proc;
- bool found = false;
- __u32 txns_pending;
-
- info->sync_recv = 0;
- info->async_recv = 0;
-
- mutex_lock(&binder_procs_lock);
- hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
- if (target_proc->pid == info->pid) {
- found = true;
- binder_inner_proc_lock(target_proc);
- txns_pending = binder_txns_pending_ilocked(target_proc);
- info->sync_recv |= target_proc->sync_recv |
- (txns_pending << 1);
- info->async_recv |= target_proc->async_recv;
- binder_inner_proc_unlock(target_proc);
- }
- }
- mutex_unlock(&binder_procs_lock);
-
- if (!found)
- return -EINVAL;
-
- return 0;
-}
-
static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret;
@@ -5560,96 +5129,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
break;
}
- case BINDER_FREEZE: {
- struct binder_freeze_info info;
- struct binder_proc **target_procs = NULL, *target_proc;
- int target_procs_count = 0, i = 0;
-
- ret = 0;
-
- if (copy_from_user(&info, ubuf, sizeof(info))) {
- ret = -EFAULT;
- goto err;
- }
-
- mutex_lock(&binder_procs_lock);
- hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
- if (target_proc->pid == info.pid)
- target_procs_count++;
- }
-
- if (target_procs_count == 0) {
- mutex_unlock(&binder_procs_lock);
- ret = -EINVAL;
- goto err;
- }
-
- target_procs = kcalloc(target_procs_count,
- sizeof(struct binder_proc *),
- GFP_KERNEL);
-
- if (!target_procs) {
- mutex_unlock(&binder_procs_lock);
- ret = -ENOMEM;
- goto err;
- }
-
- hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
- if (target_proc->pid != info.pid)
- continue;
-
- binder_inner_proc_lock(target_proc);
- target_proc->tmp_ref++;
- binder_inner_proc_unlock(target_proc);
-
- target_procs[i++] = target_proc;
- }
- mutex_unlock(&binder_procs_lock);
-
- for (i = 0; i < target_procs_count; i++) {
- if (ret >= 0)
- ret = binder_ioctl_freeze(&info,
- target_procs[i]);
-
- binder_proc_dec_tmpref(target_procs[i]);
- }
-
- kfree(target_procs);
-
- if (ret < 0)
- goto err;
- break;
- }
- case BINDER_GET_FROZEN_INFO: {
- struct binder_frozen_status_info info;
-
- if (copy_from_user(&info, ubuf, sizeof(info))) {
- ret = -EFAULT;
- goto err;
- }
-
- ret = binder_ioctl_get_freezer_info(&info);
- if (ret < 0)
- goto err;
-
- if (copy_to_user(ubuf, &info, sizeof(info))) {
- ret = -EFAULT;
- goto err;
- }
- break;
- }
- case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: {
- uint32_t enable;
-
- if (copy_from_user(&enable, ubuf, sizeof(enable))) {
- ret = -EFAULT;
- goto err;
- }
- binder_inner_proc_lock(proc);
- proc->oneway_spam_detection_enabled = (bool)enable;
- binder_inner_proc_unlock(proc);
- break;
- }
default:
ret = -EINVAL;
goto err;
@@ -5659,7 +5138,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (thread)
thread->looper_need_return = false;
wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
- if (ret && ret != -EINTR)
+ if (ret && ret != -ERESTARTSYS)
pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
err_unlocked:
trace_binder_ioctl_done(ret);
@@ -5687,6 +5166,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
binder_alloc_vma_close(&proc->alloc);
+ binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
}
static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -5702,11 +5182,16 @@ static const struct vm_operations_struct binder_vm_ops = {
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
{
+ int ret;
struct binder_proc *proc = filp->private_data;
+ const char *failure_string;
if (proc->tsk != current->group_leader)
return -EINVAL;
+ if ((vma->vm_end - vma->vm_start) > SZ_4M)
+ vma->vm_end = vma->vm_start + SZ_4M;
+
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
"%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
__func__, proc->pid, vma->vm_start, vma->vm_end,
@@ -5714,9 +5199,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
(unsigned long)pgprot_val(vma->vm_page_prot));
if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) {
- pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
- proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM);
- return -EPERM;
+ ret = -EPERM;
+ failure_string = "bad vm_flags";
+ goto err_bad_arg;
}
vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
vma->vm_flags &= ~VM_MAYWRITE;
@@ -5724,30 +5209,39 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_ops = &binder_vm_ops;
vma->vm_private_data = proc;
- return binder_alloc_mmap_handler(&proc->alloc, vma);
+ ret = binder_alloc_mmap_handler(&proc->alloc, vma);
+ if (ret)
+ return ret;
+ mutex_lock(&proc->files_lock);
+ proc->files = get_files_struct(current);
+ mutex_unlock(&proc->files_lock);
+ return 0;
+
+err_bad_arg:
+ pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+ proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+ return ret;
}
static int binder_open(struct inode *nodp, struct file *filp)
{
- struct binder_proc *proc, *itr;
+ struct binder_proc *proc;
struct binder_device *binder_dev;
- struct binderfs_info *info;
- struct dentry *binder_binderfs_dir_entry_proc = NULL;
- bool existing_pid = false;
binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__,
current->group_leader->pid, current->pid);
- proc = kmem_cache_zalloc(binder_proc_pool, GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL);
if (proc == NULL)
return -ENOMEM;
spin_lock_init(&proc->inner_lock);
spin_lock_init(&proc->outer_lock);
+ atomic_set(&proc->tmp_ref, 0);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ mutex_init(&proc->files_lock);
proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
- init_waitqueue_head(&proc->freeze_wait);
if (binder_supported_policy(current->policy)) {
proc->default_priority.sched_policy = current->policy;
proc->default_priority.prio = current->normal_prio;
@@ -5756,16 +5250,8 @@ static int binder_open(struct inode *nodp, struct file *filp)
proc->default_priority.prio = NICE_TO_PRIO(0);
}
- /* binderfs stashes devices in i_private */
- if (is_binderfs_device(nodp)) {
- binder_dev = nodp->i_private;
- info = nodp->i_sb->s_fs_info;
- binder_binderfs_dir_entry_proc = info->proc_log_dir;
- } else {
- binder_dev = container_of(filp->private_data,
- struct binder_device, miscdev);
- }
- refcount_inc(&binder_dev->ref);
+ binder_dev = container_of(filp->private_data, struct binder_device,
+ miscdev);
proc->context = &binder_dev->context;
binder_alloc_init(&proc->alloc);
@@ -5776,52 +5262,24 @@ static int binder_open(struct inode *nodp, struct file *filp)
filp->private_data = proc;
mutex_lock(&binder_procs_lock);
- hlist_for_each_entry(itr, &binder_procs, proc_node) {
- if (itr->pid == proc->pid) {
- existing_pid = true;
- break;
- }
- }
hlist_add_head(&proc->proc_node, &binder_procs);
mutex_unlock(&binder_procs_lock);
- if (binder_debugfs_dir_entry_proc && !existing_pid) {
+
+ if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
/*
- * proc debug entries are shared between contexts.
- * Only create for the first PID to avoid debugfs log spamming
- * The printing code will anyway print all contexts for a given
- * PID so this is not a problem.
+ * proc debug entries are shared between contexts, so
+ * this will fail if the process tries to open the driver
+ * again with a different context. The priting code will
+ * anyway print all contexts that a given PID has, so this
+ * is not a problem.
*/
proc->debugfs_entry = debugfs_create_file(strbuf, 0444,
binder_debugfs_dir_entry_proc,
(void *)(unsigned long)proc->pid,
- &proc_fops);
- }
-
- if (binder_binderfs_dir_entry_proc && !existing_pid) {
- char strbuf[11];
- struct dentry *binderfs_entry;
-
- snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
- /*
- * Similar to debugfs, the process specific log file is shared
- * between contexts. Only create for the first PID.
- * This is ok since same as debugfs, the log file will contain
- * information on all contexts of a given PID.
- */
- binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc,
- strbuf, &proc_fops, (void *)(unsigned long)proc->pid);
- if (!IS_ERR(binderfs_entry)) {
- proc->binderfs_entry = binderfs_entry;
- } else {
- int error;
-
- error = PTR_ERR(binderfs_entry);
- pr_warn("Unable to create file %s in binderfs (error %d)\n",
- strbuf, error);
- }
+ &binder_proc_fops);
}
return 0;
@@ -5863,12 +5321,6 @@ static int binder_release(struct inode *nodp, struct file *filp)
struct binder_proc *proc = filp->private_data;
debugfs_remove(proc->debugfs_entry);
-
- if (proc->binderfs_entry) {
- binderfs_remove_file(proc->binderfs_entry);
- proc->binderfs_entry = NULL;
- }
-
binder_defer_work(proc, BINDER_DEFERRED_RELEASE);
return 0;
@@ -5945,6 +5397,8 @@ static void binder_deferred_release(struct binder_proc *proc)
struct rb_node *n;
int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
+ BUG_ON(proc->files);
+
mutex_lock(&binder_procs_lock);
hlist_del(&proc->proc_node);
mutex_unlock(&binder_procs_lock);
@@ -5963,12 +5417,9 @@ static void binder_deferred_release(struct binder_proc *proc)
* Make sure proc stays alive after we
* remove all the threads
*/
- proc->tmp_ref++;
+ atomic_inc(&proc->tmp_ref);
proc->is_dead = true;
- proc->is_frozen = false;
- proc->sync_recv = false;
- proc->async_recv = false;
threads = 0;
active_transactions = 0;
while ((n = rb_first(&proc->threads))) {
@@ -6029,6 +5480,7 @@ static void binder_deferred_release(struct binder_proc *proc)
static void binder_deferred_func(struct work_struct *work)
{
struct binder_proc *proc;
+ struct files_struct *files;
int defer;
@@ -6046,11 +5498,23 @@ static void binder_deferred_func(struct work_struct *work)
}
mutex_unlock(&binder_deferred_lock);
+ files = NULL;
+ if (defer & BINDER_DEFERRED_PUT_FILES) {
+ mutex_lock(&proc->files_lock);
+ files = proc->files;
+ if (files)
+ proc->files = NULL;
+ mutex_unlock(&proc->files_lock);
+ }
+
if (defer & BINDER_DEFERRED_FLUSH)
binder_deferred_flush(proc);
if (defer & BINDER_DEFERRED_RELEASE)
binder_deferred_release(proc); /* frees proc */
+
+ if (files)
+ put_files_struct(files);
} while (proc);
}
static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
@@ -6321,9 +5785,7 @@ static const char * const binder_return_strings[] = {
"BR_FINISHED",
"BR_DEAD_BINDER",
"BR_CLEAR_DEATH_NOTIFICATION_DONE",
- "BR_FAILED_REPLY",
- "BR_FROZEN_REPLY",
- "BR_ONEWAY_SPAM_SUSPECT",
+ "BR_FAILED_REPLY"
};
static const char * const binder_command_strings[] = {
@@ -6464,7 +5926,8 @@ static void print_binder_proc_stats(struct seq_file *m,
print_binder_stats(m, " ", &proc->stats);
}
-static int state_show(struct seq_file *m, void *unused)
+
+static int binder_state_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
struct binder_node *node;
@@ -6503,7 +5966,7 @@ static int state_show(struct seq_file *m, void *unused)
return 0;
}
-static int stats_show(struct seq_file *m, void *unused)
+static int binder_stats_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
@@ -6519,7 +5982,7 @@ static int stats_show(struct seq_file *m, void *unused)
return 0;
}
-static int transactions_show(struct seq_file *m, void *unused)
+static int binder_transactions_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
@@ -6532,7 +5995,7 @@ static int transactions_show(struct seq_file *m, void *unused)
return 0;
}
-static int proc_show(struct seq_file *m, void *unused)
+static int binder_proc_show(struct seq_file *m, void *unused)
{
struct binder_proc *itr;
int pid = (unsigned long)m->private;
@@ -6575,7 +6038,7 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
"\n" : " (incomplete)\n");
}
-static int transaction_log_show(struct seq_file *m, void *unused)
+static int binder_transaction_log_show(struct seq_file *m, void *unused)
{
struct binder_transaction_log *log = m->private;
unsigned int log_cur = atomic_read(&log->cur);
@@ -6596,7 +6059,7 @@ static int transaction_log_show(struct seq_file *m, void *unused)
return 0;
}
-const struct file_operations binder_fops = {
+static const struct file_operations binder_fops = {
.owner = THIS_MODULE,
.poll = binder_poll,
.unlocked_ioctl = binder_ioctl,
@@ -6607,44 +6070,10 @@ const struct file_operations binder_fops = {
.release = binder_release,
};
-DEFINE_SHOW_ATTRIBUTE(state);
-DEFINE_SHOW_ATTRIBUTE(stats);
-DEFINE_SHOW_ATTRIBUTE(transactions);
-DEFINE_SHOW_ATTRIBUTE(transaction_log);
-
-const struct binder_debugfs_entry binder_debugfs_entries[] = {
- {
- .name = "state",
- .mode = 0444,
- .fops = &state_fops,
- .data = NULL,
- },
- {
- .name = "stats",
- .mode = 0444,
- .fops = &stats_fops,
- .data = NULL,
- },
- {
- .name = "transactions",
- .mode = 0444,
- .fops = &transactions_fops,
- .data = NULL,
- },
- {
- .name = "transaction_log",
- .mode = 0444,
- .fops = &transaction_log_fops,
- .data = &binder_transaction_log,
- },
- {
- .name = "failed_transaction_log",
- .mode = 0444,
- .fops = &transaction_log_fops,
- .data = &binder_transaction_log_failed,
- },
- {} /* terminator */
-};
+BINDER_DEBUG_ENTRY(state);
+BINDER_DEBUG_ENTRY(stats);
+BINDER_DEBUG_ENTRY(transactions);
+BINDER_DEBUG_ENTRY(transaction_log);
static int __init init_binder_device(const char *name)
{
@@ -6659,7 +6088,6 @@ static int __init init_binder_device(const char *name)
binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
binder_device->miscdev.name = name;
- refcount_set(&binder_device->ref, 1);
binder_device->context.binder_context_mgr_uid = INVALID_UID;
binder_device->context.name = name;
mutex_init(&binder_device->context.context_mgr_node_lock);
@@ -6675,130 +6103,70 @@ static int __init init_binder_device(const char *name)
return ret;
}
-static int __init binder_create_pools(void)
-{
- int ret;
-
- ret = binder_buffer_pool_create();
- if (ret)
- return ret;
-
- binder_node_pool = KMEM_CACHE(binder_node, SLAB_HWCACHE_ALIGN);
- if (!binder_node_pool)
- goto err_node_pool;
-
- binder_proc_pool = KMEM_CACHE(binder_proc, SLAB_HWCACHE_ALIGN);
- if (!binder_proc_pool)
- goto err_proc_pool;
-
- binder_ref_death_pool = KMEM_CACHE(binder_ref_death, SLAB_HWCACHE_ALIGN);
- if (!binder_ref_death_pool)
- goto err_ref_death_pool;
-
- binder_ref_pool = KMEM_CACHE(binder_ref, SLAB_HWCACHE_ALIGN);
- if (!binder_ref_pool)
- goto err_ref_pool;
-
- binder_thread_pool = KMEM_CACHE(binder_thread, SLAB_HWCACHE_ALIGN);
- if (!binder_thread_pool)
- goto err_thread_pool;
-
- binder_transaction_pool = KMEM_CACHE(binder_transaction, SLAB_HWCACHE_ALIGN);
- if (!binder_transaction_pool)
- goto err_transaction_pool;
-
- binder_work_pool = KMEM_CACHE(binder_work, SLAB_HWCACHE_ALIGN);
- if (!binder_work_pool)
- goto err_work_pool;
-
- return 0;
-
-err_work_pool:
- kmem_cache_destroy(binder_transaction_pool);
-err_transaction_pool:
- kmem_cache_destroy(binder_thread_pool);
-err_thread_pool:
- kmem_cache_destroy(binder_ref_pool);
-err_ref_pool:
- kmem_cache_destroy(binder_ref_death_pool);
-err_ref_death_pool:
- kmem_cache_destroy(binder_proc_pool);
-err_proc_pool:
- kmem_cache_destroy(binder_node_pool);
-err_node_pool:
- binder_buffer_pool_destroy();
- return -ENOMEM;
-}
-
-static void __init binder_destroy_pools(void)
-{
- binder_buffer_pool_destroy();
- kmem_cache_destroy(binder_node_pool);
- kmem_cache_destroy(binder_proc_pool);
- kmem_cache_destroy(binder_ref_death_pool);
- kmem_cache_destroy(binder_ref_pool);
- kmem_cache_destroy(binder_thread_pool);
- kmem_cache_destroy(binder_transaction_pool);
- kmem_cache_destroy(binder_work_pool);
-}
-
static int __init binder_init(void)
{
int ret;
- char *device_name, *device_tmp;
+ char *device_name, *device_names, *device_tmp;
struct binder_device *device;
struct hlist_node *tmp;
- char *device_names = NULL;
-
- ret = binder_create_pools();
- if (ret)
- return ret;
ret = binder_alloc_shrinker_init();
if (ret)
- goto err_alloc_shrinker_failed;
+ return ret;
atomic_set(&binder_transaction_log.cur, ~0U);
atomic_set(&binder_transaction_log_failed.cur, ~0U);
binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
- if (binder_debugfs_dir_entry_root) {
- const struct binder_debugfs_entry *db_entry;
-
- binder_for_each_debugfs_entry(db_entry)
- debugfs_create_file(db_entry->name,
- db_entry->mode,
- binder_debugfs_dir_entry_root,
- db_entry->data,
- db_entry->fops);
-
+ if (binder_debugfs_dir_entry_root)
binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
binder_debugfs_dir_entry_root);
- }
- if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) &&
- strcmp(binder_devices_param, "") != 0) {
- /*
- * Copy the module_parameter string, because we don't want to
- * tokenize it in-place.
- */
- device_names = kstrdup(binder_devices_param, GFP_KERNEL);
- if (!device_names) {
- ret = -ENOMEM;
- goto err_alloc_device_names_failed;
- }
+ if (binder_debugfs_dir_entry_root) {
+ debugfs_create_file("state",
+ 0444,
+ binder_debugfs_dir_entry_root,
+ NULL,
+ &binder_state_fops);
+ debugfs_create_file("stats",
+ 0444,
+ binder_debugfs_dir_entry_root,
+ NULL,
+ &binder_stats_fops);
+ debugfs_create_file("transactions",
+ 0444,
+ binder_debugfs_dir_entry_root,
+ NULL,
+ &binder_transactions_fops);
+ debugfs_create_file("transaction_log",
+ 0444,
+ binder_debugfs_dir_entry_root,
+ &binder_transaction_log,
+ &binder_transaction_log_fops);
+ debugfs_create_file("failed_transaction_log",
+ 0444,
+ binder_debugfs_dir_entry_root,
+ &binder_transaction_log_failed,
+ &binder_transaction_log_fops);
+ }
- device_tmp = device_names;
- while ((device_name = strsep(&device_tmp, ","))) {
- ret = init_binder_device(device_name);
- if (ret)
- goto err_init_binder_device_failed;
- }
+ /*
+ * Copy the module_parameter string, because we don't want to
+ * tokenize it in-place.
+ */
+ device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+ if (!device_names) {
+ ret = -ENOMEM;
+ goto err_alloc_device_names_failed;
}
+ strcpy(device_names, binder_devices_param);
- ret = init_binderfs();
- if (ret)
- goto err_init_binder_device_failed;
+ device_tmp = device_names;
+ while ((device_name = strsep(&device_tmp, ","))) {
+ ret = init_binder_device(device_name);
+ if (ret)
+ goto err_init_binder_device_failed;
+ }
return ret;
@@ -6814,9 +6182,6 @@ static int __init binder_init(void)
err_alloc_device_names_failed:
debugfs_remove_recursive(binder_debugfs_dir_entry_root);
-err_alloc_shrinker_failed:
- binder_destroy_pools();
-
return ret;
}
@@ -6824,7 +6189,5 @@ device_initcall(binder_init);
#define CREATE_TRACE_POINTS
#include "binder_trace.h"
-EXPORT_TRACEPOINT_SYMBOL_GPL(binder_transaction_received);
-EXPORT_TRACEPOINT_SYMBOL_GPL(binder_txn_latency_free);
MODULE_LICENSE("GPL v2");
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 9eb15d712567..5addcd56afb4 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1,13 +1,23 @@
-// SPDX-License-Identifier: GPL-2.0-only
/* binder_alloc.c
*
* Android IPC Subsystem
*
* Copyright (C) 2007-2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include
#include
#include
#include
@@ -18,11 +28,8 @@
#include
#include
#include
-#include
-#include
#include
#include
-#include
#include "binder_alloc.h"
#include "binder_trace.h"
@@ -36,7 +43,7 @@ enum {
BINDER_DEBUG_BUFFER_ALLOC = 1U << 2,
BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3,
};
-static uint32_t binder_alloc_debug_mask = BINDER_DEBUG_USER_ERROR;
+static uint32_t binder_alloc_debug_mask;
module_param_named(debug_mask, binder_alloc_debug_mask,
uint, 0644);
@@ -44,25 +51,9 @@ module_param_named(debug_mask, binder_alloc_debug_mask,
#define binder_alloc_debug(mask, x...) \
do { \
if (binder_alloc_debug_mask & mask) \
- pr_info_ratelimited(x); \
+ pr_info(x); \
} while (0)
-static struct kmem_cache *binder_buffer_pool;
-
-int binder_buffer_pool_create(void)
-{
- binder_buffer_pool = KMEM_CACHE(binder_buffer, SLAB_HWCACHE_ALIGN);
- if (!binder_buffer_pool)
- return -ENOMEM;
-
- return 0;
-}
-
-void binder_buffer_pool_destroy(void)
-{
- kmem_cache_destroy(binder_buffer_pool);
-}
-
static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
{
return list_entry(buffer->entry.next, struct binder_buffer, entry);
@@ -173,7 +164,7 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked(
}
/**
- * binder_alloc_prepare_to_free() - get buffer given user ptr
+ * binder_alloc_buffer_lookup() - get buffer given user ptr
* @alloc: binder_alloc for this proc
* @user_ptr: User pointer to buffer data
*
@@ -228,14 +219,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
mm = alloc->vma_vm_mm;
if (mm) {
- down_read;
+ down_read(&mm->mmap_sem);
+ if (!mmget_still_valid(mm)) {
+ if (allocate == 0)
+ goto free_range;
+ goto err_no_vma;
+ }
vma = alloc->vma;
}
if (!vma && need_mm) {
- binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
- "%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
- alloc->pid);
+ pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+ alloc->pid);
goto err_no_vma;
}
@@ -284,15 +279,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
alloc->pages_high = index + 1;
trace_binder_alloc_page_end(alloc, index);
+ /* vm_insert_page does not seem to increment the refcount */
}
if (mm) {
- up_read;
+ up_read(&mm->mmap_sem);
mmput(mm);
}
return 0;
free_range:
- for (page_addr = end - PAGE_SIZE; 1; page_addr -= PAGE_SIZE) {
+ for (page_addr = end - PAGE_SIZE; page_addr >= start;
+ page_addr -= PAGE_SIZE) {
bool ret;
size_t index;
@@ -305,8 +302,6 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
WARN_ON(!ret);
trace_binder_free_lru_end(alloc, index);
- if (page_addr == start)
- break;
continue;
err_vm_insert_page_failed:
@@ -314,47 +309,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
page->page_ptr = NULL;
err_alloc_page_failed:
err_page_ptr_cleared:
- if (page_addr == start)
- break;
+ ;
}
err_no_vma:
if (mm) {
- up_read;
+ up_read(&mm->mmap_sem);
mmput(mm);
}
return vma ? -ENOMEM : -ESRCH;
}
-
-static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
- struct vm_area_struct *vma)
-{
- if (vma)
- alloc->vma_vm_mm = vma->vm_mm;
- /*
- * If we see alloc->vma is not NULL, buffer data structures set up
- * completely. Look at smp_rmb side binder_alloc_get_vma.
- * We also want to guarantee new alloc->vma_vm_mm is always visible
- * if alloc->vma is set.
- */
- smp_wmb();
- alloc->vma = vma;
-}
-
-static inline struct vm_area_struct *binder_alloc_get_vma(
- struct binder_alloc *alloc)
-{
- struct vm_area_struct *vma = NULL;
-
- if (alloc->vma) {
- /* Look at description in binder_alloc_set_vma */
- smp_rmb();
- vma = alloc->vma;
- }
- return vma;
-}
-
-static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
+static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
{
/*
* Find the amount and size of buffers allocated by the current caller;
@@ -363,7 +328,7 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
* and at some point we'll catch them in the act. This is more efficient
* than keeping a map per pid.
*/
- struct rb_node *n;
+ struct rb_node *n = alloc->free_buffers.rb_node;
struct binder_buffer *buffer;
size_t total_alloc_size = 0;
size_t num_buffers = 0;
@@ -382,19 +347,13 @@ static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
/*
* Warn if this pid has more than 50 transactions, or more than 50% of
- * async space (which is 25% of total buffer size). Oneway spam is only
- * detected when the threshold is exceeded.
+ * async space (which is 25% of total buffer size).
*/
if (num_buffers > 50 || total_alloc_size > alloc->buffer_size / 4) {
binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
"%d: pid %d spamming oneway? %zd buffers allocated for a total size of %zd\n",
alloc->pid, pid, num_buffers, total_alloc_size);
- if (!alloc->oneway_spam_detected) {
- alloc->oneway_spam_detected = true;
- return true;
- }
}
- return false;
}
static struct binder_buffer *binder_alloc_new_buf_locked(
@@ -414,15 +373,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
size_t size, data_offsets_size;
int ret;
- down_read;
- if (!binder_alloc_get_vma(alloc)) {
- up_read;
- binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
- "%d: binder_alloc_buf, no vma\n",
- alloc->pid);
+ if (alloc->vma == NULL) {
+ pr_err("%d: binder_alloc_buf, no vma\n",
+ alloc->pid);
return ERR_PTR(-ESRCH);
}
- up_read;
data_offsets_size = ALIGN(data_size, sizeof(void *)) +
ALIGN(offsets_size, sizeof(void *));
@@ -492,14 +447,11 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
if (buffer_size > largest_free_size)
largest_free_size = buffer_size;
}
- binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
- "%d: binder_alloc_buf size %zd failed, no address space\n",
- alloc->pid, size);
- binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
- "allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
- total_alloc_size, allocated_buffers,
- largest_alloc_size, total_free_size,
- free_buffers, largest_free_size);
+ pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+ alloc->pid, size);
+ pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
+ total_alloc_size, allocated_buffers, largest_alloc_size,
+ total_free_size, free_buffers, largest_free_size);
return ERR_PTR(-ENOSPC);
}
if (n == NULL) {
@@ -526,7 +478,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
if (buffer_size != size) {
struct binder_buffer *new_buffer;
- new_buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL);
+ new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
if (!new_buffer) {
pr_err("%s: %d failed to alloc new buffer struct\n",
__func__, alloc->pid);
@@ -550,7 +502,6 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
buffer->async_transaction = is_async;
buffer->extra_buffers_size = extra_buffers_size;
buffer->pid = pid;
- buffer->oneway_spam_suspect = false;
if (is_async) {
alloc->free_async_space -= size + sizeof(struct binder_buffer);
binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
@@ -562,9 +513,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
* of async space left (which is less than 10% of total
* buffer size).
*/
- buffer->oneway_spam_suspect = debug_low_async_space_locked(alloc, pid);
- } else {
- alloc->oneway_spam_detected = false;
+ debug_low_async_space_locked(alloc, pid);
}
}
return buffer;
@@ -624,7 +573,6 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
{
struct binder_buffer *prev, *next = NULL;
bool to_free = true;
-
BUG_ON(alloc->buffers.next == &buffer->entry);
prev = binder_buffer_prev(buffer);
BUG_ON(!prev->free);
@@ -665,7 +613,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
buffer_start_page(buffer) + PAGE_SIZE);
}
list_del(&buffer->entry);
- kmem_cache_free(binder_buffer_pool, buffer);
+ kfree(buffer);
}
static void binder_free_buf_locked(struct binder_alloc *alloc,
@@ -690,7 +638,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
BUG_ON(buffer->user_data > alloc->buffer + alloc->buffer_size);
if (buffer->async_transaction) {
- alloc->free_async_space += buffer_size + sizeof(struct binder_buffer);
+ alloc->free_async_space += size + sizeof(struct binder_buffer);
binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
"%d: binder_free_buf size %zd async free %zd\n",
@@ -731,7 +679,7 @@ static void binder_alloc_clear_buf(struct binder_alloc *alloc,
* @alloc: binder_alloc for this proc
* @buffer: kernel pointer to buffer
*
- * Free the buffer allocated via binder_alloc_new_buf()
+ * Free the buffer allocated via binder_alloc_new_buffer()
*/
void binder_alloc_free_buf(struct binder_alloc *alloc,
struct binder_buffer *buffer)
@@ -773,34 +721,27 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
const char *failure_string;
struct binder_buffer *buffer;
- if (unlikely(vma->vm_mm != alloc->vma_vm_mm)) {
- ret = -EINVAL;
- failure_string = "invalid vma->vm_mm";
- goto err_invalid_mm;
- }
-
mutex_lock(&binder_alloc_mmap_lock);
- if (alloc->buffer_size) {
+ if (alloc->buffer) {
ret = -EBUSY;
failure_string = "already mapped";
goto err_already_mapped;
}
- alloc->buffer_size = min_t(unsigned long, vma->vm_end - vma->vm_start,
- SZ_4M);
- mutex_unlock(&binder_alloc_mmap_lock);
alloc->buffer = (void __user *)vma->vm_start;
+ mutex_unlock(&binder_alloc_mmap_lock);
- alloc->pages = kcalloc(alloc->buffer_size / PAGE_SIZE,
- sizeof(alloc->pages[0]),
+ alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+ ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
GFP_KERNEL);
if (alloc->pages == NULL) {
ret = -ENOMEM;
failure_string = "alloc page array";
goto err_alloc_pages_failed;
}
+ alloc->buffer_size = vma->vm_end - vma->vm_start;
- buffer = kmem_cache_zalloc(binder_buffer_pool, GFP_KERNEL);
+ buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
if (!buffer) {
ret = -ENOMEM;
failure_string = "alloc buffer struct";
@@ -812,7 +753,11 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
buffer->free = 1;
binder_insert_free_buffer(alloc, buffer);
alloc->free_async_space = alloc->buffer_size / 2;
- binder_alloc_set_vma(alloc, vma);
+ barrier();
+ alloc->vma = vma;
+ alloc->vma_vm_mm = vma->vm_mm;
+ /* Same as mmgrab() in later kernel versions */
+ atomic_inc(&alloc->vma_vm_mm->mm_count);
return 0;
@@ -820,16 +765,12 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
kfree(alloc->pages);
alloc->pages = NULL;
err_alloc_pages_failed:
- alloc->buffer = NULL;
mutex_lock(&binder_alloc_mmap_lock);
- alloc->buffer_size = 0;
+ alloc->buffer = NULL;
err_already_mapped:
mutex_unlock(&binder_alloc_mmap_lock);
-err_invalid_mm:
- binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
- "%s: %d %lx-%lx %s failed %d\n", __func__,
- alloc->pid, vma->vm_start, vma->vm_end,
- failure_string, ret);
+ pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+ alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
return ret;
}
@@ -840,10 +781,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
int buffers, page_count;
struct binder_buffer *buffer;
- buffers = 0;
- mutex_lock(&alloc->mutex);
BUG_ON(alloc->vma);
+ buffers = 0;
+ mutex_lock(&alloc->mutex);
while ((n = rb_first(&alloc->allocated_buffers))) {
buffer = rb_entry(n, struct binder_buffer, rb_node);
@@ -865,7 +806,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
list_del(&buffer->entry);
WARN_ON_ONCE(!list_empty(&alloc->buffers));
- kmem_cache_free(binder_buffer_pool, buffer);
+ kfree(buffer);
}
page_count = 0;
@@ -945,18 +886,6 @@ void binder_alloc_print_pages(struct seq_file *m,
int free = 0;
mutex_lock(&alloc->mutex);
- /*
- * Make sure the binder_alloc is fully initialized, otherwise we might
- * read inconsistent state.
- */
-
- down_read;
- if (binder_alloc_get_vma(alloc) == NULL) {
- up_read;
- goto uninitialized;
- }
-
- up_read;
for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
page = &alloc->pages[i];
if (!page->page_ptr)
@@ -966,8 +895,6 @@ void binder_alloc_print_pages(struct seq_file *m,
else
lru++;
}
-
-uninitialized:
mutex_unlock(&alloc->mutex);
seq_printf(m, " pages: %d:%d:%d\n", active, lru, free);
seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high);
@@ -1002,7 +929,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
*/
void binder_alloc_vma_close(struct binder_alloc *alloc)
{
- binder_alloc_set_vma(alloc, NULL);
+ WRITE_ONCE(alloc->vma, NULL);
}
/**
@@ -1018,7 +945,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lock,
void *cb_arg)
- __must_hold(lock)
{
struct mm_struct *mm = NULL;
struct binder_lru_page *page = container_of(item,
@@ -1042,9 +968,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
mm = alloc->vma_vm_mm;
if (!mmget_not_zero(mm))
goto err_mmget;
- if (!*down_read_trylock)
- goto err_down_read_mmap_sem_failed;
- vma = binder_alloc_get_vma(alloc);
+ if (!down_write_trylock(&mm->mmap_sem))
+ goto err_down_write_mmap_sem_failed;
+ vma = alloc->vma;
list_lru_isolate(lru, item);
spin_unlock(lock);
@@ -1056,8 +982,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
trace_binder_unmap_user_end(alloc, index);
}
- up_read;
- mmput_async(mm);
+ up_write(&mm->mmap_sem);
+ mmput(mm);
trace_binder_unmap_kernel_start(alloc, index);
@@ -1070,7 +996,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
mutex_unlock(&alloc->mutex);
return LRU_REMOVED_RETRY;
-err_down_read_mmap_sem_failed:
+err_down_write_mmap_sem_failed:
mmput_async(mm);
err_mmget:
err_page_already_freed:
@@ -1112,8 +1038,6 @@ static struct shrinker binder_shrinker = {
void binder_alloc_init(struct binder_alloc *alloc)
{
alloc->pid = current->group_leader->pid;
- alloc->vma_vm_mm = current->mm;
- mmgrab(alloc->vma_vm_mm);
mutex_init(&alloc->mutex);
INIT_LIST_HEAD(&alloc->buffers);
}
@@ -1271,16 +1195,15 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc,
return 0;
}
-static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
- bool to_buffer,
- struct binder_buffer *buffer,
- binder_size_t buffer_offset,
- void *ptr,
- size_t bytes)
+static void binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
+ bool to_buffer,
+ struct binder_buffer *buffer,
+ binder_size_t buffer_offset,
+ void *ptr,
+ size_t bytes)
{
/* All copies must be 32-bit aligned and 32-bit size */
- if (!check_buffer(alloc, buffer, buffer_offset, bytes))
- return -EINVAL;
+ BUG_ON(!check_buffer(alloc, buffer, buffer_offset, bytes));
while (bytes) {
unsigned long size;
@@ -1308,25 +1231,25 @@ static int binder_alloc_do_buffer_copy(struct binder_alloc *alloc,
ptr = ptr + size;
buffer_offset += size;
}
- return 0;
}
-int binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
- struct binder_buffer *buffer,
- binder_size_t buffer_offset,
- void *src,
- size_t bytes)
+void binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *buffer,
+ binder_size_t buffer_offset,
+ void *src,
+ size_t bytes)
{
- return binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset,
- src, bytes);
+ binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset,
+ src, bytes);
}
-int binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
- void *dest,
- struct binder_buffer *buffer,
- binder_size_t buffer_offset,
- size_t bytes)
+void binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
+ void *dest,
+ struct binder_buffer *buffer,
+ binder_size_t buffer_offset,
+ size_t bytes)
{
- return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
- dest, bytes);
+ binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
+ dest, bytes);
}
+
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index a30eb98d99f4..da025cc94cd9 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -1,6 +1,15 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
*/
#ifndef _LINUX_BINDER_ALLOC_H
@@ -13,6 +22,11 @@
#include
#include
#include
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
+#define BINDER_IPC_32BIT 1
+#endif
+
#include
extern struct list_lru binder_alloc_lru;
@@ -26,8 +40,6 @@ struct binder_transaction;
* @clear_on_free: %true if buffer must be zeroed after use
* @allow_user_free: %true if user is allowed to free buffer
* @async_transaction: %true if buffer is in use for an async txn
- * @oneway_spam_suspect: %true if total async allocate size just exceed
- * spamming detect threshold
* @debug_id: unique ID for debugging
* @transaction: pointer to associated struct binder_transaction
* @target_node: struct binder_node associated with this buffer
@@ -47,8 +59,7 @@ struct binder_buffer {
unsigned clear_on_free:1;
unsigned allow_user_free:1;
unsigned async_transaction:1;
- unsigned oneway_spam_suspect:1;
- unsigned debug_id:27;
+ unsigned debug_id:28;
struct binder_transaction *transaction;
@@ -90,8 +101,6 @@ struct binder_lru_page {
* @buffer_size: size of address space specified via mmap
* @pid: pid for associated binder_proc (invariant after init)
* @pages_high: high watermark of offset in @pages
- * @oneway_spam_detected: %true if oneway spam detection fired, clear that
- * flag once the async buffer has returned to a healthy state
*
* Bookkeeping structure for per-proc address space management for binder
* buffers. It is normally initialized during binder_init() and binder_mmap()
@@ -112,7 +121,6 @@ struct binder_alloc {
uint32_t buffer_free;
int pid;
size_t pages_high;
- bool oneway_spam_detected;
};
#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
@@ -145,8 +153,6 @@ extern void binder_alloc_print_allocated(struct seq_file *m,
struct binder_alloc *alloc);
void binder_alloc_print_pages(struct seq_file *m,
struct binder_alloc *alloc);
-extern int binder_buffer_pool_create(void);
-extern void binder_buffer_pool_destroy(void);
/**
* binder_alloc_get_free_async_space() - get free space available for async
@@ -172,16 +178,17 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc,
const void __user *from,
size_t bytes);
-int binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
- struct binder_buffer *buffer,
- binder_size_t buffer_offset,
- void *src,
- size_t bytes);
+void binder_alloc_copy_to_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *buffer,
+ binder_size_t buffer_offset,
+ void *src,
+ size_t bytes);
-int binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
- void *dest,
- struct binder_buffer *buffer,
- binder_size_t buffer_offset,
- size_t bytes);
+void binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
+ void *dest,
+ struct binder_buffer *buffer,
+ binder_size_t buffer_offset,
+ size_t bytes);
#endif /* _LINUX_BINDER_ALLOC_H */
+
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
index c2b323bc3b3a..c839c490fde3 100644
--- a/drivers/android/binder_alloc_selftest.c
+++ b/drivers/android/binder_alloc_selftest.c
@@ -1,9 +1,18 @@
-// SPDX-License-Identifier: GPL-2.0-only
/* binder_alloc_selftest.c
*
* Android IPC Subsystem
*
* Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
deleted file mode 100644
index 3b6918d8a977..000000000000
--- a/drivers/android/binder_internal.h
+++ /dev/null
@@ -1,603 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_BINDER_INTERNAL_H
-#define _LINUX_BINDER_INTERNAL_H
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include "binder_alloc.h"
-
-#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c)
-#define ida_free ida_remove
-
-typedef unsigned int __poll_t;
-typedef __bitwise int vm_fault_t;
-
-struct binder_context {
- struct binder_node *binder_context_mgr_node;
- struct mutex context_mgr_node_lock;
- kuid_t binder_context_mgr_uid;
- const char *name;
-};
-
-/**
- * struct binder_device - information about a binder device node
- * @hlist: list of binder devices (only used for devices requested via
- * CONFIG_ANDROID_BINDER_DEVICES)
- * @miscdev: information about a binder character device node
- * @context: binder context information
- * @binderfs_inode: This is the inode of the root dentry of the super block
- * belonging to a binderfs mount.
- */
-struct binder_device {
- struct hlist_node hlist;
- struct miscdevice miscdev;
- struct binder_context context;
- struct inode *binderfs_inode;
- refcount_t ref;
-};
-
-/**
- * binderfs_mount_opts - mount options for binderfs
- * @max: maximum number of allocatable binderfs binder devices
- * @stats_mode: enable binder stats in binderfs.
- */
-struct binderfs_mount_opts {
- int max;
- int stats_mode;
-};
-
-/**
- * binderfs_info - information about a binderfs mount
- * @ipc_ns: The ipc namespace the binderfs mount belongs to.
- * @control_dentry: This records the dentry of this binderfs mount
- * binder-control device.
- * @root_uid: uid that needs to be used when a new binder device is
- * created.
- * @root_gid: gid that needs to be used when a new binder device is
- * created.
- * @mount_opts: The mount options in use.
- * @device_count: The current number of allocated binder devices.
- * @proc_log_dir: Pointer to the directory dentry containing process-specific
- * logs.
- */
-struct binderfs_info {
- struct ipc_namespace *ipc_ns;
- struct dentry *control_dentry;
- kuid_t root_uid;
- kgid_t root_gid;
- struct binderfs_mount_opts mount_opts;
- int device_count;
- struct dentry *proc_log_dir;
-};
-
-extern const struct file_operations binder_fops;
-
-extern char *binder_devices_param;
-
-#ifdef CONFIG_ANDROID_BINDERFS
-extern bool is_binderfs_device(const struct inode *inode);
-extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name,
- const struct file_operations *fops,
- void *data);
-extern void binderfs_remove_file(struct dentry *dentry);
-#else
-static inline bool is_binderfs_device(const struct inode *inode)
-{
- return false;
-}
-static inline struct dentry *binderfs_create_file(struct dentry *dir,
- const char *name,
- const struct file_operations *fops,
- void *data)
-{
- return NULL;
-}
-static inline void binderfs_remove_file(struct dentry *dentry) {}
-#endif
-
-#ifdef CONFIG_ANDROID_BINDERFS
-extern int __init init_binderfs(void);
-#else
-static inline int __init init_binderfs(void)
-{
- return 0;
-}
-#endif
-
-struct binder_debugfs_entry {
- const char *name;
- umode_t mode;
- const struct file_operations *fops;
- void *data;
-};
-
-extern const struct binder_debugfs_entry binder_debugfs_entries[];
-
-#define binder_for_each_debugfs_entry(entry) \
- for ((entry) = binder_debugfs_entries; \
- (entry)->name; \
- (entry)++)
-
-enum binder_stat_types {
- BINDER_STAT_PROC,
- BINDER_STAT_THREAD,
- BINDER_STAT_NODE,
- BINDER_STAT_REF,
- BINDER_STAT_DEATH,
- BINDER_STAT_TRANSACTION,
- BINDER_STAT_TRANSACTION_COMPLETE,
- BINDER_STAT_COUNT
-};
-
-struct binder_stats {
- atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1];
- atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
- atomic_t obj_created[BINDER_STAT_COUNT];
- atomic_t obj_deleted[BINDER_STAT_COUNT];
-};
-
-/**
- * struct binder_work - work enqueued on a worklist
- * @entry: node enqueued on list
- * @type: type of work to be performed
- *
- * There are separate work lists for proc, thread, and node (async).
- */
-struct binder_work {
- struct list_head entry;
-
- enum binder_work_type {
- BINDER_WORK_TRANSACTION = 1,
- BINDER_WORK_TRANSACTION_COMPLETE,
- BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT,
- BINDER_WORK_RETURN_ERROR,
- BINDER_WORK_NODE,
- BINDER_WORK_DEAD_BINDER,
- BINDER_WORK_DEAD_BINDER_AND_CLEAR,
- BINDER_WORK_CLEAR_DEATH_NOTIFICATION,
- } type;
-};
-
-struct binder_error {
- struct binder_work work;
- uint32_t cmd;
-};
-
-/**
- * struct binder_node - binder node bookkeeping
- * @debug_id: unique ID for debugging
- * (invariant after initialized)
- * @lock: lock for node fields
- * @work: worklist element for node work
- * (protected by @proc->inner_lock)
- * @rb_node: element for proc->nodes tree
- * (protected by @proc->inner_lock)
- * @dead_node: element for binder_dead_nodes list
- * (protected by binder_dead_nodes_lock)
- * @proc: binder_proc that owns this node
- * (invariant after initialized)
- * @refs: list of references on this node
- * (protected by @lock)
- * @internal_strong_refs: used to take strong references when
- * initiating a transaction
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @local_weak_refs: weak user refs from local process
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @local_strong_refs: strong user refs from local process
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @tmp_refs: temporary kernel refs
- * (protected by @proc->inner_lock while @proc
- * is valid, and by binder_dead_nodes_lock
- * if @proc is NULL. During inc/dec and node release
- * it is also protected by @lock to provide safety
- * as the node dies and @proc becomes NULL)
- * @ptr: userspace pointer for node
- * (invariant, no lock needed)
- * @cookie: userspace cookie for node
- * (invariant, no lock needed)
- * @has_strong_ref: userspace notified of strong ref
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @pending_strong_ref: userspace has acked notification of strong ref
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @has_weak_ref: userspace notified of weak ref
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @pending_weak_ref: userspace has acked notification of weak ref
- * (protected by @proc->inner_lock if @proc
- * and by @lock)
- * @has_async_transaction: async transaction to node in progress
- * (protected by @lock)
- * @sched_policy: minimum scheduling policy for node
- * (invariant after initialized)
- * @accept_fds: file descriptor operations supported for node
- * (invariant after initialized)
- * @min_priority: minimum scheduling priority
- * (invariant after initialized)
- * @inherit_rt: inherit RT scheduling policy from caller
- * @txn_security_ctx: require sender's security context
- * (invariant after initialized)
- * @async_todo: list of async work items
- * (protected by @proc->inner_lock)
- *
- * Bookkeeping structure for binder nodes.
- */
-struct binder_node {
- int debug_id;
- spinlock_t lock;
- struct binder_work work;
- union {
- struct rb_node rb_node;
- struct hlist_node dead_node;
- };
- struct binder_proc *proc;
- struct hlist_head refs;
- int internal_strong_refs;
- int local_weak_refs;
- int local_strong_refs;
- int tmp_refs;
- binder_uintptr_t ptr;
- binder_uintptr_t cookie;
- struct {
- /*
- * bitfield elements protected by
- * proc inner_lock
- */
- u8 has_strong_ref:1;
- u8 pending_strong_ref:1;
- u8 has_weak_ref:1;
- u8 pending_weak_ref:1;
- };
- struct {
- /*
- * invariant after initialization
- */
- u8 sched_policy:2;
- u8 inherit_rt:1;
- u8 accept_fds:1;
- u8 txn_security_ctx:1;
- u8 min_priority;
- };
- bool has_async_transaction;
- struct list_head async_todo;
-};
-
-struct binder_ref_death {
- /**
- * @work: worklist element for death notifications
- * (protected by inner_lock of the proc that
- * this ref belongs to)
- */
- struct binder_work work;
- binder_uintptr_t cookie;
-};
-
-/**
- * struct binder_ref_data - binder_ref counts and id
- * @debug_id: unique ID for the ref
- * @desc: unique userspace handle for ref
- * @strong: strong ref count (debugging only if not locked)
- * @weak: weak ref count (debugging only if not locked)
- *
- * Structure to hold ref count and ref id information. Since
- * the actual ref can only be accessed with a lock, this structure
- * is used to return information about the ref to callers of
- * ref inc/dec functions.
- */
-struct binder_ref_data {
- int debug_id;
- uint32_t desc;
- int strong;
- int weak;
-};
-
-/**
- * struct binder_ref - struct to track references on nodes
- * @data: binder_ref_data containing id, handle, and current refcounts
- * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
- * @rb_node_node: node for lookup by @node in proc's rb_tree
- * @node_entry: list entry for node->refs list in target node
- * (protected by @node->lock)
- * @proc: binder_proc containing ref
- * @node: binder_node of target node. When cleaning up a
- * ref for deletion in binder_cleanup_ref, a non-NULL
- * @node indicates the node must be freed
- * @death: pointer to death notification (ref_death) if requested
- * (protected by @node->lock)
- *
- * Structure to track references from procA to target node (on procB). This
- * structure is unsafe to access without holding @proc->outer_lock.
- */
-struct binder_ref {
- /* Lookups needed: */
- /* node + proc => ref (transaction) */
- /* desc + proc => ref (transaction, inc/dec ref) */
- /* node => refs + procs (proc exit) */
- struct binder_ref_data data;
- struct rb_node rb_node_desc;
- struct rb_node rb_node_node;
- struct hlist_node node_entry;
- struct binder_proc *proc;
- struct binder_node *node;
- struct binder_ref_death *death;
-};
-
-/**
- * struct binder_priority - scheduler policy and priority
- * @sched_policy scheduler policy
- * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
- *
- * The binder driver supports inheriting the following scheduler policies:
- * SCHED_NORMAL
- * SCHED_BATCH
- * SCHED_FIFO
- * SCHED_RR
- */
-struct binder_priority {
- unsigned int sched_policy;
- int prio;
-};
-
-enum binder_prio_state {
- BINDER_PRIO_SET, /* desired priority set */
- BINDER_PRIO_PENDING, /* initiated a saved priority restore */
- BINDER_PRIO_ABORT, /* abort the pending priority restore */
-};
-
-/**
- * struct binder_proc - binder process bookkeeping
- * @proc_node: element for binder_procs list
- * @threads: rbtree of binder_threads in this proc
- * (protected by @inner_lock)
- * @nodes: rbtree of binder nodes associated with
- * this proc ordered by node->ptr
- * (protected by @inner_lock)
- * @refs_by_desc: rbtree of refs ordered by ref->desc
- * (protected by @outer_lock)
- * @refs_by_node: rbtree of refs ordered by ref->node
- * (protected by @outer_lock)
- * @waiting_threads: threads currently waiting for proc work
- * (protected by @inner_lock)
- * @pid PID of group_leader of process
- * (invariant after initialized)
- * @tsk task_struct for group_leader of process
- * (invariant after initialized)
- * @cred struct cred associated with the `struct file`
- * in binder_open()
- * (invariant after initialized)
- * @deferred_work_node: element for binder_deferred_list
- * (protected by binder_deferred_lock)
- * @deferred_work: bitmap of deferred work to perform
- * (protected by binder_deferred_lock)
- * @outstanding_txns: number of transactions to be transmitted before
- * processes in freeze_wait are woken up
- * (protected by @inner_lock)
- * @is_dead: process is dead and awaiting free
- * when outstanding transactions are cleaned up
- * (protected by @inner_lock)
- * @is_frozen: process is frozen and unable to service
- * binder transactions
- * (protected by @inner_lock)
- * @sync_recv: process received sync transactions since last frozen
- * bit 0: received sync transaction after being frozen
- * bit 1: new pending sync transaction during freezing
- * (protected by @inner_lock)
- * @async_recv: process received async transactions since last frozen
- * (protected by @inner_lock)
- * @freeze_wait: waitqueue of processes waiting for all outstanding
- * transactions to be processed
- * (protected by @inner_lock)
- * @todo: list of work for this process
- * (protected by @inner_lock)
- * @stats: per-process binder statistics
- * (atomics, no lock needed)
- * @delivered_death: list of delivered death notification
- * (protected by @inner_lock)
- * @max_threads: cap on number of binder threads
- * (protected by @inner_lock)
- * @requested_threads: number of binder threads requested but not
- * yet started. In current implementation, can
- * only be 0 or 1.
- * (protected by @inner_lock)
- * @requested_threads_started: number binder threads started
- * (protected by @inner_lock)
- * @tmp_ref: temporary reference to indicate proc is in use
- * (protected by @inner_lock)
- * @default_priority: default scheduler priority
- * (invariant after initialized)
- * @debugfs_entry: debugfs node
- * @alloc: binder allocator bookkeeping
- * @context: binder_context for this proc
- * (invariant after initialized)
- * @inner_lock: can nest under outer_lock and/or node lock
- * @outer_lock: no nesting under innor or node lock
- * Lock order: 1) outer, 2) node, 3) inner
- * @binderfs_entry: process-specific binderfs log file
- * @oneway_spam_detection_enabled: process enabled oneway spam detection
- * or not
- *
- * Bookkeeping structure for binder processes
- */
-struct binder_proc {
- struct hlist_node proc_node;
- struct rb_root threads;
- struct rb_root nodes;
- struct rb_root refs_by_desc;
- struct rb_root refs_by_node;
- struct list_head waiting_threads;
- int pid;
- struct task_struct *tsk;
- const struct cred *cred;
- struct hlist_node deferred_work_node;
- int deferred_work;
- int outstanding_txns;
- bool is_dead;
- bool is_frozen;
- bool sync_recv;
- bool async_recv;
- wait_queue_head_t freeze_wait;
-
- struct list_head todo;
- struct binder_stats stats;
- struct list_head delivered_death;
- int max_threads;
- int requested_threads;
- int requested_threads_started;
- int tmp_ref;
- struct binder_priority default_priority;
- struct dentry *debugfs_entry;
- struct binder_alloc alloc;
- struct binder_context *context;
- spinlock_t inner_lock;
- spinlock_t outer_lock;
- struct dentry *binderfs_entry;
- bool oneway_spam_detection_enabled;
-};
-
-/**
- * struct binder_thread - binder thread bookkeeping
- * @proc: binder process for this thread
- * (invariant after initialization)
- * @rb_node: element for proc->threads rbtree
- * (protected by @proc->inner_lock)
- * @waiting_thread_node: element for @proc->waiting_threads list
- * (protected by @proc->inner_lock)
- * @pid: PID for this thread
- * (invariant after initialization)
- * @looper: bitmap of looping state
- * (only accessed by this thread)
- * @looper_needs_return: looping thread needs to exit driver
- * (no lock needed)
- * @transaction_stack: stack of in-progress transactions for this thread
- * (protected by @proc->inner_lock)
- * @todo: list of work to do for this thread
- * (protected by @proc->inner_lock)
- * @process_todo: whether work in @todo should be processed
- * (protected by @proc->inner_lock)
- * @return_error: transaction errors reported by this thread
- * (only accessed by this thread)
- * @reply_error: transaction errors reported by target thread
- * (protected by @proc->inner_lock)
- * @wait: wait queue for thread work
- * @stats: per-thread statistics
- * (atomics, no lock needed)
- * @tmp_ref: temporary reference to indicate thread is in use
- * (atomic since @proc->inner_lock cannot
- * always be acquired)
- * @is_dead: thread is dead and awaiting free
- * when outstanding transactions are cleaned up
- * (protected by @proc->inner_lock)
- * @task: struct task_struct for this thread
- * @prio_lock: protects thread priority fields
- * @prio_next: saved priority to be restored next
- * (protected by @prio_lock)
- * @prio_state: state of the priority restore process as
- * defined by enum binder_prio_state
- * (protected by @prio_lock)
- *
- * Bookkeeping structure for binder threads.
- */
-struct binder_thread {
- struct binder_proc *proc;
- struct rb_node rb_node;
- struct list_head waiting_thread_node;
- int pid;
- int looper; /* only modified by this thread */
- bool looper_need_return; /* can be written by other thread */
- struct binder_transaction *transaction_stack;
- struct list_head todo;
- bool process_todo;
- struct binder_error return_error;
- struct binder_error reply_error;
- wait_queue_head_t wait;
- struct binder_stats stats;
- atomic_t tmp_ref;
- bool is_dead;
- struct task_struct *task;
- spinlock_t prio_lock;
- struct binder_priority prio_next;
- enum binder_prio_state prio_state;
-};
-
-/**
- * struct binder_txn_fd_fixup - transaction fd fixup list element
- * @fixup_entry: list entry
- * @file: struct file to be associated with new fd
- * @offset: offset in buffer data to this fixup
- *
- * List element for fd fixups in a transaction. Since file
- * descriptors need to be allocated in the context of the
- * target process, we pass each fd to be processed in this
- * struct.
- */
-struct binder_txn_fd_fixup {
- struct list_head fixup_entry;
- struct file *file;
- size_t offset;
-};
-
-struct binder_transaction {
- int debug_id;
- struct binder_work work;
- struct binder_thread *from;
- struct binder_transaction *from_parent;
- struct binder_proc *to_proc;
- struct binder_thread *to_thread;
- struct binder_transaction *to_parent;
- unsigned need_reply:1;
- /* unsigned is_dead:1; */ /* not used at the moment */
-
- struct binder_buffer *buffer;
- unsigned int code;
- unsigned int flags;
- struct binder_priority priority;
- struct binder_priority saved_priority;
- bool set_priority_called;
- bool is_nested;
- kuid_t sender_euid;
- struct list_head fd_fixups;
- binder_uintptr_t security_ctx;
- /**
- * @lock: protects @from, @to_proc, and @to_thread
- *
- * @from, @to_proc, and @to_thread can be set to NULL
- * during thread teardown
- */
- spinlock_t lock;
-};
-
-/**
- * struct binder_object - union of flat binder object types
- * @hdr: generic object header
- * @fbo: binder object (nodes and refs)
- * @fdo: file descriptor object
- * @bbo: binder buffer pointer
- * @fdao: file descriptor array
- *
- * Used for type-independent object copies
- */
-struct binder_object {
- union {
- struct binder_object_header hdr;
- struct flat_binder_object fbo;
- struct binder_fd_object fdo;
- struct binder_buffer_object bbo;
- struct binder_fd_array_object fdao;
- };
-};
-
-#endif /* _LINUX_BINDER_INTERNAL_H */
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 5d82cf8af88b..7674231af8cb 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -1,6 +1,15 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
*/
#undef TRACE_SYSTEM
@@ -119,35 +128,6 @@ TRACE_EVENT(binder_wait_for_work,
__entry->thread_todo)
);
-TRACE_EVENT(binder_txn_latency_free,
- TP_PROTO(struct binder_transaction *t,
- int from_proc, int from_thread,
- int to_proc, int to_thread),
- TP_ARGS(t, from_proc, from_thread, to_proc, to_thread),
- TP_STRUCT__entry(
- __field(int, debug_id)
- __field(int, from_proc)
- __field(int, from_thread)
- __field(int, to_proc)
- __field(int, to_thread)
- __field(unsigned int, code)
- __field(unsigned int, flags)
- ),
- TP_fast_assign(
- __entry->debug_id = t->debug_id;
- __entry->from_proc = from_proc;
- __entry->from_thread = from_thread;
- __entry->to_proc = to_proc;
- __entry->to_thread = to_thread;
- __entry->code = t->code;
- __entry->flags = t->flags;
- ),
- TP_printk("transaction=%d from %d:%d to %d:%d flags=0x%x code=0x%x",
- __entry->debug_id, __entry->from_proc, __entry->from_thread,
- __entry->to_proc, __entry->to_thread, __entry->code,
- __entry->flags)
-);
-
TRACE_EVENT(binder_transaction,
TP_PROTO(bool reply, struct binder_transaction *t,
struct binder_node *target_node),
@@ -267,40 +247,22 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
__entry->dest_ref_debug_id, __entry->dest_ref_desc)
);
-TRACE_EVENT(binder_transaction_fd_send,
- TP_PROTO(struct binder_transaction *t, int fd, size_t offset),
- TP_ARGS(t, fd, offset),
+TRACE_EVENT(binder_transaction_fd,
+ TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd),
+ TP_ARGS(t, src_fd, dest_fd),
TP_STRUCT__entry(
__field(int, debug_id)
- __field(int, fd)
- __field(size_t, offset)
+ __field(int, src_fd)
+ __field(int, dest_fd)
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
- __entry->fd = fd;
- __entry->offset = offset;
+ __entry->src_fd = src_fd;
+ __entry->dest_fd = dest_fd;
),
- TP_printk("transaction=%d src_fd=%d offset=%zu",
- __entry->debug_id, __entry->fd, __entry->offset)
-);
-
-TRACE_EVENT(binder_transaction_fd_recv,
- TP_PROTO(struct binder_transaction *t, int fd, size_t offset),
- TP_ARGS(t, fd, offset),
-
- TP_STRUCT__entry(
- __field(int, debug_id)
- __field(int, fd)
- __field(size_t, offset)
- ),
- TP_fast_assign(
- __entry->debug_id = t->debug_id;
- __entry->fd = fd;
- __entry->offset = offset;
- ),
- TP_printk("transaction=%d dest_fd=%d offset=%zu",
- __entry->debug_id, __entry->fd, __entry->offset)
+ TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d",
+ __entry->debug_id, __entry->src_fd, __entry->dest_fd)
);
DECLARE_EVENT_CLASS(binder_buffer_class,
@@ -310,17 +272,14 @@ DECLARE_EVENT_CLASS(binder_buffer_class,
__field(int, debug_id)
__field(size_t, data_size)
__field(size_t, offsets_size)
- __field(size_t, extra_buffers_size)
),
TP_fast_assign(
__entry->debug_id = buf->debug_id;
__entry->data_size = buf->data_size;
__entry->offsets_size = buf->offsets_size;
- __entry->extra_buffers_size = buf->extra_buffers_size;
),
- TP_printk("transaction=%d data_size=%zd offsets_size=%zd extra_buffers_size=%zd",
- __entry->debug_id, __entry->data_size, __entry->offsets_size,
- __entry->extra_buffers_size)
+ TP_printk("transaction=%d data_size=%zd offsets_size=%zd",
+ __entry->debug_id, __entry->data_size, __entry->offsets_size)
);
DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf,
@@ -335,10 +294,6 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
TP_PROTO(struct binder_buffer *buffer),
TP_ARGS(buffer));
-DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release,
- TP_PROTO(struct binder_buffer *buffer),
- TP_ARGS(buffer));
-
TRACE_EVENT(binder_update_page_range,
TP_PROTO(struct binder_alloc *alloc, bool allocate,
void __user *start, void __user *end),
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
deleted file mode 100644
index f80d1fb9d9b2..000000000000
--- a/drivers/android/binderfs.c
+++ /dev/null
@@ -1,819 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "binder_internal.h"
-
-#define ida_alloc_max(a, b, c) ida_simple_get(a, 0, b + 1, c)
-#define ida_free ida_remove
-
-#define FIRST_INODE 1
-#define SECOND_INODE 2
-#define INODE_OFFSET 3
-#define INTSTRLEN 21
-#define BINDERFS_MAX_MINOR (1U << MINORBITS)
-/* Ensure that the initial ipc namespace always has devices available. */
-#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4)
-
-static dev_t binderfs_dev;
-static DEFINE_MUTEX(binderfs_minors_mutex);
-static DEFINE_IDA(binderfs_minors);
-
-enum binderfs_param {
- Opt_max,
- Opt_stats_mode,
-};
-
-enum binderfs_stats_mode {
- binderfs_stats_mode_unset,
- binderfs_stats_mode_global,
-};
-
-struct binder_features {
- bool oneway_spam_detection;
-};
-
-static const struct constant_table binderfs_param_stats[] = {
- { "global", binderfs_stats_mode_global },
- {}
-};
-
-static const struct fs_parameter_spec binderfs_fs_parameters[] = {
- fsparam_u32("max", Opt_max),
- fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats),
- {}
-};
-
-static struct binder_features binder_features = {
- .oneway_spam_detection = true,
-};
-
-static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-bool is_binderfs_device(const struct inode *inode)
-{
- if (inode->i_sb->s_magic == BINDERFS_SUPER_MAGIC)
- return true;
-
- return false;
-}
-
-/**
- * binderfs_binder_device_create - allocate inode from super block of a
- * binderfs mount
- * @ref_inode: inode from wich the super block will be taken
- * @userp: buffer to copy information about new device for userspace to
- * @req: struct binderfs_device as copied from userspace
- *
- * This function allocates a new binder_device and reserves a new minor
- * number for it.
- * Minor numbers are limited and tracked globally in binderfs_minors. The
- * function will stash a struct binder_device for the specific binder
- * device in i_private of the inode.
- * It will go on to allocate a new inode from the super block of the
- * filesystem mount, stash a struct binder_device in its i_private field
- * and attach a dentry to that inode.
- *
- * Return: 0 on success, negative errno on failure
- */
-static int binderfs_binder_device_create(struct inode *ref_inode,
- struct binderfs_device __user *userp,
- struct binderfs_device *req)
-{
- int minor, ret;
- struct dentry *dentry, *root;
- struct binder_device *device;
- char *name = NULL;
- size_t name_len;
- struct inode *inode = NULL;
- struct super_block *sb = ref_inode->i_sb;
- struct binderfs_info *info = sb->s_fs_info;
-#if defined(CONFIG_IPC_NS)
- bool use_reserve = (info->ipc_ns == &init_ipc_ns);
-#else
- bool use_reserve = true;
-#endif
-
- /* Reserve new minor number for the new device. */
- mutex_lock(&binderfs_minors_mutex);
- if (++info->device_count <= info->mount_opts.max)
- minor = ida_alloc_max(&binderfs_minors,
- use_reserve ? BINDERFS_MAX_MINOR :
- BINDERFS_MAX_MINOR_CAPPED,
- GFP_KERNEL);
- else
- minor = -ENOSPC;
- if (minor < 0) {
- --info->device_count;
- mutex_unlock(&binderfs_minors_mutex);
- return minor;
- }
- mutex_unlock(&binderfs_minors_mutex);
-
- ret = -ENOMEM;
- device = kzalloc(sizeof(*device), GFP_KERNEL);
- if (!device)
- goto err;
-
- inode = new_inode(sb);
- if (!inode)
- goto err;
-
- inode->i_ino = minor + INODE_OFFSET;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- init_special_inode(inode, S_IFCHR | 0600,
- MKDEV(MAJOR(binderfs_dev), minor));
- inode->i_fop = &binder_fops;
- inode->i_uid = info->root_uid;
- inode->i_gid = info->root_gid;
-
- req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */
- name_len = strlen(req->name);
- /* Make sure to include terminating NUL byte */
- name = kmemdup(req->name, name_len + 1, GFP_KERNEL);
- if (!name)
- goto err;
-
- refcount_set(&device->ref, 1);
- device->binderfs_inode = inode;
- device->context.binder_context_mgr_uid = INVALID_UID;
- device->context.name = name;
- device->miscdev.name = name;
- device->miscdev.minor = minor;
- mutex_init(&device->context.context_mgr_node_lock);
-
- req->major = MAJOR(binderfs_dev);
- req->minor = minor;
-
- if (userp && copy_to_user(userp, req, sizeof(*req))) {
- ret = -EFAULT;
- goto err;
- }
-
- root = sb->s_root;
- inode_lock(d_inode(root));
-
- /* look it up */
- dentry = lookup_one_len(name, root, name_len);
- if (IS_ERR(dentry)) {
- inode_unlock(d_inode(root));
- ret = PTR_ERR(dentry);
- goto err;
- }
-
- if (d_really_is_positive(dentry)) {
- /* already exists */
- dput(dentry);
- inode_unlock(d_inode(root));
- ret = -EEXIST;
- goto err;
- }
-
- inode->i_private = device;
- d_instantiate(dentry, inode);
- fsnotify_create(root->d_inode, dentry);
- inode_unlock(d_inode(root));
-
- return 0;
-
-err:
- kfree(name);
- kfree(device);
- mutex_lock(&binderfs_minors_mutex);
- --info->device_count;
- ida_free(&binderfs_minors, minor);
- mutex_unlock(&binderfs_minors_mutex);
- iput(inode);
-
- return ret;
-}
-
-/**
- * binderfs_ctl_ioctl - handle binder device node allocation requests
- *
- * The request handler for the binder-control device. All requests operate on
- * the binderfs mount the binder-control device resides in:
- * - BINDER_CTL_ADD
- * Allocate a new binder device.
- *
- * Return: 0 on success, negative errno on failure
- */
-static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- int ret = -EINVAL;
- struct inode *inode = file_inode(file);
- struct binderfs_device __user *device = (struct binderfs_device __user *)arg;
- struct binderfs_device device_req;
-
- switch (cmd) {
- case BINDER_CTL_ADD:
- ret = copy_from_user(&device_req, device, sizeof(device_req));
- if (ret) {
- ret = -EFAULT;
- break;
- }
-
- ret = binderfs_binder_device_create(inode, device, &device_req);
- break;
- default:
- break;
- }
-
- return ret;
-}
-
-static void binderfs_evict_inode(struct inode *inode)
-{
- struct binder_device *device = inode->i_private;
- struct binderfs_info *info = BINDERFS_SB(inode->i_sb);
-
- clear_inode(inode);
-
- if (!S_ISCHR(inode->i_mode) || !device)
- return;
-
- mutex_lock(&binderfs_minors_mutex);
- --info->device_count;
- ida_free(&binderfs_minors, device->miscdev.minor);
- mutex_unlock(&binderfs_minors_mutex);
-
- if (refcount_dec_and_test(&device->ref)) {
- kfree(device->context.name);
- kfree(device);
- }
-}
-
-static int binderfs_fs_context_parse_param(struct fs_context *fc,
- struct fs_parameter *param)
-{
- int opt;
- struct binderfs_mount_opts *ctx = fc->fs_private;
- struct fs_parse_result result;
-
- opt = fs_parse(fc, binderfs_fs_parameters, param, &result);
- if (opt < 0)
- return opt;
-
- switch (opt) {
- case Opt_max:
- if (result.uint_32 > BINDERFS_MAX_MINOR)
- return invalfc(fc, "Bad value for '%s'", param->key);
-
- ctx->max = result.uint_32;
- break;
- case Opt_stats_mode:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- ctx->stats_mode = result.uint_32;
- break;
- default:
- return invalfc(fc, "Unsupported parameter '%s'", param->key);
- }
-
- return 0;
-}
-
-static int binderfs_fs_context_reconfigure(struct fs_context *fc)
-{
- struct binderfs_mount_opts *ctx = fc->fs_private;
- struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb);
-
- if (info->mount_opts.stats_mode != ctx->stats_mode)
- return invalfc(fc, "Binderfs stats mode cannot be changed during a remount");
-
- info->mount_opts.stats_mode = ctx->stats_mode;
- info->mount_opts.max = ctx->max;
- return 0;
-}
-
-static int binderfs_show_options(struct seq_file *seq, struct dentry *root)
-{
- struct binderfs_info *info = BINDERFS_SB(root->d_sb);
-
- if (info->mount_opts.max <= BINDERFS_MAX_MINOR)
- seq_printf(seq, ",max=%d", info->mount_opts.max);
-
- switch (info->mount_opts.stats_mode) {
- case binderfs_stats_mode_unset:
- break;
- case binderfs_stats_mode_global:
- seq_printf(seq, ",stats=global");
- break;
- }
-
- return 0;
-}
-
-static void binderfs_put_super(struct super_block *sb)
-{
- struct binderfs_info *info = sb->s_fs_info;
-
- if (info && info->ipc_ns)
- put_ipc_ns(info->ipc_ns);
-
- kfree(info);
- sb->s_fs_info = NULL;
-}
-
-static const struct super_operations binderfs_super_ops = {
- .evict_inode = binderfs_evict_inode,
- .show_options = binderfs_show_options,
- .statfs = simple_statfs,
- .put_super = binderfs_put_super,
-};
-
-static inline bool is_binderfs_control_device(const struct dentry *dentry)
-{
- struct binderfs_info *info = dentry->d_sb->s_fs_info;
-
- return info->control_dentry == dentry;
-}
-
-static int binderfs_rename(struct user_namespace *mnt_userns,
- struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- if (is_binderfs_control_device(old_dentry) ||
- is_binderfs_control_device(new_dentry))
- return -EPERM;
-
- return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir,
- new_dentry, flags);
-}
-
-static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
-{
- if (is_binderfs_control_device(dentry))
- return -EPERM;
-
- return simple_unlink(dir, dentry);
-}
-
-static const struct file_operations binder_ctl_fops = {
- .owner = THIS_MODULE,
- .open = nonseekable_open,
- .unlocked_ioctl = binder_ctl_ioctl,
- .compat_ioctl = binder_ctl_ioctl,
- .llseek = noop_llseek,
-};
-
-/**
- * binderfs_binder_ctl_create - create a new binder-control device
- * @sb: super block of the binderfs mount
- *
- * This function creates a new binder-control device node in the binderfs mount
- * referred to by @sb.
- *
- * Return: 0 on success, negative errno on failure
- */
-static int binderfs_binder_ctl_create(struct super_block *sb)
-{
- int minor, ret;
- struct dentry *dentry;
- struct binder_device *device;
- struct inode *inode = NULL;
- struct dentry *root = sb->s_root;
- struct binderfs_info *info = sb->s_fs_info;
-#if defined(CONFIG_IPC_NS)
- bool use_reserve = (info->ipc_ns == &init_ipc_ns);
-#else
- bool use_reserve = true;
-#endif
-
- device = kzalloc(sizeof(*device), GFP_KERNEL);
- if (!device)
- return -ENOMEM;
-
- /* If we have already created a binder-control node, return. */
- if (info->control_dentry) {
- ret = 0;
- goto out;
- }
-
- ret = -ENOMEM;
- inode = new_inode(sb);
- if (!inode)
- goto out;
-
- /* Reserve a new minor number for the new device. */
- mutex_lock(&binderfs_minors_mutex);
- minor = ida_alloc_max(&binderfs_minors,
- use_reserve ? BINDERFS_MAX_MINOR :
- BINDERFS_MAX_MINOR_CAPPED,
- GFP_KERNEL);
- mutex_unlock(&binderfs_minors_mutex);
- if (minor < 0) {
- ret = minor;
- goto out;
- }
-
- inode->i_ino = SECOND_INODE;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- init_special_inode(inode, S_IFCHR | 0600,
- MKDEV(MAJOR(binderfs_dev), minor));
- inode->i_fop = &binder_ctl_fops;
- inode->i_uid = info->root_uid;
- inode->i_gid = info->root_gid;
-
- refcount_set(&device->ref, 1);
- device->binderfs_inode = inode;
- device->miscdev.minor = minor;
-
- dentry = d_alloc_name(root, "binder-control");
- if (!dentry)
- goto out;
-
- inode->i_private = device;
- info->control_dentry = dentry;
- d_add(dentry, inode);
-
- return 0;
-
-out:
- kfree(device);
- iput(inode);
-
- return ret;
-}
-
-static const struct inode_operations binderfs_dir_inode_operations = {
- .lookup = simple_lookup,
- .rename = binderfs_rename,
- .unlink = binderfs_unlink,
-};
-
-static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
-{
- struct inode *ret;
-
- ret = new_inode(sb);
- if (ret) {
- ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
- ret->i_mode = mode;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
- }
- return ret;
-}
-
-static struct dentry *binderfs_create_dentry(struct dentry *parent,
- const char *name)
-{
- struct dentry *dentry;
-
- dentry = lookup_one_len(name, parent, strlen(name));
- if (IS_ERR(dentry))
- return dentry;
-
- /* Return error if the file/dir already exists. */
- if (d_really_is_positive(dentry)) {
- dput(dentry);
- return ERR_PTR(-EEXIST);
- }
-
- return dentry;
-}
-
-void binderfs_remove_file(struct dentry *dentry)
-{
- struct inode *parent_inode;
-
- parent_inode = d_inode(dentry->d_parent);
- inode_lock(parent_inode);
- if (simple_positive(dentry)) {
- dget(dentry);
- simple_unlink(parent_inode, dentry);
- d_delete(dentry);
- dput(dentry);
- }
- inode_unlock(parent_inode);
-}
-
-struct dentry *binderfs_create_file(struct dentry *parent, const char *name,
- const struct file_operations *fops,
- void *data)
-{
- struct dentry *dentry;
- struct inode *new_inode, *parent_inode;
- struct super_block *sb;
-
- parent_inode = d_inode(parent);
- inode_lock(parent_inode);
-
- dentry = binderfs_create_dentry(parent, name);
- if (IS_ERR(dentry))
- goto out;
-
- sb = parent_inode->i_sb;
- new_inode = binderfs_make_inode(sb, S_IFREG | 0444);
- if (!new_inode) {
- dput(dentry);
- dentry = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- new_inode->i_fop = fops;
- new_inode->i_private = data;
- d_instantiate(dentry, new_inode);
- fsnotify_create(parent_inode, dentry);
-
-out:
- inode_unlock(parent_inode);
- return dentry;
-}
-
-static struct dentry *binderfs_create_dir(struct dentry *parent,
- const char *name)
-{
- struct dentry *dentry;
- struct inode *new_inode, *parent_inode;
- struct super_block *sb;
-
- parent_inode = d_inode(parent);
- inode_lock(parent_inode);
-
- dentry = binderfs_create_dentry(parent, name);
- if (IS_ERR(dentry))
- goto out;
-
- sb = parent_inode->i_sb;
- new_inode = binderfs_make_inode(sb, S_IFDIR | 0755);
- if (!new_inode) {
- dput(dentry);
- dentry = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- new_inode->i_fop = &simple_dir_operations;
- new_inode->i_op = &simple_dir_inode_operations;
-
- set_nlink(new_inode, 2);
- d_instantiate(dentry, new_inode);
- inc_nlink(parent_inode);
- fsnotify_mkdir(parent_inode, dentry);
-
-out:
- inode_unlock(parent_inode);
- return dentry;
-}
-
-static int binder_features_show(struct seq_file *m, void *unused)
-{
- bool *feature = m->private;
-
- seq_printf(m, "%d\n", *feature);
-
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(binder_features);
-
-static int init_binder_features(struct super_block *sb)
-{
- struct dentry *dentry, *dir;
-
- dir = binderfs_create_dir(sb->s_root, "features");
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- dentry = binderfs_create_file(dir, "oneway_spam_detection",
- &binder_features_fops,
- &binder_features.oneway_spam_detection);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- return 0;
-}
-
-static int init_binder_logs(struct super_block *sb)
-{
- struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir;
- const struct binder_debugfs_entry *db_entry;
- struct binderfs_info *info;
- int ret = 0;
-
- binder_logs_root_dir = binderfs_create_dir(sb->s_root,
- "binder_logs");
- if (IS_ERR(binder_logs_root_dir)) {
- ret = PTR_ERR(binder_logs_root_dir);
- goto out;
- }
-
- binder_for_each_debugfs_entry(db_entry) {
- dentry = binderfs_create_file(binder_logs_root_dir,
- db_entry->name,
- db_entry->fops,
- db_entry->data);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- goto out;
- }
- }
-
- proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc");
- if (IS_ERR(proc_log_dir)) {
- ret = PTR_ERR(proc_log_dir);
- goto out;
- }
- info = sb->s_fs_info;
- info->proc_log_dir = proc_log_dir;
-
-out:
- return ret;
-}
-
-static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
-{
- int ret;
- struct binderfs_info *info;
- struct binderfs_mount_opts *ctx = fc->fs_private;
- struct inode *inode = NULL;
- struct binderfs_device device_info = {};
- const char *name;
- size_t len;
-
- sb->s_blocksize = PAGE_SIZE;
- sb->s_blocksize_bits = PAGE_SHIFT;
-
- /*
- * The binderfs filesystem can be mounted by userns root in a
- * non-initial userns. By default such mounts have the MS_NODEV flag
- * set in s_iflags to prevent security issues where userns root can
- * just create random device nodes via mknod() since it owns the
- * filesystem mount. But binderfs does not allow to create any files
- * including devices nodes. The only way to create binder devices nodes
- * is through the binder-control device which userns root is explicitly
- * allowed to do. So removing the MS_NODEV flag from s_iflags is both
- * necessary and safe.
- */
- sb->s_iflags &= ~MS_NODEV;
- sb->s_iflags |= SB_I_NOEXEC;
- sb->s_magic = BINDERFS_SUPER_MAGIC;
- sb->s_op = &binderfs_super_ops;
- sb->s_time_gran = 1;
-
- sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL);
- if (!sb->s_fs_info)
- return -ENOMEM;
- info = sb->s_fs_info;
-
- info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
-
- info->root_gid = make_kgid(sb->s_user_ns, 0);
- if (!gid_valid(info->root_gid))
- info->root_gid = GLOBAL_ROOT_GID;
- info->root_uid = make_kuid(sb->s_user_ns, 0);
- if (!uid_valid(info->root_uid))
- info->root_uid = GLOBAL_ROOT_UID;
- info->mount_opts.max = ctx->max;
- info->mount_opts.stats_mode = ctx->stats_mode;
-
- inode = new_inode(sb);
- if (!inode)
- return -ENOMEM;
-
- inode->i_ino = FIRST_INODE;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | 0755;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- inode->i_op = &binderfs_dir_inode_operations;
- set_nlink(inode, 2);
-
- sb->s_root = d_make_root(inode);
- if (!sb->s_root)
- return -ENOMEM;
-
- ret = binderfs_binder_ctl_create(sb);
- if (ret)
- return ret;
-
- name = binder_devices_param;
- for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
- strscpy(device_info.name, name, len + 1);
- ret = binderfs_binder_device_create(inode, NULL, &device_info);
- if (ret)
- return ret;
- name += len;
- if (*name == ',')
- name++;
- }
-
- ret = init_binder_features(sb);
- if (ret)
- return ret;
-
- if (info->mount_opts.stats_mode == binderfs_stats_mode_global)
- return init_binder_logs(sb);
-
- return 0;
-}
-
-static int binderfs_fs_context_get_tree(struct fs_context *fc)
-{
- return get_tree_nodev(fc, binderfs_fill_super);
-}
-
-static void binderfs_fs_context_free(struct fs_context *fc)
-{
- struct binderfs_mount_opts *ctx = fc->fs_private;
-
- kfree(ctx);
-}
-
-static const struct fs_context_operations binderfs_fs_context_ops = {
- .free = binderfs_fs_context_free,
- .get_tree = binderfs_fs_context_get_tree,
- .parse_param = binderfs_fs_context_parse_param,
- .reconfigure = binderfs_fs_context_reconfigure,
-};
-
-static int binderfs_init_fs_context(struct fs_context *fc)
-{
- struct binderfs_mount_opts *ctx;
-
- ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->max = BINDERFS_MAX_MINOR;
- ctx->stats_mode = binderfs_stats_mode_unset;
-
- fc->fs_private = ctx;
- fc->ops = &binderfs_fs_context_ops;
-
- return 0;
-}
-
-static struct file_system_type binder_fs_type = {
- .name = "binder",
- .init_fs_context = binderfs_init_fs_context,
- .parameters = binderfs_fs_parameters,
- .kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
-};
-
-int __init init_binderfs(void)
-{
- int ret;
- const char *name;
- size_t len;
-
- /* Verify that the default binderfs device names are valid. */
- name = binder_devices_param;
- for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
- if (len > BINDERFS_MAX_NAME)
- return -E2BIG;
- name += len;
- if (*name == ',')
- name++;
- }
-
- /* Allocate new major number for binderfs. */
- ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR,
- "binder");
- if (ret)
- return ret;
-
- ret = register_filesystem(&binder_fs_type);
- if (ret) {
- unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
- return ret;
- }
-
- return ret;
-}
diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
deleted file mode 100644
index fd718ab02392..000000000000
--- a/drivers/android/vendor_hooks.c
+++ /dev/null
@@ -1,433 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* vendor_hook.c
- *
- * Android Vendor Hook Support
- *
- * Copyright 2020 Google LLC
- */
-
-#ifndef __GENKSYMS__
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include <../fs/mount.h>
-#include <../kernel/audit.h>
-#include <../kernel/locking/mutex.h>
-#include <../net/can/af_can.h>
-#include <../net/tipc/bearer.h>
-#include <../kernel/printk/printk_ringbuffer.h>
-#endif
-
-#define CREATE_TRACE_POINTS
-#include
-#include
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#ifdef __GENKSYMS__
-#include
-#endif
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#ifdef __GENKSYMS__
-#include
-#endif
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-/*
- * Export tracepoints that act as a bare tracehook (ie: have no trace event
- * associated with them) to allow external modules to probe them.
- */
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_refrigerator);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sk_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_nf_conn_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_arch_set_freq_scale);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_is_fpsimd_save);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_transaction_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_priority_skip);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_set_priority);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_restore_priority);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wakeup_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_send_sig_info);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_sleep_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_futex);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wait_end);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_traverse_plist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_this);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_futex_wake_up_q_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_process_killed);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_killed_process);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_read_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_wait_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_opt_spin_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_can_spin_on_owner);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sched_show_task);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_hotplug);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller_id);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_caller);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_printk_ext_header);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_gic_v3_set_affinity);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_set_affinity);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v3_affinity_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_suspend_epoch_val);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_resume_epoch_val);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_table_limits);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_resolve_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_fast_switch);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_target);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_offline);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_skip_swapcache_flags);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_gfp_zone_flags);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_readahead_gfp_mask);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_readahead_gfp_mask);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_rmqueue_bulk);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_disable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_preempt_enable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_disable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_irqs_enable);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_can_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_cgroup_online);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_oops_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_size_check);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_format_check);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ftrace_dump_buffer);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_compl_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_set_task);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_syscall_prctl_finished);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_uic_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sdev);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_clock_scaling);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_use_mcq_hooks);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_max_tag);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_map_tag);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_set_sqid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_handler);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_make_hba_operational);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_hba_capabilities);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_print_trs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_send_command);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_config);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_has_oustanding_reqs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_get_outstanding_reqs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_cmd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_clear_pending);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_mcq_retry_complete);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kfree_skb);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cgroup_force_kthread_migration);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_wait_for_work);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_entry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_select_worklist_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sync_txn_recvd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpufreq_transition);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_add_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_update_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_freq_qos_remove_request);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_set_balance_anon_file_reclaim);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_show_max_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_failed_page_trylock);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_clear);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_trylock_get_result);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_page_trylock);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_referenced_check_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drain_all_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_drain_all_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pcplist_add_cma_pages_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_slab_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_insert);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_delete);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_node_replace);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_avc_lookup);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_commit_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_exit_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_override_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_revert_creds);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_mutex_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rtmutex_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_rwsem_lock_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_record_pcpu_rwsem_starttime);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_nx);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_memory_rw);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_before_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_module_permit_after_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_oom_check_panic);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_logbuf_pr_cont);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_uninterruptible_tasks_dn);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_meminfo_proc_show);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_mm);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_slowpath);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mem);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_print_slabinfo_header);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_shrink_slab);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cache_show);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_report_bug);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watchdog_timer_softlockup);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_logging);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_freeze_todo_unfrozen);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_die_kernel_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sea);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_mem_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_sp_pc_abort);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_undefinstr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_ptrauth_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_panic_unhandled);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_arm64_serror_panic);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_do_serror);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_vmpressure);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_request_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_target_freq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_register);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_disable_thermal_cooling_stats);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_throttle_update);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_tk_based_time_sync);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kswapd_per_node);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_vendor_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_ep_action);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_synctype);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_audio_usb_offload_connect);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_audio_usb_offload_disconnect);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_atomic_remove_fb);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_drm_atomic_check_modeset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_tos_resident_on);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_psci_cpu_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_usb_new_device_added);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_regmap_update);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_mutex_list_add);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_unlock_slowpath);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake_finish);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dma_buf_release);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dmabuf_heap_flags_validation);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pass_input_event);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_check_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmap_region);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_try_to_unmap_one);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_shrink_node_memcgs);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sdio_pm_flag_set);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_scan_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_swappiness);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_partial_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_cache_card_properties);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_print_transaction_info);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_handle_tlb_conf);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_init);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_memcgv2_calc_decayed_watermark);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_watermark);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_reset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_attach_sd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sdhci_get_cd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_gpio_cd_irqt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_update_partition_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_cmdline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_sd_update_dataline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_partition_status);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_cmdline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_mmc_sd_dataline_timing);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_set_context);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cfg80211_get_context);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_track_hash);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_id_remove);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_offline);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_css_online);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_free);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mem_cgroup_alloc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_slab);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpuset_fork);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_looper_state_registered);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_read);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_free_proc);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_thread_release);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_has_work_ilocked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_read_done);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_gic_v2_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_exit_signal);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_alloc_new_buf_locked);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_reply);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_trans);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_preset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_memcg_scan_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_thermal_stats);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_proc_transaction);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_new_ref);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_binder_del_ref);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_show_mapcount_pages);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_traversal_lruvec);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_should_be_protected);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mark_page_accessed);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_ffu_update_cid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_uid);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_user);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_thermal_cpu_get_power);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_page_cache_forced_ra);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpufreq_acct_update_power);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rmqueue);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_tune_inactive_ratio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_hibernation_swap);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_cpu_resume);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_save_hib_resume_bdev);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dma_buf_stats_teardown);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_cold_or_pageout);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_retry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_encrypt_page);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_init_aes_encrypt);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_skip_swap_map_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_post_image_save);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_dm_update_clone_bio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ctl_dirty_rate);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_direct_io_update_bio);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_loop_prepare_cmd);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_event);
-EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_psi_group);
-/*
- * For type visibility
- */
-const struct readahead_control *GKI_struct_readahead_control;
-EXPORT_SYMBOL_GPL(GKI_struct_readahead_control);
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 6cbdf2737004..378717d1b3b4 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -377,19 +377,11 @@ static int input_get_disposition(struct input_dev *dev,
return disposition;
}
-#ifdef CONFIG_KSU_MANUAL_HOOK
-extern bool ksu_input_hook __read_mostly;
-extern int ksu_handle_input_handle_event(unsigned int *type, unsigned int *code, int *value);
-#endif
static void input_handle_event(struct input_dev *dev,
unsigned int type, unsigned int code, int value)
{
int disposition = input_get_disposition(dev, type, code, &value);
-#ifdef CONFIG_KSU_MANUAL_HOOK
- if (unlikely(ksu_input_hook))
- ksu_handle_input_handle_event(&type, &code, &value);
-#endif
if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN)
add_input_randomness(type, code, value);
diff --git a/drivers/input/touchscreen/fts_521/fts.c b/drivers/input/touchscreen/fts_521/fts.c
index 4722476a4c2d..e0750f28a609 100644
--- a/drivers/input/touchscreen/fts_521/fts.c
+++ b/drivers/input/touchscreen/fts_521/fts.c
@@ -2723,15 +2723,15 @@ static void fts_enter_pointer_event_handler(struct fts_ts_info *info,
input_report_key(info->input_dev, BTN_TOOL_FINGER, 1);
/*input_report_abs(info->input_dev, ABS_MT_TRACKING_ID, touchId); */
- input_report_abs(info->input_dev, ABS_MT_POSITION_X, x);
- input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y);
- input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z);
- input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z);
- input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance);
+ input_report_abs(info->input_dev, ABS_MT_POSITION_X, x);
+ input_report_abs(info->input_dev, ABS_MT_POSITION_Y, y);
+ input_report_abs(info->input_dev, ABS_MT_TOUCH_MAJOR, z);
+ input_report_abs(info->input_dev, ABS_MT_TOUCH_MINOR, z);
+ input_report_abs(info->input_dev, ABS_MT_DISTANCE, distance);
#ifdef CONFIG_INPUT_PRESS_NDT
- input_report_abs(info->input_dev, ABS_MT_PRESSURE, z);
+ input_report_abs(info->input_dev, ABS_MT_PRESSURE, z);
#endif
- input_sync(info->input_dev);
+ input_sync(info->input_dev);
/* pr_info("%s: Event 0x%02x - ID[%d], (x, y, z) = (%3d, %3d, %3d) type = %d\n",
__func__, *event, touchId, x, y, z, touchType); */
diff --git a/drivers/kernelsu/Kbuild b/drivers/kernelsu/Kbuild
deleted file mode 100644
index 800da52d0892..000000000000
--- a/drivers/kernelsu/Kbuild
+++ /dev/null
@@ -1,26 +0,0 @@
-obj-y += ksuinit.o
-obj-y += allowlist.o
-obj-y += app_profile.o
-obj-y += apk_sign.o
-obj-y += sucompat.o
-obj-y += throne_tracker.o
-obj-y += setuid_hook.o
-obj-y += kernel_compat.o
-obj-y += kernel_umount.o
-obj-y += supercalls.o
-obj-y += feature.o
-obj-y += ksud.o
-obj-y += file_wrapper.o
-obj-y += su_mount_ns.o
-obj-y += shim.o
-obj-y += selinux/selinux.o
-obj-y += selinux/sepolicy.o
-obj-y += selinux/rules.o
-
-ccflags-y += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
-ccflags-y += -I$(objtree)/security/selinux -include $(srctree)/include/uapi/asm-generic/errno.h
-
-ccflags-y += -Wno-strict-prototypes -Wno-int-conversion -Wno-gcc-compat
-ccflags-y += -Wno-declaration-after-statement -Wno-unused-function -Wno-missing-prototypes
-
-# Keep a new line here !! Because someone may append config
diff --git a/drivers/kernelsu/Kconfig b/drivers/kernelsu/Kconfig
index 8464a6c4ca4b..10608831444f 100644
--- a/drivers/kernelsu/Kconfig
+++ b/drivers/kernelsu/Kconfig
@@ -1,48 +1,77 @@
menu "KernelSU"
config KSU
- tristate "KernelSU function support"
- default y
+ bool "KernelSU function support"
+ depends on !CPU_BIG_ENDIAN
+ depends on SECURITY_SELINUX
+ select SECCOMP
+ default n
help
Enable kernel-level root privileges on Android System.
- To compile as a module, choose M here: the
- module will be called kernelsu.
-config KSU_DEBUG
- bool "KernelSU debug mode"
- depends on KSU
+config KSU_KPROBES_KSUD
+ bool "Enable dynamic kprobes for early boot hooks"
+ depends on KPROBES && KRETPROBES
+ default y
+ help
+ Use dynamic hooks via kprobes for functions only
+ on early boot. Hooks are unregistered at boot complete
+ to reduce overhead.
+
+config KSU_TAMPER_SYSCALL_TABLE
+ bool "EXPERIMENTAL: tamper sys_call_table for sucompat + sys_reboot"
+ depends on (ARM || ARM64) && !CFI_CLANG && !CFI
default n
help
- Enable KernelSU debug mode.
+ EXPERIMENTAL: use syscall table hijacking method demonstrated on zx2c4's
+ kernel-assisted-superuser. Replaces sys_reboot, sys_execve, sys_newfstatat,
+ sys_faccessat, sys_newfstat_ret manual hooks.
+ Personally tested on Linux 3.10 ~ 4.14, aarch64.
-config KSU_ALLOWLIST_WORKAROUND
- bool "KernelSU allowlist workaround"
+config KSU_FEATURE_SULOG
+ bool "KernelSU SU Logging feature"
depends on KSU
- default n
+ default y
help
- Enable workaround for broken allowlist save
+ Build KernelSU's SU Log.
-choice
- prompt "KernelSU hooks"
- default KSU_MANUAL_HOOK if !KPROBES
- default KSU_SYSCALL_HOOK if KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS
+config KSU_FEATURE_ADBROOT
+ bool "KernelSU ADB Root feature"
+ depends on KSU
+ default y
help
- KernelSU core hooks.
+ Build KernelSU's adb root feature.
-config KSU_MANUAL_HOOK
- bool "KernelSU manual hook mode."
- depends on KSU && KSU != m
+config KSU_FEATURE_SELINUX_HIDE
+ bool "KernelSU SELinux hide feature"
+ depends on KSU
+ default y
help
- Enable manual hook support.
+ Build KernelSU's SELinux hide feature.
+ This is a dumber implementation, but it should be fine for most cases.
-config KSU_SYSCALL_HOOK
- bool "KernelSU syscall hook mode."
+config KSU_DEBUG
+ bool "KernelSU debug mode"
depends on KSU
- depends on KPROBES && KRETPROBES && HAVE_SYSCALL_TRACEPOINTS
+ default n
help
- Enable KPROBES, KRETPROBES and TRACEPOINT hook for KernelSU core.
- This should not be used on kernel below 5.10.
+ Enable KernelSU debug mode.
-endchoice
+config KSU_THRONE_TRACKER_ALWAYS_THREADED
+ bool "Always run throne tracker in a kthread"
+ default n
+ help
+ Enable this option to run throne tracker in a kthread for the first
+ run, which happens at boot time / decryption stage. This can decrease
+ boot time, but can cause crowning failure on some FDE/FBEv1 setups.
+ If unsure, say n.
+
+config KSU_LSM_SECURITY_HOOKS
+ bool "Use LSM security hooks"
+ depends on KSU
+ default y
+ help
+ Disabling this is mostly useful for kernel > 6.8.
+ Make sure to implement manual hooks on security/security.c.
endmenu
diff --git a/drivers/kernelsu/Makefile b/drivers/kernelsu/Makefile
new file mode 100644
index 000000000000..7c2fcedc7eac
--- /dev/null
+++ b/drivers/kernelsu/Makefile
@@ -0,0 +1,78 @@
+# NOTE: unity build. single unit.
+
+obj-$(CONFIG_KSU) := ksu.o
+
+CFLAGS_ksu.o += -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
+CFLAGS_ksu.o += -I$(objtree)/security/selinux
+
+# uncommon, but wont hurt, check for 3-arg security_add_hooks
+ifeq ($(shell grep -A1 "void security_add_hooks" $(srctree)/include/linux/lsm_hooks.h 2>/dev/null | grep -q lsm 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_SECURITY_ADD_HOOKS_V2
+endif
+
+ifeq ($(shell grep -q " current_sid(void)" $(srctree)/security/selinux/include/objsec.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_CURRENT_SID
+endif
+
+ifeq ($(shell grep -q "struct selinux_state " $(srctree)/security/selinux/include/security.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_SELINUX_STATE
+endif
+
+ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT
+endif
+
+# half-assed-backport from 5.1
+ifeq ($(shell grep -q "struct type_datum \*\*type_val_to_struct_array;" $(srctree)/security/selinux/ss/policydb.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_TYPE_VAL_TO_STRUCT_ARRAY
+endif
+
+ifeq ($(shell grep -q "^DEFINE_RWLOCK(policy_rwlock);" $(srctree)/security/selinux/ss/services.c; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_EXPORTED_POLICY_RWLOCK
+endif
+
+ifeq ($(shell grep -q "cpus_ptr;" $(srctree)/include/linux/sched.h; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_COMPAT_HAS_BACKPORTED_CPUS_PTR
+endif
+
+ifeq ($(shell grep -q "^struct security_operations selinux_ops" $(srctree)/security/selinux/hooks.c; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_EXPORTED_SELINUX_OPS
+endif
+
+# UL, look for read_iter on f_op struct
+ifeq ($(shell grep -q "read_iter" $(srctree)/include/linux/fs.h 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_FOP_READ_ITER
+endif
+
+# UL, look for iterate_dir on fs/readdir.c
+ifeq ($(shell grep -q "^int iterate_dir" $(srctree)/fs/readdir.c 2>/dev/null; echo $$?),0)
+CFLAGS_ksu.o += -DKSU_HAS_ITERATE_DIR
+endif
+
+CFLAGS_ksu.o += -Wno-implicit-function-declaration -Wno-strict-prototypes -Wno-declaration-after-statement
+CFLAGS_ksu.o += -Wno-int-conversion -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast
+CFLAGS_ksu.o += -Wno-unused-variable -Wno-unused-function -Wno-format
+CFLAGS_ksu.o += -Wno-macro-redefined
+
+# dont be too strict
+CFLAGS_REMOVE_ksu.o += -Werror
+
+# so we can see stack use atleast, as we disable all stack safety here
+CFLAGS_ksu.o += $(call cc-option, -Wframe-larger-than=1024)
+
+# to make sure we can use builtins
+CFLAGS_REMOVE_ksu.o += -fno-builtin
+
+ifneq ($(CONFIG_KSU_DEBUG),y)
+# strip, remove tracing / profiling
+# comment out if proper backtrace is needed
+CFLAGS_ksu.o += -g0 -fno-unwind-tables -fno-asynchronous-unwind-tables -fomit-frame-pointer
+CFLAGS_REMOVE_ksu.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ksu.o += -pg
+
+# if cflags can be macro'd, this will be called 'TRUST_ME'
+CFLAGS_ksu.o += -fno-stack-protector -fno-stack-check
+CFLAGS_REMOVE_ksu.o += -fsanitize=shadow-call-stack
+endif # CONFIG_KSU_DEBUG
+
+# Keep a new line here!! Because someone may append config
diff --git a/drivers/kernelsu/allowlist.c b/drivers/kernelsu/allowlist.c
deleted file mode 100644
index 9152b7174b6c..000000000000
--- a/drivers/kernelsu/allowlist.c
+++ /dev/null
@@ -1,576 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#include
-#else
-#include
-#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
-#include
-#endif
-
-#include "klog.h" // IWYU pragma: keep
-#include "ksud.h"
-#include "selinux/selinux.h"
-#include "allowlist.h"
-#include "manager.h"
-#include "kernel_compat.h"
-#include "su_mount_ns.h"
-#ifdef CONFIG_KSU_SYSCALL_HOOK
-#include "syscall_handler.h"
-#endif
-
-#define FILE_MAGIC 0x7f4b5355 // ' KSU', u32
-#define FILE_FORMAT_VERSION 3 // u32
-
-#define KSU_APP_PROFILE_PRESERVE_UID 9999 // NOBODY_UID
-#define KSU_DEFAULT_SELINUX_DOMAIN "u:r:" KERNEL_SU_DOMAIN ":s0"
-
-static DEFINE_MUTEX(allowlist_mutex);
-
-// default profiles, these may be used frequently, so we cache it
-static struct root_profile default_root_profile;
-static struct non_root_profile default_non_root_profile;
-
-static int allow_list_arr[PAGE_SIZE / sizeof(int)] __read_mostly
- __aligned(PAGE_SIZE);
-static int allow_list_pointer __read_mostly = 0;
-
-static void remove_uid_from_arr(uid_t uid)
-{
- int *temp_arr;
- int i, j;
-
- if (allow_list_pointer == 0)
- return;
-
- temp_arr = kzalloc(sizeof(allow_list_arr), GFP_KERNEL);
- if (temp_arr == NULL) {
- pr_err("%s: unable to allocate memory\n", __func__);
- return;
- }
-
- for (i = j = 0; i < allow_list_pointer; i++) {
- if (allow_list_arr[i] == uid)
- continue;
- temp_arr[j++] = allow_list_arr[i];
- }
-
- allow_list_pointer = j;
-
- for (; j < ARRAY_SIZE(allow_list_arr); j++)
- temp_arr[j] = -1;
-
- memcpy(&allow_list_arr, temp_arr, PAGE_SIZE);
- kfree(temp_arr);
-}
-
-static void init_default_profiles(void)
-{
- kernel_cap_t full_cap = CAP_FULL_SET;
-
- default_root_profile.uid = 0;
- default_root_profile.gid = 0;
- default_root_profile.groups_count = 1;
- default_root_profile.groups[0] = 0;
- memcpy(&default_root_profile.capabilities.effective, &full_cap,
- sizeof(default_root_profile.capabilities.effective));
- default_root_profile.namespaces = KSU_NS_INHERITED;
- strcpy(default_root_profile.selinux_domain, KSU_DEFAULT_SELINUX_DOMAIN);
-
- // This means that we will umount modules by default!
- default_non_root_profile.umount_modules = true;
-}
-
-struct perm_data {
- struct list_head list;
- struct app_profile profile;
-};
-
-static struct list_head allow_list;
-
-static uint8_t allow_list_bitmap[PAGE_SIZE] __read_mostly __aligned(PAGE_SIZE);
-#define BITMAP_UID_MAX ((sizeof(allow_list_bitmap) * BITS_PER_BYTE) - 1)
-
-#define KERNEL_SU_ALLOWLIST "/data/adb/ksu/.allowlist"
-
-void persistent_allow_list(void);
-
-void ksu_show_allow_list(void)
-{
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
- pr_info("ksu_show_allow_list\n");
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- pr_info("uid :%d, allow: %d\n", p->profile.current_uid,
- p->profile.allow_su);
- }
-}
-
-#ifdef CONFIG_KSU_DEBUG
-static void ksu_grant_root_to_shell(void)
-{
- struct app_profile profile = {
- .version = KSU_APP_PROFILE_VER,
- .allow_su = true,
- .current_uid = 2000,
- };
- strcpy(profile.key, "com.android.shell");
- strcpy(profile.rp_config.profile.selinux_domain,
- KSU_DEFAULT_SELINUX_DOMAIN);
- ksu_set_app_profile(&profile, false);
-}
-#endif
-
-bool ksu_get_app_profile(struct app_profile *profile)
-{
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
- bool found = false;
-
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- bool uid_match = profile->current_uid == p->profile.current_uid;
- if (uid_match) {
- // found it, override it with ours
- memcpy(profile, &p->profile, sizeof(*profile));
- found = true;
- goto exit;
- }
- }
-
-exit:
- return found;
-}
-
-static inline bool forbid_system_uid(uid_t uid)
-{
-#define SHELL_UID 2000
-#define SYSTEM_UID 1000
- return uid < SHELL_UID && uid != SYSTEM_UID;
-}
-
-static bool profile_valid(struct app_profile *profile)
-{
- if (!profile) {
- return false;
- }
-
- if (profile->version < KSU_APP_PROFILE_VER) {
- pr_info("Unsupported profile version: %d\n", profile->version);
- return false;
- }
-
- if (profile->allow_su) {
- if (profile->rp_config.profile.groups_count > KSU_MAX_GROUPS) {
- return false;
- }
-
- if (strlen(profile->rp_config.profile.selinux_domain) == 0) {
- return false;
- }
- }
-
- return true;
-}
-
-bool ksu_set_app_profile(struct app_profile *profile, bool persist)
-{
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
- bool result = false;
-
- if (!profile_valid(profile)) {
- pr_err("Failed to set app profile: invalid profile!\n");
- return false;
- }
-
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- // both uid and package must match, otherwise it will break multiple package with different user id
- if (profile->current_uid == p->profile.current_uid &&
- !strcmp(profile->key, p->profile.key)) {
- // found it, just override it all!
- memcpy(&p->profile, profile, sizeof(*profile));
- result = true;
- goto out;
- }
- }
-
- // not found, alloc a new node!
- p = (struct perm_data *)kzalloc(sizeof(struct perm_data), GFP_KERNEL);
- if (!p) {
- pr_err("ksu_set_app_profile alloc failed\n");
- return false;
- }
-
- memcpy(&p->profile, profile, sizeof(*profile));
- if (profile->allow_su) {
- pr_info("set root profile, key: %s, uid: %d, gid: %d, context: %s\n",
- profile->key, profile->current_uid,
- profile->rp_config.profile.gid,
- profile->rp_config.profile.selinux_domain);
- } else {
- pr_info("set app profile, key: %s, uid: %d, umount modules: %d\n",
- profile->key, profile->current_uid,
- profile->nrp_config.profile.umount_modules);
- }
- list_add_tail(&p->list, &allow_list);
-
-out:
- if (profile->current_uid <= BITMAP_UID_MAX) {
- if (profile->allow_su)
- allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] |=
- 1 << (profile->current_uid % BITS_PER_BYTE);
- else
- allow_list_bitmap[profile->current_uid / BITS_PER_BYTE] &=
- ~(1 << (profile->current_uid % BITS_PER_BYTE));
- } else {
- if (profile->allow_su) {
- /*
- * 1024 apps with uid higher than BITMAP_UID_MAX
- * registered to request superuser?
- */
- if (allow_list_pointer >= ARRAY_SIZE(allow_list_arr)) {
- pr_err("too many apps registered\n");
- WARN_ON(1);
- return false;
- }
- allow_list_arr[allow_list_pointer++] =
- profile->current_uid;
- } else {
- remove_uid_from_arr(profile->current_uid);
- }
- }
- result = true;
-
- // check if the default profiles is changed, cache it to a single struct to accelerate access.
- if (unlikely(!strcmp(profile->key, "$"))) {
- // set default non root profile
- memcpy(&default_non_root_profile, &profile->nrp_config.profile,
- sizeof(default_non_root_profile));
- }
-
- if (unlikely(!strcmp(profile->key, "#"))) {
- // set default root profile
- memcpy(&default_root_profile, &profile->rp_config.profile,
- sizeof(default_root_profile));
- }
-
- if (persist) {
- persistent_allow_list();
-#ifdef CONFIG_KSU_SYSCALL_HOOK
- // FIXME: use a new flag
- ksu_mark_running_process();
-#endif
- }
-
- return result;
-}
-
-bool __ksu_is_allow_uid(uid_t uid)
-{
- int i;
-
- if (forbid_system_uid(uid)) {
- // do not bother going through the list if it's system
- return false;
- }
-
- if (likely(ksu_is_manager_appid_valid()) &&
- unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) {
- // manager is always allowed!
- return true;
- }
-
- if (likely(uid <= BITMAP_UID_MAX)) {
- return !!(allow_list_bitmap[uid / BITS_PER_BYTE] &
- (1 << (uid % BITS_PER_BYTE)));
- } else {
- for (i = 0; i < allow_list_pointer; i++) {
- if (allow_list_arr[i] == uid)
- return true;
- }
- }
-
- return false;
-}
-
-bool __ksu_is_allow_uid_for_current(uid_t uid)
-{
- if (unlikely(uid == 0)) {
- // already root, but only allow our domain.
- return is_ksu_domain();
- }
- return __ksu_is_allow_uid(uid);
-}
-
-bool ksu_uid_should_umount(uid_t uid)
-{
- struct app_profile profile = { .current_uid = uid };
-
- if (likely(ksu_is_manager_appid_valid()) &&
- unlikely(ksu_get_manager_appid() == uid % PER_USER_RANGE)) {
- // we should not umount on manager!
- return false;
- }
-
- bool found = ksu_get_app_profile(&profile);
- if (!found) {
- // no app profile found, it must be non root app
- return default_non_root_profile.umount_modules;
- }
- if (profile.allow_su) {
- // if found and it is granted to su, we shouldn't umount for it
- return false;
- } else {
- // found an app profile
- if (profile.nrp_config.use_default) {
- return default_non_root_profile.umount_modules;
- } else {
- return profile.nrp_config.profile.umount_modules;
- }
- }
-}
-
-struct root_profile *ksu_get_root_profile(uid_t uid)
-{
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
-
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- if (uid == p->profile.current_uid && p->profile.allow_su) {
- if (!p->profile.rp_config.use_default) {
- return &p->profile.rp_config.profile;
- }
- }
- }
-
- // use default profile
- return &default_root_profile;
-}
-
-bool ksu_get_allow_list(int *array, int *length, bool allow)
-{
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
- int i = 0;
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- // pr_info("get_allow_list uid: %d allow: %d\n", p->uid, p->allow);
- if (p->profile.allow_su == allow) {
- array[i++] = p->profile.current_uid;
- }
- }
- *length = i;
-
- return true;
-}
-
-static void do_persistent_allow_list(struct callback_head *_cb)
-{
- u32 magic = FILE_MAGIC;
- u32 version = FILE_FORMAT_VERSION;
- struct perm_data *p = NULL;
- struct list_head *pos = NULL;
- loff_t off = 0;
-
- mutex_lock(&allowlist_mutex);
- struct file *fp = ksu_filp_open_compat(
- KERNEL_SU_ALLOWLIST, O_WRONLY | O_CREAT | O_TRUNC, 0644);
- if (IS_ERR(fp)) {
- pr_err("save_allow_list create file failed: %ld\n",
- PTR_ERR(fp));
- goto unlock;
- }
-
- // store magic and version
- if (ksu_kernel_write_compat(fp, &magic, sizeof(magic), &off) !=
- sizeof(magic)) {
- pr_err("save_allow_list write magic failed.\n");
- goto close_file;
- }
-
- if (ksu_kernel_write_compat(fp, &version, sizeof(version), &off) !=
- sizeof(version)) {
- pr_err("save_allow_list write version failed.\n");
- goto close_file;
- }
-
- list_for_each (pos, &allow_list) {
- p = list_entry(pos, struct perm_data, list);
- pr_info("save allow list, name: %s uid :%d, allow: %d\n",
- p->profile.key, p->profile.current_uid,
- p->profile.allow_su);
-
- ksu_kernel_write_compat(fp, &p->profile, sizeof(p->profile),
- &off);
- }
-
-close_file:
- filp_close(fp, 0);
-unlock:
- mutex_unlock(&allowlist_mutex);
- kfree(_cb);
-}
-
-void persistent_allow_list(void)
-{
- struct task_struct *tsk;
-
- tsk = get_pid_task(find_vpid(1), PIDTYPE_PID);
- if (!tsk) {
- pr_err("save_allow_list find init task err\n");
- return;
- }
-
- struct callback_head *cb =
- kzalloc(sizeof(struct callback_head), GFP_KERNEL);
- if (!cb) {
- pr_err("save_allow_list alloc cb err\b");
- goto put_task;
- }
- cb->func = do_persistent_allow_list;
- if (task_work_add(tsk, cb, TWA_RESUME)) {
- kfree(cb);
- pr_warn("save_allow_list add task_work failed\n");
- }
-
-put_task:
- put_task_struct(tsk);
-}
-
-void ksu_load_allow_list(void)
-{
- loff_t off = 0;
- ssize_t ret = 0;
- struct file *fp = NULL;
- u32 magic;
- u32 version;
-
-#ifdef CONFIG_KSU_DEBUG
- // always allow adb shell by default
- ksu_grant_root_to_shell();
-#endif
-
- // load allowlist now!
- fp = ksu_filp_open_compat(KERNEL_SU_ALLOWLIST, O_RDONLY, 0);
- if (IS_ERR(fp)) {
- pr_err("load_allow_list open file failed: %ld\n", PTR_ERR(fp));
- return;
- }
-
- // verify magic
- if (ksu_kernel_read_compat(fp, &magic, sizeof(magic), &off) !=
- sizeof(magic) ||
- magic != FILE_MAGIC) {
- pr_err("allowlist file invalid: %d!\n", magic);
- goto exit;
- }
-
- if (ksu_kernel_read_compat(fp, &version, sizeof(version), &off) !=
- sizeof(version)) {
- pr_err("allowlist read version: %d failed\n", version);
- goto exit;
- }
-
- pr_info("allowlist version: %d\n", version);
-
- while (true) {
- struct app_profile profile;
-
- ret = ksu_kernel_read_compat(fp, &profile, sizeof(profile),
- &off);
-
- if (ret <= 0) {
- pr_info("load_allow_list read err: %zd\n", ret);
- break;
- }
-
- pr_info("load_allow_uid, name: %s, uid: %d, allow: %d\n",
- profile.key, profile.current_uid, profile.allow_su);
- ksu_set_app_profile(&profile, false);
- }
-
-exit:
- ksu_show_allow_list();
- filp_close(fp, 0);
-}
-
-void ksu_prune_allowlist(bool (*is_uid_valid)(uid_t, char *, void *),
- void *data)
-{
- struct perm_data *np, *n = NULL;
-
- if (!ksu_boot_completed) {
- pr_info("boot not completed, skip prune\n");
- return;
- }
-
- bool modified = false;
- // TODO: use RCU!
- mutex_lock(&allowlist_mutex);
- list_for_each_entry_safe (np, n, &allow_list, list) {
- uid_t uid = np->profile.current_uid;
- char *package = np->profile.key;
- // we use this uid for special cases, don't prune it!
- bool is_preserved_uid = uid == KSU_APP_PROFILE_PRESERVE_UID;
- if (!is_preserved_uid && !is_uid_valid(uid, package, data)) {
- modified = true;
- pr_info("prune uid: %d, package: %s\n", uid, package);
- list_del(&np->list);
- if (likely(uid <= BITMAP_UID_MAX)) {
- allow_list_bitmap[uid / BITS_PER_BYTE] &=
- ~(1 << (uid % BITS_PER_BYTE));
- }
- remove_uid_from_arr(uid);
- smp_mb();
- kfree(np);
- }
- }
- mutex_unlock(&allowlist_mutex);
-
- if (modified) {
- persistent_allow_list();
- }
-}
-
-void ksu_allowlist_init(void)
-{
- int i;
-
- BUILD_BUG_ON(sizeof(allow_list_bitmap) != PAGE_SIZE);
- BUILD_BUG_ON(sizeof(allow_list_arr) != PAGE_SIZE);
-
- for (i = 0; i < ARRAY_SIZE(allow_list_arr); i++)
- allow_list_arr[i] = -1;
-
- INIT_LIST_HEAD(&allow_list);
-
- init_default_profiles();
-}
-
-void ksu_allowlist_exit(void)
-{
- struct perm_data *np, *n = NULL;
-
- // free allowlist
- mutex_lock(&allowlist_mutex);
- list_for_each_entry_safe (np, n, &allow_list, list) {
- list_del(&np->list);
- kfree(np);
- }
- mutex_unlock(&allowlist_mutex);
-}
diff --git a/drivers/kernelsu/app_profile.c b/drivers/kernelsu/app_profile.c
deleted file mode 100644
index 4d2f333ebffd..000000000000
--- a/drivers/kernelsu/app_profile.c
+++ /dev/null
@@ -1,206 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-#include // signal_struct
-#include
-#endif
-#include
-#include