From fabfc448590f557dfa6fd0b0b116c82d2cc74de4 Mon Sep 17 00:00:00 2001 From: lean Date: Thu, 15 Sep 2022 12:11:45 +0800 Subject: [PATCH 01/11] bpftools: update to standalone bpftools + libbpf, use the latest version --- package/kernel/bpf-headers/Makefile | 1 - package/network/config/qosify/Makefile | 76 ++++++++ .../config/qosify/files/qosify-defaults.conf | 17 ++ .../network/config/qosify/files/qosify-status | 70 +++++++ .../network/config/qosify/files/qosify.conf | 48 +++++ .../config/qosify/files/qosify.hotplug | 2 + .../network/config/qosify/files/qosify.init | 171 ++++++++++++++++++ package/network/utils/bpftools/Makefile | 78 +++----- .../utils/bpftools/patches/001-cflags.patch | 10 + .../utils/bpftools/patches/002-includes.patch | 26 +++ ...pc-fix-EDEADLOCK-redefinition-errors.patch | 51 ------ 11 files changed, 449 insertions(+), 101 deletions(-) create mode 100644 package/network/config/qosify/Makefile create mode 100644 package/network/config/qosify/files/qosify-defaults.conf create mode 100644 package/network/config/qosify/files/qosify-status create mode 100644 package/network/config/qosify/files/qosify.conf create mode 100644 package/network/config/qosify/files/qosify.hotplug create mode 100644 package/network/config/qosify/files/qosify.init create mode 100644 package/network/utils/bpftools/patches/001-cflags.patch create mode 100644 package/network/utils/bpftools/patches/002-includes.patch delete mode 100644 package/network/utils/bpftools/patches/005-tools-arch-powerpc-fix-EDEADLOCK-redefinition-errors.patch diff --git a/package/kernel/bpf-headers/Makefile b/package/kernel/bpf-headers/Makefile index 4ffe5a7cc04158..6d25a7386f2e27 100644 --- a/package/kernel/bpf-headers/Makefile +++ b/package/kernel/bpf-headers/Makefile @@ -56,7 +56,6 @@ KERNEL_MAKE := \ LLVM=1 CC="$(CLANG)" LD="$(TARGET_CROSS)ld" \ HOSTCC="$(HOSTCC)" \ HOSTCXX="$(HOSTCXX)" \ - HOST_LOADLIBES="-L$(STAGING_DIR_HOST)/lib" \ KBUILD_HOSTLDLIBS="-L$(STAGING_DIR_HOST)/lib" \ CONFIG_SHELL="$(BASH)" \ INSTALL_HDR_PATH="$(PKG_BUILD_DIR)/user_headers" diff --git a/package/network/config/qosify/Makefile b/package/network/config/qosify/Makefile new file mode 100644 index 00000000000000..be0bc6657c620a --- /dev/null +++ b/package/network/config/qosify/Makefile @@ -0,0 +1,76 @@ +# +# Copyright (C) 2021 OpenWrt.org +# +# This is free software, licensed under the GNU General Public License v2. +# See /LICENSE for more information. +# + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=qosify +PKG_SOURCE_URL=$(PROJECT_GIT)/project/qosify.git +PKG_SOURCE_PROTO:=git +PKG_SOURCE_DATE:=2022-04-08 +PKG_SOURCE_VERSION:=ef82defaae26619e5b2ebddfdd86e9de61c399f1 +PKG_MIRROR_HASH:=8e4ca65d23a85aad774af51dc62cfaa4615111ffd2c7922258ac8f026a62b013 +PKG_RELEASE:=$(AUTORELEASE) + +PKG_LICENSE:=GPL-2.0 +PKG_MAINTAINER:=Felix Fietkau + +PKG_BUILD_DEPENDS:=bpf-headers +PKG_FLAGS:=nonshared + +include $(INCLUDE_DIR)/package.mk +include $(INCLUDE_DIR)/cmake.mk +include $(INCLUDE_DIR)/bpf.mk +include $(INCLUDE_DIR)/nls.mk + +define Package/qosify + SECTION:=utils + CATEGORY:=Base system + TITLE:=A simple QoS solution based eBPF + CAKE + DEPENDS:=+libbpf +libubox +libubus +libnl-tiny +kmod-sched-cake +kmod-sched-bpf +kmod-ifb +tc $(BPF_DEPENDS) +endef + +TARGET_CFLAGS += \ + -Wno-error=deprecated-declarations \ + -I$(STAGING_DIR)/usr/include/libnl-tiny \ + -I$(STAGING_DIR)/usr/include + +CMAKE_OPTIONS += \ + -DLIBNL_LIBS=-lnl-tiny + +define Build/Compile + $(call CompileBPF,$(PKG_BUILD_DIR)/qosify-bpf.c) + $(Build/Compile/Default) +endef + +define Package/qosify/conffiles +/etc/config/qosify +/etc/qosify/00-defaults.conf +endef + +define Package/qosify/install + $(INSTALL_DIR) \ + $(1)/lib/bpf \ + $(1)/usr/sbin \ + $(1)/etc/init.d \ + $(1)/etc/config \ + $(1)/etc/qosify \ + $(1)/etc/hotplug.d/net \ + $(1)/etc/hotplug.d/iface + $(INSTALL_DATA) $(PKG_BUILD_DIR)/qosify-bpf.o $(1)/lib/bpf + $(INSTALL_BIN) \ + $(PKG_INSTALL_DIR)/usr/bin/qosify \ + ./files/qosify-status \ + $(1)/usr/sbin/ + $(INSTALL_BIN) ./files/qosify.init $(1)/etc/init.d/qosify + $(INSTALL_DATA) ./files/qosify-defaults.conf $(1)/etc/qosify/00-defaults.conf + $(INSTALL_DATA) ./files/qosify.conf $(1)/etc/config/qosify + $(INSTALL_DATA) ./files/qosify.hotplug $(1)/etc/hotplug.d/net/10-qosify + $(INSTALL_DATA) ./files/qosify.hotplug $(1)/etc/hotplug.d/iface/10-qosify +endef + +$(eval $(call BuildPackage,qosify)) diff --git a/package/network/config/qosify/files/qosify-defaults.conf b/package/network/config/qosify/files/qosify-defaults.conf new file mode 100644 index 00000000000000..ff8a0bef5b172b --- /dev/null +++ b/package/network/config/qosify/files/qosify-defaults.conf @@ -0,0 +1,17 @@ +# DNS +tcp:53 voice +tcp:5353 voice +udp:53 voice +udp:5353 voice + +# NTP +udp:123 voice + +# SSH +tcp:22 +video + +# HTTP/QUIC +tcp:80 +besteffort +tcp:443 +besteffort +udp:80 +besteffort +udp:443 +besteffort diff --git a/package/network/config/qosify/files/qosify-status b/package/network/config/qosify/files/qosify-status new file mode 100644 index 00000000000000..dc582322681e0b --- /dev/null +++ b/package/network/config/qosify/files/qosify-status @@ -0,0 +1,70 @@ +#!/bin/sh +. /usr/share/libubox/jshn.sh + +dev_status() { + tc -s qdisc sh dev "$1" root + echo +} + +common_status() { + json_get_vars ifname ingress egress + + [ -n "$ifname" ] || return + + [ "$egress" -gt 0 ] && { + echo "egress status:" + dev_status "$ifname" + } + [ "$ingress" -gt 0 ] && { + echo "ingress status:" + dev_status "$(printf %.16s "ifb-$ifname")" + } +} + +is_active() { + json_get_vars active + + [ "${active:-0}" -gt 0 ] +} + +device_status() { + local name="$2" + + json_select "$name" + + if is_active; then + status="active" + else + status="not found" + fi + + echo "===== device $name: $status =====" + + is_active && common_status + + json_select .. +} + +interface_status() { + local name="$2" + + json_select "$name" + + if is_active; then + status="active" + elif ubus -S -t 0 wait_for "network.interface.$name"; then + status="down" + else + status="not found" + fi + + echo "===== interface $name: $status =====" + + is_active && common_status + + json_select .. +} + +json_load "$(ubus call qosify status)" +json_for_each_item device_status devices +json_for_each_item interface_status interfaces diff --git a/package/network/config/qosify/files/qosify.conf b/package/network/config/qosify/files/qosify.conf new file mode 100644 index 00000000000000..7a3d0d068add5f --- /dev/null +++ b/package/network/config/qosify/files/qosify.conf @@ -0,0 +1,48 @@ +config defaults + list defaults /etc/qosify/*.conf + option dscp_prio video + option dscp_icmp +besteffort + option dscp_default_udp besteffort + option prio_max_avg_pkt_len 500 + +config class besteffort + option ingress CS0 + option egress CS0 + +config class bulk + option ingress LE + option egress LE + +config class video + option ingress AF41 + option egress AF41 + +config class voice + option ingress CS6 + option egress CS6 + option bulk_trigger_pps 100 + option bulk_trigger_timeout 5 + option dscp_bulk CS0 + +config interface wan + option name wan + option disabled 1 + option bandwidth_up 100mbit + option bandwidth_down 100mbit + option overhead_type none + # defaults: + option ingress 1 + option egress 1 + option mode diffserv4 + option nat 1 + option host_isolate 1 + option autorate_ingress 0 + option ingress_options "" + option egress_options "" + option options "" + +config device wandev + option disabled 1 + option name wan + option bandwidth 100mbit + diff --git a/package/network/config/qosify/files/qosify.hotplug b/package/network/config/qosify/files/qosify.hotplug new file mode 100644 index 00000000000000..950812c03c2b08 --- /dev/null +++ b/package/network/config/qosify/files/qosify.hotplug @@ -0,0 +1,2 @@ +#!/bin/sh +ubus call qosify check_devices diff --git a/package/network/config/qosify/files/qosify.init b/package/network/config/qosify/files/qosify.init new file mode 100644 index 00000000000000..f676d92a52e4f8 --- /dev/null +++ b/package/network/config/qosify/files/qosify.init @@ -0,0 +1,171 @@ +#!/bin/sh /etc/rc.common +# Copyright (c) 2021 OpenWrt.org + +START=19 + +USE_PROCD=1 +PROG=/usr/sbin/qosify + +add_option() { + local type="$1" + local name="$2" + + config_get val "$cfg" "$name" + + [ -n "$val" ] && json_add_$type "$name" "$val" +} + +add_flow_config() { + local cfg="$1" + + add_option string dscp_prio + add_option string dscp_bulk + add_option int bulk_trigger_timeout + add_option int bulk_trigger_pps + add_option int prio_max_avg_pkt_len +} + +add_defaults() { + cfg="$1" + + json_add_boolean reset 1 + + config_get files "$cfg" defaults + json_add_array files + for i in $files; do + json_add_string "" "$i" + done + json_close_array + + add_flow_config "$cfg" + add_option int timeout + add_option string dscp_icmp + add_option string dscp_default_udp + add_option string dscp_default_tcp +} + +add_interface() { + local cfg="$1" + + config_get_bool disabled "$cfg" disabled 0 + [ "$disabled" -gt 0 ] && return + + config_get name "$cfg" name + json_add_object "$name" + + config_get bw "$cfg" bandwidth + + config_get bw_up "$cfg" bandwidth_up + bw_up="${bw_up:-$bw}" + [ -n "$bw_up" ] && json_add_string bandwidth_up "$bw_up" + + config_get bw_down "$cfg" bandwidth_down + bw_down="${bw_down:-$bw}" + [ -n "$bw_down" ] && json_add_string bandwidth_down "$bw_down" + + add_option string bandwidth + add_option boolean ingress + add_option boolean egress + add_option string mode + add_option boolean nat + add_option boolean host_isolate + add_option boolean autorate_ingress + add_option string ingress_options + add_option string egress_options + + config_get user_options "$cfg" options + + config_get otype "$cfg" overhead_type + options= + case "$otype" in + none);; + manual) + config_get overhead "$cfg" overhead + [ -n "$overhead" ] && append options "overhead $overhead" + + config_get encap "$cfg" overhead_encap + [ -n "$encap" ] && append options "$encap" + ;; + conservative|\ + pppoa-vcmux|\ + pppoa-llc|\ + pppoe-vcmux|\ + pppoe-llcsnap|\ + bridged-vcmux|\ + bridged-llcsnap|\ + ipoa-vcmux|\ + ipoa-llcsnap|\ + pppoe-ptm|\ + bridged-ptm|\ + docsis|\ + ethernet) + append options "$otype" + ;; + esac + + config_get mpu "$cfg" overhead_mpu + [ -n "$mpu" ] && append options "mpu $mpu" + + config_get ovlan "$cfg" overhead_vlan + [ "${ovlan:-0}" -ge 2 ] && append options "ether-vlan" + [ "${ovlan:-0}" -ge 1 ] && append options "ether-vlan" + + [ -n "$user_options" ] && append options "$user_options" + [ -n "$options" ] && json_add_string options "$options" + + json_close_object +} + +add_class() { + local cfg="$1" + + config_get value "$cfg" value + config_get ingress "$cfg" ingress + config_get egress "$cfg" egress + + json_add_object "$cfg" + json_add_string ingress "${ingress:-$value}" + json_add_string egress "${egress:-$value}" + add_flow_config "$cfg" + json_close_object +} + + +reload_service() { + json_init + + config_load qosify + + config_foreach add_defaults defaults + + json_add_object interfaces + config_foreach add_interface interface + json_close_object + + json_add_object classes + config_foreach add_class class + config_foreach add_class alias + json_close_object + + json_add_object devices + config_foreach add_interface device + json_close_object + + ubus call qosify config "$(json_dump)" +} + +service_triggers() { + procd_add_reload_trigger qosify +} + +start_service() { + procd_open_instance + procd_set_param command "$PROG" + procd_set_param respawn + procd_close_instance +} + +service_started() { + ubus -t 10 wait_for qosify + [ $? = 0 ] && reload_service +} diff --git a/package/network/utils/bpftools/Makefile b/package/network/utils/bpftools/Makefile index f044cc81f850bf..56422e79026638 100644 --- a/package/network/utils/bpftools/Makefile +++ b/package/network/utils/bpftools/Makefile @@ -8,12 +8,14 @@ include $(TOPDIR)/rules.mk PKG_NAME:=bpftools -PKG_VERSION:=5.11.16 PKG_RELEASE:=1 -PKG_SOURCE:=linux-$(PKG_VERSION).tar.xz -PKG_SOURCE_URL:=@KERNEL/linux/kernel/v5.x -PKG_HASH:=21163681d130cbce5a6be39019e2c69e44f284855ddd70b1a3bd039249540f43 +PKG_SOURCE_URL:=https://github.com/libbpf/bpftool +PKG_SOURCE_PROTO:=git +PKG_SOURCE_DATE:=2022-03-08 +PKG_SOURCE_VERSION:=04c465fd1f561f67796dc68bbfe1aa7cfa956c3c +PKG_MIRROR_HASH:=e22a954cd186f43228a96586bbdc120b11e6c87360ab88ae96ba37afb9c7cb58 +PKG_ABI_VERSION:=$(call abi_version_str,$(PKG_SOURCE_DATE)) PKG_MAINTAINER:=Tony Ambardar @@ -21,16 +23,6 @@ PKG_USE_MIPS16:=0 PKG_BUILD_PARALLEL:=1 PKG_INSTALL:=1 -LINUX_VERSION:=$(PKG_VERSION) -LINUX_TLD:=linux-$(LINUX_VERSION) - -BPF_FILES:= \ - kernel/bpf scripts tools/Makefile tools/bpf tools/perf/perf-sys.h \ - tools/arch tools/build tools/include tools/lib tools/scripts -TAR_OPTIONS+= \ - --transform="s;$(LINUX_TLD)/;$(PKG_NAME)-$(PKG_VERSION)/;" \ - $(addprefix $(LINUX_TLD)/,$(BPF_FILES)) - include $(INCLUDE_DIR)/package.mk include $(INCLUDE_DIR)/nls.mk @@ -77,7 +69,7 @@ define Package/libbpf TITLE:=libbpf - eBPF helper library VARIANT:=lib LICENSE:=LGPL-2.1 OR BSD-2-Clause - ABI_VERSION:=0 + ABI_VERSION:=$(PKG_ABI_VERSION) URL:=http://www.kernel.org DEPENDS:=+libelf endef @@ -93,55 +85,43 @@ ifneq ($(BUILD_VARIANT),lib) TARGET_LDFLAGS += -Wl,--gc-sections endif +ifeq ($(BUILD_VARIANT),full) + full:=1 +else + full:=0 +endif + MAKE_VARS = \ EXTRA_CFLAGS="$(TARGET_CFLAGS) $(TARGET_CPPFLAGS)" \ LDFLAGS="$(TARGET_LDFLAGS)" MAKE_FLAGS += \ - BPFTOOL_VERSION="$(LINUX_VERSION)" \ - FEATURES_DUMP="$(PKG_BUILD_DIR)/FEATURE-DUMP.openwrt" \ OUTPUT="$(PKG_BUILD_DIR)/" \ prefix="/usr" \ - $(if $(findstring c,$(OPENWRT_VERBOSE)),V=1,V='') + $(if $(findstring c,$(OPENWRT_VERBOSE)),V=1,V='') \ + LIBSUBDIR=lib \ + check_feat=0 \ + feature-clang-bpf-co-re=0 \ + feature-reallocarray=1 \ + feature-zlib=1 \ + feature-libbfd=$(full) \ + feature-libcap=0 \ + feature-disassembler-four-args=$(full) -ifeq ($(BUILD_VARIANT),full) - HAVE_LIBBFD:=1 - HAVE_LIBCAP:=0 - HAVE_CLANG:=0 - MAKE_PATH:=tools/bpf/bpftool -else ifeq ($(BUILD_VARIANT),minimal) - HAVE_LIBBFD:=0 - HAVE_LIBCAP:=0 - HAVE_CLANG:=0 - MAKE_PATH:=tools/bpf/bpftool -else ifeq ($(BUILD_VARIANT),lib) - HAVE_LIBBFD:=0 - HAVE_LIBCAP:=0 - HAVE_CLANG:=0 - MAKE_PATH:=tools/lib/bpf +ifeq ($(BUILD_VARIANT),lib) + MAKE_PATH = libbpf/src +else + MAKE_PATH = src endif -# Perform a "throw-away" make to create a FEATURE-DUMP.* file to edit later. -# The "//" in the make target is actually needed, very unPOSIXly. -define Build/Configure - +$(MAKE_VARS) $(MAKE) $(PKG_JOBS) -C $(PKG_BUILD_DIR)/tools/bpf/bpftool \ - $(MAKE_FLAGS) FEATURES_DUMP= $(PKG_BUILD_DIR)//libbpf/libbpf.a - (cd $(PKG_BUILD_DIR); cat FEATURE-DUMP.bpftool libbpf/FEATURE-DUMP.libbpf \ - | sort | uniq > FEATURE-DUMP.openwrt) - $(SED) 's/feature-libbfd=1/feature-libbfd=$(HAVE_LIBBFD)/' \ - -e 's/feature-libcap=1/feature-libcap=$(HAVE_LIBCAP)/' \ - -e 's/feature-clang-bpf-co-re=1/feature-clang-bpf-co-re=$(HAVE_CLANG)/' \ - $(PKG_BUILD_DIR)/FEATURE-DUMP.openwrt -endef - define Build/InstallDev/libbpf $(INSTALL_DIR) $(1)/usr/include/bpf $(CP) $(PKG_INSTALL_DIR)/usr/include/bpf/*.h $(1)/usr/include/bpf/ $(INSTALL_DIR) $(1)/usr/lib - $(CP) $(PKG_INSTALL_DIR)/usr/lib$(LIB_SUFFIX)/libbpf.{a,so*} \ + $(CP) $(PKG_INSTALL_DIR)/usr/lib/libbpf.{a,so*} \ $(1)/usr/lib/ $(INSTALL_DIR) $(1)/usr/lib/pkgconfig - $(CP) $(PKG_INSTALL_DIR)/usr/lib$(LIB_SUFFIX)/pkgconfig/libbpf.pc \ + $(CP) $(PKG_INSTALL_DIR)/usr/lib/pkgconfig/libbpf.pc \ $(1)/usr/lib/pkgconfig/ $(SED) 's,/usr/include,$$$${prefix}/include,g' \ $(1)/usr/lib/pkgconfig/libbpf.pc @@ -161,7 +141,7 @@ endef define Package/libbpf/install $(INSTALL_DIR) $(1)/usr/lib - $(CP) $(PKG_INSTALL_DIR)/usr/lib$(LIB_SUFFIX)/libbpf.so.* $(1)/usr/lib/ + $(CP) $(PKG_INSTALL_DIR)/usr/lib/libbpf.so.* $(1)/usr/lib/ endef $(eval $(call BuildPackage,libbpf)) diff --git a/package/network/utils/bpftools/patches/001-cflags.patch b/package/network/utils/bpftools/patches/001-cflags.patch new file mode 100644 index 00000000000000..48617e302bd22f --- /dev/null +++ b/package/network/utils/bpftools/patches/001-cflags.patch @@ -0,0 +1,10 @@ +--- a/libbpf/src/Makefile ++++ b/libbpf/src/Makefile +@@ -25,6 +25,7 @@ ALL_CFLAGS := $(INCLUDES) + + SHARED_CFLAGS += -fPIC -fvisibility=hidden -DSHARED + ++CFLAGS = $(EXTRA_CFLAGS) + CFLAGS ?= -g -O2 -Werror -Wall -std=gnu89 + ALL_CFLAGS += $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 + ALL_LDFLAGS += $(LDFLAGS) diff --git a/package/network/utils/bpftools/patches/002-includes.patch b/package/network/utils/bpftools/patches/002-includes.patch new file mode 100644 index 00000000000000..589d71c31e7999 --- /dev/null +++ b/package/network/utils/bpftools/patches/002-includes.patch @@ -0,0 +1,26 @@ +--- a/libbpf/include/linux/list.h ++++ b/libbpf/include/linux/list.h +@@ -3,6 +3,8 @@ + #ifndef __LINUX_LIST_H + #define __LINUX_LIST_H + ++#include ++ + #define LIST_HEAD_INIT(name) { &(name), &(name) } + #define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) +--- a/src/Makefile ++++ b/src/Makefile +@@ -73,10 +73,10 @@ CFLAGS += -W -Wall -Wextra -Wno-unused-p + CFLAGS += $(filter-out -Wswitch-enum -Wnested-externs,$(EXTRA_WARNINGS)) + CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ + -I$(if $(OUTPUT),$(OUTPUT),.) \ +- -I$(LIBBPF_INCLUDE) \ + -I$(srctree)/src/kernel/bpf/ \ + -I$(srctree)/include \ +- -I$(srctree)/include/uapi ++ -I$(srctree)/include/uapi \ ++ -I$(LIBBPF_INCLUDE) + ifneq ($(BPFTOOL_VERSION),) + CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' + endif diff --git a/package/network/utils/bpftools/patches/005-tools-arch-powerpc-fix-EDEADLOCK-redefinition-errors.patch b/package/network/utils/bpftools/patches/005-tools-arch-powerpc-fix-EDEADLOCK-redefinition-errors.patch deleted file mode 100644 index 996ffc43ee515b..00000000000000 --- a/package/network/utils/bpftools/patches/005-tools-arch-powerpc-fix-EDEADLOCK-redefinition-errors.patch +++ /dev/null @@ -1,51 +0,0 @@ -From afe3f4c765b17ced23811fe652c7f7adf7a0c0cf Mon Sep 17 00:00:00 2001 -From: Tony Ambardar -Date: Mon, 14 Sep 2020 23:05:26 -0700 -Subject: [PATCH] tools/arch/powerpc: fix EDEADLOCK redefinition errors in - errno.h - -A few archs like powerpc have different errno.h values for macros -EDEADLOCK and EDEADLK. In code including both libc and linux versions of -errno.h, this can result in multiple definitions of EDEADLOCK in the -include chain. Definitions to the same value (e.g. seen with mips) do -not raise warnings, but on powerpc there are redefinitions changing the -value, which raise warnings and errors (with "-Werror"). - -Guard against these redefinitions to avoid build errors like the following, -first seen cross-compiling libbpf v5.8.9 for powerpc using GCC 8.4.0 with -musl 1.1.24: - - In file included from ../../arch/powerpc/include/uapi/asm/errno.h:5, - from ../../include/linux/err.h:8, - from libbpf.c:29: - ../../include/uapi/asm-generic/errno.h:40: error: "EDEADLOCK" redefined [-Werror] - #define EDEADLOCK EDEADLK - - In file included from toolchain-powerpc_8540_gcc-8.4.0_musl/include/errno.h:10, - from libbpf.c:26: - toolchain-powerpc_8540_gcc-8.4.0_musl/include/bits/errno.h:58: note: this is the location of the previous definition - #define EDEADLOCK 58 - - cc1: all warnings being treated as errors - make[5]: *** [target-powerpc_8540_musl/bpftools-5.8.9/tools/build/Makefile.build:97: /home/kodidev/openwrt-project/build_dir/target-powerpc_8540_musl/bpftools-minimal/bpftools-5.8.9//libbpf/staticobjs/libbpf.o] Error 1 - -Fixes: 95f28190aa01 ("tools include arch: Grab a copy of errno.h for arch's - supported by perf") -Fixes: c3617f72036c ("UAPI: (Scripted) Disintegrate arch/powerpc/include/asm") - -Reported-by: Rosen Penev -Signed-off-by: Tony Ambardar ---- - tools/arch/powerpc/include/uapi/asm/errno.h | 1 + - 1 file changed, 1 insertion(+) - ---- a/tools/arch/powerpc/include/uapi/asm/errno.h -+++ b/tools/arch/powerpc/include/uapi/asm/errno.h -@@ -2,6 +2,7 @@ - #ifndef _ASM_POWERPC_ERRNO_H - #define _ASM_POWERPC_ERRNO_H - -+#undef EDEADLOCK - #include - - #undef EDEADLOCK From 61d68ec9b2430b1d6065515654f30761cf8abf56 Mon Sep 17 00:00:00 2001 From: lean Date: Thu, 15 Sep 2022 13:42:02 +0800 Subject: [PATCH 02/11] bridger: add bridge forwarding accelerator for target --- target/linux/rockchip/Makefile | 2 +- target/linux/x86/Makefile | 2 +- toolchain/Config.in | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/target/linux/rockchip/Makefile b/target/linux/rockchip/Makefile index f8d8f51db31646..6d3a36da96aebb 100644 --- a/target/linux/rockchip/Makefile +++ b/target/linux/rockchip/Makefile @@ -16,7 +16,7 @@ endef include $(INCLUDE_DIR)/target.mk -DEFAULT_PACKAGES += uboot-envtools partx-utils e2fsprogs mkf2fs kmod-gpio-button-hotplug \ +DEFAULT_PACKAGES += uboot-envtools partx-utils e2fsprogs mkf2fs kmod-gpio-button-hotplug bridger \ automount autocore-arm e2fsprogs ethtool haveged htop usb-modeswitch iperf3 hysteria \ luci-app-diskman luci-app-samba luci-app-unblockmusic diff --git a/target/linux/x86/Makefile b/target/linux/x86/Makefile index 7a8d7576c48e06..7fd4ce1e3a5958 100644 --- a/target/linux/x86/Makefile +++ b/target/linux/x86/Makefile @@ -17,7 +17,7 @@ KERNELNAME:=bzImage include $(INCLUDE_DIR)/target.mk -DEFAULT_PACKAGES += partx-utils mkf2fs e2fsprogs kmod-button-hotplug kmod-usb-hid kmod-mmc kmod-sdhci usbutils pciutils \ +DEFAULT_PACKAGES += partx-utils mkf2fs e2fsprogs kmod-button-hotplug kmod-usb-hid kmod-mmc kmod-sdhci bridger usbutils pciutils \ kmod-alx kmod-e1000e kmod-igb kmod-igc kmod-igbvf kmod-iavf kmod-bnx2x kmod-pcnet32 kmod-tulip kmod-via-velocity kmod-vmxnet3 kmod-i40e kmod-i40evf kmod-r8125 kmod-8139cp kmod-8139too kmod-fs-f2fs cfdisk \ htop lm-sensors iperf3 autocore-x86 automount autosamba luci-app-adbyby-plus luci-app-ipsec-vpnd luci-proto-bonding luci-app-diskman \ luci-app-unblockmusic luci-app-zerotier luci-app-xlnetacc ddns-scripts_aliyun ddns-scripts_dnspod ca-bundle luci-app-wireguard luci-app-ttyd \ diff --git a/toolchain/Config.in b/toolchain/Config.in index 2d29ec09e760a4..4a2f91a8b43076 100644 --- a/toolchain/Config.in +++ b/toolchain/Config.in @@ -41,7 +41,7 @@ menuconfig TARGET_OPTIONS prompt "BPF toolchain" if DEVEL default BPF_TOOLCHAIN_BUILD_LLVM if BUILDBOT default BPF_TOOLCHAIN_PREBUILT if HAS_PREBUILT_LLVM_TOOLCHAIN - default BPF_TOOLCHAIN_NONE + default BPF_TOOLCHAIN_BUILD_LLVM config BPF_TOOLCHAIN_NONE bool "None" @@ -324,7 +324,7 @@ config USE_LLVM_PREBUILT bool config USE_LLVM_BUILD - default y if !DEVEL && BUILDBOT + default y select HAS_BPF_TOOLCHAIN bool From b030e9103a2d1ef2fb746778fcfd0597167dec92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AA=B7=E9=AB=85=E5=A4=B4?= <74764072+DHDAXCW@users.noreply.github.com> Date: Thu, 15 Sep 2022 14:05:50 +0800 Subject: [PATCH 03/11] uboot-rockchip:fix r4se uboot sd card not bootable (#10117) * uboot-rockchip:fix r4se uboot sd card not bootable this will cause uboot to fail to load the sd card at startup, and uboot can recognize emmc * rockchip:fix r4se system loading prompt led * add the previous one * fix --- ...hip-rk3399-Add-support-for-FriendlyARM-NanoPi-R.patch | 5 ++++- .../arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4se.dts | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/package/boot/uboot-rockchip/patches/305-rockchip-rk3399-Add-support-for-FriendlyARM-NanoPi-R.patch b/package/boot/uboot-rockchip/patches/305-rockchip-rk3399-Add-support-for-FriendlyARM-NanoPi-R.patch index 696ca42189e3ba..ca6f80958ff2bb 100644 --- a/package/boot/uboot-rockchip/patches/305-rockchip-rk3399-Add-support-for-FriendlyARM-NanoPi-R.patch +++ b/package/boot/uboot-rockchip/patches/305-rockchip-rk3399-Add-support-for-FriendlyARM-NanoPi-R.patch @@ -10,7 +10,7 @@ rk3399-puma-haikou.dtb \ --- /dev/null +++ b/arch/arm/dts/rk3399-nanopi-r4se.dts -@@ -0,0 +1,29 @@ +@@ -0,0 +1,32 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * FriendlyElec NanoPC-T4 board device tree source @@ -40,6 +40,9 @@ + non-removable; + status = "okay"; +}; ++&sdmmc { ++ pinctrl-0 = <&sdmmc_cd>; ++}; --- /dev/null +++ b/configs/nanopi-r4se-rk3399_defconfig @@ -0,0 +1,65 @@ diff --git a/target/linux/rockchip/files/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4se.dts b/target/linux/rockchip/files/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4se.dts index e80f2d1da783bd..10ce0604de54c7 100644 --- a/target/linux/rockchip/files/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4se.dts +++ b/target/linux/rockchip/files/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4se.dts @@ -19,6 +19,13 @@ model = "FriendlyElec NanoPi R4SE"; compatible = "friendlyarm,nanopi-r4se", "rockchip,rk3399"; + aliases { + led-boot = &sys_led; + led-failsafe = &sys_led; + led-running = &sys_led; + led-upgrade = &sys_led; + }; + /delete-node/ display-subsystem; gpio-leds { @@ -33,7 +40,7 @@ sys_led: led-sys { gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; - label = "red:power"; + label = "red:sys"; default-state = "on"; }; From 58ccac858b4478befa50ee026721601f7995d5fe Mon Sep 17 00:00:00 2001 From: AmadeusGhost <42570690+AmadeusGhost@users.noreply.github.com> Date: Wed, 14 Sep 2022 23:15:19 +0800 Subject: [PATCH 04/11] Revert "rockchip: default smp tune on" This reverts commit 518cd6a5c5c3c3ef78b51a2edaed9527dc7d2b30. --- include/target.mk | 2 +- target/linux/rockchip/Makefile | 5 ++--- .../etc/uci-defaults/13_opkg_update | 19 ------------------- 3 files changed, 3 insertions(+), 23 deletions(-) delete mode 100644 target/linux/rockchip/armv8/base-files/etc/uci-defaults/13_opkg_update diff --git a/include/target.mk b/include/target.mk index 9cf84986da1ba8..b99f8253408957 100644 --- a/include/target.mk +++ b/include/target.mk @@ -46,7 +46,7 @@ DEFAULT_PACKAGES.router:=\ iptables-mod-tproxy iptables-mod-extra ipset ip-full default-settings luci luci-newapi \ ddns-scripts_aliyun ddns-scripts_dnspod luci-app-ddns luci-app-upnp luci-app-autoreboot \ luci-app-arpbind luci-app-filetransfer luci-app-vsftpd luci-app-ssr-plus luci-app-vlmcsd \ - luci-app-ramfree luci-app-accesscontrol luci-app-nlbwmon luci-app-turboacc luci-app-wol ca-certificates + luci-app-accesscontrol luci-app-nlbwmon luci-app-turboacc luci-app-wol curl ca-certificates ifneq ($(DUMP),) all: dumpinfo diff --git a/target/linux/rockchip/Makefile b/target/linux/rockchip/Makefile index 6d3a36da96aebb..a9df40df43f4c1 100644 --- a/target/linux/rockchip/Makefile +++ b/target/linux/rockchip/Makefile @@ -16,9 +16,8 @@ endef include $(INCLUDE_DIR)/target.mk -DEFAULT_PACKAGES += uboot-envtools partx-utils e2fsprogs mkf2fs kmod-gpio-button-hotplug bridger \ - automount autocore-arm e2fsprogs ethtool haveged htop usb-modeswitch iperf3 hysteria \ - luci-app-diskman luci-app-samba luci-app-unblockmusic +DEFAULT_PACKAGES += uboot-envtools partx-utils e2fsprogs mkf2fs kmod-gpio-button-hotplug \ + automount autocore-arm e2fsprogs ethtool haveged htop usb-modeswitch KERNELNAME:=Image dtbs diff --git a/target/linux/rockchip/armv8/base-files/etc/uci-defaults/13_opkg_update b/target/linux/rockchip/armv8/base-files/etc/uci-defaults/13_opkg_update deleted file mode 100644 index 8468c16215f00c..00000000000000 --- a/target/linux/rockchip/armv8/base-files/etc/uci-defaults/13_opkg_update +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -cat <<-EOF >/etc/opkg.conf -dest root / -dest ram /tmp -lists_dir ext /var/opkg-lists -option overlay_root /overlay -EOF - -cat <<-EOF >/etc/opkg/distfeeds.conf -src/gz openwrt_base https://downloads.immortalwrt.org/releases/21.02.1/packages/aarch64_generic/base -src/gz openwrt_luci https://downloads.immortalwrt.org/releases/packages-18.06-k5.4/aarch64_generic/luci -src/gz openwrt_packages https://downloads.immortalwrt.org/releases/21.02.1/packages/aarch64_generic/packages -src/gz openwrt_routing https://downloads.immortalwrt.org/releases/21.02.1/packages/aarch64_generic/routing -src/gz openwrt_telephony https://downloads.immortalwrt.org/releases/21.02.1/packages/aarch64_generic/telephony -EOF - -exit 0 - From a5dada15ce54d210a58cc1c4b20b45f2ebca7e36 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Sat, 10 Sep 2022 01:01:57 +0200 Subject: [PATCH 05/11] rockchip: ensure NanoPi R4S has unique MAC address Ensure the MAC address for all NanoPi R4S boards is assigned unique for each board. FriendlyElec ship two versions of the R4S: The standard as well as the enterprise edition with only the enterprise edition including the EEPROM chip that stores the unique MAC address. In order to assign both board types unique MAC addresses, fall back on the same method used for the NanoPi R2S in case the EEPROM chip is not present by generating the board MAC from the SD card CID. [0] https://wiki.friendlyelec.com/wiki/index.php/NanoPi_R4S#Differences_Between_R4S_Standard_Version_.26_R4S_Enterprise_Version Signed-off-by: David Bauer --- .../armv8/base-files/etc/board.d/02_network | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/target/linux/rockchip/armv8/base-files/etc/board.d/02_network b/target/linux/rockchip/armv8/base-files/etc/board.d/02_network index 6b78675b5e72a4..267cf2ddc85686 100755 --- a/target/linux/rockchip/armv8/base-files/etc/board.d/02_network +++ b/target/linux/rockchip/armv8/base-files/etc/board.d/02_network @@ -37,13 +37,36 @@ rockchip_setup_interfaces() esac } -nanopi_r2s_generate_mac() +generate_mac_from_mmc_cid() { - local sd_hash=$(sha256sum /sys/class/block/mmcblk*/device/cid | head -n 1) + local mmc_dev=$1 + + local sd_hash=$(sha256sum /sys/class/block/$mmc_dev/device/cid) local mac_base=$(macaddr_canonicalize "$(echo "${sd_hash}" | dd bs=1 count=12 2>/dev/null)") echo "$(macaddr_unsetbit_mc "$(macaddr_setbit_la "${mac_base}")")" } +nanopi_r4s_get_mac() +{ + local interface=$1 + local eeprom_path="/sys/bus/i2c/devices/2-0051/eeprom" + local address + + if [ -f "$eeprom_path" ]; then + address=$(get_mac_binary "$eeprom_path" 0xfa) + if [ "$interface" = "lan" ]; then + address=$(macaddr_setbit_la "$address") + fi + else + address=$(generate_mac_from_mmc_cid mmcblk1) + if [ "$interface" = "lan" ]; then + address=$(macaddr_add "$address" 1) + fi + fi + + echo "$address" +} + rockchip_setup_macs() { local board="$1" @@ -61,17 +84,13 @@ rockchip_setup_macs() friendlyarm,nanopi-r2s|\ friendlyarm,nanopi-r5s|\ sharevdi,guangmiao-g4c) - wan_mac=$(nanopi_r2s_generate_mac) + wan_mac=$(generate_mac_from_mmc_cid mmcblk0) lan_mac=$(macaddr_add "$wan_mac" +1) ;; friendlyarm,nanopi-r4s|\ friendlyarm,nanopi-r4se) - if [ -f /sys/bus/i2c/devices/2-0051/eeprom ]; then - wan_mac=$(get_mac_binary "/sys/bus/i2c/devices/2-0051/eeprom" 0xfa) - else - wan_mac=$(nanopi_r2s_generate_mac) - fi - lan_mac=$(macaddr_setbit_la "$wan_mac") + wan_mac=$(nanopi_r4s_get_mac wan) + lan_mac=$(nanopi_r4s_get_mac lan) ;; xunlong,orangepi-r1-plus|\ xunlong,orangepi-r1-plus-lts) From 62ef304e672c3a685b0ec962f72b27c2476f05b2 Mon Sep 17 00:00:00 2001 From: Rui Salvaterra Date: Tue, 29 Mar 2022 17:10:48 +0100 Subject: [PATCH 06/11] kernel: add and enable MGLRU for Linux 5.15 Backport a preliminary version of Yu Zhao's multi-generational LRU, for improved memory management. Refresh the patches while at it. Signed-off-by: Rui Salvaterra --- target/linux/generic/config-5.15 | 5 + ...-x86-arm64-add-arch_has_hw_pte_young.patch | 169 +++ ...dd-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch | 111 ++ ...-02-mm-vmscan.c-refactor-shrink_node.patch | 224 ++++ ...-mm-multigenerational-lru-groundwork.patch | 996 ++++++++++++++ ...multigenerational-lru-mm_struct-list.patch | 760 +++++++++++ ...20-05-mm-multigenerational-lru-aging.patch | 1176 +++++++++++++++++ ...06-mm-multigenerational-lru-eviction.patch | 1002 ++++++++++++++ ...multigenerational-lru-user-interface.patch | 496 +++++++ ...-08-mm-multigenerational-lru-Kconfig.patch | 80 ++ ...-multigenerational-lru-documentation.patch | 161 +++ 11 files changed, 5180 insertions(+) create mode 100644 target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch create mode 100644 target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch create mode 100644 target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch create mode 100644 target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch create mode 100644 target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch create mode 100644 target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch create mode 100644 target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch create mode 100644 target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch create mode 100644 target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch create mode 100644 target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch diff --git a/target/linux/generic/config-5.15 b/target/linux/generic/config-5.15 index dc24a058e37354..7bcc22a3b6ea35 100644 --- a/target/linux/generic/config-5.15 +++ b/target/linux/generic/config-5.15 @@ -3178,6 +3178,9 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 # CONFIG_LPC_ICH is not set # CONFIG_LPC_SCH is not set # CONFIG_LP_CONSOLE is not set +CONFIG_LRU_GEN=y +CONFIG_LRU_GEN_ENABLED=y +# CONFIG_LRU_GEN_STATS is not set # CONFIG_LSI_ET1011C_PHY is not set CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity" CONFIG_LSM_MMAP_MIN_ADDR=65536 @@ -4352,6 +4355,7 @@ CONFIG_NMI_LOG_BUF_SHIFT=13 # CONFIG_NO_HZ is not set # CONFIG_NO_HZ_FULL is not set # CONFIG_NO_HZ_IDLE is not set +CONFIG_NR_LRU_GENS=7 # CONFIG_NS83820 is not set # CONFIG_NTB is not set # CONFIG_NTFS3_64BIT_CLUSTER is not set @@ -6427,6 +6431,7 @@ CONFIG_THIN_ARCHIVES=y # CONFIG_THUNDER_NIC_VF is not set # CONFIG_TICK_CPU_ACCOUNTING is not set CONFIG_TICK_ONESHOT=y +CONFIG_TIERS_PER_GEN=4 # CONFIG_TIFM_CORE is not set # CONFIG_TIGON3 is not set # CONFIG_TIMB_DMA is not set diff --git a/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch new file mode 100644 index 00000000000000..548d8e61b28e47 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch @@ -0,0 +1,169 @@ +From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Wed, 4 Aug 2021 01:31:34 -0600 +Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young() + +Some architectures automatically set the accessed bit in PTEs, e.g., +x86 and arm64 v8.2. On architectures that do not have this capability, +clearing the accessed bit in a PTE triggers a page fault following the +TLB miss of this PTE. + +Being aware of this capability can help make better decisions, i.e., +whether to limit the size of each batch of PTEs and the burst of +batches when clearing the accessed bit. + +Signed-off-by: Yu Zhao +Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a +--- + arch/arm64/include/asm/cpufeature.h | 5 +++++ + arch/arm64/include/asm/pgtable.h | 13 ++++++++----- + arch/arm64/kernel/cpufeature.c | 10 ++++++++++ + arch/arm64/tools/cpucaps | 1 + + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 13 +++++++++++++ + mm/memory.c | 14 +------------- + 7 files changed, 41 insertions(+), 21 deletions(-) + +--- a/arch/arm64/include/asm/cpufeature.h ++++ b/arch/arm64/include/asm/cpufeature.h +@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); + } + ++static inline bool system_has_hw_af(void) ++{ ++ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); ++} ++ + extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); + + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru + * page after fork() + CoW for pfn mappings. We don't always have a + * hardware-managed access flag on arm64. + */ +-static inline bool arch_faults_on_old_pte(void) ++static inline bool arch_has_hw_pte_young(bool local) + { +- WARN_ON(preemptible()); ++ if (local) { ++ WARN_ON(preemptible()); ++ return cpu_has_hw_af(); ++ } + +- return !cpu_has_hw_af(); ++ return system_has_hw_af(); + } +-#define arch_faults_on_old_pte arch_faults_on_old_pte ++#define arch_has_hw_pte_young arch_has_hw_pte_young + + /* + * Experimentally, it's cheap to set the access flag in hardware and we +@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt + */ + static inline bool arch_wants_old_prefaulted_pte(void) + { +- return !arch_faults_on_old_pte(); ++ return arch_has_hw_pte_young(true); + } + #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte + +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -2184,6 +2184,16 @@ static const struct arm64_cpu_capabiliti + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, ++ { ++ .desc = "Hardware update of the Access flag", ++ .type = ARM64_CPUCAP_SYSTEM_FEATURE, ++ .capability = ARM64_HW_AF, ++ .sys_reg = SYS_ID_AA64MMFR1_EL1, ++ .sign = FTR_UNSIGNED, ++ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, ++ .min_field_value = 1, ++ .matches = has_cpuid_feature, ++ }, + #endif + { + .desc = "CRC32 instructions", +--- a/arch/arm64/tools/cpucaps ++++ b/arch/arm64/tools/cpucaps +@@ -35,6 +35,7 @@ HAS_STAGE2_FWB + HAS_SYSREG_GIC_CPUIF + HAS_TLB_RANGE + HAS_VIRT_HOST_EXTN ++HW_AF + HW_DBM + KVM_PROTECTED_MODE + MISMATCHED_CACHE_TYPE +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c + return boot_cpu_has_bug(X86_BUG_L1TF); + } + +-#define arch_faults_on_old_pte arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) ++#define arch_has_hw_pte_young arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(bool local) + { +- return false; ++ return true; + } + + #endif /* __ASSEMBLY__ */ +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_pte_young ++/* ++ * Return whether the accessed bit is supported by the local CPU or all CPUs. ++ * ++ * Those arches which have hw access flag feature need to implement their own ++ * helper. By default, "false" means pagefault will be hit on old pte. ++ */ ++static inline bool arch_has_hw_pte_young(bool local) ++{ ++ return false; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = + 2; + #endif + +-#ifndef arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* +- * Those arches which don't have hw access flag feature need to +- * implement their own helper. By default, "true" means pagefault +- * will be hit on old pte. +- */ +- return true; +-} +-#endif +- + #ifndef arch_wants_old_prefaulted_pte + static inline bool arch_wants_old_prefaulted_pte(void) + { +@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ +- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); diff --git a/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch new file mode 100644 index 00000000000000..785af275f50949 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch @@ -0,0 +1,111 @@ +From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sat, 26 Sep 2020 21:17:18 -0600 +Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + +Some architectures support the accessed bit on non-leaf PMD entries, +e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using +it as part of linear address translation [1]. As an optimization, page +table walkers who are interested in the accessed bit can skip the PTEs +under a non-leaf PMD entry if the accessed bit is cleared on this PMD +entry. + +Although an inline function may be preferable, this capability is +added as a configuration option to look consistent when used with the +existing macros. + +[1]: Intel 64 and IA-32 Architectures Software Developer's Manual + Volume 3 (June 2021), section 4.8 + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 +--- + arch/Kconfig | 9 +++++++++ + arch/x86/Kconfig | 1 + + arch/x86/include/asm/pgtable.h | 3 ++- + arch/x86/mm/pgtable.c | 5 ++++- + include/linux/pgtable.h | 4 ++-- + 5 files changed, 18 insertions(+), 4 deletions(-) + +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT + config ARCH_HAS_PARANOID_L1D_FLUSH + bool + ++config ARCH_HAS_NONLEAF_PMD_YOUNG ++ bool ++ depends on PGTABLE_LEVELS > 2 ++ help ++ Architectures that select this are able to set the accessed bit on ++ non-leaf PMD entries in addition to leaf PTE entries where pages are ++ mapped. For them, page table walkers that clear the accessed bit may ++ stop at non-leaf PMD entries if they do not see the accessed bit. ++ + source "kernel/gcov/Kconfig" + + source "scripts/gcc-plugins/Kconfig" +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -84,6 +84,7 @@ config X86 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_COPY_MC if X86_64 + select ARCH_HAS_SET_MEMORY +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad + + static inline int pmd_bad(pmd_t pmd) + { +- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != ++ (_KERNPG_TABLE & ~_PAGE_ACCESSED); + } + + static inline unsigned long pages_to_mb(unsigned long npg) +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_ + return ret; + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_ + + return ret; + } ++#endif ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) + { +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH diff --git a/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch new file mode 100644 index 00000000000000..f466f1105c0ba1 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch @@ -0,0 +1,224 @@ +From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sun, 27 Sep 2020 20:49:08 -0600 +Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node() + +This patch refactors shrink_node(). This will make the upcoming +changes to mm/vmscan.c more readable. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 +--- + mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- + 1 file changed, 98 insertions(+), 88 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2562,6 +2562,103 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined +@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +@@ -3048,93 +3144,7 @@ again: + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + diff --git a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch new file mode 100644 index 00000000000000..146f510d287fa8 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch @@ -0,0 +1,996 @@ +From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:12:33 -0700 +Subject: [PATCH 04/10] mm: multigenerational lru: groundwork + +For each lruvec, evictable pages are divided into multiple +generations. The youngest generation number is stored in +lrugen->max_seq for both anon and file types as they are aged on an +equal footing. The oldest generation numbers are stored in +lrugen->min_seq[] separately for anon and file types as clean file +pages can be evicted regardless of swap constraints. These three +variables are monotonically increasing. Generation numbers are +truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into +page->flags. The sliding window technique is used to prevent truncated +generation numbers from overlapping. Each truncated generation number +is an index to +lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. + +The framework comprises two conceptually independent components: the +aging, which produces young generations, and the eviction, which +consumes old generations. Both can be invoked independently from user +space for the purpose of working set estimation and proactive reclaim. + +The protection of hot pages and the selection of cold pages are based +on page access types and patterns. There are two access types: one via +page tables and the other via file descriptors. The protection of the +former type is by design stronger because: + 1) The uncertainty in determining the access patterns of the former + type is higher due to the coalesced nature of the accessed bit. + 2) The cost of evicting the former type is higher due to the TLB + flushes required and the likelihood of involving I/O. + 3) The penalty of under-protecting the former type is higher because + applications usually do not prepare themselves for major faults like + they do for blocked I/O. For example, client applications commonly + dedicate blocked I/O to separate threads to avoid UI janks that + negatively affect user experience. + +There are also two access patterns: one with temporal locality and the +other without. The latter pattern, e.g., random and sequential, needs +to be explicitly excluded to avoid weakening the protection of the +former pattern. Generally the former type follows the former pattern +unless MADV_SEQUENTIAL is specified and the latter type follows the +latter pattern unless outlying refaults have been observed. + +Upon faulting, a page is added to the youngest generation, which +provides the strongest protection as the eviction will not consider +this page before the aging has scanned it at least twice. The first +scan clears the accessed bit set during the initial fault. And the +second scan makes sure this page has not been used since the first +scan. A page from any other generations is brought back to the +youngest generation whenever the aging finds the accessed bit set on +any of the PTEs mapping this page. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. This is done later [PATCH 07/10]. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 +--- + fs/fuse/dev.c | 3 +- + include/linux/cgroup.h | 15 +- + include/linux/mm.h | 36 ++++ + include/linux/mm_inline.h | 182 ++++++++++++++++++++ + include/linux/mmzone.h | 70 ++++++++ + include/linux/page-flags-layout.h | 19 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 3 + + kernel/bounds.c | 3 + + kernel/cgroup/cgroup-internal.h | 1 - + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 1 + + mm/memory.c | 7 + + mm/mm_init.c | 6 +- + mm/page_alloc.c | 1 + + mm/swap.c | 9 +- + mm/swapfile.c | 2 + + mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ + 18 files changed, 618 insertions(+), 15 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -707,6 +718,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s + loff_t const holebegin, loff_t const holelen, int even_cows) { } + #endif + ++#ifdef CONFIG_LRU_GEN ++static inline void task_enter_nonseq_fault(void) ++{ ++ WARN_ON(current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 1; ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++ WARN_ON(!current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 0; ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return current->in_nonseq_fault; ++} ++#else ++static inline void task_enter_nonseq_fault(void) ++{ ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return false; ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) + { +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++#ifdef CONFIG_LRU_GEN_ENABLED ++ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); ++ ++ return static_branch_likely(&lru_gen_static_key); ++#else ++ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); ++ ++ return static_branch_unlikely(&lru_gen_static_key); ++#endif ++} ++ ++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++/* The youngest and the second youngest generations are counted as active. */ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->evictable.max_seq; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++/* Update the sizes of the multigenerational lru lists. */ ++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_FILE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_BUG_ON(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], ++ lrugen->sizes[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], ++ lrugen->sizes[new_gen][type][zone] + delta); ++ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); ++} ++ ++/* Add a page to one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (PageUnevictable(page) || !lrugen->enabled[type]) ++ return false; ++ /* ++ * If a page shouldn't be considered for eviction, i.e., a page mapped ++ * upon fault during which the accessed bit is set, add it to the ++ * youngest generation. ++ * ++ * If a page can't be evicted immediately, i.e., an anon page not in ++ * swap cache or a dirty page pending writeback, add it to the second ++ * oldest generation. ++ * ++ * If a page could be evicted immediately, e.g., a clean page, add it to ++ * the oldest generation. ++ */ ++ if (PageActive(page)) ++ gen = lru_gen_from_seq(lrugen->max_seq); ++ else if ((!type && !PageSwapCache(page)) || ++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) ++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); ++ else ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); ++ ++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, -1, gen); ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++/* Delete a page from one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ if (!(new_flags & LRU_GEN_MASK)) ++ return false; ++ ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ ++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ /* for shrink_page_list() */ ++ if (reclaiming) ++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ++ else if (lru_gen_is_active(lruvec, gen)) ++ new_flags |= BIT(PG_active); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(page, lruvec, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,72 @@ enum lruvec_flags { + */ + }; + ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * For each lruvec, evictable pages are divided into multiple generations. The ++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are ++ * monotonically increasing. The sliding window technique is used to track at ++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the ++ * window, AKA gen, indexes an array of per-type and per-zone lists for the ++ * corresponding generation. The counter in page->flags stores gen+1 while a ++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. ++ * ++ * After a page is faulted in, the aging must check the accessed bit at least ++ * twice before the eviction would consider it. The first check clears the ++ * accessed bit set during the initial fault. The second check makes sure this ++ * page hasn't been used since then. ++ */ ++#define MIN_NR_GENS 2 ++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++ ++struct lrugen { ++ /* the aging increments the max generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the min generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; ++ /* the multigenerational lru lists */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the sizes of the multigenerational lru lists in pages */ ++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* whether the multigenerational lru is enabled */ ++ bool enabled[ANON_AND_FILE]; ++}; ++ ++#define MAX_BATCH_SIZE 8192 ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); ++void lru_gen_change_state(bool enable, bool main, bool swap); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++} ++ ++static inline void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +377,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* unevictable pages are on LRU_UNEVICTABLE */ ++ struct lrugen evictable; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -26,6 +26,14 @@ + + #define ZONES_WIDTH ZONES_SHIFT + ++#ifdef CONFIG_LRU_GEN ++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ ++#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) ++#else ++#define LRU_GEN_WIDTH 0 ++#define LRU_REFS_WIDTH 0 ++#endif /* CONFIG_LRU_GEN */ ++ + #ifdef CONFIG_SPARSEMEM + #include + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +@@ -55,7 +63,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +98,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,8 +109,8 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -911,6 +911,9 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ unsigned in_nonseq_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,9 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); ++#endif + /* End of constants */ + + return 0; +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are + unsigned int flags, struct pt_regs *regs) + { + vm_fault_t ret; ++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); + + __set_current_state(TASK_RUNNING); + +@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ if (nonseq_fault) ++ task_enter_nonseq_fault(); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ if (nonseq_fault) ++ task_exit_nonseq_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna + + pgdat_page_ext_init(pgdat); + lruvec_init(&pgdat->__lruvec); ++ lru_gen_init_state(NULL, &pgdat->__lruvec); + } + + static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(true, false, true); + + error = 0; + goto out; +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ if (lruvec->pgdat != pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static int lru_gen_nr_swapfiles; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; ++} ++ ++static bool fill_lists(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ if (!lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_lists(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(). ++ */ ++void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ cgroup_lock(); ++ mutex_lock(&state_mutex); ++ ++ if (swap) { ++ if (enable) ++ swap = !lru_gen_nr_swapfiles++; ++ else ++ swap = !--lru_gen_nr_swapfiles; ++ } ++ ++ if (main && enable != lru_gen_enabled()) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } else if (!swap || !lru_gen_enabled()) ++ goto unlock; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lruvec->evictable.enabled[1] = lru_gen_enabled(); ++ ++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ cgroup_unlock(); ++ mem_hotplug_done(); ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ lru_gen_init_state(memcg, lruvec); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; diff --git a/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch new file mode 100644 index 00000000000000..75fd39d99d4c54 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch @@ -0,0 +1,760 @@ +From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:17:41 -0600 +Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list + +To scan PTEs for accessed pages, a mm_struct list is maintained for +each memcg. When multiple threads traverse the same memcg->mm_list, +each of them gets a unique mm_struct and therefore they can run +walk_page_range() concurrently to reach page tables of all processes +of this memcg. + +This infrastructure also provides the following optimizations: + 1) it allows walkers to skip processes that have been sleeping since + the last walk by tracking the usage of mm_struct between context + switches. + 2) it allows walkers to add interesting items they find during a + walk to a Bloom filter so that they can skip uninteresting items + during the next walk by testing whether an item is in this Bloom + filter. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 4 + + include/linux/mm_inline.h | 6 + + include/linux/mm_types.h | 75 +++++++++ + include/linux/mmzone.h | 63 +++++++ + kernel/exit.c | 1 + + kernel/fork.c | 9 + + kernel/sched/core.c | 1 + + mm/memcontrol.c | 25 +++ + mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++ + 10 files changed, 517 insertions(+) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); ++ lru_gen_activate_mm(mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -348,6 +348,10 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ struct lru_gen_mm_list mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig + return seq % MAX_NR_GENS; + } + ++/* Return a proper index regardless whether we keep stats for historical generations. */ ++static inline int lru_hist_from_seq(unsigned long seq) ++{ ++ return seq % NR_HIST_GENS; ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -3,6 +3,7 @@ + #define _LINUX_MM_TYPES_H + + #include ++#include + + #include + #include +@@ -15,6 +16,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -580,6 +583,18 @@ struct mm_struct { + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; + #endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* the node of a global or per-memcg mm_struct list */ ++ struct list_head list; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of the owner task above */ ++ struct mem_cgroup *memcg; ++#endif ++ /* whether this mm_struct has been used since the last walk */ ++ nodemask_t nodes; ++ } lrugen; ++#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* +@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++struct lru_gen_mm_list { ++ /* a global or per-memcg mm_struct list */ ++ struct list_head fifo; ++ /* protects the list above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lrugen.list); ++#ifdef CONFIG_MEMCG ++ mm->lrugen.memcg = NULL; ++#endif ++ nodes_clear(mm->lrugen.nodes); ++} ++ ++/* Track the usage of each mm_struct so that we can skip inactive ones. */ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ ++ VM_WARN_ON(list_empty(&mm->lrugen.list)); ++ ++ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes)) ++ nodes_setall(mm->lrugen.nodes); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -318,6 +318,13 @@ struct lruvec; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* Whether to keep stats for historical generations. */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++#else ++#define NR_HIST_GENS 1U ++#endif ++ + struct lrugen { + /* the aging increments the max generation number */ + unsigned long max_seq; +@@ -333,13 +340,63 @@ struct lrugen { + bool enabled[ANON_AND_FILE]; + }; + ++enum { ++ MM_LEAF_TOTAL, /* total leaf entries */ ++ MM_LEAF_OLD, /* old leaf entries */ ++ MM_LEAF_YOUNG, /* young leaf entries */ ++ MM_NONLEAF_TOTAL, /* total non-leaf entries */ ++ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ ++ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ ++ NR_MM_STATS ++}; ++ ++/* mnemonic codes for the stats above */ ++#define MM_STAT_CODES "toydpc" ++ ++/* double buffering bloom filters */ ++#define NR_BLOOM_FILTERS 2 ++ ++struct lru_gen_mm_walk { ++ /* set to max_seq after each round of walk */ ++ unsigned long seq; ++ /* the next mm_struct on the list to walk */ ++ struct list_head *head; ++ /* the first mm_struct never walked before */ ++ struct list_head *tail; ++ /* to wait for the last walker to finish */ ++ struct wait_queue_head wait; ++ /* bloom filters flip after each round of walk */ ++ unsigned long *filters[NR_BLOOM_FILTERS]; ++ /* page table stats for debugging */ ++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; ++ /* the number of concurrent walkers */ ++ int nr_walkers; ++}; ++ ++#define MIN_BATCH_SIZE 64 + #define MAX_BATCH_SIZE 8192 + ++struct mm_walk_args { ++ struct mem_cgroup *memcg; ++ unsigned long max_seq; ++ unsigned long start_pfn; ++ unsigned long end_pfn; ++ unsigned long next_addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; ++ int node_id; ++ int swappiness; ++ int batch_size; ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ int mm_stats[NR_MM_STATS]; ++ bool use_filter; ++}; ++ + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); ++void lru_gen_free_memcg(struct mem_cgroup *memcg); + #endif + + #else /* !CONFIG_LRU_GEN */ +@@ -356,6 +413,10 @@ static inline void lru_gen_change_state( + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + } ++ ++static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++} + #endif + + #endif /* CONFIG_LRU_GEN */ +@@ -380,6 +441,8 @@ struct lruvec { + #ifdef CONFIG_LRU_GEN + /* unevictable pages are on LRU_UNEVICTABLE */ + struct lrugen evictable; ++ /* state for mm list and page table walks */ ++ struct lru_gen_mm_walk mm_walk; + #endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -422,6 +422,7 @@ assign_new_owner: + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2616,6 +2618,13 @@ pid_t kernel_clone(struct kernel_clone_a + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_activate_mm(next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem + + static void mem_cgroup_free(struct mem_cgroup *memcg) + { ++ lru_gen_free_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); + } +@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *task = NULL; ++ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid( + } + + /****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++ static struct lru_gen_mm_list mm_list = { ++ .fifo = LIST_HEAD_INIT(mm_list.fifo), ++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), ++ }; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ return &memcg->mm_list; ++#endif ++ return &mm_list; ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); ++#ifdef CONFIG_MEMCG ++ VM_BUG_ON_MM(mm->lrugen.memcg, mm); ++ mm->lrugen.memcg = memcg; ++#endif ++ spin_lock(&mm_list->lock); ++ ++ list_add_tail(&mm->lrugen.list, &mm_list->fifo); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm_list->fifo) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; ++ } ++ ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (list_empty(&mm->lrugen.list)) ++ return; ++ ++#ifdef CONFIG_MEMCG ++ memcg = mm->lrugen.memcg; ++#endif ++ mm_list = get_mm_list(memcg); ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm->lrugen.list) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; ++ ++ if (lruvec->mm_walk.head != &mm->lrugen.list) ++ continue; ++ ++ lruvec->mm_walk.head = lruvec->mm_walk.head->next; ++ if (lruvec->mm_walk.head == &mm_list->fifo) ++ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); ++ } ++ ++ list_del_init(&mm->lrugen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lrugen.memcg); ++ mm->lrugen.memcg = NULL; ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ ++ lockdep_assert_held(&mm->owner->alloc_lock); ++ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(mm->owner); ++ rcu_read_unlock(); ++ if (memcg == mm->lrugen.memcg) ++ return; ++ ++ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); ++ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++#endif ++ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ filter = lruvec->mm_walk.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); ++ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); ++} ++ ++static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return false; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) ++{ ++ int i; ++ int hist = lru_hist_from_seq(args->max_seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], ++ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); ++ args->mm_stats[i] = 0; ++ } ++ ++ if (!last || NR_HIST_GENS == 1) ++ return; ++ ++ hist = lru_hist_from_seq(args->max_seq + 1); ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ int type; ++ unsigned long size = 0; ++ ++ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes)) ++ return true; ++ ++ if (mm_is_oom_victim(mm)) ++ return true; ++ ++ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ if (size < MIN_BATCH_SIZE) ++ return true; ++ ++ if (!mmget_not_zero(mm)) ++ return true; ++ ++ node_clear(args->node_id, mm->lrugen.nodes); ++ ++ return false; ++} ++ ++/* To support multiple walkers that concurrently walk an mm_struct list. */ ++static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, ++ struct mm_struct **iter) ++{ ++ bool first = false; ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; ++ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); ++ ++ if (*iter) ++ mmput_async(*iter); ++ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) ++ return false; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); ++ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); ++ VM_BUG_ON(*iter && !mm_walk->nr_walkers); ++ ++ if (args->max_seq <= mm_walk->seq) { ++ if (!*iter) ++ last = false; ++ goto done; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) { ++ VM_BUG_ON(mm_walk->nr_walkers); ++ mm_walk->head = mm_walk->head->next; ++ first = true; ++ } ++ ++ while (!mm && mm_walk->head != &mm_list->fifo) { ++ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); ++ ++ mm_walk->head = mm_walk->head->next; ++ ++ if (mm_walk->tail == &mm->lrugen.list) { ++ mm_walk->tail = mm_walk->tail->next; ++ args->use_filter = false; ++ } ++ ++ if (should_skip_mm(mm, args)) ++ mm = NULL; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) ++ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); ++done: ++ if (*iter && !mm) ++ mm_walk->nr_walkers--; ++ if (!*iter && mm) ++ mm_walk->nr_walkers++; ++ ++ if (mm_walk->nr_walkers) ++ last = false; ++ ++ if (mm && first) ++ clear_bloom_filter(lruvec, args->max_seq + 1); ++ ++ if (*iter || last) ++ reset_mm_stats(lruvec, last, args); ++ ++ spin_unlock(&mm_list->lock); ++ ++ *iter = mm; ++ ++ return last; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou + int i; + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; +@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_lock(&mm_list->lock); ++ ++ lruvec->mm_walk.seq = MIN_NR_GENS; ++ lruvec->mm_walk.head = &mm_list->fifo; ++ lruvec->mm_walk.tail = &mm_list->fifo; ++ init_waitqueue_head(&lruvec->mm_walk.wait); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_unlock(&mm_list->lock); + } + + #ifdef CONFIG_MEMCG +@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou + { + int nid; + ++ INIT_LIST_HEAD(&memcg->mm_list.fifo); ++ spin_lock_init(&memcg->mm_list.lock); ++ + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + lru_gen_init_state(memcg, lruvec); + } + } ++ ++void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ int i; ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ for (i = 0; i < NR_BLOOM_FILTERS; i++) { ++ bitmap_free(lruvec->mm_walk.filters[i]); ++ lruvec->mm_walk.filters[i] = NULL; ++ } ++ } ++} + #endif + + static int __init init_lru_gen(void) + { + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + + return 0; + }; diff --git a/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch new file mode 100644 index 00000000000000..e5622ccbbe6d2a --- /dev/null +++ b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch @@ -0,0 +1,1176 @@ +From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:35:07 -0600 +Subject: [PATCH 06/10] mm: multigenerational lru: aging + +The aging produces young generations. Given an lruvec, the aging +traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan +PTEs for accessed pages. Upon finding one, the aging updates its +generation number to max_seq (modulo MAX_NR_GENS). After each round of +traversal, the aging increments max_seq. The aging is due when +min_seq[] reaches max_seq-1. + +The aging uses the following optimizations when walking page tables: + 1) It skips non-leaf PMD entries that have the accessed bit cleared + when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. + 2) It does not zigzag between a PGD table and the same PMD or PTE + table spanning multiple VMAs. In other words, it finishes all the + VMAs within the range of the same PMD or PTE table before it returns + to this PGD table. This optimizes workloads that have large numbers + of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45 +--- + include/linux/memcontrol.h | 3 + + include/linux/mmzone.h | 9 + + include/linux/oom.h | 16 + + include/linux/swap.h | 3 + + mm/memcontrol.c | 5 + + mm/oom_kill.c | 4 +- + mm/rmap.c | 8 + + mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++ + 8 files changed, 994 insertions(+), 2 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_ + + static inline void lock_page_memcg(struct page *page) + { ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); + } + + static inline void unlock_page_memcg(struct page *page) + { ++ rcu_read_unlock(); + } + + static inline void mem_cgroup_handle_over_high(void) +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -295,6 +295,7 @@ enum lruvec_flags { + }; + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -393,6 +394,7 @@ struct mm_walk_args { + + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -409,6 +411,10 @@ static inline void lru_gen_change_state( + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +@@ -1028,6 +1034,9 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args mm_walk_args; ++#endif + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +--- a/include/linux/oom.h ++++ b/include/linux/oom.h +@@ -57,6 +57,22 @@ struct oom_control { + extern struct mutex oom_lock; + extern struct mutex oom_adj_mutex; + ++#ifdef CONFIG_MMU ++extern struct task_struct *oom_reaper_list; ++extern struct wait_queue_head oom_reaper_wait; ++ ++static inline bool oom_reaping_in_progress(void) ++{ ++ /* racy check to see if oom reaping could be in progress */ ++ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); ++} ++#else ++static inline bool oom_reaping_in_progress(void) ++{ ++ return false; ++} ++#endif ++ + static inline void set_current_oom_origin(void) + { + current->signal->oom_flag_origin = true; +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -137,6 +137,9 @@ union swap_header { + */ + struct reclaim_state { + unsigned long reclaimed_slab; ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args *mm_walk_args; ++#endif + }; + + #ifdef __KERNEL__ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l + *lru_size += nr_pages; + + size = *lru_size; ++#ifdef CONFIG_LRU_GEN ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); ++#else + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { + VM_BUG_ON(1); + *lru_size = 0; + } ++#endif + + if (nr_pages > 0) + *lru_size += nr_pages; +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc + * victim (if that is possible) to help the OOM killer to move on. + */ + static struct task_struct *oom_reaper_th; +-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +-static struct task_struct *oom_reaper_list; ++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); ++struct task_struct *oom_reaper_list; + static DEFINE_SPINLOCK(oom_reaper_lock); + + bool __oom_reap_task_mm(struct mm_struct *mm) +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -73,6 +73,7 @@ + #include + #include + #include ++#include + + #include + +@@ -790,6 +791,13 @@ static bool page_referenced_one(struct p + } + + if (pvmw.pte) { ++ /* the multigenerational lru exploits the spatial locality */ ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & VM_SEQ_READ)) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -51,6 +51,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg + * shorthand helpers + ******************************************************************************/ + ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->evictable.min_seq[0]), \ ++ READ_ONCE((lruvec)->evictable.min_seq[1]), \ ++ } ++ + #define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ +@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int get_swappiness(struct mem_cgroup *memcg) ++{ ++ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? ++ mem_cgroup_swappiness(memcg) : 0; ++} ++ + static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) + { + struct pglist_data *pgdat = NODE_DATA(nid); +@@ -3229,6 +3246,926 @@ done: + } + + /****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long old_flags, new_flags; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & LRU_GEN_MASK)) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int old_gen, new_gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); ++ ++ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has updated this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) { ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ return; ++ } ++ ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for end_page_writeback() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, old_gen, new_gen); ++ if (reclaiming) ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ else ++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++} ++ ++static void update_batch_size(struct page *page, int old_gen, int new_gen, ++ struct mm_walk_args *args) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON(old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen >= MAX_NR_GENS); ++ ++ args->batch_size++; ++ ++ args->nr_pages[old_gen][type][zone] -= delta; ++ args->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ args->batch_size = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_FILE; ++ int delta = args->nr_pages[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ args->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->sizes[gen][type][zone], ++ lrugen->sizes[gen][type][zone] + delta); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ } ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = walk->vma; ++ struct mm_walk_args *args = walk->private; ++ ++ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || ++ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !args->swappiness; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (!mapping->a_ops->writepage) ++ return true; ++ ++ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); ++} ++ ++/* ++ * Some userspace memory allocators create many single-page VMAs. So instead of ++ * returning back to the PGD table for each of such VMAs, we finish at least an ++ * entire PMD table and therefore avoid many zigzags. ++ */ ++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, ++ unsigned long *start, unsigned long *end) ++{ ++ unsigned long next = round_up(*end, size); ++ ++ VM_BUG_ON(mask & size); ++ VM_BUG_ON(*start >= *end); ++ VM_BUG_ON((next & mask) != (*start & mask)); ++ ++ while (walk->vma) { ++ if (next >= walk->vma->vm_end) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ if ((next & mask) != (walk->vma->vm_start & mask)) ++ return false; ++ ++ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ *start = max(next, walk->vma->vm_start); ++ next = (next | ~mask) + 1; ++ /* rounded-up boundaries can wrap to 0 */ ++ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int worth = 0; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pmd_leaf(*pmd)); ++ ++ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); ++ arch_enter_lazy_mmu_mode(); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ struct page *page; ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); ++ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ ++ worth++; ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) ++ goto restart; ++ ++ arch_leave_lazy_mmu_mode(); ++ pte_unmap_unlock(pte, ptl); ++ ++ return worth >= MIN_BATCH_SIZE / 2; ++} ++ ++/* ++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and ++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD ++ * entries and needs to take the PMD lock. ++ */ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ start = (start & PUD_MASK) + offset * PMD_SIZE; ++ pmd = pmd_offset(pud, start); ++ ptl = pmd_lock(walk->mm, pmd); ++ arch_enter_lazy_mmu_mode(); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { ++ struct page *page; ++ unsigned long pfn = pmd_pfn(pmd[i]); ++ unsigned long addr = start + i * PMD_SIZE; ++ ++ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) ++ continue; ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) ++ continue; ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ pmdp_test_and_clear_young(vma, addr, pmd + i); ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = pfn_to_page(pfn); ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pmd_dirty(pmd[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ int offset = -1; ++ bool reset = false; ++ struct mm_walk_args *args = walk->private; ++ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ vma = walk->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val)) { ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (is_huge_zero_pmd(val)) ++ continue; ++ ++ if (!pmd_young(val)) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ } ++ __set_bit(i - offset, args->bitmap); ++ reset = true; ++ continue; ++ } ++#endif ++ args->mm_stats[MM_NONLEAF_TOTAL]++; ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ reset = false; ++ } ++ __set_bit(i - offset, args->bitmap); ++#endif ++ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_PREV]++; ++ ++ if (!walk_pte_range(&val, addr, next, walk)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_CUR]++; ++ ++ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); ++ } ++ ++ if (reset) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = -1; ++ reset = false; ++ } ++ ++ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) ++ goto restart; ++ ++ if (offset >= 0) ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, walk); ++ ++ if (args->batch_size >= MAX_BATCH_SIZE) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ /* rounded-up boundaries can wrap to 0 */ ++ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ ++ args->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ unsigned long start = args->next_addr; ++ unsigned long end = mm->highest_vm_end; ++ ++ err = -EBUSY; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_MEMCG ++ if (args->memcg && atomic_read(&args->memcg->moving_account)) ++ goto contended; ++#endif ++ if (!mmap_read_trylock(mm)) ++ goto contended; ++ ++ err = walk_page_range(mm, start, end, &mm_walk_ops, args); ++ ++ mmap_read_unlock(mm); ++ ++ if (args->batch_size) { ++ spin_lock_irq(&lruvec->lru_lock); ++ reset_batch_size(lruvec, args); ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++contended: ++ rcu_read_unlock(); ++ ++ cond_resched(); ++ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); ++} ++ ++static struct mm_walk_args *alloc_mm_walk_args(void) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); ++ ++ return current->reclaim_state->mm_walk_args; ++} ++ ++static void free_mm_walk_args(struct mm_walk_args *args) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ kvfree(args); ++} ++ ++static bool inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ int gen, zone; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ return true; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ page_inc_gen(page, lruvec, false); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) ++{ ++ int gen, type, zone; ++ bool success = false; ++ struct lrugen *lrugen = &lruvec->evictable; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { ++ gen = lru_gen_from_seq(min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ goto next; ++ } ++ ++ min_seq[type]++; ++ } ++next: ++ ; ++ } ++ ++ min_seq[0] = min(min_seq[0], min_seq[1]); ++ if (swappiness) ++ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ if (min_seq[type] == lrugen->min_seq[type]) ++ continue; ++ ++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ if (!try_to_inc_min_seq(lruvec, true)) { ++ for (type = ANON_AND_FILE - 1; type >= 0; type--) { ++ while (!inc_min_seq(lruvec, type)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq - 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq + 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ } ++ ++ WRITE_ONCE(lrugen->timestamps[gen], jiffies); ++ /* make sure all preceding modifications appear first */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++/* Main function used by the foreground, the background and the user-triggered aging. */ ++static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, bool use_filter) ++{ ++ bool last; ++ struct mm_walk_args *args; ++ struct mm_struct *mm = NULL; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ int nid = pgdat->node_id; ++ ++ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* ++ * If we are not from run_aging() and clearing the accessed bit may ++ * trigger page faults, then don't proceed to clearing all accessed ++ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a ++ * handful of accessed PTEs. This is less efficient but causes fewer ++ * page faults on CPUs that don't have the capability. ++ */ ++ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { ++ inc_max_seq(lruvec, max_seq); ++ return true; ++ } ++ ++ args = alloc_mm_walk_args(); ++ if (!args) ++ return false; ++ ++ args->memcg = memcg; ++ args->max_seq = max_seq; ++ args->start_pfn = pgdat->node_start_pfn; ++ args->end_pfn = pgdat_end_pfn(pgdat); ++ args->node_id = nid; ++ args->swappiness = swappiness; ++ args->use_filter = use_filter; ++ ++ do { ++ last = get_next_mm(lruvec, args, &mm); ++ if (mm) ++ walk_mm(lruvec, mm, args); ++ ++ cond_resched(); ++ } while (mm); ++ ++ free_mm_walk_args(args); ++ ++ if (!last) { ++ /* don't wait unless we may have trouble reclaiming */ ++ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) ++ wait_event_killable(lruvec->mm_walk.wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, max_seq); ++ /* either we see any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&lruvec->mm_walk.wait)) ++ wake_up_all(&lruvec->mm_walk.wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ ++static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, unsigned long *min_seq, bool *low) ++{ ++ int gen, type, zone; ++ long max = 0; ++ long min = 0; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (type = !swappiness; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone <= sc->reclaim_idx; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ max += size; ++ if (type && max_seq - seq >= MIN_NR_GENS) ++ min += size; ++ } ++ } ++ ++ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; ++ ++ return max > 0 ? max : 0; ++} ++ ++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, ++ unsigned long min_ttl) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int swappiness = get_swappiness(memcg); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return false; ++ ++ if (min_ttl) { ++ int gen = lru_gen_from_seq(min_seq[1]); ++ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); ++ ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return false; ++ ++ nr_to_scan >>= sc->priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); ++ ++ return true; ++} ++ ++/* Protect the working set accessed within the last N milliseconds. */ ++static unsigned long lru_gen_min_ttl __read_mostly; ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ bool success = false; ++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); ++ ++ VM_BUG_ON(!current_is_kswapd()); ++ ++ if (!sc->force_deactivate) { ++ sc->force_deactivate = 1; ++ return; ++ } ++ ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ ++ if (age_lruvec(lruvec, sc, min_ttl)) ++ success = true; ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ if (!success && mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ .order = sc->order, ++ }; ++ ++ /* to avoid overkilling */ ++ if (!oom_reaping_in_progress()) ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } ++ ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ struct page *page; ++ int old_gen, new_gen; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ struct mm_walk_args *args; ++ int worth = 0; ++ struct mem_cgroup *memcg = page_memcg(pvmw->page); ++ struct pglist_data *pgdat = page_pgdat(pvmw->page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (!args) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ end = start + MIN_BATCH_SIZE * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ start = end - MIN_BATCH_SIZE * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ new_gen = lru_gen_from_seq(max_seq); ++ ++ lock_page_memcg(pvmw->page); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ worth++; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ continue; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ __set_bit(i, args->bitmap); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ unlock_page_memcg(pvmw->page); ++ ++ if (worth >= MIN_BATCH_SIZE / 2) ++ set_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) ++ set_page_dirty(pte_page(pte[i])); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void) + }; + late_initcall(init_lru_gen); + ++#else ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!can_age_anon_pages(pgdat, sc)) + return; + diff --git a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch new file mode 100644 index 00000000000000..09e4315dccb3c1 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch @@ -0,0 +1,1002 @@ +From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 29 Jun 2021 20:46:47 -0600 +Subject: [PATCH 07/10] mm: multigenerational lru: eviction + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[]. If they are equal, it selects the type that has +a lower refaulted %. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refaulted % than the base tier. The +eviction increments min_seq[] of a selected type when it finds +lrugen->lists indexed by min_seq[] of this selected type are empty. + +Each generation is divided into multiple tiers. Tiers represent +different ranges of numbers of accesses from file descriptors only. +Pages accessed N times via file descriptors belong to tier +order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, +and they require additional MAX_NR_TIERS-2 bits in page->flags. In +contrast to moving between generations which requires list operations, +moving between tiers only involves operations on page->flags and +therefore has a negligible cost. A feedback loop modeled after the PID +controller monitors refaulted % across all tiers and decides when to +protect pages from which tiers. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. Each tier keeps track of how many +pages from it have refaulted. Tier 0 is the base tier and pages from +it are evicted unconditionally because there are no better candidates. +Pages from an upper tier are either evicted or moved to the next +generation, depending on whether this upper tier has a higher +refaulted % than the base tier. This model has the following +advantages: + 1) It removes the cost in the buffered access path and reduces the + overall cost of protection because pages are conditionally protected + in the reclaim path. + 2) It takes mapped pages into account and avoids overprotecting + pages accessed multiple times via file descriptors. + 3 Additional tiers improve the protection of pages accessed more + than twice. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac +--- + include/linux/mm_inline.h | 10 + + include/linux/mmzone.h | 33 +++ + mm/swap.c | 42 +++ + mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- + mm/workingset.c | 120 ++++++++- + 5 files changed, 757 insertions(+), 3 deletions(-) + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi + return seq % NR_HIST_GENS; + } + ++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); ++ ++ return order_base_2(refs + 1); ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; ++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* ++ * Each generation is divided into multiple tiers. Tiers represent different ++ * ranges of numbers of accesses from file descriptors, i.e., ++ * mark_page_accessed(). In contrast to moving between generations which ++ * requires the lru lock, moving between tiers only involves an atomic ++ * operation on page->flags and therefore has a negligible cost. ++ * ++ * The purposes of tiers are to: ++ * 1) estimate whether pages accessed multiple times via file descriptors are ++ * more active than pages accessed only via page tables by separating the two ++ * access types into upper tiers and the base tier, and comparing refaulted % ++ * across all tiers. ++ * 2) improve buffered io performance by deferring the protection of pages ++ * accessed multiple times until the eviction. That is the protection happens ++ * in the reclaim path, not the access path. ++ * ++ * Pages accessed N times via file descriptors belong to tier order_base_2(N). ++ * The base tier may be marked by PageReferenced(). All upper tiers are marked ++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are ++ * used to support more than one upper tier. ++ */ ++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ + /* Whether to keep stats for historical generations. */ + #ifdef CONFIG_LRU_GEN_STATS + #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +@@ -337,6 +361,15 @@ struct lrugen { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the multigenerational lru lists in pages */ + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of protected+evicted */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the base tier isn't protected, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* incremented without holding the lru lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multigenerational lru is enabled */ + bool enabled[ANON_AND_FILE]; + }; +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_refs(struct page *page) ++{ ++ unsigned long refs; ++ unsigned long old_flags, new_flags; ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & BIT(PG_referenced))) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ if (!(new_flags & BIT(PG_workingset))) { ++ new_flags |= BIT(PG_workingset); ++ continue; ++ } ++ ++ refs = new_flags & LRU_REFS_MASK; ++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); ++ ++ new_flags &= ~LRU_REFS_MASK; ++ new_flags |= refs; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++#else ++static void page_inc_refs(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_refs(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before page_memcg() is cleared */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(page, swap); +@@ -1410,6 +1412,11 @@ retry: + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* lru_gen_look_around() has updated this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int page_lru_tier(struct page *page) ++{ ++ int refs; ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? ++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; ++ ++ return lru_tier_from_refs(refs); ++} ++ + static int get_swappiness(struct mem_cgroup *memcg) + { + return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? +@@ -3246,6 +3267,91 @@ done: + } + + /****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop modeled after the PID controller. Currently supports the ++ * proportional (P) and the integral (I) terms; the derivative (D) term can be ++ * added if necessary. The setpoint (SP) is the desired position; the process ++ * variable (PV) is the measured position. The error is the difference between ++ * the SP and the PV. A positive error results in a positive control output ++ * correction, which, in our case, is to allow eviction. ++ * ++ * The P term is refaulted % of the current generation being evicted. The I ++ * term is the exponential moving average of refaulted % of previously evicted ++ * generations, using the smoothing factor 1/2. ++ * ++ * Our goal is to maintain proportional refaulted % across all tiers. ++ */ ++struct ctrl_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, ++ struct ctrl_pos *pos) ++{ ++ struct lrugen *lrugen = &lruvec->evictable; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) ++{ ++ int tier; ++ int hist = lru_hist_from_seq(gen); ++ struct lrugen *lrugen = &lruvec->evictable; ++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ ++ if (!carryover && !clear) ++ return; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Allow eviction if the PV has a limited number of refaulted pages or a ++ * lower refaulted % than the SP. ++ */ ++ return pv->refaulted < MIN_BATCH_SIZE || ++ pv->refaulted * max(sp->total, 1UL) * sp->gain <= ++ sp->refaulted * max(pv->total, 1UL) * pv->gain; ++} ++ ++/****************************************************************************** + * the aging + ******************************************************************************/ + +@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page * + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + +@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); +@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l + } + } + ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +@@ -3824,6 +3933,8 @@ next: + if (min_seq[type] == lrugen->min_seq[type]) + continue; + ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } +@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l + } + } + ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, gen, type); ++ + WRITE_ONCE(lrugen->timestamps[gen], jiffies); + /* make sure all preceding modifications appear first */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma + } + + /****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int tier = page_lru_tier(page); ++ int delta = thp_nr_pages(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); ++ ++ /* an mlocked page? */ ++ if (!page_evictable(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageUnevictable(page); ++ add_page_to_lru_list(page, lruvec); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* a lazy-free page that has been written into? */ ++ if (type && PageDirty(page) && PageAnon(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* page_update_gen() has updated this page? */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ /* protect this page if its tier has a higher refaulted % */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(gen); ++ ++ page_inc_gen(page, lruvec, false); ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* mark this page for reclaim if it's pending writeback */ ++ if (PageWriteback(page) || (type && PageDirty(page))) { ++ page_inc_gen(page, lruvec, true); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_BUG_ON(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ scanned += delta; ++ ++ if (sort_page(page, lruvec, tier)) ++ sorted += delta; ++ else if (isolate_page(page, lruvec, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * We may have trouble finding eligible pages due to reclaim_idx, ++ * may_unmap and may_writepage. Check `remaining` to make sure we won't ++ * be stuck if we aren't making enough progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * Ideally we don't want to evict upper tiers that have higher refaulted ++ * %. However, we need to leave a margin for the fluctuation in ++ * refaulted %. So we use a larger gain factor to make sure upper tiers ++ * are indeed more active. We choose 2 because the lowest upper tier ++ * would have twice of refaulted % of the base tier, according to their ++ * numbers of accesses. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare refaulted % between the base tiers of anon and file to ++ * determine which type to evict. Also need to compare refaulted % of ++ * the upper tiers of the selected type with that of the base tier of ++ * the other type to determine which tier of the selected type to evict. ++ */ ++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); ++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ /* ++ * Try to select a type based on generations and swappiness, and if that ++ * fails, fall back to get_type_to_scan(). When anon and file are both ++ * available from the same generation, swappiness 200 is interpreted as ++ * anon first and swappiness 1 is interpreted as file first. ++ */ ++ if (!swappiness) ++ type = 1; ++ else if (min_seq[0] < min_seq[1]) ++ type = 0; ++ else if (swappiness == 1) ++ type = 1; ++ else if (swappiness == 200) ++ type = 0; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_pages(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++/* Main function used by the foreground, the background and the user-triggered eviction. */ ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mm_walk_args *args; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); ++ ++ if (try_to_inc_min_seq(lruvec, swappiness)) ++ scanned++; ++ ++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ /* ++ * We need to prevent rejected pages from being added back to the same ++ * lists they were isolated from. Otherwise we may risk looping on them ++ * forever. ++ */ ++ list_for_each_entry(page, &list, lru) { ++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) ++ SetPageActive(page); ++ ++ ClearPageReferenced(page); ++ ClearPageWorkingset(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (args && args->batch_size) ++ reset_batch_size(lruvec, args); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int priority = sc->priority; ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ++ priority = DEF_PRIORITY; ++ sc->force_deactivate = 0; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return 0; ++ ++ nr_to_scan >>= priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (!nr_to_scan) ++ return 0; ++ ++ if (current_is_kswapd()) { ++ /* leave the work to lru_gen_age_node() */ ++ if (max_seq - min_seq[1] < MIN_NR_GENS) ++ return 0; ++ ++ if (!low) ++ sc->force_deactivate = 0; ++ ++ return nr_to_scan; ++ } ++ ++ if (max_seq - min_seq[1] >= MIN_NR_GENS) ++ return nr_to_scan; ++ ++ /* move onto slab and other memcgs if we haven't tried them all */ ++ if (!sc->force_deactivate) { ++ sc->skipped_deactivate = 1; ++ return 0; ++ } ++ ++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ long scanned = 0; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ lru_add_drain(); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(memcg); ++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_pages(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli + { + } + ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static int page_lru_refs(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ /* see the comment on MAX_NR_TIERS */ ++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; ++} ++ ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int refs = page_lru_refs(page); ++ int delta = thp_nr_pages(page); ++ bool workingset = PageWorkingset(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | refs; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); ++} ++ ++/* Count a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ if (page_pgdat(page) != pgdat) ++ return; ++ ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; ++ ++ refs = token & (BIT(LRU_REFS_WIDTH) - 1); ++ if (refs && !workingset) ++ goto unlock; ++ ++ token >>= LRU_REFS_WIDTH; ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Tiers don't offer any protection to pages accessed via page tables. ++ * That's what generations do. Tiers can't fully protect pages after ++ * their numbers of accesses has exceeded the max value. Conservatively ++ * count these two conditions as stalls even though they might not ++ * indicate any real memory pressure. ++ */ ++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { ++ SetPageWorkingset(page); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* diff --git a/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch new file mode 100644 index 00000000000000..a1a749fc38e845 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch @@ -0,0 +1,496 @@ +From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:38:02 -0700 +Subject: [PATCH 08/10] mm: multigenerational lru: user interface + +Add /sys/kernel/mm/lru_gen/enabled to enable and disable the +multigenerational lru at runtime. + +Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a +given number of milliseconds. The OOM killer is invoked if this +working set cannot be kept in memory. + +Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and +invoke the aging and the eviction. This file has the following output: + memcg memcg_id memcg_path + node node_id + min_gen birth_time anon_size file_size + ... + max_gen birth_time anon_size file_size + +min_gen is the oldest generation number and max_gen is the youngest +generation number. birth_time is in milliseconds. anon_size and +file_size are in pages. + +This file takes the following input: + + memcg_id node_id max_gen [swappiness] [use_bloom_filter] + - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] + +The first command line invokes the aging, which scans PTEs for +accessed pages and then creates the next generation max_gen+1. A swap +file and a non-zero swappiness, which overrides vm.swappiness, are +required to scan PTEs mapping anon pages. The second command line +invokes the eviction, which evicts generations less than or equal to +min_gen. min_gen should be less than max_gen-1 as max_gen and +max_gen-1 are not fully aged and therefore cannot be evicted. +Setting nr_to_reclaim to N limits the number of pages to evict. +Setting use_bloom_filter to 0 overrides the default behavior which +only scans PTE tables found populated. Multiple command lines are +supported, as is concatenation with delimiters "," and ";". + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3 +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 416 insertions(+) + +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -485,6 +485,7 @@ static inline int num_node_state(enum no + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -53,6 +53,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -4882,6 +4884,413 @@ unlock: + } + + /****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 10, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ ++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); ++} ++ ++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ bool enable; ++ ++ if (kstrtobool(buf, &enable)) ++ return -EINVAL; ++ ++ lru_gen_change_state(enable, true, false); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enable, store_enable ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return get_lruvec(nid, memcg); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return get_lruvec(nid, memcg); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ ++ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); ++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ ++ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); ++ } else ++ seq_puts(m, " 0 0 0 "); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ if (seq == max_seq && NR_HIST_GENS == 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ toupper(MM_STAT_CODES[i])); ++ else if (seq != max_seq && NR_HIST_GENS > 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ MM_STAT_CODES[i]); ++ else ++ seq_puts(m, " 0 "); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lrugen *lrugen = &lruvec->evictable; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min_seq[0]; ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int gen, type, zone; ++ unsigned int msecs; ++ ++ gen = lru_gen_from_seq(seq); ++ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); ++ ++ seq_printf(m, " %10lu %10u", seq, msecs); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ long size = 0; ++ ++ if (seq < min_seq[type]) { ++ seq_puts(m, " -0 "); ++ continue; ++ } ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ seq_printf(m, " %10lu ", max(size, 0L)); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, bool use_filter) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq == max_seq) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); ++ ++ return seq > max_seq ? -EINVAL : 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, unsigned long nr_to_reclaim) ++{ ++ struct blk_plug plug; ++ int err = -EINTR; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq >= max_seq - 1) ++ return -EINVAL; ++ ++ sc->nr_reclaimed = 0; ++ ++ blk_start_plug(&plug); ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || ++ !evict_pages(lruvec, sc, swappiness)) { ++ err = 0; ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ return err; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, ++ int swappiness, unsigned long seq, unsigned long opt) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ goto done; ++ } ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ goto done; ++ ++ lruvec = get_lruvec(nid, memcg); ++ ++ if (swappiness < 0) ++ swappiness = get_swappiness(memcg); ++ else if (swappiness > 200) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, sc, swappiness, seq, opt); ++ break; ++ case '-': ++ err = run_eviction(lruvec, sc, swappiness, seq, opt); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ unsigned int flags; ++ int err = 0; ++ struct scan_control sc = { ++ .may_writepage = 1, ++ .may_unmap = 1, ++ .may_swap = 1, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ buf = kvmalloc(len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); ++ if (!sc.reclaim_state.mm_walk_args) { ++ kvfree(buf); ++ return -ENOMEM; ++ } ++ ++ flags = memalloc_noreclaim_save(); ++ set_task_reclaim_state(current, &sc.reclaim_state); ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long opt = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &opt, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); ++ if (err) ++ break; ++ } ++ ++ set_task_reclaim_state(current, NULL); ++ memalloc_noreclaim_restore(flags); ++ ++ free_mm_walk_args(sc.reclaim_state.mm_walk_args); ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++/****************************************************************************** + * initialization + ******************************************************************************/ + +@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + late_initcall(init_lru_gen); diff --git a/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch new file mode 100644 index 00000000000000..4462549f99bd7a --- /dev/null +++ b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch @@ -0,0 +1,80 @@ +From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:47:24 -0700 +Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig + +Add configuration options for the multigenerational lru. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635 +--- + mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 59 insertions(+) + +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -899,4 +899,63 @@ config SECRETMEM + + source "mm/damon/Kconfig" + ++# the multigenerational lru { ++config LRU_GEN ++ bool "Multigenerational LRU" ++ depends on MMU ++ # the following options may leave not enough spare bits in page->flags ++ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) ++ help ++ A high performance LRU implementation to heavily overcommit workloads ++ that are not IO bound. See Documentation/vm/multigen_lru.rst for ++ details. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces a small per-process and per-memcg and per-node memory ++ overhead. ++ ++config LRU_GEN_ENABLED ++ bool "Turn on by default" ++ depends on LRU_GEN ++ help ++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option ++ changes it to 1. ++ ++ Warning: the default value is the fast path. See ++ Documentation/static-keys.txt for details. ++ ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ This option keeps full stats for each generation, which can be read ++ from /sys/kernel/debug/lru_gen_full. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces an additional small per-process and per-memcg and ++ per-node memory overhead. ++ ++config NR_LRU_GENS ++ int "Max number of generations" ++ depends on LRU_GEN ++ range 4 31 ++ default 7 ++ help ++ This will use order_base_2(N+1) spare bits from page flags. ++ ++ Warning: do not use numbers larger than necessary because each ++ generation introduces a small per-node and per-memcg memory overhead. ++ ++config TIERS_PER_GEN ++ int "Number of tiers per generation" ++ depends on LRU_GEN ++ range 2 5 ++ default 4 ++ help ++ This will use N-2 spare bits from page flags. ++ ++ Larger values generally offer better protection to active pages under ++ heavy buffered I/O workloads. ++# } ++ + endmenu diff --git a/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch new file mode 100644 index 00000000000000..f4716fb68d6aab --- /dev/null +++ b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch @@ -0,0 +1,161 @@ +From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 2 Feb 2021 01:27:45 -0700 +Subject: [PATCH 10/10] mm: multigenerational lru: documentation + +Add Documentation/vm/multigen_lru.rst. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e +--- + Documentation/vm/index.rst | 1 + + Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++ + 2 files changed, 133 insertions(+) + create mode 100644 Documentation/vm/multigen_lru.rst + +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -17,6 +17,7 @@ various features of the Linux memory man + + swap_numa + zswap ++ multigen_lru + + Kernel developers MM documentation + ================================== +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,132 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++===================== ++Multigenerational LRU ++===================== ++ ++Quick Start ++=========== ++Build Configurations ++-------------------- ++:Required: Set ``CONFIG_LRU_GEN=y``. ++ ++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by ++ default. ++ ++Runtime Configurations ++---------------------- ++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the ++ feature was not turned on by default. ++ ++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to ++ protect the working set of ``N`` milliseconds. The OOM killer is ++ invoked if this working set cannot be kept in memory. ++ ++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature ++ is turned on. This file has the following output: ++ ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen birth_time anon_size file_size ++ ... ++ max_gen birth_time anon_size file_size ++ ++``min_gen`` is the oldest generation number and ``max_gen`` is the ++youngest generation number. ``birth_time`` is in milliseconds. ++``anon_size`` and ``file_size`` are in pages. ++ ++Phones/Laptops/Workstations ++--------------------------- ++No additional configurations required. ++ ++Servers/Data Centers ++-------------------- ++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a ++ larger number. ++ ++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger ++ number. ++ ++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. ++ ++:Working set estimation: Write ``+ memcg_id node_id max_gen ++ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to ++ invoke the aging, which scans PTEs for accessed pages and then ++ creates the next generation ``max_gen+1``. A swap file and a non-zero ++ ``swappiness``, which overrides ``vm.swappiness``, are required to ++ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to ++ override the default behavior which only scans PTE tables found ++ populated. ++ ++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] ++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the ++ eviction, which evicts generations less than or equal to ``min_gen``. ++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and ++ ``max_gen-1`` are not fully aged and therefore cannot be evicted. ++ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple ++ command lines are supported, so does concatenation with delimiters ++ ``,`` and ``;``. ++ ++Framework ++========= ++For each ``lruvec``, evictable pages are divided into multiple ++generations. The youngest generation number is stored in ++``lrugen->max_seq`` for both anon and file types as they are aged on ++an equal footing. The oldest generation numbers are stored in ++``lrugen->min_seq[]`` separately for anon and file types as clean ++file pages can be evicted regardless of swap and writeback ++constraints. These three variables are monotonically increasing. ++Generation numbers are truncated into ++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into ++``page->flags``. The sliding window technique is used to prevent ++truncated generation numbers from overlapping. Each truncated ++generation number is an index to an array of per-type and per-zone ++lists ``lrugen->lists``. ++ ++Each generation is divided into multiple tiers. Tiers represent ++different ranges of numbers of accesses from file descriptors only. ++Pages accessed ``N`` times via file descriptors belong to tier ++``order_base_2(N)``. Each generation contains at most ++``CONFIG_TIERS_PER_GEN`` tiers, and they require additional ++``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to ++moving between generations which requires list operations, moving ++between tiers only involves operations on ``page->flags`` and ++therefore has a negligible cost. A feedback loop modeled after the PID ++controller monitors refaulted % across all tiers and decides when to ++protect pages from which tiers. ++ ++The framework comprises two conceptually independent components: the ++aging and the eviction, which can be invoked separately from user ++space for the purpose of working set estimation and proactive reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, the aging ++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` ++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained ++for each ``memcg``). Upon finding one, the aging updates its ++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). ++After each round of traversal, the aging increments ``max_seq``. The ++aging is due when ``min_seq[]`` reaches ``max_seq-1``. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, the ++eviction scans pages on the per-zone lists indexed by anon and file ++``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to ++select a type based on the values of ``min_seq[]``. If they are ++equal, it selects the type that has a lower refaulted %. The eviction ++sorts a page according to its updated generation number if the aging ++has found this page accessed. It also moves a page to the next ++generation if this page is from an upper tier that has a higher ++refaulted % than the base tier. The eviction increments ``min_seq[]`` ++of a selected type when it finds all the per-zone lists indexed by ++``min_seq[]`` of this selected type are empty. ++ ++To-do List ++========== ++KVM Optimization ++---------------- ++Support shadow page table walk. From 0e7c277d030cff70ca4afb18422cb790e32bf63e Mon Sep 17 00:00:00 2001 From: lean Date: Fri, 16 Sep 2022 02:10:11 +0800 Subject: [PATCH 07/11] rockchip: fix NanoPi R4SE has unique MAC address when boot from emmc --- .../rockchip/armv8/base-files/etc/board.d/02_network | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/target/linux/rockchip/armv8/base-files/etc/board.d/02_network b/target/linux/rockchip/armv8/base-files/etc/board.d/02_network index 267cf2ddc85686..9cdcae422dd068 100755 --- a/target/linux/rockchip/armv8/base-files/etc/board.d/02_network +++ b/target/linux/rockchip/armv8/base-files/etc/board.d/02_network @@ -39,9 +39,7 @@ rockchip_setup_interfaces() generate_mac_from_mmc_cid() { - local mmc_dev=$1 - - local sd_hash=$(sha256sum /sys/class/block/$mmc_dev/device/cid) + local sd_hash=$(sha256sum /sys/class/block/mmcblk*/device/cid | head -n 1) local mac_base=$(macaddr_canonicalize "$(echo "${sd_hash}" | dd bs=1 count=12 2>/dev/null)") echo "$(macaddr_unsetbit_mc "$(macaddr_setbit_la "${mac_base}")")" } @@ -58,7 +56,7 @@ nanopi_r4s_get_mac() address=$(macaddr_setbit_la "$address") fi else - address=$(generate_mac_from_mmc_cid mmcblk1) + address=$(generate_mac_from_mmc_cid) if [ "$interface" = "lan" ]; then address=$(macaddr_add "$address" 1) fi @@ -84,7 +82,7 @@ rockchip_setup_macs() friendlyarm,nanopi-r2s|\ friendlyarm,nanopi-r5s|\ sharevdi,guangmiao-g4c) - wan_mac=$(generate_mac_from_mmc_cid mmcblk0) + wan_mac=$(generate_mac_from_mmc_cid) lan_mac=$(macaddr_add "$wan_mac" +1) ;; friendlyarm,nanopi-r4s|\ From 84c7916464379a82d130bd454b08d67ed7ecdca9 Mon Sep 17 00:00:00 2001 From: aakkll <94471752+aakkll@users.noreply.github.com> Date: Fri, 16 Sep 2022 02:10:53 +0800 Subject: [PATCH 08/11] kernel: bump 5.19 to 5.19.9 (#10125) Signed-off-by: aakkll <94471752+aakkll@users.noreply.github.com> Signed-off-by: aakkll <94471752+aakkll@users.noreply.github.com> --- include/kernel-5.19 | 4 ++-- .../613-netfilter_optional_tcp_window_check.patch | 4 ++-- target/linux/generic/pending-5.19/655-increase_skb_pad.patch | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/kernel-5.19 b/include/kernel-5.19 index 8f3128bb435137..bc8d3d27267e03 100644 --- a/include/kernel-5.19 +++ b/include/kernel-5.19 @@ -1,2 +1,2 @@ -LINUX_VERSION-5.19 = .8 -LINUX_KERNEL_HASH-5.19.8 = 616308795a952a6a39b4c74807c33916850eb7166d8ed7c9a87a1ba55d7487ce +LINUX_VERSION-5.19 = .9 +LINUX_KERNEL_HASH-5.19.9 = 0ad5b5986693adc1962be807bc3a64423a24b6a9da9df39b259d7e3bfd927f37 diff --git a/target/linux/generic/pending-5.19/613-netfilter_optional_tcp_window_check.patch b/target/linux/generic/pending-5.19/613-netfilter_optional_tcp_window_check.patch index 7fa3d900c29f65..328440f4fbd2a9 100644 --- a/target/linux/generic/pending-5.19/613-netfilter_optional_tcp_window_check.patch +++ b/target/linux/generic/pending-5.19/613-netfilter_optional_tcp_window_check.patch @@ -19,7 +19,7 @@ Signed-off-by: Christian 'Ansuel' Marangi /* * Get the required data from the packet. */ -@@ -1130,7 +1133,7 @@ int nf_conntrack_tcp_packet(struct nf_co +@@ -1161,7 +1164,7 @@ int nf_conntrack_tcp_packet(struct nf_co IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; @@ -28,7 +28,7 @@ Signed-off-by: Christian 'Ansuel' Marangi timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else -@@ -1446,6 +1449,9 @@ void nf_conntrack_tcp_init_net(struct ne +@@ -1477,6 +1480,9 @@ void nf_conntrack_tcp_init_net(struct ne */ tn->tcp_be_liberal = 0; diff --git a/target/linux/generic/pending-5.19/655-increase_skb_pad.patch b/target/linux/generic/pending-5.19/655-increase_skb_pad.patch index 81367e1eeee20a..a6c5a35e19a904 100644 --- a/target/linux/generic/pending-5.19/655-increase_skb_pad.patch +++ b/target/linux/generic/pending-5.19/655-increase_skb_pad.patch @@ -9,7 +9,7 @@ Signed-off-by: Felix Fietkau --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h -@@ -3096,7 +3096,7 @@ static inline int pskb_network_may_pull( +@@ -3123,7 +3123,7 @@ static inline int pskb_network_may_pull( * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD From a9b138ae004efadb16d4048a79e70a96b6aa870a Mon Sep 17 00:00:00 2001 From: lean Date: Fri, 16 Sep 2022 02:19:40 +0800 Subject: [PATCH 09/11] x86: add Aquantia AQtion 5/2.5GbE USB-to-Ethernet by default --- target/linux/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/linux/x86/Makefile b/target/linux/x86/Makefile index 7fd4ce1e3a5958..13e5da9e47ada1 100644 --- a/target/linux/x86/Makefile +++ b/target/linux/x86/Makefile @@ -17,12 +17,12 @@ KERNELNAME:=bzImage include $(INCLUDE_DIR)/target.mk -DEFAULT_PACKAGES += partx-utils mkf2fs e2fsprogs kmod-button-hotplug kmod-usb-hid kmod-mmc kmod-sdhci bridger usbutils pciutils \ +DEFAULT_PACKAGES += partx-utils mkf2fs e2fsprogs kmod-button-hotplug kmod-usb-hid kmod-mmc kmod-sdhci usbutils pciutils \ kmod-alx kmod-e1000e kmod-igb kmod-igc kmod-igbvf kmod-iavf kmod-bnx2x kmod-pcnet32 kmod-tulip kmod-via-velocity kmod-vmxnet3 kmod-i40e kmod-i40evf kmod-r8125 kmod-8139cp kmod-8139too kmod-fs-f2fs cfdisk \ htop lm-sensors iperf3 autocore-x86 automount autosamba luci-app-adbyby-plus luci-app-ipsec-vpnd luci-proto-bonding luci-app-diskman \ luci-app-unblockmusic luci-app-zerotier luci-app-xlnetacc ddns-scripts_aliyun ddns-scripts_dnspod ca-bundle luci-app-wireguard luci-app-ttyd \ kmod-sound-hda-core kmod-sound-hda-codec-realtek kmod-sound-hda-codec-via kmod-sound-via82xx kmod-sound-hda-intel kmod-sound-hda-codec-hdmi kmod-sound-i8x0 kmod-usb-audio \ -kmod-usb-net kmod-usb-net-asix-ax88179 kmod-usb-net-rtl8150 kmod-usb-net-rtl8152-vendor kmod-mlx4-core kmod-mlx5-core kmod-drm-i915 kmod-i915-gvt +kmod-usb-net kmod-usb-net-asix-ax88179 kmod-usb-net-rtl8150 kmod-usb-net-rtl8152-vendor kmod-usb-net-aqc111 kmod-mlx4-core kmod-mlx5-core kmod-drm-i915 kmod-i915-gvt $(eval $(call BuildTarget)) From 0b9571cf20c9554b19e79f00af2a6d0b8fbaa5c6 Mon Sep 17 00:00:00 2001 From: lean Date: Fri, 16 Sep 2022 02:23:58 +0800 Subject: [PATCH 10/11] Config.in: disable LLVM default building --- toolchain/Config.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolchain/Config.in b/toolchain/Config.in index 4a2f91a8b43076..2d29ec09e760a4 100644 --- a/toolchain/Config.in +++ b/toolchain/Config.in @@ -41,7 +41,7 @@ menuconfig TARGET_OPTIONS prompt "BPF toolchain" if DEVEL default BPF_TOOLCHAIN_BUILD_LLVM if BUILDBOT default BPF_TOOLCHAIN_PREBUILT if HAS_PREBUILT_LLVM_TOOLCHAIN - default BPF_TOOLCHAIN_BUILD_LLVM + default BPF_TOOLCHAIN_NONE config BPF_TOOLCHAIN_NONE bool "None" @@ -324,7 +324,7 @@ config USE_LLVM_PREBUILT bool config USE_LLVM_BUILD - default y + default y if !DEVEL && BUILDBOT select HAS_BPF_TOOLCHAIN bool From 83903ede7b63691a43173f94caa37b92d9b46975 Mon Sep 17 00:00:00 2001 From: lean Date: Fri, 16 Sep 2022 02:33:11 +0800 Subject: [PATCH 11/11] bcm53xx: PHICOMM K3 add k3screenctrl by default --- target/linux/bcm53xx/image/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/linux/bcm53xx/image/Makefile b/target/linux/bcm53xx/image/Makefile index d101ff95a7b89f..813de2f9ee1339 100644 --- a/target/linux/bcm53xx/image/Makefile +++ b/target/linux/bcm53xx/image/Makefile @@ -436,7 +436,7 @@ TARGET_DEVICES += smartrg_sr400ac define Device/phicomm_k3 DEVICE_VENDOR := PHICOMM DEVICE_MODEL := K3 - DEVICE_PACKAGES := $(BRCMFMAC_4366C0) $(USB3_PACKAGES) + DEVICE_PACKAGES := $(IEEE8021X) kmod-brcmfmac k3wifi $(USB3_PACKAGES) k3screenctrl IMAGES := trx endef TARGET_DEVICES += phicomm_k3 @@ -448,7 +448,7 @@ define Device/tenda_ac9 IMAGES := trx IMAGE/trx := append-rootfs | trx-serial endef -TARGET_DEVICES += tenda_ac9 +#TARGET_DEVICES += tenda_ac9 define Device/tplink_archer-c5-v2 DEVICE_VENDOR := TP-Link