From af82800cd2d9cfcae2de1b47a35c26c147b9cdd1 Mon Sep 17 00:00:00 2001
From: wangbo <wdjjwb@163.com>
Date: Thu, 12 Apr 2018 16:58:08 +0800
Subject: [PATCH 001/393] spi: imx: Update MODULE_DESCRIPTION to "SPI
 Controller driver"

Now i.MX SPI controller can work in Slave mode.
Update MODULE_DESCRIPTION to "SPI Controller driver".

Signed-off-by: wangbo <wang.bo116@zte.com.cn>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-imx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 6f57592a7f95..a056ee88a960 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -1701,7 +1701,7 @@ static struct platform_driver spi_imx_driver = {
 };
 module_platform_driver(spi_imx_driver);
 
-MODULE_DESCRIPTION("SPI Master Controller driver");
+MODULE_DESCRIPTION("SPI Controller driver");
 MODULE_AUTHOR("Sascha Hauer, Pengutronix");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:" DRIVER_NAME);

From 10b4640833e95eeacaef8060bc1b35e636df3218 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Fri, 13 Apr 2018 15:44:16 +0300
Subject: [PATCH 002/393] spi: sh-msiof: Fix bit field overflow writes to
 TSCR/RSCR

The change fixes a bit field overflow which allows to write to higher
bits while calculating SPI transfer clock and setting BRPS and BRDV
bit fields, the problem is reproduced if 'parent_rate' to 'spi_hz'
ratio is greater than 1024, for instance

  p->min_div      = 2,
  MSO rate        = 33333333,
  SPI device rate = 10000

results in

  k          = 5, i.e. BRDV = 0b100 or 1/32 prescaler output,
  BRPS       = 105,
  TSCR value = 0x6804, thus MSSEL and MSIMM bit fields are non-zero.

Fixes: 65d5665bb260 ("spi: sh-msiof: Update calculation of frequency dividing")
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-sh-msiof.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index ae086aab57d5..8171eedbfc90 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -283,6 +283,7 @@ static void sh_msiof_spi_set_clk_regs(struct sh_msiof_spi_priv *p,
 	}
 
 	k = min_t(int, k, ARRAY_SIZE(sh_msiof_spi_div_table) - 1);
+	brps = min_t(int, brps, 32);
 
 	scr = sh_msiof_spi_div_table[k].brdv | SCR_BRPS(brps);
 	sh_msiof_write(p, TSCR, scr);

From 49530e6411789c1b9ea3ebc58e520c19d1c3752f Mon Sep 17 00:00:00 2001
From: sxauwsk <sxauwsk@163.com>
Date: Tue, 17 Apr 2018 04:01:27 +0800
Subject: [PATCH 003/393] spi: cadence: Add usleep_range() for
 cdns_spi_fill_tx_fifo()

In case of xspi work in busy condition, may send bytes failed.
once something wrong, spi controller did't work any more

My test found this situation appear in both of read/write process.
so when TX FIFO is full, add one byte delay before send data;

Signed-off-by: sxauwsk <sxauwsk@163.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-cadence.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c
index 5c9516ae4942..4a001634023e 100644
--- a/drivers/spi/spi-cadence.c
+++ b/drivers/spi/spi-cadence.c
@@ -313,6 +313,14 @@ static void cdns_spi_fill_tx_fifo(struct cdns_spi *xspi)
 
 	while ((trans_cnt < CDNS_SPI_FIFO_DEPTH) &&
 	       (xspi->tx_bytes > 0)) {
+
+		/* When xspi in busy condition, bytes may send failed,
+		 * then spi control did't work thoroughly, add one byte delay
+		 */
+		if (cdns_spi_read(xspi, CDNS_SPI_ISR) &
+		    CDNS_SPI_IXR_TXFULL)
+			usleep_range(10, 20);
+
 		if (xspi->txbuf)
 			cdns_spi_write(xspi, CDNS_SPI_TXD, *xspi->txbuf++);
 		else

From 15a1ff30d8f9bd83273d8712973b88663ad16265 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 20 Apr 2018 14:57:52 +0300
Subject: [PATCH 004/393] ARM: dts: r8a7790: Convert to new LVDS DT bindings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The internal LVDS encoder now has DT bindings separate from the DU. Port
the device tree over to the new model.

Fixes: c6a27fa41fab ("drm: rcar-du: Convert LVDS encoder code to bridge driver")
Fixes: 4bdb7aa7dcd0 ("ARM: dts: r8a7790: add soc node")
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm/boot/dts/r8a7790-lager.dts | 22 +++++++---
 arch/arm/boot/dts/r8a7790.dtsi      | 65 +++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/arch/arm/boot/dts/r8a7790-lager.dts b/arch/arm/boot/dts/r8a7790-lager.dts
index 063fdb65dc60..f07f9018c3e7 100644
--- a/arch/arm/boot/dts/r8a7790-lager.dts
+++ b/arch/arm/boot/dts/r8a7790-lager.dts
@@ -379,7 +379,7 @@ ports {
 				port@0 {
 					reg = <0>;
 					adv7511_in: endpoint {
-						remote-endpoint = <&du_out_lvds0>;
+						remote-endpoint = <&lvds0_out>;
 					};
 				};
 
@@ -467,10 +467,8 @@ &du {
 	status = "okay";
 
 	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>, <&cpg CPG_MOD 722>,
-		 <&cpg CPG_MOD 726>, <&cpg CPG_MOD 725>,
 		 <&x13_clk>, <&x2_clk>;
-	clock-names = "du.0", "du.1", "du.2", "lvds.0", "lvds.1",
-		      "dclkin.0", "dclkin.1";
+	clock-names = "du.0", "du.1", "du.2", "dclkin.0", "dclkin.1";
 
 	ports {
 		port@0 {
@@ -478,12 +476,26 @@ endpoint {
 				remote-endpoint = <&adv7123_in>;
 			};
 		};
+	};
+};
+
+&lvds0 {
+	status = "okay";
+
+	ports {
 		port@1 {
 			endpoint {
 				remote-endpoint = <&adv7511_in>;
 			};
 		};
-		port@2 {
+	};
+};
+
+&lvds1 {
+	status = "okay";
+
+	ports {
+		port@1 {
 			lvds_connector: endpoint {
 			};
 		};
diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi
index e4367cecad18..05a0fc23ac88 100644
--- a/arch/arm/boot/dts/r8a7790.dtsi
+++ b/arch/arm/boot/dts/r8a7790.dtsi
@@ -1627,18 +1627,13 @@ jpu: jpeg-codec@fe980000 {
 
 		du: display@feb00000 {
 			compatible = "renesas,du-r8a7790";
-			reg = <0 0xfeb00000 0 0x70000>,
-			      <0 0xfeb90000 0 0x1c>,
-			      <0 0xfeb94000 0 0x1c>;
-			reg-names = "du", "lvds.0", "lvds.1";
+			reg = <0 0xfeb00000 0 0x70000>;
 			interrupts = <GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 268 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 269 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>,
-				 <&cpg CPG_MOD 722>, <&cpg CPG_MOD 726>,
-				 <&cpg CPG_MOD 725>;
-			clock-names = "du.0", "du.1", "du.2", "lvds.0",
-				      "lvds.1";
+				 <&cpg CPG_MOD 722>;
+			clock-names = "du.0", "du.1", "du.2";
 			status = "disabled";
 
 			ports {
@@ -1653,11 +1648,65 @@ du_out_rgb: endpoint {
 				port@1 {
 					reg = <1>;
 					du_out_lvds0: endpoint {
+						remote-endpoint = <&lvds0_in>;
 					};
 				};
 				port@2 {
 					reg = <2>;
 					du_out_lvds1: endpoint {
+						remote-endpoint = <&lvds1_in>;
+					};
+				};
+			};
+		};
+
+		lvds0: lvds@feb90000 {
+			compatible = "renesas,r8a7790-lvds";
+			reg = <0 0xfeb90000 0 0x1c>;
+			clocks = <&cpg CPG_MOD 726>;
+			power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+			resets = <&cpg 726>;
+			status = "disabled";
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				port@0 {
+					reg = <0>;
+					lvds0_in: endpoint {
+						remote-endpoint = <&du_out_lvds0>;
+					};
+				};
+				port@1 {
+					reg = <1>;
+					lvds0_out: endpoint {
+					};
+				};
+			};
+		};
+
+		lvds1: lvds@feb94000 {
+			compatible = "renesas,r8a7790-lvds";
+			reg = <0 0xfeb94000 0 0x1c>;
+			clocks = <&cpg CPG_MOD 725>;
+			power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+			resets = <&cpg 725>;
+			status = "disabled";
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				port@0 {
+					reg = <0>;
+					lvds1_in: endpoint {
+						remote-endpoint = <&du_out_lvds1>;
+					};
+				};
+				port@1 {
+					reg = <1>;
+					lvds1_out: endpoint {
 					};
 				};
 			};

From e5c3f4707f3956a2f34b8c16daad07a16873c498 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 20 Apr 2018 14:57:53 +0300
Subject: [PATCH 005/393] ARM: dts: r8a7791: Convert to new LVDS DT bindings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The internal LVDS encoder now has DT bindings separate from the DU. Port
the device tree over to the new model.

Fixes: c6a27fa41fab ("drm: rcar-du: Convert LVDS encoder code to bridge driver")
Fixes: bb21803ea440 ("ARM: dts: r8a7791: add soc node")
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm/boot/dts/r8a7791-koelsch.dts | 12 ++++++---
 arch/arm/boot/dts/r8a7791-porter.dts  | 16 +++++++++---
 arch/arm/boot/dts/r8a7791.dtsi        | 36 ++++++++++++++++++++++-----
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/arm/boot/dts/r8a7791-koelsch.dts b/arch/arm/boot/dts/r8a7791-koelsch.dts
index f40321a1c917..9d7213a0b8b8 100644
--- a/arch/arm/boot/dts/r8a7791-koelsch.dts
+++ b/arch/arm/boot/dts/r8a7791-koelsch.dts
@@ -468,10 +468,9 @@ &du {
 	pinctrl-names = "default";
 	status = "okay";
 
-	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>, <&cpg CPG_MOD 726>,
+	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>,
 		 <&x13_clk>, <&x2_clk>;
-	clock-names = "du.0", "du.1", "lvds.0",
-		      "dclkin.0", "dclkin.1";
+	clock-names = "du.0", "du.1", "dclkin.0", "dclkin.1";
 
 	ports {
 		port@0 {
@@ -479,6 +478,13 @@ endpoint {
 				remote-endpoint = <&adv7511_in>;
 			};
 		};
+	};
+};
+
+&lvds0 {
+	status = "okay";
+
+	ports {
 		port@1 {
 			lvds_connector: endpoint {
 			};
diff --git a/arch/arm/boot/dts/r8a7791-porter.dts b/arch/arm/boot/dts/r8a7791-porter.dts
index c14e6fe9e4f6..ae9ed9ff53ef 100644
--- a/arch/arm/boot/dts/r8a7791-porter.dts
+++ b/arch/arm/boot/dts/r8a7791-porter.dts
@@ -441,10 +441,9 @@ &du {
 	pinctrl-names = "default";
 	status = "okay";
 
-	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>, <&cpg CPG_MOD 726>,
+	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>,
 		 <&x3_clk>, <&x16_clk>;
-	clock-names = "du.0", "du.1", "lvds.0",
-		      "dclkin.0", "dclkin.1";
+	clock-names = "du.0", "du.1", "dclkin.0", "dclkin.1";
 
 	ports {
 		port@0 {
@@ -455,6 +454,17 @@ endpoint {
 	};
 };
 
+&lvds0 {
+	status = "okay";
+
+	ports {
+		port@1 {
+			lvds_connector: endpoint {
+			};
+		};
+	};
+};
+
 &rcar_sound {
 	pinctrl-0 = <&ssi_pins &audio_clk_pins>;
 	pinctrl-names = "default";
diff --git a/arch/arm/boot/dts/r8a7791.dtsi b/arch/arm/boot/dts/r8a7791.dtsi
index f11dab71b03a..506b20885413 100644
--- a/arch/arm/boot/dts/r8a7791.dtsi
+++ b/arch/arm/boot/dts/r8a7791.dtsi
@@ -1633,15 +1633,12 @@ jpu: jpeg-codec@fe980000 {
 
 		du: display@feb00000 {
 			compatible = "renesas,du-r8a7791";
-			reg = <0 0xfeb00000 0 0x40000>,
-			      <0 0xfeb90000 0 0x1c>;
-			reg-names = "du", "lvds.0";
+			reg = <0 0xfeb00000 0 0x40000>;
 			interrupts = <GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 268 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&cpg CPG_MOD 724>,
-				 <&cpg CPG_MOD 723>,
-				 <&cpg CPG_MOD 726>;
-			clock-names = "du.0", "du.1", "lvds.0";
+				 <&cpg CPG_MOD 723>;
+			clock-names = "du.0", "du.1";
 			status = "disabled";
 
 			ports {
@@ -1656,6 +1653,33 @@ du_out_rgb: endpoint {
 				port@1 {
 					reg = <1>;
 					du_out_lvds0: endpoint {
+						remote-endpoint = <&lvds0_in>;
+					};
+				};
+			};
+		};
+
+		lvds0: lvds@feb90000 {
+			compatible = "renesas,r8a7791-lvds";
+			reg = <0 0xfeb90000 0 0x1c>;
+			clocks = <&cpg CPG_MOD 726>;
+			power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
+			resets = <&cpg 726>;
+			status = "disabled";
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				port@0 {
+					reg = <0>;
+					lvds0_in: endpoint {
+						remote-endpoint = <&du_out_lvds0>;
+					};
+				};
+				port@1 {
+					reg = <1>;
+					lvds0_out: endpoint {
 					};
 				};
 			};

From edb0c3affe5214a21d71ffb82ca92ed068e828df Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 20 Apr 2018 14:57:54 +0300
Subject: [PATCH 006/393] ARM: dts: r8a7793: Convert to new LVDS DT bindings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The internal LVDS encoder now has DT bindings separate from the DU. Port
the device tree over to the new model.

Fixes: c6a27fa41fab ("drm: rcar-du: Convert LVDS encoder code to bridge driver")
Fixes: bff8f8c2feb7 ("ARM: dts: r8a7793: add soc node")
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm/boot/dts/r8a7793-gose.dts | 10 +++++---
 arch/arm/boot/dts/r8a7793.dtsi     | 37 +++++++++++++++++++++++++-----
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/arch/arm/boot/dts/r8a7793-gose.dts b/arch/arm/boot/dts/r8a7793-gose.dts
index 9ed6961f2d9a..96e117d8b2cc 100644
--- a/arch/arm/boot/dts/r8a7793-gose.dts
+++ b/arch/arm/boot/dts/r8a7793-gose.dts
@@ -447,10 +447,9 @@ &du {
 	pinctrl-names = "default";
 	status = "okay";
 
-	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>, <&cpg CPG_MOD 726>,
+	clocks = <&cpg CPG_MOD 724>, <&cpg CPG_MOD 723>,
 		 <&x13_clk>, <&x2_clk>;
-	clock-names = "du.0", "du.1", "lvds.0",
-		      "dclkin.0", "dclkin.1";
+	clock-names = "du.0", "du.1", "dclkin.0", "dclkin.1";
 
 	ports {
 		port@0 {
@@ -458,6 +457,11 @@ endpoint {
 				remote-endpoint = <&adv7511_in>;
 			};
 		};
+	};
+};
+
+&lvds0 {
+	ports {
 		port@1 {
 			lvds_connector: endpoint {
 			};
diff --git a/arch/arm/boot/dts/r8a7793.dtsi b/arch/arm/boot/dts/r8a7793.dtsi
index f9c5a557107d..4f526030dc7c 100644
--- a/arch/arm/boot/dts/r8a7793.dtsi
+++ b/arch/arm/boot/dts/r8a7793.dtsi
@@ -1292,15 +1292,12 @@ gic: interrupt-controller@f1001000 {
 
 		du: display@feb00000 {
 			compatible = "renesas,du-r8a7793";
-			reg = <0 0xfeb00000 0 0x40000>,
-			      <0 0xfeb90000 0 0x1c>;
-			reg-names = "du", "lvds.0";
+			reg = <0 0xfeb00000 0 0x40000>;
 			interrupts = <GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 268 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&cpg CPG_MOD 724>,
-				 <&cpg CPG_MOD 723>,
-				 <&cpg CPG_MOD 726>;
-			clock-names = "du.0", "du.1", "lvds.0";
+				 <&cpg CPG_MOD 723>;
+			clock-names = "du.0", "du.1";
 			status = "disabled";
 
 			ports {
@@ -1315,6 +1312,34 @@ du_out_rgb: endpoint {
 				port@1 {
 					reg = <1>;
 					du_out_lvds0: endpoint {
+						remote-endpoint = <&lvds0_in>;
+					};
+				};
+			};
+		};
+
+		lvds0: lvds@feb90000 {
+			compatible = "renesas,r8a7793-lvds";
+			reg = <0 0xfeb90000 0 0x1c>;
+			clocks = <&cpg CPG_MOD 726>;
+			power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
+			resets = <&cpg 726>;
+
+			status = "disabled";
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				port@0 {
+					reg = <0>;
+					lvds0_in: endpoint {
+						remote-endpoint = <&du_out_lvds0>;
+					};
+				};
+				port@1 {
+					reg = <1>;
+					lvds0_out: endpoint {
 					};
 				};
 			};

From 30f548ba90b83759c1b187156d6874334a804791 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Wed, 18 Apr 2018 14:54:07 +0530
Subject: [PATCH 007/393] ARM: dts: da850: get rid of skeleton.dtsi

skeleton.dtsi is deprecated. Drop its usage in da850.dtsi
and move the nodes and properties included by it directly
to keep the dtb same.

The memory node has been changed to get rid of warnings
(see below). It contains the memory base address as that is
fixed for DA850 SoCs. But the size needs to be added by
bootloader or a board specific dts.

This gets rid of the following W=1 warnings:

arch/arm/boot/dts/da850-enbw-cmc.dtb: Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name
arch/arm/boot/dts/da850-evm.dtb: Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name
arch/arm/boot/dts/da850-lego-ev3.dtb: Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name

Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/boot/dts/da850.dtsi | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/da850.dtsi b/arch/arm/boot/dts/da850.dtsi
index c66cf7895363..d82be6c419df 100644
--- a/arch/arm/boot/dts/da850.dtsi
+++ b/arch/arm/boot/dts/da850.dtsi
@@ -7,10 +7,19 @@
  * Free Software Foundation;  either version 2 of the  License, or (at your
  * option) any later version.
  */
-#include "skeleton.dtsi"
 #include <dt-bindings/interrupt-controller/irq.h>
 
 / {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	chosen { };
+	aliases { };
+
+	memory@c0000000 {
+		device_type = "memory";
+		reg = <0xc0000000 0x0>;
+	};
+
 	arm {
 		#address-cells = <1>;
 		#size-cells = <1>;

From 01de0be5c1a3a03d043fd28c2a6a6fef245c9ab8 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Wed, 18 Apr 2018 14:54:08 +0530
Subject: [PATCH 008/393] ARM: dts: da850-lcdk: add unit name for memory node

Add unit name for memory node to squash the W=1 warning:

arch/arm/boot/dts/da850-lcdk.dtb: Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name

While at it, drop the device_type property from memory
node since its provided by da850.dtsi already.

Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/boot/dts/da850-lcdk.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/da850-lcdk.dts b/arch/arm/boot/dts/da850-lcdk.dts
index a1f4d6d5a569..0edf769ea95c 100644
--- a/arch/arm/boot/dts/da850-lcdk.dts
+++ b/arch/arm/boot/dts/da850-lcdk.dts
@@ -21,8 +21,8 @@ chosen {
 		stdout-path = "serial2:115200n8";
 	};
 
-	memory {
-		device_type = "memory";
+	memory@c0000000 {
+		/* 128 MB DDR2 SDRAM @ 0xc0000000 */
 		reg = <0xc0000000 0x08000000>;
 	};
 

From 94a82284ad4711b7f9fd78981fdc7a1cb645030b Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Tue, 17 Apr 2018 18:06:00 +0530
Subject: [PATCH 009/393] ARM: dts: da850: fix W=1 warnings with pinmux node

Remove unused #address-cells and #size-cells from pinmux
node. This fixes W=1 warnings of the type:

arch/arm/boot/dts/da850-lcdk.dtb: Warning (avoid_unnecessary_addr_size): /soc@1c00000/pinmux@14120: unnecessary #address-cells/#size-cells without "ranges" or child "reg" property

Tested on DA850 LCDK by checking output of:

/sys/kernel/debug/pinctrl/1c14120.pinmux-pinctrl-single/pins

before and after the change.

Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/boot/dts/da850.dtsi | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm/boot/dts/da850.dtsi b/arch/arm/boot/dts/da850.dtsi
index d82be6c419df..12010002dbdb 100644
--- a/arch/arm/boot/dts/da850.dtsi
+++ b/arch/arm/boot/dts/da850.dtsi
@@ -55,8 +55,6 @@ soc@1c00000 {
 		pmx_core: pinmux@14120 {
 			compatible = "pinctrl-single";
 			reg = <0x14120 0x50>;
-			#address-cells = <1>;
-			#size-cells = <0>;
 			#pinctrl-cells = <2>;
 			pinctrl-single,bit-per-mux;
 			pinctrl-single,register-width = <32>;

From efc4a13724b852ddaa3358402a8dec024ffbcb17 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 19 Apr 2018 19:53:32 +0300
Subject: [PATCH 010/393] spi: pxa2xx: Allow 64-bit DMA

Currently the 32-bit device address only is supported for DMA. However,
starting from Intel Sunrisepoint PCH the DMA address of the device FIFO
can be 64-bit.

Change the respective variable to be compatible with DMA engine
expectations, i.e. to phys_addr_t.

Fixes: 34cadd9c1bcb ("spi: pxa2xx: Add support for Intel Sunrisepoint")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-pxa2xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index 513ec6c6e25b..0ae7defd3492 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -38,7 +38,7 @@ struct driver_data {
 
 	/* SSP register addresses */
 	void __iomem *ioaddr;
-	u32 ssdr_physical;
+	phys_addr_t ssdr_physical;
 
 	/* SSP masks*/
 	u32 dma_cr1;

From f4e5200fc0d7dad75c688e7ccc0652481a916df5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 12 Apr 2018 11:31:31 +0900
Subject: [PATCH 011/393] arm64: dts: uniphier: fix input delay value for
 legacy mode of eMMC

The property of the legacy mode for the eMMC PHY turned out to
be wrong.  Some eMMC devices are unstable due to the set-up/hold
timing violation.  Correct the delay value.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi | 2 +-
 arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi | 2 +-
 arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
index e62bda1cf2d9..c32dd3419c87 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld11.dtsi
@@ -414,7 +414,7 @@ emmc: sdhc@5a000000 {
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
index 9efe20d07589..3a5ed789c056 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
@@ -519,7 +519,7 @@ emmc: sdhc@5a000000 {
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;
diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
index 7c8f710d9bfa..e85d6ddea3c2 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
@@ -334,7 +334,7 @@ emmc: sdhc@5a000000 {
 			mmc-ddr-1_8v;
 			mmc-hs200-1_8v;
 			mmc-pwrseq = <&emmc_pwrseq>;
-			cdns,phy-input-delay-legacy = <4>;
+			cdns,phy-input-delay-legacy = <9>;
 			cdns,phy-input-delay-mmc-highspeed = <2>;
 			cdns,phy-input-delay-mmc-ddr = <3>;
 			cdns,phy-dll-delay-sdclk = <21>;

From 019fca287f93fa5e110ec33bcfadf16c04181ee5 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Thu, 26 Apr 2018 12:05:12 +0530
Subject: [PATCH 012/393] dmaengine: Update email address for Vinod

Update the email address for DMAengine maintainer

Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..820553103018 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4305,7 +4305,7 @@ F:	Documentation/driver-api/dma-buf.rst
 T:	git git://anongit.freedesktop.org/drm/drm-misc
 
 DMA GENERIC OFFLOAD ENGINE SUBSYSTEM
-M:	Vinod Koul <vinod.koul@intel.com>
+M:	Vinod Koul <vkoul@kernel.org>
 L:	dmaengine@vger.kernel.org
 Q:	https://patchwork.kernel.org/project/linux-dmaengine/list/
 S:	Maintained

From 72d4d3e3980702809509586d36015b7c3c51fad4 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 21 Apr 2018 13:43:48 +0200
Subject: [PATCH 013/393] netfilter: Fix handling simultaneous open in TCP
 conntrack

Dominique Martinet reported a TCP hang problem when simultaneous open was used.
The problem is that the tcp_conntracks state table is not smart enough
to handle the case. The state table could be fixed by introducing a new state,
but that would require more lines of code compared to this patch, due to the
required backward compatibility with ctnetlink.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Reported-by: Dominique Martinet <asmadeus@codewreck.org>
Tested-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_tcp.h |  3 +++
 net/netfilter/nf_conntrack_proto_tcp.c          | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_tcp.h b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
index 74b91151d494..bcba72def817 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
@@ -46,6 +46,9 @@ enum tcp_conntrack {
 /* Marks possibility for expected RFC5961 challenge ACK */
 #define IP_CT_EXP_CHALLENGE_ACK 		0x40
 
+/* Simultaneous open initialized */
+#define IP_CT_TCP_SIMULTANEOUS_OPEN		0x80
+
 struct nf_ct_tcp_flags {
 	__u8 flags;
 	__u8 mask;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e97cdc1cf98c..8e67910185a0 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -981,6 +981,17 @@ static int tcp_packet(struct nf_conn *ct,
 			return NF_ACCEPT; /* Don't change state */
 		}
 		break;
+	case TCP_CONNTRACK_SYN_SENT2:
+		/* tcp_conntracks table is not smart enough to handle
+		 * simultaneous open.
+		 */
+		ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
+		    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
+			new_state = TCP_CONNTRACK_ESTABLISHED;
+		break;
 	case TCP_CONNTRACK_CLOSE:
 		if (index == TCP_RST_SET
 		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)

From dceb48d86b4871984b8ce9ad5057fb2c01aa33de Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 25 Apr 2018 13:38:47 +0200
Subject: [PATCH 014/393] netfilter: x_tables: check name length in
 find_match/target, too

ebtables uses find_match() rather than find_request_match in one case
(see bcf4934288402be3464110109a4dae3bd6fb3e93,
 "netfilter: ebtables: Fix extension lookup with identical name"), so
 extend the check on name length to those functions too.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 71325fef647d..cb7cb300c3bc 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -183,6 +183,9 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 	struct xt_match *m;
 	int err = -ENOENT;
 
+	if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN)
+		return ERR_PTR(-EINVAL);
+
 	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(m, &xt[af].match, list) {
 		if (strcmp(m->name, name) == 0) {
@@ -229,6 +232,9 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 	struct xt_target *t;
 	int err = -ENOENT;
 
+	if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN)
+		return ERR_PTR(-EINVAL);
+
 	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &xt[af].target, list) {
 		if (strcmp(t->name, name) == 0) {

From 2f99aa31cd7a5d667f17dd2924c884c3d2c621ac Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 25 Apr 2018 15:11:07 +0200
Subject: [PATCH 015/393] netfilter: nf_tables: skip synchronize_rcu if
 transaction log is empty

After processing the transaction log, the remaining entries of the log
need to be released.

However, in some cases no entries remain, e.g. because the transaction
did not remove anything.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 04d4e3772584..785d7fcf1fe1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5761,7 +5761,7 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 	}
 }
 
-static void nf_tables_commit_release(struct nft_trans *trans)
+static void nft_commit_release(struct nft_trans *trans)
 {
 	switch (trans->msg_type) {
 	case NFT_MSG_DELTABLE:
@@ -5790,6 +5790,21 @@ static void nf_tables_commit_release(struct nft_trans *trans)
 	kfree(trans);
 }
 
+static void nf_tables_commit_release(struct net *net)
+{
+	struct nft_trans *trans, *next;
+
+	if (list_empty(&net->nft.commit_list))
+		return;
+
+	synchronize_rcu();
+
+	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+		list_del(&trans->list);
+		nft_commit_release(trans);
+	}
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
@@ -5920,13 +5935,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		}
 	}
 
-	synchronize_rcu();
-
-	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
-		list_del(&trans->list);
-		nf_tables_commit_release(trans);
-	}
-
+	nf_tables_commit_release(net);
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
 
 	return 0;

From 84edb315cf766e7c8e09d473d63aaccd1637bbda Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@de.ibm.com>
Date: Mon, 23 Apr 2018 11:14:28 +0200
Subject: [PATCH 016/393] MAINTAINERS: update s390 zcrypt maintainers email
 address

Signed-off-by: Harald Freudenberger <freude@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index dd66ae9a847e..2ad33062c9e3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12239,7 +12239,7 @@ F:	Documentation/s390/vfio-ccw.txt
 F:	include/uapi/linux/vfio_ccw.h
 
 S390 ZCRYPT DRIVER
-M:	Harald Freudenberger <freude@de.ibm.com>
+M:	Harald Freudenberger <freude@linux.ibm.com>
 L:	linux-s390@vger.kernel.org
 W:	http://www.ibm.com/developerworks/linux/linux390/
 S:	Supported

From d14be68fd1be2b33e362594b831eef6f007febe5 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.ibm.com>
Date: Thu, 26 Apr 2018 14:31:52 +0200
Subject: [PATCH 017/393] s390: update defconfigs

Change the following to y:

arch/s390/configs/performance_defconfig:262:warning: symbol value 'm' invalid for NF_TABLES_IPV4
arch/s390/configs/performance_defconfig:264:warning: symbol value 'm' invalid for NF_TABLES_ARP
arch/s390/configs/performance_defconfig:285:warning: symbol value 'm' invalid for NF_TABLES_IPV6
arch/s390/configs/performance_defconfig:306:warning: symbol value 'm' invalid for NF_TABLES_BRIDGE

Signed-off-by: Sebastian Ott <sebott@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/configs/debug_defconfig       | 9 ++++-----
 arch/s390/configs/performance_defconfig | 8 ++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 6176fe9795ca..941d8cc6c9f5 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -261,9 +261,9 @@ CONFIG_IP_VS_NQ=m
 CONFIG_IP_VS_FTP=m
 CONFIG_IP_VS_PE_SIP=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_TABLES_IPV4=m
+CONFIG_NF_TABLES_IPV4=y
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_MATCH_AH=m
@@ -284,7 +284,7 @@ CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_NF_TABLES_IPV6=m
+CONFIG_NF_TABLES_IPV6=y
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
@@ -305,7 +305,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_SECURITY=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m
@@ -604,7 +604,6 @@ CONFIG_DETECT_HUNG_TASK=y
 CONFIG_WQ_WATCHDOG=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_DEBUG_TIMEKEEPING=y
-CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_LOCK_STAT=y
 CONFIG_DEBUG_LOCKDEP=y
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index c105bcc6d7a6..eb6f75f24208 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -259,9 +259,9 @@ CONFIG_IP_VS_NQ=m
 CONFIG_IP_VS_FTP=m
 CONFIG_IP_VS_PE_SIP=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_TABLES_IPV4=m
+CONFIG_NF_TABLES_IPV4=y
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_MATCH_AH=m
@@ -282,7 +282,7 @@ CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_NF_TABLES_IPV6=m
+CONFIG_NF_TABLES_IPV6=y
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
@@ -303,7 +303,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_SECURITY=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m

From 598d76562cc2329ef2fdd95475e287b60ba9463f Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.ibm.com>
Date: Wed, 25 Apr 2018 12:39:06 +0200
Subject: [PATCH 018/393] s390/kexec_file: add declaration of purgatory related
 globals

Fix the following sparse complaints:

arch/s390/purgatory/purgatory.c:18:5: warning: symbol 'kernel_entry' was not declared. Should it be static?
arch/s390/purgatory/purgatory.c:19:5: warning: symbol 'kernel_type' was not declared. Should it be static?
arch/s390/purgatory/purgatory.c:21:5: warning: symbol 'crash_start' was not declared. Should it be static?
arch/s390/purgatory/purgatory.c:22:5: warning: symbol 'crash_size' was not declared. Should it be static?

Signed-off-by: Sebastian Ott <sebott@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/purgatory.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/s390/include/asm/purgatory.h b/arch/s390/include/asm/purgatory.h
index e297bcfc476f..6090670df51f 100644
--- a/arch/s390/include/asm/purgatory.h
+++ b/arch/s390/include/asm/purgatory.h
@@ -13,5 +13,11 @@
 
 int verify_sha256_digest(void);
 
+extern u64 kernel_entry;
+extern u64 kernel_type;
+
+extern u64 crash_start;
+extern u64 crash_size;
+
 #endif	/* __ASSEMBLY__ */
 #endif /* _S390_PURGATORY_H_ */

From d66a7355717ec903d455277a550d930ba13df4a8 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.vnet.ibm.com>
Date: Tue, 24 Apr 2018 13:26:56 +0200
Subject: [PATCH 019/393] vfio: ccw: fix cleanup if cp_prefetch fails

If the translation of a channel program fails, we may end up attempting
to clean up (free, unpin) stuff that never got translated (and allocated,
pinned) in the first place.

By adjusting the lengths of the chains accordingly (so the element that
failed, and all subsequent elements are excluded) cleanup activities
based on false assumptions can be avoided.

Let's make sure cp_free works properly after cp_prefetch returns with an
error by setting ch_len of a ccw chain to the number of the translated
CCWs on that chain.

Cc: stable@vger.kernel.org #v4.12+
Acked-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20180423110113.59385-2-bjsdjshi@linux.vnet.ibm.com>
[CH: fixed typos]
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_cp.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index 2c7550797ec2..dce92b2a895d 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -715,6 +715,10 @@ void cp_free(struct channel_program *cp)
  * and stores the result to ccwchain list. @cp must have been
  * initialized by a previous call with cp_init(). Otherwise, undefined
  * behavior occurs.
+ * For each chain composing the channel program:
+ * - On entry ch_len holds the count of CCWs to be translated.
+ * - On exit ch_len is adjusted to the count of successfully translated CCWs.
+ * This allows cp_free to find in ch_len the count of CCWs to free in a chain.
  *
  * The S/390 CCW Translation APIS (prefixed by 'cp_') are introduced
  * as helpers to do ccw chain translation inside the kernel. Basically
@@ -749,11 +753,18 @@ int cp_prefetch(struct channel_program *cp)
 		for (idx = 0; idx < len; idx++) {
 			ret = ccwchain_fetch_one(chain, idx, cp);
 			if (ret)
-				return ret;
+				goto out_err;
 		}
 	}
 
 	return 0;
+out_err:
+	/* Only cleanup the chain elements that were actually translated. */
+	chain->ch_len = idx;
+	list_for_each_entry_continue(chain, &cp->ccwchain_list, next) {
+		chain->ch_len = 0;
+	}
+	return ret;
 }
 
 /**

From e6914365fd280fce303a89b8a8d4529af5a2e0f9 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 12 Apr 2018 11:16:10 +0900
Subject: [PATCH 020/393] reset: uniphier: fix USB clock line for LD20

For LD20, the bit 5 of the offset 0x200c turned out to be a USB3
reset.  The hardware document says it is the GIO reset despite LD20
has no GIO bus, confusingly.

Also, fix confusing comments for PXs3.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/reset-uniphier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/reset/reset-uniphier.c b/drivers/reset/reset-uniphier.c
index 360e06b20c53..ac18f2f27881 100644
--- a/drivers/reset/reset-uniphier.c
+++ b/drivers/reset/reset-uniphier.c
@@ -110,7 +110,7 @@ static const struct uniphier_reset_data uniphier_ld20_sys_reset_data[] = {
 	UNIPHIER_RESETX(4, 0x200c, 2),		/* eMMC */
 	UNIPHIER_RESETX(6, 0x200c, 6),		/* Ether */
 	UNIPHIER_RESETX(8, 0x200c, 8),		/* STDMAC (HSC) */
-	UNIPHIER_RESETX(12, 0x200c, 5),		/* GIO (PCIe, USB3) */
+	UNIPHIER_RESETX(14, 0x200c, 5),		/* USB30 */
 	UNIPHIER_RESETX(16, 0x200c, 12),	/* USB30-PHY0 */
 	UNIPHIER_RESETX(17, 0x200c, 13),	/* USB30-PHY1 */
 	UNIPHIER_RESETX(18, 0x200c, 14),	/* USB30-PHY2 */
@@ -127,8 +127,8 @@ static const struct uniphier_reset_data uniphier_pxs3_sys_reset_data[] = {
 	UNIPHIER_RESETX(6, 0x200c, 9),		/* Ether0 */
 	UNIPHIER_RESETX(7, 0x200c, 10),		/* Ether1 */
 	UNIPHIER_RESETX(8, 0x200c, 12),		/* STDMAC */
-	UNIPHIER_RESETX(12, 0x200c, 4),		/* USB30 link (GIO0) */
-	UNIPHIER_RESETX(13, 0x200c, 5),		/* USB31 link (GIO1) */
+	UNIPHIER_RESETX(12, 0x200c, 4),		/* USB30 link */
+	UNIPHIER_RESETX(13, 0x200c, 5),		/* USB31 link */
 	UNIPHIER_RESETX(16, 0x200c, 16),	/* USB30-PHY0 */
 	UNIPHIER_RESETX(17, 0x200c, 18),	/* USB30-PHY1 */
 	UNIPHIER_RESETX(18, 0x200c, 20),	/* USB30-PHY2 */

From a057344806d035cb9ac991619fa07854e807562d Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 25 Apr 2018 13:07:31 +0200
Subject: [PATCH 021/393] ARM64: dts: marvell: armada-cp110: Add clocks for the
 xmdio node

The Marvell XSMI controller needs 3 clocks to operate correctly :
 - The MG clock (clk 5)
 - The MG Core clock (clk 6)
 - The GOP clock (clk 18)

 This commit adds them, to avoid system hangs when using these
 interfaces.

[gregory.clement: use the real first commit to fix and add the cc:stable
flag]
Fixes: f66b2aff46ea ("arm64: dts: marvell: add xmdio nodes for 7k/8k")
Cc: <stable@vger.kernel.org>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
---
 arch/arm64/boot/dts/marvell/armada-cp110.dtsi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
index 48cad7919efa..ca22f9d100f5 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
@@ -141,6 +141,8 @@ CP110_LABEL(xmdio): mdio@12a600 {
 			#size-cells = <0>;
 			compatible = "marvell,xmdio";
 			reg = <0x12a600 0x10>;
+			clocks = <&CP110_LABEL(clk) 1 5>,
+				 <&CP110_LABEL(clk) 1 6>, <&CP110_LABEL(clk) 1 18>;
 			status = "disabled";
 		};
 

From f43194c1447c9536efb0859c2f3f46f6bf2b9154 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 25 Apr 2018 20:19:47 +0200
Subject: [PATCH 022/393] ARM64: dts: marvell: armada-cp110: Add mg_core_clk
 for ethernet node

Marvell PPv2.2 controller present on CP-110 need the extra "mg_core_clk"
clock to avoid system hangs when powering some network interfaces up.

This issue appeared after a recent clock rework on Armada 7K/8K platforms.

This commit adds the new clock and updates the documentation accordingly.

[gregory.clement: use the real first commit to fix and add the cc:stable
flag]
Fixes: e3af9f7c6ece ("RM64: dts: marvell: armada-cp110: Fix clock resources for various node")
Cc: <stable@vger.kernel.org>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
---
 Documentation/devicetree/bindings/net/marvell-pp2.txt | 9 +++++----
 arch/arm64/boot/dts/marvell/armada-cp110.dtsi         | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt
index 1814fa13f6ab..fc019df0d863 100644
--- a/Documentation/devicetree/bindings/net/marvell-pp2.txt
+++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt
@@ -21,9 +21,10 @@ Required properties:
 	- main controller clock (for both armada-375-pp2 and armada-7k-pp2)
 	- GOP clock (for both armada-375-pp2 and armada-7k-pp2)
 	- MG clock (only for armada-7k-pp2)
+	- MG Core clock (only for armada-7k-pp2)
 	- AXI clock (only for armada-7k-pp2)
-- clock-names: names of used clocks, must be "pp_clk", "gop_clk", "mg_clk"
-  and "axi_clk" (the 2 latter only for armada-7k-pp2).
+- clock-names: names of used clocks, must be "pp_clk", "gop_clk", "mg_clk",
+  "mg_core_clk" and "axi_clk" (the 3 latter only for armada-7k-pp2).
 
 The ethernet ports are represented by subnodes. At least one port is
 required.
@@ -80,8 +81,8 @@ cpm_ethernet: ethernet@0 {
 	compatible = "marvell,armada-7k-pp22";
 	reg = <0x0 0x100000>, <0x129000 0xb000>;
 	clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>,
-		 <&cpm_syscon0 1 5>, <&cpm_syscon0 1 18>;
-	clock-names = "pp_clk", "gop_clk", "gp_clk", "axi_clk";
+		 <&cpm_syscon0 1 5>, <&cpm_syscon0 1 6>, <&cpm_syscon0 1 18>;
+	clock-names = "pp_clk", "gop_clk", "mg_clk", "mg_core_clk", "axi_clk";
 
 	eth0: eth0 {
 		interrupts = <ICU_GRP_NSR 39 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
index ca22f9d100f5..ed2f1237ea1e 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110.dtsi
@@ -38,9 +38,10 @@ CP110_LABEL(ethernet): ethernet@0 {
 			compatible = "marvell,armada-7k-pp22";
 			reg = <0x0 0x100000>, <0x129000 0xb000>;
 			clocks = <&CP110_LABEL(clk) 1 3>, <&CP110_LABEL(clk) 1 9>,
-				 <&CP110_LABEL(clk) 1 5>, <&CP110_LABEL(clk) 1 18>;
+				 <&CP110_LABEL(clk) 1 5>, <&CP110_LABEL(clk) 1 6>,
+				 <&CP110_LABEL(clk) 1 18>;
 			clock-names = "pp_clk", "gop_clk",
-				      "mg_clk", "axi_clk";
+				      "mg_clk", "mg_core_clk", "axi_clk";
 			marvell,system-controller = <&CP110_LABEL(syscon0)>;
 			status = "disabled";
 			dma-coherent;

From 8b82b66e68b32dd8e22a04db6a0494bfdd65156f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 27 Apr 2018 13:25:18 +0900
Subject: [PATCH 023/393] arm64: dts: uniphier: stabilize ethernet of LD20
 reference board

Currently, the ethernet RGMII mode on the LD20 reference board is
unstable.

The default drive-strength of ethernet TX pins is too strong because
there is no dumping resistor on the TX lines on the board.

Weaken the drive-strength to make the ethernet more stable.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
index 2c1a92fafbfb..440c2e6a638b 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
@@ -67,3 +67,11 @@ ethphy: ethphy@0 {
 		reg = <0>;
 	};
 };
+
+&pinctrl_ether_rgmii {
+	tx {
+		pins = "RGMII_TXCLK", "RGMII_TXD0", "RGMII_TXD1",
+		       "RGMII_TXD2", "RGMII_TXD3", "RGMII_TXCTL";
+		drive-strength = <9>;
+	};
+};

From 40626a1bf657eef557fcee9e1b8ef5b4f5b56dcd Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sun, 29 Apr 2018 08:08:24 -0700
Subject: [PATCH 024/393] hwmon: (k10temp) Fix reading critical temperature
 register

The HTC (Hardware Temperature Control) register has moved
for recent chips.

Cc: stable@vger.kernel.org # v4.16+
Tested-by: Gabriel Craciunescu <nix.or.die@gmail.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/k10temp.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index d2cc55e21374..34b5448b00be 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -63,10 +63,12 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
 #define  NB_CAP_HTC			0x00000400
 
 /*
- * For F15h M60h, functionality of REG_REPORTED_TEMPERATURE
- * has been moved to D0F0xBC_xD820_0CA4 [Reported Temperature
- * Control]
+ * For F15h M60h and M70h, REG_HARDWARE_THERMAL_CONTROL
+ * and REG_REPORTED_TEMPERATURE have been moved to
+ * D0F0xBC_xD820_0C64 [Hardware Temperature Control]
+ * D0F0xBC_xD820_0CA4 [Reported Temperature Control]
  */
+#define F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET	0xd8200c64
 #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET	0xd8200ca4
 
 /* F17h M01h Access througn SMN */
@@ -74,6 +76,7 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
 
 struct k10temp_data {
 	struct pci_dev *pdev;
+	void (*read_htcreg)(struct pci_dev *pdev, u32 *regval);
 	void (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
 	int temp_offset;
 	u32 temp_adjust_mask;
@@ -98,6 +101,11 @@ static const struct tctl_offset tctl_offset_table[] = {
 	{ 0x17, "AMD Ryzen Threadripper 1910", 10000 },
 };
 
+static void read_htcreg_pci(struct pci_dev *pdev, u32 *regval)
+{
+	pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL, regval);
+}
+
 static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval)
 {
 	pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval);
@@ -114,6 +122,12 @@ static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn,
 	mutex_unlock(&nb_smu_ind_mutex);
 }
 
+static void read_htcreg_nb_f15(struct pci_dev *pdev, u32 *regval)
+{
+	amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
+			  F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET, regval);
+}
+
 static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
 {
 	amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
@@ -160,8 +174,7 @@ static ssize_t show_temp_crit(struct device *dev,
 	u32 regval;
 	int value;
 
-	pci_read_config_dword(data->pdev,
-			      REG_HARDWARE_THERMAL_CONTROL, &regval);
+	data->read_htcreg(data->pdev, &regval);
 	value = ((regval >> 16) & 0x7f) * 500 + 52000;
 	if (show_hyst)
 		value -= ((regval >> 24) & 0xf) * 500;
@@ -181,13 +194,18 @@ static umode_t k10temp_is_visible(struct kobject *kobj,
 	struct pci_dev *pdev = data->pdev;
 
 	if (index >= 2) {
-		u32 reg_caps, reg_htc;
+		u32 reg;
+
+		if (!data->read_htcreg)
+			return 0;
 
 		pci_read_config_dword(pdev, REG_NORTHBRIDGE_CAPABILITIES,
-				      &reg_caps);
-		pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL,
-				      &reg_htc);
-		if (!(reg_caps & NB_CAP_HTC) || !(reg_htc & HTC_ENABLE))
+				      &reg);
+		if (!(reg & NB_CAP_HTC))
+			return 0;
+
+		data->read_htcreg(data->pdev, &reg);
+		if (!(reg & HTC_ENABLE))
 			return 0;
 	}
 	return attr->mode;
@@ -268,11 +286,13 @@ static int k10temp_probe(struct pci_dev *pdev,
 
 	if (boot_cpu_data.x86 == 0x15 && (boot_cpu_data.x86_model == 0x60 ||
 					  boot_cpu_data.x86_model == 0x70)) {
+		data->read_htcreg = read_htcreg_nb_f15;
 		data->read_tempreg = read_tempreg_nb_f15;
 	} else if (boot_cpu_data.x86 == 0x17) {
 		data->temp_adjust_mask = 0x80000;
 		data->read_tempreg = read_tempreg_nb_f17;
 	} else {
+		data->read_htcreg = read_htcreg_pci;
 		data->read_tempreg = read_tempreg_pci;
 	}
 

From c8016fa215d2d50920a58de07da2d00f7cb982f3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 22 Apr 2018 19:57:59 +0200
Subject: [PATCH 025/393] i2c: core: ACPI: Improve OpRegion read errors

When we get an error doing an ACPI SerialBus I2C OpRegion read log some
useful details, like the client address and which register is being
read.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index a9126b3cda61..3dc43a009f5d 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -446,7 +446,8 @@ static int acpi_gsb_i2c_read_bytes(struct i2c_client *client,
 
 	ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
 	if (ret < 0)
-		dev_err(&client->adapter->dev, "i2c read failed\n");
+		dev_err(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n",
+			data_len, client->addr, cmd, ret);
 	else
 		memcpy(data, buffer, data_len);
 

From 7781edaed63e9396fc913e0899cb197562e6f1a0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 22 Apr 2018 19:58:00 +0200
Subject: [PATCH 026/393] i2c: core: ACPI: Log device not acking errors at dbg
 loglevel

Unfortunately some DSDTs issue bogus i2c reads to non existing devices
resulting in -EREMOTEIO errors because the non existing device of course
does not ack.

This happens e.g. from the The Asus T100TA's _BIX method, the DSDT on
the T100TA defines 2 resources on the I2C1 bus:

        Name (EHID, ResourceTemplate ()
        {
            I2cSerialBusV2 (0x005B, ControllerInitiated, 0x00061A80,
                AddressingMode7Bit, "\\_SB.I2C1",
                0x00, ResourceConsumer, , Exclusive,
                )
        })
        OperationRegion (EHOR, GenericSerialBus, Zero, 0x0100)
        Field (EHOR, BufferAcc, NoLock, Preserve)
        {
            Connection (EHID),
            Offset (0x01),
            AccessAs (BufferAcc, AttribBytes (0x10)),
            ABCD,   8
        }

        Name (UMPC, ResourceTemplate ()
        {
            I2cSerialBusV2 (0x0066, ControllerInitiated, 0x00061A80,
                AddressingMode7Bit, "\\_SB.I2C1",
                0x00, ResourceConsumer, , Exclusive,
                )
        })

The _BIX method does a single read (on each BIX() call) from the EHID
device through the ABCD Field, only to completely ignore the result.
This read always fails as there is no i2c client at address 0x5b.

The _BIX method also does several reads from the UMPC device and actually
uses the results of those to provide battery information.

IIRC I've also seen some DSTDs which do an i2c read to detect if a device
is present, also leading to false positive errors being logged.

Esp. the _BIX use is problematic as the _BIX method gets called
periodically to monitor battery status.

This commit stops the logs from filling up with errors like these:

[   57.327858] i2c i2c-0: i2c read 16 bytes from client@0x5b starting at
               reg 0x1 failed, error: -121

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-acpi.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 3dc43a009f5d..7c3b4740b94b 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -445,11 +445,17 @@ static int acpi_gsb_i2c_read_bytes(struct i2c_client *client,
 	msgs[1].buf = buffer;
 
 	ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
-	if (ret < 0)
-		dev_err(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n",
-			data_len, client->addr, cmd, ret);
-	else
+	if (ret < 0) {
+		/* Getting a NACK is unfortunately normal with some DSTDs */
+		if (ret == -EREMOTEIO)
+			dev_dbg(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n",
+				data_len, client->addr, cmd, ret);
+		else
+			dev_err(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n",
+				data_len, client->addr, cmd, ret);
+	} else {
 		memcpy(data, buffer, data_len);
+	}
 
 	kfree(buffer);
 	return ret;

From 51e9f12163223546bd3aa9f7af6817931f980da8 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Tue, 24 Apr 2018 20:05:03 +0530
Subject: [PATCH 027/393] ARM: davinci: board-da830-evm: fix GPIO lookup for
 MMC/SD

The GPIO chip is called davinci_gpio.0 in legacy mode. Fix it, so that
mmc can correctly lookup the wp and cp gpios. Also fix the GPIO numbers
as they are not offsets within a bank.

Note that it is the gpio-davinci driver that sets the gpiochip label to
davinci_gpio.0.

Fixes: b5e1438cf98a ("ARM: davinci: da830-evm: use gpio descriptor for mmc pins")
Reported-by: David Lechner <david@lechnology.com>
Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-da830-evm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-davinci/board-da830-evm.c b/arch/arm/mach-davinci/board-da830-evm.c
index 004f9c8de032..d1e8ce7b4bd2 100644
--- a/arch/arm/mach-davinci/board-da830-evm.c
+++ b/arch/arm/mach-davinci/board-da830-evm.c
@@ -205,12 +205,17 @@ static const short da830_evm_mmc_sd_pins[] = {
 	-1
 };
 
+#define DA830_MMCSD_WP_PIN		GPIO_TO_PIN(2, 1)
+#define DA830_MMCSD_CD_PIN		GPIO_TO_PIN(2, 2)
+
 static struct gpiod_lookup_table mmc_gpios_table = {
 	.dev_id = "da830-mmc.0",
 	.table = {
 		/* gpio chip 1 contains gpio range 32-63 */
-		GPIO_LOOKUP("davinci_gpio.1", 2, "cd", GPIO_ACTIVE_LOW),
-		GPIO_LOOKUP("davinci_gpio.1", 1, "wp", GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA830_MMCSD_CD_PIN, "cd",
+			    GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA830_MMCSD_WP_PIN, "wp",
+			    GPIO_ACTIVE_LOW),
 	},
 };
 

From 67c6b6ff221f807180aea6dd597246f87e1dd98a Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Tue, 24 Apr 2018 20:05:04 +0530
Subject: [PATCH 028/393] ARM: davinci: board-da850-evm: fix GPIO lookup for
 MMC/SD

The GPIO chip is called davinci_gpio.0 in legacy mode. Fix it, so that
mmc can correctly lookup the wp and cp gpios. Also fix the GPIO numbers
as they are not offsets within a bank.

Note that it is the gpio-davinci driver that sets the gpiochip label to
davinci_gpio.0.

Fixes: bdf0e8364fd3 ("ARM: davinci: da850-evm: use gpio descriptor for mmc pins")
Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-da850-evm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index 3063478bcc36..158ed9a1483f 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -763,12 +763,17 @@ static const short da850_evm_mcasp_pins[] __initconst = {
 	-1
 };
 
+#define DA850_MMCSD_CD_PIN		GPIO_TO_PIN(4, 0)
+#define DA850_MMCSD_WP_PIN		GPIO_TO_PIN(4, 1)
+
 static struct gpiod_lookup_table mmc_gpios_table = {
 	.dev_id = "da830-mmc.0",
 	.table = {
 		/* gpio chip 2 contains gpio range 64-95 */
-		GPIO_LOOKUP("davinci_gpio.2", 0, "cd", GPIO_ACTIVE_LOW),
-		GPIO_LOOKUP("davinci_gpio.2", 1, "wp", GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA850_MMCSD_CD_PIN, "cd",
+			    GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA850_MMCSD_WP_PIN, "wp",
+			    GPIO_ACTIVE_LOW),
 	},
 };
 

From d45622c0eaa5992a1a2248cbe93e1ff7a2da7be4 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Tue, 24 Apr 2018 20:05:06 +0530
Subject: [PATCH 029/393] ARM: davinci: board-omapl138-hawk: fix GPIO numbers
 for MMC/SD lookup

commit c4dc56be7e26 ("ARM: davinci: fix the GPIO lookup for omapl138-hawk")
fixed the GPIO chip name for look-up of MMC/SD CD and WP pins, but forgot
to change the GPIO numbers passed.

The GPIO numbers are not offsets from within a 32 GPIO bank. Fix the
GPIO numbers as well as remove the misleading comment.

Fixes: c4dc56be7e26 ("ARM: davinci: fix the GPIO lookup for omapl138-hawk")
Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-omapl138-hawk.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mach-davinci/board-omapl138-hawk.c b/arch/arm/mach-davinci/board-omapl138-hawk.c
index 0d32042b728f..be8b892a6ea7 100644
--- a/arch/arm/mach-davinci/board-omapl138-hawk.c
+++ b/arch/arm/mach-davinci/board-omapl138-hawk.c
@@ -123,12 +123,16 @@ static const short hawk_mmcsd0_pins[] = {
 	-1
 };
 
+#define DA850_HAWK_MMCSD_CD_PIN		GPIO_TO_PIN(3, 12)
+#define DA850_HAWK_MMCSD_WP_PIN		GPIO_TO_PIN(3, 13)
+
 static struct gpiod_lookup_table mmc_gpios_table = {
 	.dev_id = "da830-mmc.0",
 	.table = {
-		/* CD: gpio3_12: gpio60: chip 1 contains gpio range 32-63*/
-		GPIO_LOOKUP("davinci_gpio.0", 28, "cd", GPIO_ACTIVE_LOW),
-		GPIO_LOOKUP("davinci_gpio.0", 29, "wp", GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA850_HAWK_MMCSD_CD_PIN, "cd",
+			    GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP("davinci_gpio.0", DA850_HAWK_MMCSD_WP_PIN, "wp",
+			    GPIO_ACTIVE_LOW),
 	},
 };
 

From 5c054de228dd6d97bf8e38962bd118953b66e5a0 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Wed, 25 Apr 2018 14:53:23 +0530
Subject: [PATCH 030/393] ARM: davinci: board-dm355-evm: fix broken networking

Since commit 09f3756bb9a8 ("dm9000: Return an ERR_PTR() in all
error conditions of dm9000_parse_dt()"), passing either non-NULL
platform data or device-tree for dm9000 driver to probe is
mandatory.

DM335 board was using none, so networking failed to initialize.
Fix it by passing non-NULL (but empty) platform data.

Fixes: 09f3756bb9a8 ("dm9000: Return an ERR_PTR() in all error conditions of dm9000_parse_dt()")
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-dm355-evm.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c
index cb30637d9eaf..762d29683a3a 100644
--- a/arch/arm/mach-davinci/board-dm355-evm.c
+++ b/arch/arm/mach-davinci/board-dm355-evm.c
@@ -19,6 +19,7 @@
 #include <linux/gpio.h>
 #include <linux/gpio/machine.h>
 #include <linux/clk.h>
+#include <linux/dm9000.h>
 #include <linux/videodev2.h>
 #include <media/i2c/tvp514x.h>
 #include <linux/spi/spi.h>
@@ -179,11 +180,16 @@ static struct resource dm355evm_dm9000_rsrc[] = {
 	},
 };
 
+static struct dm9000_plat_data dm335evm_dm9000_platdata;
+
 static struct platform_device dm355evm_dm9000 = {
 	.name		= "dm9000",
 	.id		= -1,
 	.resource	= dm355evm_dm9000_rsrc,
 	.num_resources	= ARRAY_SIZE(dm355evm_dm9000_rsrc),
+	.dev		= {
+		.platform_data = &dm335evm_dm9000_platdata,
+	},
 };
 
 static struct tvp514x_platform_data tvp5146_pdata = {

From 33e9572483031a79ad0a4468064675144d9269ec Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 9 Mar 2018 11:50:20 +0200
Subject: [PATCH 031/393] ARM: OMAP2+: powerdomain: use raw_smp_processor_id()
 for trace

smp_processor_id() checks preemption if CONFIG_DEBUG_PREEMPT is enabled,
causing a warning dump during boot:

[    5.042377] BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
[    5.050281] caller is pwrdm_set_next_pwrst+0x48/0x88
[    5.055330] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.14.24-g57341df0b4 #1

Use the raw_smp_processor_id() for the trace instead, this value does
not need to be perfectly correct. The alternative of disabling preempt
is too heavy weight operation to be applied in PM hot path for just
tracing purposes.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/powerdomain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-omap2/powerdomain.c b/arch/arm/mach-omap2/powerdomain.c
index 76eb6ec5f157..1e6a967cd2d5 100644
--- a/arch/arm/mach-omap2/powerdomain.c
+++ b/arch/arm/mach-omap2/powerdomain.c
@@ -188,7 +188,7 @@ static int _pwrdm_state_switch(struct powerdomain *pwrdm, int flag)
 				       ((prev & OMAP_POWERSTATE_MASK) << 0));
 			trace_power_domain_target_rcuidle(pwrdm->name,
 							  trace_state,
-							  smp_processor_id());
+							  raw_smp_processor_id());
 		}
 		break;
 	default:
@@ -518,7 +518,7 @@ int pwrdm_set_next_pwrst(struct powerdomain *pwrdm, u8 pwrst)
 	if (arch_pwrdm && arch_pwrdm->pwrdm_set_next_pwrst) {
 		/* Trace the pwrdm desired target state */
 		trace_power_domain_target_rcuidle(pwrdm->name, pwrst,
-						  smp_processor_id());
+						  raw_smp_processor_id());
 		/* Program the pwrdm desired target state */
 		ret = arch_pwrdm->pwrdm_set_next_pwrst(pwrdm, pwrst);
 	}

From 189822cbcbf3ea37c26a15612d8f922c440bc0e0 Mon Sep 17 00:00:00 2001
From: Adam Ford <aford173@gmail.com>
Date: Mon, 30 Apr 2018 18:24:34 -0500
Subject: [PATCH 032/393] ARM: dts: logicpd-som-lv: Fix WL127x Startup Issues

The VAUX3 rail from the PMIC powers a clock driver which clocks
the WL127x. This corrects a bug which did not correctly associate
the vin-supply with the proper power rail.

This also fixes a typo in the pinmuxing to properly configure the
interrupt pin.

Fixes: ab8dd3aed011 ("ARM: DTS: Add minimal Support for Logic PD
DM3730 SOM-LV")

Signed-off-by: Adam Ford <aford173@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/boot/dts/logicpd-som-lv.dtsi | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi
index b47cac23a04b..40eadbfb91ea 100644
--- a/arch/arm/boot/dts/logicpd-som-lv.dtsi
+++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi
@@ -26,7 +26,7 @@ wl12xx_vmmc: wl12xx_vmmc {
 		gpio = <&gpio1 3 0>;   /* gpio_3 */
 		startup-delay-us = <70000>;
 		enable-active-high;
-		vin-supply = <&vmmc2>;
+		vin-supply = <&vaux3>;
 	};
 
 	/* HS USB Host PHY on PORT 1 */
@@ -213,7 +213,7 @@ OMAP3_WKUP_IOPAD(0x2a0e, PIN_OUTPUT | MUX_MODE4)	/* sys_boot2.gpio_4 */
 	};
 	wl127x_gpio: pinmux_wl127x_gpio_pin {
 		pinctrl-single,pins = <
-			OMAP3_WKUP_IOPAD(0x2a0c, PIN_INPUT | MUX_MODE4)		/* sys_boot0.gpio_2 */
+			OMAP3_WKUP_IOPAD(0x2a0a, PIN_INPUT | MUX_MODE4)		/* sys_boot0.gpio_2 */
 			OMAP3_WKUP_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4)	/* sys_boot1.gpio_3 */
 		>;
 	};
@@ -260,6 +260,11 @@ &mcspi1 {
 #include "twl4030.dtsi"
 #include "twl4030_omap3.dtsi"
 
+&vaux3 {
+	regulator-min-microvolt = <2800000>;
+	regulator-max-microvolt = <2800000>;
+};
+
 &twl {
 	twl_power: power {
 		compatible = "ti,twl4030-power-idle-osc-off", "ti,twl4030-power-idle";

From 95e59fc3c3fa3187a07a75f40b21637deb4bd12d Mon Sep 17 00:00:00 2001
From: Adam Ford <aford173@gmail.com>
Date: Tue, 1 May 2018 08:58:53 -0500
Subject: [PATCH 033/393] ARM: dts: logicpd-som-lv: Fix Audio Mute

The Audio has worked, but the mute pin has a weak pulldown which alows
some of the audio signal to pass very quietly.  This patch fixes
that so the mute pin is actively driven high for mute or low for normal
operation.

Fixes: ab8dd3aed011 ("ARM: DTS: Add minimal Support for Logic
PD DM3730 SOM-LV")

Signed-off-by: Adam Ford <aford173@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/boot/dts/logicpd-som-lv.dtsi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi
index 40eadbfb91ea..6fa7bba3e801 100644
--- a/arch/arm/boot/dts/logicpd-som-lv.dtsi
+++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi
@@ -82,6 +82,7 @@ twl: twl@48 {
 		twl_audio: audio {
 			compatible = "ti,twl4030-audio";
 			codec {
+				ti,hs_extmute_gpio = <&gpio2 25 GPIO_ACTIVE_HIGH>;
 			};
 		};
 	};
@@ -199,6 +200,7 @@ i2c1_pins: pinmux_i2c1_pins {
 		pinctrl-single,pins = <
 			OMAP3_CORE1_IOPAD(0x21ba, PIN_INPUT | MUX_MODE0)        /* i2c1_scl.i2c1_scl */
 			OMAP3_CORE1_IOPAD(0x21bc, PIN_INPUT | MUX_MODE0)        /* i2c1_sda.i2c1_sda */
+			OMAP3_CORE1_IOPAD(0x20ba, PIN_OUTPUT | MUX_MODE4)        /* gpmc_ncs6.gpio_57 */
 		>;
 	};
 };

From 30443b3104527c83102fa85347ae4bf21caaf77a Mon Sep 17 00:00:00 2001
From: Adam Ford <aford173@gmail.com>
Date: Mon, 30 Apr 2018 12:47:48 -0500
Subject: [PATCH 034/393] ARM: dts: logicpd-som-lv: Fix pinmux controller
 references

The pinmux controllers do not themselves need references to
'pinctrl-names' or 'pinctrl-0'

This patch removes some unnecessary typos.

Fixes: 89077c7145c3 ("ARM: dts: Add HSUSB2 EHCI Support to Logic PD DM37xx
SOM-LV")

Signed-off-by: Adam Ford <aford173@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/boot/dts/logicpd-som-lv.dtsi | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi
index 6fa7bba3e801..efd8c3351e10 100644
--- a/arch/arm/boot/dts/logicpd-som-lv.dtsi
+++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi
@@ -206,8 +206,6 @@ OMAP3_CORE1_IOPAD(0x20ba, PIN_OUTPUT | MUX_MODE4)        /* gpmc_ncs6.gpio_57 */
 };
 
 &omap3_pmx_wkup {
-	pinctrl-names = "default";
-	pinctrl-0 = <&hsusb2_reset_pin>;
 	hsusb2_reset_pin: pinmux_hsusb1_reset_pin {
 		pinctrl-single,pins = <
 			OMAP3_WKUP_IOPAD(0x2a0e, PIN_OUTPUT | MUX_MODE4)	/* sys_boot2.gpio_4 */
@@ -234,8 +232,6 @@ OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0)	/* i2c3_sda */
 };
 
 &omap3_pmx_core2 {
-	pinctrl-names = "default";
-	pinctrl-0 = <&hsusb2_2_pins>;
 	hsusb2_2_pins: pinmux_hsusb2_2_pins {
 		pinctrl-single,pins = <
 			OMAP3630_CORE2_IOPAD(0x25f0, PIN_OUTPUT | MUX_MODE3)            /* etk_d10.hsusb2_clk */

From 5eb9a07a4ae1008b67d8bcd47bddb3dae97456b7 Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 26 Apr 2018 14:48:00 -0400
Subject: [PATCH 035/393] spi: bcm-qspi: Avoid setting MSPI_CDRAM_PCS for
 spi-nor master

Added fix for probing of spi-nor device non-zero chip selects. Set
MSPI_CDRAM_PCS (peripheral chip select) with spi master for MSPI
controller and not for MSPI/BSPI spi-nor master controller. Ensure
setting of cs bit in chip select register on chip select change.

Fixes: fa236a7ef24048 ("spi: bcm-qspi: Add Broadcom MSPI driver")
Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-bcm-qspi.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 1596d35498c5..2946989c61a2 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -519,16 +519,19 @@ static void bcm_qspi_disable_bspi(struct bcm_qspi *qspi)
 
 static void bcm_qspi_chip_select(struct bcm_qspi *qspi, int cs)
 {
-	u32 data = 0;
+	u32 rd = 0;
+	u32 wr = 0;
 
-	if (qspi->curr_cs == cs)
-		return;
 	if (qspi->base[CHIP_SELECT]) {
-		data = bcm_qspi_read(qspi, CHIP_SELECT, 0);
-		data = (data & ~0xff) | (1 << cs);
-		bcm_qspi_write(qspi, CHIP_SELECT, 0, data);
+		rd = bcm_qspi_read(qspi, CHIP_SELECT, 0);
+		wr = (rd & ~0xff) | (1 << cs);
+		if (rd == wr)
+			return;
+		bcm_qspi_write(qspi, CHIP_SELECT, 0, wr);
 		usleep_range(10, 20);
 	}
+
+	dev_dbg(&qspi->pdev->dev, "using cs:%d\n", cs);
 	qspi->curr_cs = cs;
 }
 
@@ -755,8 +758,13 @@ static int write_to_hw(struct bcm_qspi *qspi, struct spi_device *spi)
 			dev_dbg(&qspi->pdev->dev, "WR %04x\n", val);
 		}
 		mspi_cdram = MSPI_CDRAM_CONT_BIT;
-		mspi_cdram |= (~(1 << spi->chip_select) &
-			       MSPI_CDRAM_PCS);
+
+		if (has_bspi(qspi))
+			mspi_cdram &= ~1;
+		else
+			mspi_cdram |= (~(1 << spi->chip_select) &
+				       MSPI_CDRAM_PCS);
+
 		mspi_cdram |= ((tp.trans->bits_per_word <= 8) ? 0 :
 				MSPI_CDRAM_BITSE_BIT);
 

From 602805fb618b018b7a41fbb3f93c1992b078b1ae Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 26 Apr 2018 14:48:01 -0400
Subject: [PATCH 036/393] spi: bcm-qspi: Always read and set
 BSPI_MAST_N_BOOT_CTRL

Always confirm the BSPI_MAST_N_BOOT_CTRL bit when enabling
or disabling BSPI transfers.

Fixes: 4e3b2d236fe00 ("spi: bcm-qspi: Add BSPI spi-nor flash controller driver")
Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-bcm-qspi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 2946989c61a2..6573152ce893 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -490,7 +490,7 @@ static int bcm_qspi_bspi_set_mode(struct bcm_qspi *qspi,
 
 static void bcm_qspi_enable_bspi(struct bcm_qspi *qspi)
 {
-	if (!has_bspi(qspi) || (qspi->bspi_enabled))
+	if (!has_bspi(qspi))
 		return;
 
 	qspi->bspi_enabled = 1;
@@ -505,7 +505,7 @@ static void bcm_qspi_enable_bspi(struct bcm_qspi *qspi)
 
 static void bcm_qspi_disable_bspi(struct bcm_qspi *qspi)
 {
-	if (!has_bspi(qspi) || (!qspi->bspi_enabled))
+	if (!has_bspi(qspi))
 		return;
 
 	qspi->bspi_enabled = 0;

From f4b024271ae3e9786e5d6f1c05b01b57a74e1d6d Mon Sep 17 00:00:00 2001
From: Jim Gill <jgill@vmware.com>
Date: Fri, 20 Apr 2018 19:04:47 -0700
Subject: [PATCH 037/393] scsi: vmw-pvscsi: return DID_BUS_BUSY for
 adapter-initated aborts

The vmw_pvscsi driver returns DID_ABORT for commands aborted internally
by the adapter, leading to the filesystem going read-only. Change the
result to DID_BUS_BUSY, causing the kernel to retry the command.

Signed-off-by: Jim Gill <jgill@vmware.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/vmw_pvscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c
index c374e3b5c678..777e5f1e52d1 100644
--- a/drivers/scsi/vmw_pvscsi.c
+++ b/drivers/scsi/vmw_pvscsi.c
@@ -609,7 +609,7 @@ static void pvscsi_complete_request(struct pvscsi_adapter *adapter,
 			break;
 
 		case BTSTAT_ABORTQUEUE:
-			cmd->result = (DID_ABORT << 16);
+			cmd->result = (DID_BUS_BUSY << 16);
 			break;
 
 		case BTSTAT_SCSIPARITY:

From 7d3af7d96af7b9f51e1ef67b6f4725f545737da2 Mon Sep 17 00:00:00 2001
From: Dave Carroll <david.carroll@microsemi.com>
Date: Wed, 25 Apr 2018 10:24:20 -0600
Subject: [PATCH 038/393] scsi: aacraid: Correct hba_send to include iu_type

commit b60710ec7d7a ("scsi: aacraid: enable sending of TMFs from
aac_hba_send()") allows aac_hba_send() to send scsi commands, and TMF
requests, but the existing code only updates the iu_type for scsi
commands. For TMF requests we are sending an unknown iu_type to
firmware, which causes a fault.

Include iu_type prior to determining the validity of the command

Reported-by: Noah Misner <nmisner@us.ibm.com>
Fixes: b60710ec7d7ab ("aacraid: enable sending of TMFs from aac_hba_send()")
Fixes: 423400e64d377 ("aacraid: Include HBA direct interface")
Tested-by: Noah Misner <nmisner@us.ibm.com>
cc: stable@vger.kernel.org
Signed-off-by: Dave Carroll <david.carroll@microsemi.com>
Reviewed-by: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aacraid/commsup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 0156c9623c35..d62ddd63f4fe 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -724,6 +724,8 @@ int aac_hba_send(u8 command, struct fib *fibptr, fib_callback callback,
 	int wait;
 	unsigned long flags = 0;
 	unsigned long mflags = 0;
+	struct aac_hba_cmd_req *hbacmd = (struct aac_hba_cmd_req *)
+			fibptr->hw_fib_va;
 
 	fibptr->flags = (FIB_CONTEXT_FLAG | FIB_CONTEXT_FLAG_NATIVE_HBA);
 	if (callback) {
@@ -734,11 +736,9 @@ int aac_hba_send(u8 command, struct fib *fibptr, fib_callback callback,
 		wait = 1;
 
 
-	if (command == HBA_IU_TYPE_SCSI_CMD_REQ) {
-		struct aac_hba_cmd_req *hbacmd =
-			(struct aac_hba_cmd_req *)fibptr->hw_fib_va;
+	hbacmd->iu_type = command;
 
-		hbacmd->iu_type = command;
+	if (command == HBA_IU_TYPE_SCSI_CMD_REQ) {
 		/* bit1 of request_id must be 0 */
 		hbacmd->request_id =
 			cpu_to_le32((((u32)(fibptr - dev->fibs)) << 2) + 1);

From 9411ac07cd764be34bbd7ff09125a6b7b9175d4c Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Wed, 18 Apr 2018 15:02:46 +0530
Subject: [PATCH 039/393] ARM: davinci: fix GPIO lookup for I2C

The GPIO chip is called davinci_gpio.0 in legacy mode. Fix it, so that
I2C can correctly lookup the recovery gpios.

Note that it is the gpio-davinci driver that sets the gpiochip label to
davinci_gpio.0.

Also, the I2C device uses an id of 1 on DM644x and DM355.

While at it, convert to using GPIO_TO_PIN() for referring to GPIO pin
numbers, like it is done in rest of the board support files.

Fixes: e53537653791 ("i2c/ARM: davinci: Deep refactoring of I2C recovery")
Reviewed-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-dm355-evm.c  |  9 ++++++---
 arch/arm/mach-davinci/board-dm644x-evm.c | 10 +++++++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c
index 762d29683a3a..23ab9e8bc04c 100644
--- a/arch/arm/mach-davinci/board-dm355-evm.c
+++ b/arch/arm/mach-davinci/board-dm355-evm.c
@@ -110,12 +110,15 @@ static struct platform_device davinci_nand_device = {
 	},
 };
 
+#define DM355_I2C_SDA_PIN	GPIO_TO_PIN(0, 15)
+#define DM355_I2C_SCL_PIN	GPIO_TO_PIN(0, 14)
+
 static struct gpiod_lookup_table i2c_recovery_gpiod_table = {
-	.dev_id = "i2c_davinci",
+	.dev_id = "i2c_davinci.1",
 	.table = {
-		GPIO_LOOKUP("davinci_gpio", 15, "sda",
+		GPIO_LOOKUP("davinci_gpio.0", DM355_I2C_SDA_PIN, "sda",
 			    GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN),
-		GPIO_LOOKUP("davinci_gpio", 14, "scl",
+		GPIO_LOOKUP("davinci_gpio.0", DM355_I2C_SCL_PIN, "scl",
 			    GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN),
 	},
 };
diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c
index 95b55aae1366..509e64ab1994 100644
--- a/arch/arm/mach-davinci/board-dm644x-evm.c
+++ b/arch/arm/mach-davinci/board-dm644x-evm.c
@@ -17,6 +17,7 @@
 #include <linux/i2c.h>
 #include <linux/platform_data/pcf857x.h>
 #include <linux/platform_data/at24.h>
+#include <linux/platform_data/gpio-davinci.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
@@ -596,12 +597,15 @@ static struct i2c_board_info __initdata i2c_info[] =  {
 	},
 };
 
+#define DM644X_I2C_SDA_PIN	GPIO_TO_PIN(2, 12)
+#define DM644X_I2C_SCL_PIN	GPIO_TO_PIN(2, 11)
+
 static struct gpiod_lookup_table i2c_recovery_gpiod_table = {
-	.dev_id = "i2c_davinci",
+	.dev_id = "i2c_davinci.1",
 	.table = {
-		GPIO_LOOKUP("davinci_gpio", 44, "sda",
+		GPIO_LOOKUP("davinci_gpio.0", DM644X_I2C_SDA_PIN, "sda",
 			    GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN),
-		GPIO_LOOKUP("davinci_gpio", 43, "scl",
+		GPIO_LOOKUP("davinci_gpio.0", DM644X_I2C_SCL_PIN, "scl",
 			    GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN),
 	},
 };

From e76b6312391bdd62e31dc86cb65e478b07b7909e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 2 May 2018 20:16:56 +0200
Subject: [PATCH 040/393] iov_iter: fix return type of __pipe_get_pages()

It returns -EFAULT and happens to be a helper for pipe_get_pages()
whose return type is ssize_t.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/iov_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 970212670b6a..4d5bf40d399d 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1012,7 +1012,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
-static inline size_t __pipe_get_pages(struct iov_iter *i,
+static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
 				int idx,

From d7760d638b140d53c6390a2fbee9b06460b43e9e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 2 May 2018 20:16:57 +0200
Subject: [PATCH 041/393] iov_iter: fix memory leak in pipe_get_pages_alloc()

Make n signed to avoid leaking the pages array if __pipe_get_pages()
fails to allocate any pages.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/iov_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 4d5bf40d399d..fdae394172fa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1102,7 +1102,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   size_t *start)
 {
 	struct page **p;
-	size_t n;
+	ssize_t n;
 	int idx;
 	int npages;
 

From 9df50ba76ac1485b844beffa1f3f5d9659d9cdaf Mon Sep 17 00:00:00 2001
From: Bhadram Varka <vbhadram@nvidia.com>
Date: Wed, 2 May 2018 20:44:40 +0530
Subject: [PATCH 042/393] arm64: tegra: Make BCM89610 PHY interrupt as active
 low

Need to configure PHY interrupt as active low for P3310 Tegra186
platform otherwise it results in spurious interrupts.

This issue wasn't seen before because the generic PHY driver without
interrupt support was used.

Signed-off-by: Bhadram Varka <vbhadram@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
index a8baad7b80df..13f57fff1477 100644
--- a/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra186-p3310.dtsi
@@ -46,7 +46,7 @@ phy: phy@0 {
 				compatible = "ethernet-phy-ieee802.3-c22";
 				reg = <0x0>;
 				interrupt-parent = <&gpio>;
-				interrupts = <TEGRA_MAIN_GPIO(M, 5) IRQ_TYPE_LEVEL_HIGH>;
+				interrupts = <TEGRA_MAIN_GPIO(M, 5) IRQ_TYPE_LEVEL_LOW>;
 			};
 		};
 	};

From 1aa7a5735a41418d8e01fa7c9565eb2657e2ea3f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 1 May 2018 15:55:51 +0200
Subject: [PATCH 043/393] x86/nospec: Simplify alternative_msr_write()

The macro is not type safe and I did look for why that "g" constraint for
the asm doesn't work: it's because the asm is more fundamentally wrong.

It does

        movl %[val], %%eax

but "val" isn't a 32-bit value, so then gcc will pass it in a register,
and generate code like

        movl %rsi, %eax

and gas will complain about a nonsensical 'mov' instruction (it's moving a
64-bit register to a 32-bit one).

Passing it through memory will just hide the real bug - gcc still thinks
the memory location is 64-bit, but the "movl" will only load the first 32
bits and it all happens to work because x86 is little-endian.

Convert it to a type safe inline function with a little trick which hands
the feature into the ALTERNATIVE macro.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9b143f..870acfc2fd34 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -241,15 +241,16 @@ static inline void vmexit_fill_RSB(void)
 #endif
 }
 
-#define alternative_msr_write(_msr, _val, _feature)		\
-	asm volatile(ALTERNATIVE("",				\
-				 "movl %[msr], %%ecx\n\t"	\
-				 "movl %[val], %%eax\n\t"	\
-				 "movl $0, %%edx\n\t"		\
-				 "wrmsr",			\
-				 _feature)			\
-		     : : [msr] "i" (_msr), [val] "i" (_val)	\
-		     : "eax", "ecx", "edx", "memory")
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+	asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+		: : "c" (msr),
+		    "a" (val),
+		    "d" (val >> 32),
+		    [feature] "i" (feature)
+		: "memory");
+}
 
 static inline void indirect_branch_prediction_barrier(void)
 {

From 4a28bfe3267b68e22c663ac26185aa16c9b879ef Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:16 -0400
Subject: [PATCH 044/393] x86/bugs: Concentrate bug detection into a separate
 function

Combine the various logic which goes through all those
x86_cpu_id matching structures in one function.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ce243f7d2d4e..9ed0a18ff6f6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -927,21 +927,27 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
 	{}
 };
 
-static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
+	if (x86_match_cpu(cpu_no_speculation))
+		return;
+
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
 	if (x86_match_cpu(cpu_no_meltdown))
-		return false;
+		return;
 
 	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
 	/* Rogue Data Cache Load? No! */
 	if (ia32_cap & ARCH_CAP_RDCL_NO)
-		return false;
+		return;
 
-	return true;
+	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 }
 
 /*
@@ -992,12 +998,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	if (!x86_match_cpu(cpu_no_speculation)) {
-		if (cpu_vulnerable_to_meltdown(c))
-			setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
-	}
+	cpu_set_bug_bits(c);
 
 	fpu__init_system(c);
 

From d1059518b4789cabe34bb4b714d07e6089c82ca1 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:17 -0400
Subject: [PATCH 045/393] x86/bugs: Concentrate bug reporting into a separate
 function

Those SysFS functions have a similar preamble, as such make common
code to handle them.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/bugs.c | 48 ++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bfca937bdcc3..ad613f73d071 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -314,30 +314,48 @@ static void __init spectre_v2_select_mitigation(void)
 #undef pr_fmt
 
 #ifdef CONFIG_SYSFS
+
+ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+			char *buf, unsigned int bug)
+{
+	if (!boot_cpu_has_bug(bug))
+		return sprintf(buf, "Not affected\n");
+
+	switch (bug) {
+	case X86_BUG_CPU_MELTDOWN:
+		if (boot_cpu_has(X86_FEATURE_PTI))
+			return sprintf(buf, "Mitigation: PTI\n");
+
+		break;
+
+	case X86_BUG_SPECTRE_V1:
+		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+	case X86_BUG_SPECTRE_V2:
+		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+			       spectre_v2_module_string());
+
+	default:
+		break;
+	}
+
+	return sprintf(buf, "Vulnerable\n");
+}
+
 ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
-		return sprintf(buf, "Not affected\n");
-	if (boot_cpu_has(X86_FEATURE_PTI))
-		return sprintf(buf, "Mitigation: PTI\n");
-	return sprintf(buf, "Vulnerable\n");
+	return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
 }
 
 ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
-		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
 }
 
 ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-		return sprintf(buf, "Not affected\n");
-
-	return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
-		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
-		       spectre_v2_module_string());
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
 }
 #endif

From 1b86883ccb8d5d9506529d42dbe1a5257cb30b18 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:18 -0400
Subject: [PATCH 046/393] x86/bugs: Read SPEC_CTRL MSR during boot and re-use
 reserved bits

The 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to all
the other bits as reserved. The Intel SDM glossary defines reserved as
implementation specific - aka unknown.

As such at bootup this must be taken it into account and proper masking for
the bits in use applied.

A copy of this document is available at
https://bugzilla.kernel.org/show_bug.cgi?id=199511

[ tglx: Made x86_spec_ctrl_base __ro_after_init ]

Suggested-by: Jon Masters <jcm@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++++----
 arch/x86/kernel/cpu/bugs.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 870acfc2fd34..9ec3d4d448cd 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,6 +217,17 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+/*
+ * The Intel specification for the SPEC_CTRL MSR requires that we
+ * preserve any already set reserved bits at boot time (e.g. for
+ * future additions that this kernel is not currently aware of).
+ * We then set any additional mitigation bits that we want
+ * ourselves and always use this as the base for SPEC_CTRL.
+ * We also use this when handling guest entry/exit as below.
+ */
+extern void x86_spec_ctrl_set(u64);
+extern u64 x86_spec_ctrl_get_default(void);
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
@@ -254,8 +265,9 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
 
 static inline void indirect_branch_prediction_barrier(void)
 {
-	alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
-			      X86_FEATURE_USE_IBPB);
+	u64 val = PRED_CMD_IBPB;
+
+	alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
 }
 
 /*
@@ -266,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void)
  */
 #define firmware_restrict_branch_speculation_start()			\
 do {									\
+	u64 val = x86_spec_ctrl_get_default() | SPEC_CTRL_IBRS;		\
+									\
 	preempt_disable();						\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS,	\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 } while (0)
 
 #define firmware_restrict_branch_speculation_end()			\
 do {									\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, 0,			\
+	u64 val = x86_spec_ctrl_get_default();				\
+									\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 	preempt_enable();						\
 } while (0)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ad613f73d071..6ed84f50d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -28,6 +28,12 @@
 
 static void __init spectre_v2_select_mitigation(void);
 
+/*
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+ * writes to SPEC_CTRL contain whatever reserved bits have been set.
+ */
+static u64 __ro_after_init x86_spec_ctrl_base;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -37,6 +43,13 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 	}
 
+	/*
+	 * Read the SPEC_CTRL MSR to account for reserved bits which may
+	 * have unknown values.
+	 */
+	if (boot_cpu_has(X86_FEATURE_IBRS))
+		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
@@ -95,6 +108,21 @@ static const char *spectre_v2_strings[] = {
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
+void x86_spec_ctrl_set(u64 val)
+{
+	if (val & ~SPEC_CTRL_IBRS)
+		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
+	else
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
+
+u64 x86_spec_ctrl_get_default(void)
+{
+	return x86_spec_ctrl_base;
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 

From 5cf687548705412da47c9cec342fd952d71ed3d5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:19 -0400
Subject: [PATCH 047/393] x86/bugs, KVM: Support the combination of guest and
 host IBRS

A guest may modify the SPEC_CTRL MSR from the value used by the
kernel. Since the kernel doesn't use IBRS, this means a value of zero is
what is needed in the host.

But the 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to
the other bits as reserved so the kernel should respect the boot time
SPEC_CTRL value and use that.

This allows to deal with future extensions to the SPEC_CTRL interface if
any at all.

Note: This uses wrmsrl() instead of native_wrmsl(). I does not make any
difference as paravirt will over-write the callq *0xfff.. with the wrmsrl
assembler code.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 10 ++++++++++
 arch/x86/kernel/cpu/bugs.c           | 18 ++++++++++++++++++
 arch/x86/kvm/svm.c                   |  6 ++----
 arch/x86/kvm/vmx.c                   |  6 ++----
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 9ec3d4d448cd..d1c2630922da 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -228,6 +228,16 @@ enum spectre_v2_mitigation {
 extern void x86_spec_ctrl_set(u64);
 extern u64 x86_spec_ctrl_get_default(void);
 
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter.
+ */
+extern void x86_spec_ctrl_set_guest(u64);
+extern void x86_spec_ctrl_restore_host(u64);
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6ed84f50d74a..38a8626c894c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -123,6 +123,24 @@ u64 x86_spec_ctrl_get_default(void)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
+{
+	if (!boot_cpu_has(X86_FEATURE_IBRS))
+		return;
+	if (x86_spec_ctrl_base != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
+
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
+{
+	if (!boot_cpu_has(X86_FEATURE_IBRS))
+		return;
+	if (x86_spec_ctrl_base != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fc05e428aba..8bfc8fe26851 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5557,8 +5557,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -5670,8 +5669,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7668806163f..3091dc98a451 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9720,8 +9720,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
@@ -9869,8 +9868,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

From c456442cd3a59eeb1d60293c26cbe2ff2c4e42cf Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:20 -0400
Subject: [PATCH 048/393] x86/bugs: Expose /sys/../spec_store_bypass

Add the sysfs file for the new vulerability. It does not do much except
show the words 'Vulnerable' for recent x86 cores.

Intel cores prior to family 6 are known not to be vulnerable, and so are
some Atoms and some Xeon Phi.

It assumes that older Cyrix, Centaur, etc. cores are immune.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 .../ABI/testing/sysfs-devices-system-cpu      |  1 +
 arch/x86/include/asm/cpufeatures.h            |  1 +
 arch/x86/kernel/cpu/bugs.c                    |  5 ++++
 arch/x86/kernel/cpu/common.c                  | 23 +++++++++++++++++++
 drivers/base/cpu.c                            |  8 +++++++
 include/linux/cpu.h                           |  2 ++
 6 files changed, 40 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 025b7cf3768d..bd4975e132d3 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -478,6 +478,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/meltdown
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
+		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 578793e97431..3493b58ef2cc 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -363,5 +363,6 @@
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 #define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 38a8626c894c..59a9f3aa9f7a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -404,4 +404,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
 }
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9ed0a18ff6f6..6ff0cc441a59 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -927,10 +927,33 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
 	{}
 };
 
+static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PINEVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_LINCROFT	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PENWELL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CLOVERVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CEDARVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
+	{ X86_VENDOR_CENTAUR,	5,					},
+	{ X86_VENDOR_INTEL,	5,					},
+	{ X86_VENDOR_NSC,	5,					},
+	{ X86_VENDOR_ANY,	4,					},
+	{}
+};
+
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
+	if (!x86_match_cpu(cpu_no_spec_store_bypass))
+		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
 	if (x86_match_cpu(cpu_no_speculation))
 		return;
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 2da998baa75c..30cc9c877ebb 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -534,14 +534,22 @@ ssize_t __weak cpu_show_spectre_v2(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
+					  struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
 	&dev_attr_spectre_v1.attr,
 	&dev_attr_spectre_v2.attr,
+	&dev_attr_spec_store_bypass.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 7b01bc11c692..a97a63eef59f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -53,6 +53,8 @@ extern ssize_t cpu_show_spectre_v1(struct device *dev,
 				   struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_spectre_v2(struct device *dev,
 				   struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
+					  struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,

From 0cc5fa00b0a88dad140b4e5c2cead9951ad36822 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Sat, 28 Apr 2018 22:34:17 +0200
Subject: [PATCH 049/393] x86/cpufeatures: Add X86_FEATURE_RDS

Add the CPU feature bit CPUID.7.0.EDX[31] which indicates whether the CPU
supports Reduced Data Speculation.

[ tglx: Split it out from a later patch ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3493b58ef2cc..48305a1ddfd6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -334,6 +334,7 @@
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_RDS			(18*32+31) /* Reduced Data Speculation */
 
 /*
  * BUG word(s)

From 24f7fc83b9204d20f878c57cb77d261ae825e033 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:21 -0400
Subject: [PATCH 050/393] x86/bugs: Provide boot parameters for the
 spec_store_bypass_disable mitigation

Contemporary high performance processors use a common industry-wide
optimization known as "Speculative Store Bypass" in which loads from
addresses to which a recent store has occurred may (speculatively) see an
older value. Intel refers to this feature as "Memory Disambiguation" which
is part of their "Smart Memory Access" capability.

Memory Disambiguation can expose a cache side-channel attack against such
speculatively read values. An attacker can create exploit code that allows
them to read memory outside of a sandbox environment (for example,
malicious JavaScript in a web page), or to perform more complex attacks
against code running within the same privilege level, e.g. via the stack.

As a first step to mitigate against such attacks, provide two boot command
line control knobs:

 nospec_store_bypass_disable
 spec_store_bypass_disable=[off,auto,on]

By default affected x86 processors will power on with Speculative
Store Bypass enabled. Hence the provided kernel parameters are written
from the point of view of whether to enable a mitigation or not.
The parameters are as follows:

 - auto - Kernel detects whether your CPU model contains an implementation
	  of Speculative Store Bypass and picks the most appropriate
	  mitigation.

 - on   - disable Speculative Store Bypass
 - off  - enable Speculative Store Bypass

[ tglx: Reordered the checks so that the whole evaluation is not done
  	when the CPU does not support RDS ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  33 ++++++
 arch/x86/include/asm/cpufeatures.h            |   1 +
 arch/x86/include/asm/nospec-branch.h          |   6 +
 arch/x86/kernel/cpu/bugs.c                    | 103 ++++++++++++++++++
 4 files changed, 143 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 11fc28ecdb6d..f32112c55f7e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2680,6 +2680,9 @@
 			allow data leaks with this option, which is equivalent
 			to spectre_v2=off.
 
+	nospec_store_bypass_disable
+			[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
+
 	noxsave		[BUGS=X86] Disables x86 extended register state save
 			and restore using xsave. The kernel will fallback to
 			enabling legacy floating-point and sse state.
@@ -4025,6 +4028,36 @@
 			Not specifying this option is equivalent to
 			spectre_v2=auto.
 
+	spec_store_bypass_disable=
+			[HW] Control Speculative Store Bypass (SSB) Disable mitigation
+			(Speculative Store Bypass vulnerability)
+
+			Certain CPUs are vulnerable to an exploit against a
+			a common industry wide performance optimization known
+			as "Speculative Store Bypass" in which recent stores
+			to the same memory location may not be observed by
+			later loads during speculative execution. The idea
+			is that such stores are unlikely and that they can
+			be detected prior to instruction retirement at the
+			end of a particular speculation execution window.
+
+			In vulnerable processors, the speculatively forwarded
+			store can be used in a cache side channel attack, for
+			example to read memory to which the attacker does not
+			directly have access (e.g. inside sandboxed code).
+
+			This parameter controls whether the Speculative Store
+			Bypass optimization is used.
+
+			on     - Unconditionally disable Speculative Store Bypass
+			off    - Unconditionally enable Speculative Store Bypass
+			auto   - Kernel detects whether the CPU model contains an
+				 implementation of Speculative Store Bypass and
+				 picks the most appropriate mitigation
+
+			Not specifying this option is equivalent to
+			spec_store_bypass_disable=auto.
+
 	spia_io_base=	[HW,MTD]
 	spia_fio_base=
 	spia_pedr=
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 48305a1ddfd6..58689ed969bb 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -214,6 +214,7 @@
 
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index d1c2630922da..7b9eacfc03f3 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -238,6 +238,12 @@ extern u64 x86_spec_ctrl_get_default(void);
 extern void x86_spec_ctrl_set_guest(u64);
 extern void x86_spec_ctrl_restore_host(u64);
 
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+	SPEC_STORE_BYPASS_NONE,
+	SPEC_STORE_BYPASS_DISABLE,
+};
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 59a9f3aa9f7a..2d1f1cb604aa 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -27,6 +27,7 @@
 #include <asm/intel-family.h>
 
 static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
 
 /*
  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@@ -53,6 +54,12 @@ void __init check_bugs(void)
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
+	/*
+	 * Select proper mitigation for any exposure to the Speculative Store
+	 * Bypass vulnerability.
+	 */
+	ssb_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -357,6 +364,99 @@ static void __init spectre_v2_select_mitigation(void)
 	}
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+	SPEC_STORE_BYPASS_CMD_NONE,
+	SPEC_STORE_BYPASS_CMD_AUTO,
+	SPEC_STORE_BYPASS_CMD_ON,
+};
+
+static const char *ssb_strings[] = {
+	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
+	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled"
+};
+
+static const struct {
+	const char *option;
+	enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] = {
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },   /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+	enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+	char arg[20];
+	int ret, i;
+
+	if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+		return SPEC_STORE_BYPASS_CMD_NONE;
+	} else {
+		ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+					  arg, sizeof(arg));
+		if (ret < 0)
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+
+		for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+			if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+				continue;
+
+			cmd = ssb_mitigation_options[i].cmd;
+			break;
+		}
+
+		if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+			pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+		}
+	}
+
+	return cmd;
+}
+
+static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
+{
+	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+	enum ssb_mitigation_cmd cmd;
+
+	if (!boot_cpu_has(X86_FEATURE_RDS))
+		return mode;
+
+	cmd = ssb_parse_cmdline();
+	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+	    (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+	     cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+		return mode;
+
+	switch (cmd) {
+	case SPEC_STORE_BYPASS_CMD_AUTO:
+	case SPEC_STORE_BYPASS_CMD_ON:
+		mode = SPEC_STORE_BYPASS_DISABLE;
+		break;
+	case SPEC_STORE_BYPASS_CMD_NONE:
+		break;
+	}
+
+	if (mode != SPEC_STORE_BYPASS_NONE)
+		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+	return mode;
+}
+
+static void ssb_select_mitigation()
+{
+	ssb_mode = __ssb_select_mitigation();
+
+	if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+		pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
 #undef pr_fmt
 
 #ifdef CONFIG_SYSFS
@@ -383,6 +483,9 @@ ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
 			       spectre_v2_module_string());
 
+	case X86_BUG_SPEC_STORE_BYPASS:
+		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
 	default:
 		break;
 	}

From 772439717dbf703b39990be58d8d4e3e4ad0598a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:22 -0400
Subject: [PATCH 051/393] x86/bugs/intel: Set proper CPU features and setup RDS

Intel CPUs expose methods to:

 - Detect whether RDS capability is available via CPUID.7.0.EDX[31],

 - The SPEC_CTRL MSR(0x48), bit 2 set to enable RDS.

 - MSR_IA32_ARCH_CAPABILITIES, Bit(4) no need to enable RRS.

With that in mind if spec_store_bypass_disable=[auto,on] is selected set at
boot-time the SPEC_CTRL MSR to enable RDS if the platform requires it.

Note that this does not fix the KVM case where the SPEC_CTRL is exposed to
guests which can muck with it, see patch titled :
 KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS.

And for the firmware (IBRS to be set), see patch titled:
 x86/spectre_v2: Read SPEC_CTRL MSR during boot and re-use reserved bits

[ tglx: Distangled it from the intel implementation and kept the call order ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h |  6 ++++++
 arch/x86/kernel/cpu/bugs.c       | 30 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/common.c     | 10 ++++++----
 arch/x86/kernel/cpu/cpu.h        |  2 ++
 arch/x86/kernel/cpu/intel.c      |  1 +
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b9255e..21e1a6df9907 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,6 +42,7 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_RDS			(1 << 2)   /* Reduced Data Speculation */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
@@ -68,6 +69,11 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
+#define ARCH_CAP_RDS_NO			(1 << 4)   /*
+						    * Not susceptible to Speculative Store Bypass
+						    * attack, so no Reduced Data Speculation control
+						    * required.
+						    */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 2d1f1cb604aa..17edf5bdc361 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -117,7 +117,7 @@ static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
 void x86_spec_ctrl_set(u64 val)
 {
-	if (val & ~SPEC_CTRL_IBRS)
+	if (val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_RDS))
 		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
 	else
 		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
@@ -444,8 +444,28 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 		break;
 	}
 
-	if (mode != SPEC_STORE_BYPASS_NONE)
+	/*
+	 * We have three CPU feature flags that are in play here:
+	 *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+	 *  - X86_FEATURE_RDS - CPU is able to turn off speculative store bypass
+	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+	 */
+	if (mode != SPEC_STORE_BYPASS_NONE) {
 		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+		/*
+		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
+		 * a completely different MSR and bit dependent on family.
+		 */
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
+			x86_spec_ctrl_set(SPEC_CTRL_RDS);
+			break;
+		case X86_VENDOR_AMD:
+			break;
+		}
+	}
+
 	return mode;
 }
 
@@ -459,6 +479,12 @@ static void ssb_select_mitigation()
 
 #undef pr_fmt
 
+void x86_spec_ctrl_setup_ap(void)
+{
+	if (boot_cpu_has(X86_FEATURE_IBRS))
+		x86_spec_ctrl_set(x86_spec_ctrl_base & (SPEC_CTRL_IBRS | SPEC_CTRL_RDS));
+}
+
 #ifdef CONFIG_SYSFS
 
 ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6ff0cc441a59..d6dc71d616ea 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -951,7 +951,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
-	if (!x86_match_cpu(cpu_no_spec_store_bypass))
+	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+	   !(ia32_cap & ARCH_CAP_RDS_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (x86_match_cpu(cpu_no_speculation))
@@ -963,9 +967,6 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (x86_match_cpu(cpu_no_meltdown))
 		return;
 
-	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
-
 	/* Rogue Data Cache Load? No! */
 	if (ia32_cap & ARCH_CAP_RDCL_NO)
 		return;
@@ -1383,6 +1384,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 #endif
 	mtrr_ap_init();
 	validate_apic_and_package_id(c);
+	x86_spec_ctrl_setup_ap();
 }
 
 static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index e806b11a99af..37672d299e35 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -50,4 +50,6 @@ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
 
 unsigned int aperfmperf_get_khz(int cpu);
 
+extern void x86_spec_ctrl_setup_ap(void);
+
 #endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 60d1897041da..ef3f9c01c274 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -189,6 +189,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+		setup_clear_cpu_cap(X86_FEATURE_RDS);
 	}
 
 	/*

From 1115a859f33276fe8afb31c60cf9d8e657872558 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:23 -0400
Subject: [PATCH 052/393] x86/bugs: Whitelist allowed SPEC_CTRL MSR values

Intel and AMD SPEC_CTRL (0x48) MSR semantics may differ in the
future (or in fact use different MSRs for the same functionality).

As such a run-time mechanism is required to whitelist the appropriate MSR
values.

[ tglx: Made the variable __ro_after_init ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/bugs.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 17edf5bdc361..d809d398675a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -35,6 +35,12 @@ static void __init ssb_select_mitigation(void);
  */
 static u64 __ro_after_init x86_spec_ctrl_base;
 
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -117,7 +123,7 @@ static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
 void x86_spec_ctrl_set(u64 val)
 {
-	if (val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_RDS))
+	if (val & x86_spec_ctrl_mask)
 		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
 	else
 		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
@@ -459,6 +465,7 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
+			x86_spec_ctrl_mask &= ~SPEC_CTRL_RDS;
 			x86_spec_ctrl_set(SPEC_CTRL_RDS);
 			break;
 		case X86_VENDOR_AMD:
@@ -482,7 +489,7 @@ static void ssb_select_mitigation()
 void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))
-		x86_spec_ctrl_set(x86_spec_ctrl_base & (SPEC_CTRL_IBRS | SPEC_CTRL_RDS));
+		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
 }
 
 #ifdef CONFIG_SYSFS

From 764f3c21588a059cd783c6ba0734d4db2d72822d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:24 -0400
Subject: [PATCH 053/393] x86/bugs/AMD: Add support to disable RDS on
 Fam[15,16,17]h if requested

AMD does not need the Speculative Store Bypass mitigation to be enabled.

The parameters for this are already available and can be done via MSR
C001_1020. Each family uses a different bit in that MSR for this.

[ tglx: Expose the bit mask via a variable and move the actual MSR fiddling
  	into the bugs code as that's the right thing to do and also required
	to prepare for dynamic enable/disable ]

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h   |  1 +
 arch/x86/include/asm/nospec-branch.h |  4 ++++
 arch/x86/kernel/cpu/amd.c            | 26 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/bugs.c           | 27 ++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/common.c         |  4 ++++
 5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 58689ed969bb..b2464c1787df 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -215,6 +215,7 @@
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_AMD_RDS		(7*32+24)  /* "" AMD RDS implementation */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 7b9eacfc03f3..3a1541cf32de 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -244,6 +244,10 @@ enum ssb_mitigation {
 	SPEC_STORE_BYPASS_DISABLE,
 };
 
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_rds_mask;
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 12bc0a1139da..3d553fa5075d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
+#include <asm/nospec-branch.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
@@ -554,6 +555,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		nodes_per_socket = ((value >> 3) & 7) + 1;
 	}
+
+	if (c->x86 >= 0x15 && c->x86 <= 0x17) {
+		unsigned int bit;
+
+		switch (c->x86) {
+		case 0x15: bit = 54; break;
+		case 0x16: bit = 33; break;
+		case 0x17: bit = 10; break;
+		default: return;
+		}
+		/*
+		 * Try to cache the base value so further operations can
+		 * avoid RMW. If that faults, do not enable RDS.
+		 */
+		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+			setup_force_cpu_cap(X86_FEATURE_RDS);
+			setup_force_cpu_cap(X86_FEATURE_AMD_RDS);
+			x86_amd_ls_cfg_rds_mask = 1ULL << bit;
+		}
+	}
 }
 
 static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -898,6 +919,11 @@ static void init_amd(struct cpuinfo_x86 *c)
 	/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+	if (boot_cpu_has(X86_FEATURE_AMD_RDS)) {
+		set_cpu_cap(c, X86_FEATURE_RDS);
+		set_cpu_cap(c, X86_FEATURE_AMD_RDS);
+	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d809d398675a..86cdf635636d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -41,6 +41,13 @@ static u64 __ro_after_init x86_spec_ctrl_base;
  */
 static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
 
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_rds_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_rds_mask;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -52,7 +59,8 @@ void __init check_bugs(void)
 
 	/*
 	 * Read the SPEC_CTRL MSR to account for reserved bits which may
-	 * have unknown values.
+	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+	 * init code as it is not enumerated and depends on the family.
 	 */
 	if (boot_cpu_has(X86_FEATURE_IBRS))
 		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
@@ -154,6 +162,14 @@ void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
+static void x86_amd_rds_enable(void)
+{
+	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_rds_mask;
+
+	if (boot_cpu_has(X86_FEATURE_AMD_RDS))
+		wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 
@@ -443,6 +459,11 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 
 	switch (cmd) {
 	case SPEC_STORE_BYPASS_CMD_AUTO:
+		/*
+		 * AMD platforms by default don't need SSB mitigation.
+		 */
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+			break;
 	case SPEC_STORE_BYPASS_CMD_ON:
 		mode = SPEC_STORE_BYPASS_DISABLE;
 		break;
@@ -469,6 +490,7 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 			x86_spec_ctrl_set(SPEC_CTRL_RDS);
 			break;
 		case X86_VENDOR_AMD:
+			x86_amd_rds_enable();
 			break;
 		}
 	}
@@ -490,6 +512,9 @@ void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))
 		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
+
+	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+		x86_amd_rds_enable();
 }
 
 #ifdef CONFIG_SYSFS
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d6dc71d616ea..e0517bcee446 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -943,6 +943,10 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
 	{ X86_VENDOR_CENTAUR,	5,					},
 	{ X86_VENDOR_INTEL,	5,					},
 	{ X86_VENDOR_NSC,	5,					},
+	{ X86_VENDOR_AMD,	0x12,					},
+	{ X86_VENDOR_AMD,	0x11,					},
+	{ X86_VENDOR_AMD,	0x10,					},
+	{ X86_VENDOR_AMD,	0xf,					},
 	{ X86_VENDOR_ANY,	4,					},
 	{}
 };

From da39556f66f5cfe8f9c989206974f1cb16ca5d7c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:25 -0400
Subject: [PATCH 054/393] x86/KVM/VMX: Expose SPEC_CTRL Bit(2) to the guest

Expose the CPUID.7.EDX[31] bit to the guest, and also guard against various
combinations of SPEC_CTRL MSR values.

The handling of the MSR (to take into account the host value of SPEC_CTRL
Bit(2)) is taken care of in patch:

  KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/cpuid.c | 2 +-
 arch/x86/kvm/vmx.c   | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b90a8b3..376ac9a2a2b9 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -407,7 +407,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(RDS) |
 		F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3091dc98a451..4197942b6d9b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3524,7 +3524,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
 			return 1;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -3643,11 +3644,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_RDS))
 			return 1;
 
 		vmx->spec_ctrl = data;

From 28a2775217b17208811fa43a9e96bd1fdf417b86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 29 Apr 2018 15:01:37 +0200
Subject: [PATCH 055/393] x86/speculation: Create spec-ctrl.h to avoid include
 hell

Having everything in nospec-branch.h creates a hell of dependencies when
adding the prctl based switching mechanism. Move everything which is not
required in nospec-branch.h to spec-ctrl.h and fix up the includes in the
relevant files.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 14 --------------
 arch/x86/include/asm/spec-ctrl.h     | 21 +++++++++++++++++++++
 arch/x86/kernel/cpu/amd.c            |  2 +-
 arch/x86/kernel/cpu/bugs.c           |  2 +-
 arch/x86/kvm/svm.c                   |  2 +-
 arch/x86/kvm/vmx.c                   |  2 +-
 6 files changed, 25 insertions(+), 18 deletions(-)
 create mode 100644 arch/x86/include/asm/spec-ctrl.h

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 3a1541cf32de..1119f14bc883 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -228,26 +228,12 @@ enum spectre_v2_mitigation {
 extern void x86_spec_ctrl_set(u64);
 extern u64 x86_spec_ctrl_get_default(void);
 
-/*
- * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
- * the guest has, while on VMEXIT we restore the host view. This
- * would be easier if SPEC_CTRL were architecturally maskable or
- * shadowable for guests but this is not (currently) the case.
- * Takes the guest view of SPEC_CTRL MSR as a parameter.
- */
-extern void x86_spec_ctrl_set_guest(u64);
-extern void x86_spec_ctrl_restore_host(u64);
-
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
 	SPEC_STORE_BYPASS_DISABLE,
 };
 
-/* AMD specific Speculative Store Bypass MSR data */
-extern u64 x86_amd_ls_cfg_base;
-extern u64 x86_amd_ls_cfg_rds_mask;
-
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
new file mode 100644
index 000000000000..3ad64420a06e
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <asm/nospec-branch.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter.
+ */
+extern void x86_spec_ctrl_set_guest(u64);
+extern void x86_spec_ctrl_restore_host(u64);
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_rds_mask;
+
+#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3d553fa5075d..18efc33a8d2e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -10,7 +10,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 86cdf635636d..ec171873167a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -13,7 +13,7 @@
 #include <linux/cpu.h>
 #include <linux/module.h>
 
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8bfc8fe26851..437c1b371129 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,7 @@
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4197942b6d9b..16a111e44691 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/mshyperv.h>
 
 #include "trace.h"

From b617cfc858161140d69cc0b5cc211996b557a1c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 29 Apr 2018 15:20:11 +0200
Subject: [PATCH 056/393] prctl: Add speculation control prctls

Add two new prctls to control aspects of speculation related vulnerabilites
and their mitigations to provide finer grained control over performance
impacting mitigations.

PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
which is selected with arg2 of prctl(2). The return value uses bit 0-2 with
the following meaning:

Bit  Define           Description
0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
                      PR_SET_SPECULATION_CTRL
1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
                      disabled
2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
                      enabled

If all bits are 0 the CPU is not affected by the speculation misfeature.

If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
misfeature will fail.

PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
is selected by arg2 of prctl(2) per task. arg3 is used to hand in the
control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.

The common return values are:

EINVAL  prctl is not implemented by the architecture or the unused prctl()
        arguments are not 0
ENODEV  arg2 is selecting a not supported speculation misfeature

PR_SET_SPECULATION_CTRL has these additional return values:

ERANGE  arg3 is incorrect, i.e. it's not either PR_SPEC_ENABLE or PR_SPEC_DISABLE
ENXIO   prctl control of the selected speculation misfeature is disabled

The first supported controlable speculation misfeature is
PR_SPEC_STORE_BYPASS. Add the define so this can be shared between
architectures.

Based on an initial patch from Tim Chen and mostly rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 Documentation/userspace-api/index.rst     |  1 +
 Documentation/userspace-api/spec_ctrl.rst | 86 +++++++++++++++++++++++
 include/linux/nospec.h                    |  5 ++
 include/uapi/linux/prctl.h                | 11 +++
 kernel/sys.c                              | 22 ++++++
 5 files changed, 125 insertions(+)
 create mode 100644 Documentation/userspace-api/spec_ctrl.rst

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 7b2eb1b7d4ca..a3233da7fa88 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -19,6 +19,7 @@ place where this information is gathered.
    no_new_privs
    seccomp_filter
    unshare
+   spec_ctrl
 
 .. only::  subproject and html
 
diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
new file mode 100644
index 000000000000..ddbebcd01208
--- /dev/null
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -0,0 +1,86 @@
+===================
+Speculation Control
+===================
+
+Quite some CPUs have speculation related misfeatures which are in fact
+vulnerabilites causing data leaks in various forms even accross privilege
+domains.
+
+The kernel provides mitigation for such vulnerabilities in various
+forms. Some of these mitigations are compile time configurable and some on
+the kernel command line.
+
+There is also a class of mitigations which are very expensive, but they can
+be restricted to a certain set of processes or tasks in controlled
+environments. The mechanism to control these mitigations is via
+:manpage:`prctl(2)`.
+
+There are two prctl options which are related to this:
+
+ * PR_GET_SPECULATION_CTRL
+
+ * PR_SET_SPECULATION_CTRL
+
+PR_GET_SPECULATION_CTRL
+-----------------------
+
+PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
+which is selected with arg2 of prctl(2). The return value uses bits 0-2 with
+the following meaning:
+
+==== ================ ===================================================
+Bit  Define           Description
+==== ================ ===================================================
+0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
+                      PR_SET_SPECULATION_CTRL
+1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
+                      disabled
+2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
+                      enabled
+==== ================ ===================================================
+
+If all bits are 0 the CPU is not affected by the speculation misfeature.
+
+If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
+available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
+misfeature will fail.
+
+PR_SET_SPECULATION_CTRL
+-----------------------
+PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
+is selected by arg2 of :manpage:`prctl(2)` per task. arg3 is used to hand
+in the control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.
+
+Common error codes
+------------------
+======= =================================================================
+Value   Meaning
+======= =================================================================
+EINVAL  The prctl is not implemented by the architecture or unused
+        prctl(2) arguments are not 0
+
+ENODEV  arg2 is selecting a not supported speculation misfeature
+======= =================================================================
+
+PR_SET_SPECULATION_CTRL error codes
+-----------------------------------
+======= =================================================================
+Value   Meaning
+======= =================================================================
+0       Success
+
+ERANGE  arg3 is incorrect, i.e. it's neither PR_SPEC_ENABLE nor
+        PR_SPEC_DISABLE
+
+ENXIO   Control of the selected speculation misfeature is not possible.
+        See PR_GET_SPECULATION_CTRL.
+======= =================================================================
+
+Speculation misfeature controls
+-------------------------------
+- PR_SPEC_STORE_BYPASS: Speculative Store Bypass
+
+  Invocations:
+   * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, 0, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index e791ebc65c9c..700bb8a4e4ea 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -55,4 +55,9 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 									\
 	(typeof(_i)) (_i & _mask);					\
 })
+
+/* Speculation control prctl */
+int arch_prctl_spec_ctrl_get(unsigned long which);
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl);
+
 #endif /* _LINUX_NOSPEC_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index af5f8c2df87a..ebf057ac1346 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -207,4 +207,15 @@ struct prctl_mm_map {
 # define PR_SVE_VL_LEN_MASK		0xffff
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
+/* Per task speculation control */
+#define PR_GET_SPECULATION_CTRL		52
+#define PR_SET_SPECULATION_CTRL		53
+/* Speculation control variants */
+# define PR_SPEC_STORE_BYPASS		0
+/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
+# define PR_SPEC_NOT_AFFECTED		0
+# define PR_SPEC_PRCTL			(1UL << 0)
+# define PR_SPEC_ENABLE			(1UL << 1)
+# define PR_SPEC_DISABLE		(1UL << 2)
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index ad692183dfe9..b76dee23bdc9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
 #include <linux/uidgid.h>
 #include <linux/cred.h>
 
+#include <linux/nospec.h>
+
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
 #include <generated/utsrelease.h>
@@ -2242,6 +2244,16 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 	return 1;
 }
 
+int __weak arch_prctl_spec_ctrl_get(unsigned long which)
+{
+	return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+{
+	return -EINVAL;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2450,6 +2462,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
+	case PR_GET_SPECULATION_CTRL:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_get(arg2);
+		break;
+	case PR_SET_SPECULATION_CTRL:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_set(arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;

From 885f82bfbc6fefb6664ea27965c3ab9ac4194b8c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 29 Apr 2018 15:21:42 +0200
Subject: [PATCH 057/393] x86/process: Allow runtime control of Speculative
 Store Bypass

The Speculative Store Bypass vulnerability can be mitigated with the
Reduced Data Speculation (RDS) feature. To allow finer grained control of
this eventually expensive mitigation a per task mitigation control is
required.

Add a new TIF_RDS flag and put it into the group of TIF flags which are
evaluated for mismatch in switch_to(). If these bits differ in the previous
and the next task, then the slow path function __switch_to_xtra() is
invoked. Implement the TIF_RDS dependent mitigation control in the slow
path.

If the prctl for controlling Speculative Store Bypass is disabled or no
task uses the prctl then there is no overhead in the switch_to() fast
path.

Update the KVM related speculation control functions to take TID_RDS into
account as well.

Based on a patch from Tim Chen. Completely rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/msr-index.h   |  3 ++-
 arch/x86/include/asm/spec-ctrl.h   | 17 +++++++++++++++++
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/kernel/cpu/bugs.c         | 26 +++++++++++++++++++++-----
 arch/x86/kernel/process.c          | 22 ++++++++++++++++++++++
 5 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 21e1a6df9907..810f50bb338d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,7 +42,8 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
-#define SPEC_CTRL_RDS			(1 << 2)   /* Reduced Data Speculation */
+#define SPEC_CTRL_RDS_SHIFT		2	   /* Reduced Data Speculation bit */
+#define SPEC_CTRL_RDS			(1 << SPEC_CTRL_RDS_SHIFT)   /* Reduced Data Speculation */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 3ad64420a06e..45ef00ad5105 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_SPECCTRL_H_
 #define _ASM_X86_SPECCTRL_H_
 
+#include <linux/thread_info.h>
 #include <asm/nospec-branch.h>
 
 /*
@@ -18,4 +19,20 @@ extern void x86_spec_ctrl_restore_host(u64);
 extern u64 x86_amd_ls_cfg_base;
 extern u64 x86_amd_ls_cfg_rds_mask;
 
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
+static inline u64 rds_tif_to_spec_ctrl(u64 tifn)
+{
+	BUILD_BUG_ON(TIF_RDS < SPEC_CTRL_RDS_SHIFT);
+	return (tifn & _TIF_RDS) >> (TIF_RDS - SPEC_CTRL_RDS_SHIFT);
+}
+
+static inline u64 rds_tif_to_amd_ls_cfg(u64 tifn)
+{
+	return (tifn & _TIF_RDS) ? x86_amd_ls_cfg_rds_mask : 0ULL;
+}
+
+extern void speculative_store_bypass_update(void);
+
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a5d9521bb2cb..e5c26cc59619 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
+#define TIF_RDS			5	/* Reduced data speculation */
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
@@ -105,6 +106,7 @@ struct thread_info {
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
+#define _TIF_RDS		(1 << TIF_RDS)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
@@ -144,7 +146,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_RDS)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ec171873167a..2bc109d0f8ae 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -33,7 +33,7 @@ static void __init ssb_select_mitigation(void);
  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
  * writes to SPEC_CTRL contain whatever reserved bits have been set.
  */
-static u64 __ro_after_init x86_spec_ctrl_base;
+u64 __ro_after_init x86_spec_ctrl_base;
 
 /*
  * The vendor and possibly platform specific bits which can be modified in
@@ -140,25 +140,41 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
 
 u64 x86_spec_ctrl_get_default(void)
 {
-	return x86_spec_ctrl_base;
+	u64 msrval = x86_spec_ctrl_base;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		msrval |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+	return msrval;
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
 void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 {
+	u64 host = x86_spec_ctrl_base;
+
 	if (!boot_cpu_has(X86_FEATURE_IBRS))
 		return;
-	if (x86_spec_ctrl_base != guest_spec_ctrl)
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+
+	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
 
 void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 {
+	u64 host = x86_spec_ctrl_base;
+
 	if (!boot_cpu_has(X86_FEATURE_IBRS))
 		return;
-	if (x86_spec_ctrl_base != guest_spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+
+	if (host != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, host);
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 03408b942adb..397342725046 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,6 +38,7 @@
 #include <asm/switch_to.h>
 #include <asm/desc.h>
 #include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -278,6 +279,24 @@ static inline void switch_to_bitmap(struct tss_struct *tss,
 	}
 }
 
+static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+{
+	u64 msr;
+
+	if (static_cpu_has(X86_FEATURE_AMD_RDS)) {
+		msr = x86_amd_ls_cfg_base | rds_tif_to_amd_ls_cfg(tifn);
+		wrmsrl(MSR_AMD64_LS_CFG, msr);
+	} else {
+		msr = x86_spec_ctrl_base | rds_tif_to_spec_ctrl(tifn);
+		wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+	}
+}
+
+void speculative_store_bypass_update(void)
+{
+	__speculative_store_bypass_update(current_thread_info()->flags);
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
@@ -309,6 +328,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	if ((tifp ^ tifn) & _TIF_NOCPUID)
 		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+	if ((tifp ^ tifn) & _TIF_RDS)
+		__speculative_store_bypass_update(tifn);
 }
 
 /*

From a73ec77ee17ec556fe7f165d00314cb7c047b1ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 29 Apr 2018 15:26:40 +0200
Subject: [PATCH 058/393] x86/speculation: Add prctl for Speculative Store
 Bypass mitigation

Add prctl based control for Speculative Store Bypass mitigation and make it
the default mitigation for Intel and AMD.

Andi Kleen provided the following rationale (slightly redacted):

 There are multiple levels of impact of Speculative Store Bypass:

 1) JITed sandbox.
    It cannot invoke system calls, but can do PRIME+PROBE and may have call
    interfaces to other code

 2) Native code process.
    No protection inside the process at this level.

 3) Kernel.

 4) Between processes.

 The prctl tries to protect against case (1) doing attacks.

 If the untrusted code can do random system calls then control is already
 lost in a much worse way. So there needs to be system call protection in
 some way (using a JIT not allowing them or seccomp). Or rather if the
 process can subvert its environment somehow to do the prctl it can already
 execute arbitrary code, which is much worse than SSB.

 To put it differently, the point of the prctl is to not allow JITed code
 to read data it shouldn't read from its JITed sandbox. If it already has
 escaped its sandbox then it can already read everything it wants in its
 address space, and do much worse.

 The ability to control Speculative Store Bypass allows to enable the
 protection selectively without affecting overall system performance.

Based on an initial patch from Tim Chen. Completely rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 .../admin-guide/kernel-parameters.txt         |  6 +-
 arch/x86/include/asm/nospec-branch.h          |  1 +
 arch/x86/kernel/cpu/bugs.c                    | 83 ++++++++++++++++---
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f32112c55f7e..a8d2ae1e335b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4053,7 +4053,11 @@
 			off    - Unconditionally enable Speculative Store Bypass
 			auto   - Kernel detects whether the CPU model contains an
 				 implementation of Speculative Store Bypass and
-				 picks the most appropriate mitigation
+				 picks the most appropriate mitigation.
+			prctl  - Control Speculative Store Bypass per thread
+				 via prctl. Speculative Store Bypass is enabled
+				 for a process by default. The state of the control
+				 is inherited on fork.
 
 			Not specifying this option is equivalent to
 			spec_store_bypass_disable=auto.
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 1119f14bc883..71ad01422655 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -232,6 +232,7 @@ extern u64 x86_spec_ctrl_get_default(void);
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
 	SPEC_STORE_BYPASS_DISABLE,
+	SPEC_STORE_BYPASS_PRCTL,
 };
 
 extern char __indirect_thunk_start[];
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 2bc109d0f8ae..fc9187b6fae7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,6 +12,8 @@
 #include <linux/utsname.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
 
 #include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
@@ -412,20 +414,23 @@ enum ssb_mitigation_cmd {
 	SPEC_STORE_BYPASS_CMD_NONE,
 	SPEC_STORE_BYPASS_CMD_AUTO,
 	SPEC_STORE_BYPASS_CMD_ON,
+	SPEC_STORE_BYPASS_CMD_PRCTL,
 };
 
 static const char *ssb_strings[] = {
 	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
-	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled"
+	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled",
+	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl"
 };
 
 static const struct {
 	const char *option;
 	enum ssb_mitigation_cmd cmd;
 } ssb_mitigation_options[] = {
-	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
-	{ "on",		SPEC_STORE_BYPASS_CMD_ON },   /* Disable Speculative Store Bypass */
-	{ "off",	SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },  /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },    /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },  /* Don't touch Speculative Store Bypass */
+	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
 };
 
 static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
@@ -475,14 +480,15 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 
 	switch (cmd) {
 	case SPEC_STORE_BYPASS_CMD_AUTO:
-		/*
-		 * AMD platforms by default don't need SSB mitigation.
-		 */
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-			break;
+		/* Choose prctl as the default mode */
+		mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
 	case SPEC_STORE_BYPASS_CMD_ON:
 		mode = SPEC_STORE_BYPASS_DISABLE;
 		break;
+	case SPEC_STORE_BYPASS_CMD_PRCTL:
+		mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
 	case SPEC_STORE_BYPASS_CMD_NONE:
 		break;
 	}
@@ -493,7 +499,7 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 	 *  - X86_FEATURE_RDS - CPU is able to turn off speculative store bypass
 	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
 	 */
-	if (mode != SPEC_STORE_BYPASS_NONE) {
+	if (mode == SPEC_STORE_BYPASS_DISABLE) {
 		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
 		/*
 		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
@@ -524,6 +530,63 @@ static void ssb_select_mitigation()
 
 #undef pr_fmt
 
+static int ssb_prctl_set(unsigned long ctrl)
+{
+	bool rds = !!test_tsk_thread_flag(current, TIF_RDS);
+
+	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL)
+		return -ENXIO;
+
+	if (ctrl == PR_SPEC_ENABLE)
+		clear_tsk_thread_flag(current, TIF_RDS);
+	else
+		set_tsk_thread_flag(current, TIF_RDS);
+
+	if (rds != !!test_tsk_thread_flag(current, TIF_RDS))
+		speculative_store_bypass_update();
+
+	return 0;
+}
+
+static int ssb_prctl_get(void)
+{
+	switch (ssb_mode) {
+	case SPEC_STORE_BYPASS_DISABLE:
+		return PR_SPEC_DISABLE;
+	case SPEC_STORE_BYPASS_PRCTL:
+		if (test_tsk_thread_flag(current, TIF_RDS))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	default:
+		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+			return PR_SPEC_ENABLE;
+		return PR_SPEC_NOT_AFFECTED;
+	}
+}
+
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+{
+	if (ctrl != PR_SPEC_ENABLE && ctrl != PR_SPEC_DISABLE)
+		return -ERANGE;
+
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_set(ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+int arch_prctl_spec_ctrl_get(unsigned long which)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_get();
+	default:
+		return -ENODEV;
+	}
+}
+
 void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))

From 7bbf1373e228840bb0295a2ca26d548ef37f448e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 1 May 2018 15:19:04 -0700
Subject: [PATCH 059/393] nospec: Allow getting/setting on non-current task

Adjust arch_prctl_get/set_spec_ctrl() to operate on tasks other than
current.

This is needed both for /proc/$pid/status queries and for seccomp (since
thread-syncing can trigger seccomp in non-current threads).

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 27 ++++++++++++++++-----------
 include/linux/nospec.h     |  7 +++++--
 kernel/sys.c               |  9 +++++----
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index fc9187b6fae7..e3afb610f2ad 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -530,31 +530,35 @@ static void ssb_select_mitigation()
 
 #undef pr_fmt
 
-static int ssb_prctl_set(unsigned long ctrl)
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
-	bool rds = !!test_tsk_thread_flag(current, TIF_RDS);
+	bool rds = !!test_tsk_thread_flag(task, TIF_RDS);
 
 	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL)
 		return -ENXIO;
 
 	if (ctrl == PR_SPEC_ENABLE)
-		clear_tsk_thread_flag(current, TIF_RDS);
+		clear_tsk_thread_flag(task, TIF_RDS);
 	else
-		set_tsk_thread_flag(current, TIF_RDS);
+		set_tsk_thread_flag(task, TIF_RDS);
 
-	if (rds != !!test_tsk_thread_flag(current, TIF_RDS))
+	/*
+	 * If being set on non-current task, delay setting the CPU
+	 * mitigation until it is next scheduled.
+	 */
+	if (task == current && rds != !!test_tsk_thread_flag(task, TIF_RDS))
 		speculative_store_bypass_update();
 
 	return 0;
 }
 
-static int ssb_prctl_get(void)
+static int ssb_prctl_get(struct task_struct *task)
 {
 	switch (ssb_mode) {
 	case SPEC_STORE_BYPASS_DISABLE:
 		return PR_SPEC_DISABLE;
 	case SPEC_STORE_BYPASS_PRCTL:
-		if (test_tsk_thread_flag(current, TIF_RDS))
+		if (test_tsk_thread_flag(task, TIF_RDS))
 			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
 		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
 	default:
@@ -564,24 +568,25 @@ static int ssb_prctl_get(void)
 	}
 }
 
-int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl)
 {
 	if (ctrl != PR_SPEC_ENABLE && ctrl != PR_SPEC_DISABLE)
 		return -ERANGE;
 
 	switch (which) {
 	case PR_SPEC_STORE_BYPASS:
-		return ssb_prctl_set(ctrl);
+		return ssb_prctl_set(task, ctrl);
 	default:
 		return -ENODEV;
 	}
 }
 
-int arch_prctl_spec_ctrl_get(unsigned long which)
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 {
 	switch (which) {
 	case PR_SPEC_STORE_BYPASS:
-		return ssb_prctl_get();
+		return ssb_prctl_get(task);
 	default:
 		return -ENODEV;
 	}
diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index 700bb8a4e4ea..a908c954484d 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -7,6 +7,8 @@
 #define _LINUX_NOSPEC_H
 #include <asm/barrier.h>
 
+struct task_struct;
+
 /**
  * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
  * @index: array element index
@@ -57,7 +59,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 })
 
 /* Speculation control prctl */
-int arch_prctl_spec_ctrl_get(unsigned long which);
-int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl);
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which);
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl);
 
 #endif /* _LINUX_NOSPEC_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index b76dee23bdc9..b0eee418ee0d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2244,12 +2244,13 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 	return 1;
 }
 
-int __weak arch_prctl_spec_ctrl_get(unsigned long which)
+int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
 {
 	return -EINVAL;
 }
 
-int __weak arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
+				    unsigned long ctrl)
 {
 	return -EINVAL;
 }
@@ -2465,12 +2466,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_SPECULATION_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = arch_prctl_spec_ctrl_get(arg2);
+		error = arch_prctl_spec_ctrl_get(me, arg2);
 		break;
 	case PR_SET_SPECULATION_CTRL:
 		if (arg4 || arg5)
 			return -EINVAL;
-		error = arch_prctl_spec_ctrl_set(arg2, arg3);
+		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
 	default:
 		error = -EINVAL;

From fae1fa0fc6cca8beee3ab8ed71d54f9a78fa3f64 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 1 May 2018 15:31:45 -0700
Subject: [PATCH 060/393] proc: Provide details on speculation flaw mitigations

As done with seccomp and no_new_privs, also show speculation flaw
mitigation state in /proc/$pid/status.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/array.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ae2c807fd719..303c155f9b04 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -85,6 +85,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/prctl.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/string_helpers.h>
@@ -335,6 +336,27 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 #ifdef CONFIG_SECCOMP
 	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
+	seq_printf(m, "\nSpeculation Store Bypass:\t");
+	switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
+	case -EINVAL:
+		seq_printf(m, "unknown");
+		break;
+	case PR_SPEC_NOT_AFFECTED:
+		seq_printf(m, "not vulnerable");
+		break;
+	case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+		seq_printf(m, "thread mitigated");
+		break;
+	case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+		seq_printf(m, "thread vulnerable");
+		break;
+	case PR_SPEC_DISABLE:
+		seq_printf(m, "globally mitigated");
+		break;
+	default:
+		seq_printf(m, "vulnerable");
+		break;
+	}
 	seq_putc(m, '\n');
 }
 

From 5c3070890d06ff82eecb808d02d2ca39169533ef Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 1 May 2018 15:07:31 -0700
Subject: [PATCH 061/393] seccomp: Enable speculation flaw mitigations

When speculation flaw mitigations are opt-in (via prctl), using seccomp
will automatically opt-in to these protections, since using seccomp
indicates at least some level of sandboxing is desired.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/seccomp.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..9f34533046aa 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,8 @@
 #include <linux/compat.h>
 #include <linux/coredump.h>
 #include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/seccomp.h>
@@ -227,6 +229,19 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 	return true;
 }
 
+/*
+ * If a given speculation mitigation is opt-in (prctl()-controlled),
+ * select it, by disabling speculation (enabling mitigation).
+ */
+static inline void spec_mitigate(struct task_struct *task,
+				 unsigned long which)
+{
+	int state = arch_prctl_spec_ctrl_get(task, which);
+
+	if (state > 0 && (state & PR_SPEC_PRCTL))
+		arch_prctl_spec_ctrl_set(task, which, PR_SPEC_DISABLE);
+}
+
 static inline void seccomp_assign_mode(struct task_struct *task,
 				       unsigned long seccomp_mode)
 {
@@ -238,6 +253,8 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 	 * filter) is set.
 	 */
 	smp_mb__before_atomic();
+	/* Assume seccomp processes want speculation flaw mitigation. */
+	spec_mitigate(task, PR_SPEC_STORE_BYPASS);
 	set_tsk_thread_flag(task, TIF_SECCOMP);
 }
 

From baf64250b4a513bf4ac226fd938692dc1836f4f6 Mon Sep 17 00:00:00 2001
From: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Date: Wed, 2 May 2018 20:32:03 +0200
Subject: [PATCH 062/393] ARM: OMAP1: ams-delta: fix deferred_fiq handler

The deferred_fiq handler used to limit hardware operations to IRQ
unmask only, relying on gpio-omap assigned handler performing the ACKs.
Since commit 80ac93c27441 ("gpio: omap: Fix lost edge interrupts") this
is no longer the case as handle_edge_irq() has been replaced with
handle_simmple_irq() which doesn't touch the hardware.

Add single ACK operation per each active IRQ pin to the handler. While
being at it, move unmask operation out of irq_counter loop so it is
also called only once for each active IRQ pin.

Fixes: 80ac93c27441 ("gpio: omap: Fix lost edge interrupts")
Signed-off-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap1/ams-delta-fiq.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/arm/mach-omap1/ams-delta-fiq.c b/arch/arm/mach-omap1/ams-delta-fiq.c
index 793a24a53c52..d7ca9e2b40d2 100644
--- a/arch/arm/mach-omap1/ams-delta-fiq.c
+++ b/arch/arm/mach-omap1/ams-delta-fiq.c
@@ -58,22 +58,24 @@ static irqreturn_t deferred_fiq(int irq, void *dev_id)
 		irq_num = gpio_to_irq(gpio);
 		fiq_count = fiq_buffer[FIQ_CNT_INT_00 + gpio];
 
-		while (irq_counter[gpio] < fiq_count) {
-			if (gpio != AMS_DELTA_GPIO_PIN_KEYBRD_CLK) {
-				struct irq_data *d = irq_get_irq_data(irq_num);
+		if (irq_counter[gpio] < fiq_count &&
+				gpio != AMS_DELTA_GPIO_PIN_KEYBRD_CLK) {
+			struct irq_data *d = irq_get_irq_data(irq_num);
 
-				/*
-				 * It looks like handle_edge_irq() that
-				 * OMAP GPIO edge interrupts default to,
-				 * expects interrupt already unmasked.
-				 */
-				if (irq_chip && irq_chip->irq_unmask)
+			/*
+			 * handle_simple_irq() that OMAP GPIO edge
+			 * interrupts default to since commit 80ac93c27441
+			 * requires interrupt already acked and unmasked.
+			 */
+			if (irq_chip) {
+				if (irq_chip->irq_ack)
+					irq_chip->irq_ack(d);
+				if (irq_chip->irq_unmask)
 					irq_chip->irq_unmask(d);
 			}
-			generic_handle_irq(irq_num);
-
-			irq_counter[gpio]++;
 		}
+		for (; irq_counter[gpio] < fiq_count; irq_counter[gpio]++)
+			generic_handle_irq(irq_num);
 	}
 	return IRQ_HANDLED;
 }

From 647efef69de483f1dd7944ede31b4cae16acb124 Mon Sep 17 00:00:00 2001
From: Graeme Smecher <gsmecher@threespeedlogic.com>
Date: Wed, 2 May 2018 17:32:36 -0700
Subject: [PATCH 063/393] ARM: dts: correct missing "compatible" entry for
 ti81xx SoCs

The missing "compatible" entries are needed by drivers/clk/ti/clkctrl.c,
and without them the structures initialized in drivers/clk/ti/clk-814x.c
are not passed to configuration code. The result is a "not found from
clkctrl data" error message, although boot proceeds anyway.

The reason why the compatible is not found is because the board specific
files override the SoC compatible without including it. This did not
cause any issues until with the clkctrl nodes got introduced.

Very lightly tested on a (lurching) AM3874 design that's in the middle
of a kernel upgrade from TI's abandoned 2.6.37 tree.

Also tested on j5eco-evm and hp-t410 to verify the clkctrl clocks are
found.

Fixes: bb30465b5902 ("ARM: dts: dm814x: add clkctrl nodes")
Fixes: 80a06c0d8357 ("ARM: dts: dm816x: add clkctrl nodes")
Signed-off-by: Graeme Smecher <gsmecher@threespeedlogic.com>
[tony: updated to fix for 8168-evm, updated comments]
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/boot/dts/dm8148-evm.dts       | 2 +-
 arch/arm/boot/dts/dm8148-t410.dts      | 2 +-
 arch/arm/boot/dts/dm8168-evm.dts       | 2 +-
 arch/arm/boot/dts/dra62x-j5eco-evm.dts | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/boot/dts/dm8148-evm.dts b/arch/arm/boot/dts/dm8148-evm.dts
index d6657b3bae84..85d7b5148b0a 100644
--- a/arch/arm/boot/dts/dm8148-evm.dts
+++ b/arch/arm/boot/dts/dm8148-evm.dts
@@ -10,7 +10,7 @@
 
 / {
 	model = "DM8148 EVM";
-	compatible = "ti,dm8148-evm", "ti,dm8148";
+	compatible = "ti,dm8148-evm", "ti,dm8148", "ti,dm814";
 
 	memory@80000000 {
 		device_type = "memory";
diff --git a/arch/arm/boot/dts/dm8148-t410.dts b/arch/arm/boot/dts/dm8148-t410.dts
index 63883b3479f9..6418f9cdbe83 100644
--- a/arch/arm/boot/dts/dm8148-t410.dts
+++ b/arch/arm/boot/dts/dm8148-t410.dts
@@ -9,7 +9,7 @@
 
 / {
 	model = "HP t410 Smart Zero Client";
-	compatible = "hp,t410", "ti,dm8148";
+	compatible = "hp,t410", "ti,dm8148", "ti,dm814";
 
 	memory@80000000 {
 		device_type = "memory";
diff --git a/arch/arm/boot/dts/dm8168-evm.dts b/arch/arm/boot/dts/dm8168-evm.dts
index c72a2132aa82..1d030d567307 100644
--- a/arch/arm/boot/dts/dm8168-evm.dts
+++ b/arch/arm/boot/dts/dm8168-evm.dts
@@ -10,7 +10,7 @@
 
 / {
 	model = "DM8168 EVM";
-	compatible = "ti,dm8168-evm", "ti,dm8168";
+	compatible = "ti,dm8168-evm", "ti,dm8168", "ti,dm816";
 
 	memory@80000000 {
 		device_type = "memory";
diff --git a/arch/arm/boot/dts/dra62x-j5eco-evm.dts b/arch/arm/boot/dts/dra62x-j5eco-evm.dts
index fee0547f7302..31b824ad5d29 100644
--- a/arch/arm/boot/dts/dra62x-j5eco-evm.dts
+++ b/arch/arm/boot/dts/dra62x-j5eco-evm.dts
@@ -10,7 +10,7 @@
 
 / {
 	model = "DRA62x J5 Eco EVM";
-	compatible = "ti,dra62x-j5eco-evm", "ti,dra62x", "ti,dm8148";
+	compatible = "ti,dra62x-j5eco-evm", "ti,dra62x", "ti,dm8148", "ti,dm814";
 
 	memory@80000000 {
 		device_type = "memory";

From bc519d9574618e47a0c788000fb78da95e18d953 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 3 May 2018 13:09:44 -0500
Subject: [PATCH 064/393] spi: bcm2835aux: ensure interrupts are enabled for
 shared handler

The BCM2835 AUX SPI has a shared interrupt line (with AUX UART).
Downstream fixes this with an AUX irqchip to demux the IRQ sources and a
DT change which breaks compatibility with older kernels. The AUX irqchip
was already rejected for upstream[1] and the DT change would break
working systems if the DTB is updated to a newer one. The latter issue
was brought to my attention by Alex Graf.

The root cause however is a bug in the shared handler. Shared handlers
must check that interrupts are actually enabled before servicing the
interrupt. Add a check that the TXEMPTY or IDLE interrupts are enabled.

[1] https://patchwork.kernel.org/patch/9781221/

Cc: Alexander Graf <agraf@suse.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Eric Anholt <eric@anholt.net>
Cc: Stefan Wahren <stefan.wahren@i2se.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Ray Jui <rjui@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: linux-spi@vger.kernel.org
Cc: linux-rpi-kernel@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-bcm2835aux.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/spi/spi-bcm2835aux.c b/drivers/spi/spi-bcm2835aux.c
index 1431cb98fe40..3094d818cf06 100644
--- a/drivers/spi/spi-bcm2835aux.c
+++ b/drivers/spi/spi-bcm2835aux.c
@@ -184,6 +184,11 @@ static irqreturn_t bcm2835aux_spi_interrupt(int irq, void *dev_id)
 	struct bcm2835aux_spi *bs = spi_master_get_devdata(master);
 	irqreturn_t ret = IRQ_NONE;
 
+	/* IRQ may be shared, so return if our interrupts are disabled */
+	if (!(bcm2835aux_rd(bs, BCM2835_AUX_SPI_CNTL1) &
+	      (BCM2835_AUX_SPI_CNTL1_TXEMPTY | BCM2835_AUX_SPI_CNTL1_IDLE)))
+		return ret;
+
 	/* check if we have data to read */
 	while (bs->rx_len &&
 	       (!(bcm2835aux_rd(bs, BCM2835_AUX_SPI_STAT) &

From 9bf4e370048d2bbae5262d0c6280e0142804a272 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Fri, 4 May 2018 01:55:37 +0300
Subject: [PATCH 065/393] ARM: dts: tegra20: Revert "Fix ULPI regression on
 Tegra20"

Commit 4c9a27a6c66d ("ARM: tegra: Fix ULPI regression on Tegra20") changed
"ulpi-link" clock from CDEV2 to PLL_P_OUT4. Turned out that PLL_P_OUT4 is
the parent of CDEV2 clock and original clock setup of "ulpi-link" was
correct. The reverted patch was fixing USB for one board and broke the
other, now Tegra's clk driver correctly sets parent for the CDEV2 clock
and hence patch could be reverted safely, restoring USB for all of the
boards.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Marcel Ziswiler <marcel@ziswiler.com>
Tested-by: Marcel Ziswiler <marcel@ziswiler.com>
Tested-by: Marc Dietrich <marvin24@gmx.de>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 arch/arm/boot/dts/tegra20.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/tegra20.dtsi b/arch/arm/boot/dts/tegra20.dtsi
index 0a7136462a1a..983dd5c14794 100644
--- a/arch/arm/boot/dts/tegra20.dtsi
+++ b/arch/arm/boot/dts/tegra20.dtsi
@@ -741,7 +741,7 @@ phy2: usb-phy@c5004000 {
 		phy_type = "ulpi";
 		clocks = <&tegra_car TEGRA20_CLK_USB2>,
 			 <&tegra_car TEGRA20_CLK_PLL_U>,
-			 <&tegra_car TEGRA20_CLK_PLL_P_OUT4>;
+			 <&tegra_car TEGRA20_CLK_CDEV2>;
 		clock-names = "reg", "pll_u", "ulpi-link";
 		resets = <&tegra_car 58>, <&tegra_car 22>;
 		reset-names = "usb", "utmi-pads";

From f9544b2b076ca90d887c5ae5d74fab4c21bb7c13 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 3 May 2018 15:03:30 -0700
Subject: [PATCH 066/393] x86/bugs: Make boot modes __ro_after_init

There's no reason for these to be changed after boot.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e3afb610f2ad..f8d9be0e86b1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -129,7 +129,8 @@ static const char *spectre_v2_strings[] = {
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
-static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+	SPECTRE_V2_NONE;
 
 void x86_spec_ctrl_set(u64 val)
 {
@@ -407,7 +408,7 @@ static void __init spectre_v2_select_mitigation(void)
 #undef pr_fmt
 #define pr_fmt(fmt)	"Speculative Store Bypass: " fmt
 
-static enum ssb_mitigation ssb_mode = SPEC_STORE_BYPASS_NONE;
+static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
 
 /* The kernel command line selection */
 enum ssb_mitigation_cmd {

From 356e4bfff2c5489e016fdb925adbf12a1e3950ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 May 2018 22:09:15 +0200
Subject: [PATCH 067/393] prctl: Add force disable speculation

For certain use cases it is desired to enforce mitigations so they cannot
be undone afterwards. That's important for loader stubs which want to
prevent a child from disabling the mitigation again. Will also be used for
seccomp(). The extra state preserving of the prctl state for SSB is a
preparatory step for EBPF dymanic speculation control.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/userspace-api/spec_ctrl.rst | 34 +++++++++++++---------
 arch/x86/kernel/cpu/bugs.c                | 35 ++++++++++++++++-------
 fs/proc/array.c                           |  3 ++
 include/linux/sched.h                     | 10 ++++++-
 include/uapi/linux/prctl.h                |  1 +
 5 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
index ddbebcd01208..1b3690d30943 100644
--- a/Documentation/userspace-api/spec_ctrl.rst
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -25,19 +25,21 @@ PR_GET_SPECULATION_CTRL
 -----------------------
 
 PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
-which is selected with arg2 of prctl(2). The return value uses bits 0-2 with
+which is selected with arg2 of prctl(2). The return value uses bits 0-3 with
 the following meaning:
 
-==== ================ ===================================================
-Bit  Define           Description
-==== ================ ===================================================
-0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
-                      PR_SET_SPECULATION_CTRL
-1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
-                      disabled
-2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
-                      enabled
-==== ================ ===================================================
+==== ===================== ===================================================
+Bit  Define                Description
+==== ===================== ===================================================
+0    PR_SPEC_PRCTL         Mitigation can be controlled per task by
+                           PR_SET_SPECULATION_CTRL
+1    PR_SPEC_ENABLE        The speculation feature is enabled, mitigation is
+                           disabled
+2    PR_SPEC_DISABLE       The speculation feature is disabled, mitigation is
+                           enabled
+3    PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A
+                           subsequent prctl(..., PR_SPEC_ENABLE) will fail.
+==== ===================== ===================================================
 
 If all bits are 0 the CPU is not affected by the speculation misfeature.
 
@@ -47,9 +49,11 @@ misfeature will fail.
 
 PR_SET_SPECULATION_CTRL
 -----------------------
+
 PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
 is selected by arg2 of :manpage:`prctl(2)` per task. arg3 is used to hand
-in the control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.
+in the control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE or
+PR_SPEC_FORCE_DISABLE.
 
 Common error codes
 ------------------
@@ -70,10 +74,13 @@ Value   Meaning
 0       Success
 
 ERANGE  arg3 is incorrect, i.e. it's neither PR_SPEC_ENABLE nor
-        PR_SPEC_DISABLE
+        PR_SPEC_DISABLE nor PR_SPEC_FORCE_DISABLE
 
 ENXIO   Control of the selected speculation misfeature is not possible.
         See PR_GET_SPECULATION_CTRL.
+
+EPERM   Speculation was disabled with PR_SPEC_FORCE_DISABLE and caller
+        tried to enable it again.
 ======= =================================================================
 
 Speculation misfeature controls
@@ -84,3 +91,4 @@ Speculation misfeature controls
    * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, 0, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f8d9be0e86b1..7e0f28160e5e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -533,21 +533,37 @@ static void ssb_select_mitigation()
 
 static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
-	bool rds = !!test_tsk_thread_flag(task, TIF_RDS);
+	bool update;
 
 	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL)
 		return -ENXIO;
 
-	if (ctrl == PR_SPEC_ENABLE)
-		clear_tsk_thread_flag(task, TIF_RDS);
-	else
-		set_tsk_thread_flag(task, TIF_RDS);
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		/* If speculation is force disabled, enable is not allowed */
+		if (task_spec_ssb_force_disable(task))
+			return -EPERM;
+		task_clear_spec_ssb_disable(task);
+		update = test_and_clear_tsk_thread_flag(task, TIF_RDS);
+		break;
+	case PR_SPEC_DISABLE:
+		task_set_spec_ssb_disable(task);
+		update = !test_and_set_tsk_thread_flag(task, TIF_RDS);
+		break;
+	case PR_SPEC_FORCE_DISABLE:
+		task_set_spec_ssb_disable(task);
+		task_set_spec_ssb_force_disable(task);
+		update = !test_and_set_tsk_thread_flag(task, TIF_RDS);
+		break;
+	default:
+		return -ERANGE;
+	}
 
 	/*
 	 * If being set on non-current task, delay setting the CPU
 	 * mitigation until it is next scheduled.
 	 */
-	if (task == current && rds != !!test_tsk_thread_flag(task, TIF_RDS))
+	if (task == current && update)
 		speculative_store_bypass_update();
 
 	return 0;
@@ -559,7 +575,9 @@ static int ssb_prctl_get(struct task_struct *task)
 	case SPEC_STORE_BYPASS_DISABLE:
 		return PR_SPEC_DISABLE;
 	case SPEC_STORE_BYPASS_PRCTL:
-		if (test_tsk_thread_flag(task, TIF_RDS))
+		if (task_spec_ssb_force_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+		if (task_spec_ssb_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
 		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
 	default:
@@ -572,9 +590,6 @@ static int ssb_prctl_get(struct task_struct *task)
 int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 			     unsigned long ctrl)
 {
-	if (ctrl != PR_SPEC_ENABLE && ctrl != PR_SPEC_DISABLE)
-		return -ERANGE;
-
 	switch (which) {
 	case PR_SPEC_STORE_BYPASS:
 		return ssb_prctl_set(task, ctrl);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 303c155f9b04..d178a0236514 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -344,6 +344,9 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 	case PR_SPEC_NOT_AFFECTED:
 		seq_printf(m, "not vulnerable");
 		break;
+	case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+		seq_printf(m, "thread force mitigated");
+		break;
 	case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
 		seq_printf(m, "thread mitigated");
 		break;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3d697f3b573..e4218d4deba0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1393,7 +1393,8 @@ static inline bool is_percpu_thread(void)
 #define PFA_NO_NEW_PRIVS		0	/* May not gain new privileges. */
 #define PFA_SPREAD_PAGE			1	/* Spread page cache over cpuset */
 #define PFA_SPREAD_SLAB			2	/* Spread some slab caches over cpuset */
-
+#define PFA_SPEC_SSB_DISABLE		3	/* Speculative Store Bypass disabled */
+#define PFA_SPEC_SSB_FORCE_DISABLE	4	/* Speculative Store Bypass force disabled*/
 
 #define TASK_PFA_TEST(name, func)					\
 	static inline bool task_##func(struct task_struct *p)		\
@@ -1418,6 +1419,13 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 
+TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
+TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
+TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
+
+TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
+TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
+
 static inline void
 current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index ebf057ac1346..db9f15f5db04 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -217,5 +217,6 @@ struct prctl_mm_map {
 # define PR_SPEC_PRCTL			(1UL << 0)
 # define PR_SPEC_ENABLE			(1UL << 1)
 # define PR_SPEC_DISABLE		(1UL << 2)
+# define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 
 #endif /* _LINUX_PRCTL_H */

From b849a812f7eb92e96d1c8239b06581b2cfd8b275 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 May 2018 09:40:03 +0200
Subject: [PATCH 068/393] seccomp: Use PR_SPEC_FORCE_DISABLE

Use PR_SPEC_FORCE_DISABLE in seccomp() because seccomp does not allow to
widen restrictions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9f34533046aa..2c819d65e15f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -239,7 +239,7 @@ static inline void spec_mitigate(struct task_struct *task,
 	int state = arch_prctl_spec_ctrl_get(task, which);
 
 	if (state > 0 && (state & PR_SPEC_PRCTL))
-		arch_prctl_spec_ctrl_set(task, which, PR_SPEC_DISABLE);
+		arch_prctl_spec_ctrl_set(task, which, PR_SPEC_FORCE_DISABLE);
 }
 
 static inline void seccomp_assign_mode(struct task_struct *task,

From 00a02d0c502a06d15e07b857f8ff921e3e402675 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 3 May 2018 14:56:12 -0700
Subject: [PATCH 069/393] seccomp: Add filter flag to opt-out of SSB mitigation

If a seccomp user is not interested in Speculative Store Bypass mitigation
by default, it can set the new SECCOMP_FILTER_FLAG_SPEC_ALLOW flag when
adding filters.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/seccomp.h                       |  5 +++--
 include/uapi/linux/seccomp.h                  |  5 +++--
 kernel/seccomp.c                              | 19 +++++++++-------
 tools/testing/selftests/seccomp/seccomp_bpf.c | 22 ++++++++++++++++---
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..e5320f6c8654 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -4,8 +4,9 @@
 
 #include <uapi/linux/seccomp.h>
 
-#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
-					 SECCOMP_FILTER_FLAG_LOG)
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC	| \
+					 SECCOMP_FILTER_FLAG_LOG	| \
+					 SECCOMP_FILTER_FLAG_SPEC_ALLOW)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..9efc0e73d50b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -17,8 +17,9 @@
 #define SECCOMP_GET_ACTION_AVAIL	2
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC	1
-#define SECCOMP_FILTER_FLAG_LOG		2
+#define SECCOMP_FILTER_FLAG_TSYNC	(1UL << 0)
+#define SECCOMP_FILTER_FLAG_LOG		(1UL << 1)
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW	(1UL << 2)
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 2c819d65e15f..53eb946120c1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -243,7 +243,8 @@ static inline void spec_mitigate(struct task_struct *task,
 }
 
 static inline void seccomp_assign_mode(struct task_struct *task,
-				       unsigned long seccomp_mode)
+				       unsigned long seccomp_mode,
+				       unsigned long flags)
 {
 	assert_spin_locked(&task->sighand->siglock);
 
@@ -253,8 +254,9 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 	 * filter) is set.
 	 */
 	smp_mb__before_atomic();
-	/* Assume seccomp processes want speculation flaw mitigation. */
-	spec_mitigate(task, PR_SPEC_STORE_BYPASS);
+	/* Assume default seccomp processes want spec flaw mitigation. */
+	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+		spec_mitigate(task, PR_SPEC_STORE_BYPASS);
 	set_tsk_thread_flag(task, TIF_SECCOMP);
 }
 
@@ -322,7 +324,7 @@ static inline pid_t seccomp_can_sync_threads(void)
  * without dropping the locks.
  *
  */
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
 {
 	struct task_struct *thread, *caller;
 
@@ -363,7 +365,8 @@ static inline void seccomp_sync_threads(void)
 		 * allow one thread to transition the other.
 		 */
 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
-			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+					    flags);
 	}
 }
 
@@ -486,7 +489,7 @@ static long seccomp_attach_filter(unsigned int flags,
 
 	/* Now that the new filter is in place, synchronize to all threads. */
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
-		seccomp_sync_threads();
+		seccomp_sync_threads(flags);
 
 	return 0;
 }
@@ -835,7 +838,7 @@ static long seccomp_set_mode_strict(void)
 #ifdef TIF_NOTSC
 	disable_TSC();
 #endif
-	seccomp_assign_mode(current, seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode, 0);
 	ret = 0;
 
 out:
@@ -893,7 +896,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	/* Do not free the successfully attached filter. */
 	prepared = NULL;
 
-	seccomp_assign_mode(current, seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode, flags);
 out:
 	spin_unlock_irq(&current->sighand->siglock);
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 168c66d74fc5..e1473234968d 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -134,11 +134,15 @@ struct seccomp_data {
 #endif
 
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
-#define SECCOMP_FILTER_FLAG_TSYNC 1
+#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 #endif
 
 #ifndef SECCOMP_FILTER_FLAG_LOG
-#define SECCOMP_FILTER_FLAG_LOG 2
+#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
 #endif
 
 #ifndef PTRACE_SECCOMP_GET_METADATA
@@ -2072,14 +2076,26 @@ TEST(seccomp_syscall_mode_lock)
 TEST(detect_seccomp_filter_flags)
 {
 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
-				 SECCOMP_FILTER_FLAG_LOG };
+				 SECCOMP_FILTER_FLAG_LOG,
+				 SECCOMP_FILTER_FLAG_SPEC_ALLOW };
 	unsigned int flag, all_flags;
 	int i;
 	long ret;
 
 	/* Test detection of known-good filter flags */
 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
+		int bits = 0;
+
 		flag = flags[i];
+		/* Make sure the flag is a single bit! */
+		while (flag) {
+			if (flag & 0x1)
+				bits ++;
+			flag >>= 1;
+		}
+		ASSERT_EQ(1, bits);
+		flag = flags[i];
+
 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
 		ASSERT_NE(ENOSYS, errno) {
 			TH_LOG("Kernel does not support seccomp syscall!");

From 8bf37d8c067bb7eb8e7c381bdadf9bd89182b6bc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 May 2018 15:12:06 +0200
Subject: [PATCH 070/393] seccomp: Move speculation migitation control to arch
 code

The migitation control is simpler to implement in architecture code as it
avoids the extra function call to check the mode. Aside of that having an
explicit seccomp enabled mode in the architecture mitigations would require
even more workarounds.

Move it into architecture code and provide a weak function in the seccomp
code. Remove the 'which' argument as this allows the architecture to decide
which mitigations are relevant for seccomp.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 29 ++++++++++++++++++-----------
 include/linux/nospec.h     |  2 ++
 kernel/seccomp.c           | 15 ++-------------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7e0f28160e5e..5dab4c3d26e7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -569,6 +569,24 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 	return 0;
 }
 
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_set(task, ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+	ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
 static int ssb_prctl_get(struct task_struct *task)
 {
 	switch (ssb_mode) {
@@ -587,17 +605,6 @@ static int ssb_prctl_get(struct task_struct *task)
 	}
 }
 
-int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
-			     unsigned long ctrl)
-{
-	switch (which) {
-	case PR_SPEC_STORE_BYPASS:
-		return ssb_prctl_set(task, ctrl);
-	default:
-		return -ENODEV;
-	}
-}
-
 int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 {
 	switch (which) {
diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index a908c954484d..0c5ef54fd416 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -62,5 +62,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which);
 int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 			     unsigned long ctrl);
+/* Speculation control for seccomp enforced mitigation */
+void arch_seccomp_spec_mitigate(struct task_struct *task);
 
 #endif /* _LINUX_NOSPEC_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 53eb946120c1..e691d9a6c58d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -229,18 +229,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 	return true;
 }
 
-/*
- * If a given speculation mitigation is opt-in (prctl()-controlled),
- * select it, by disabling speculation (enabling mitigation).
- */
-static inline void spec_mitigate(struct task_struct *task,
-				 unsigned long which)
-{
-	int state = arch_prctl_spec_ctrl_get(task, which);
-
-	if (state > 0 && (state & PR_SPEC_PRCTL))
-		arch_prctl_spec_ctrl_set(task, which, PR_SPEC_FORCE_DISABLE);
-}
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
 
 static inline void seccomp_assign_mode(struct task_struct *task,
 				       unsigned long seccomp_mode,
@@ -256,7 +245,7 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 	smp_mb__before_atomic();
 	/* Assume default seccomp processes want spec flaw mitigation. */
 	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
-		spec_mitigate(task, PR_SPEC_STORE_BYPASS);
+		arch_seccomp_spec_mitigate(task);
 	set_tsk_thread_flag(task, TIF_SECCOMP);
 }
 

From f21b53b20c754021935ea43364dbf53778eeba32 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 3 May 2018 14:37:54 -0700
Subject: [PATCH 071/393] x86/speculation: Make "seccomp" the default mode for
 Speculative Store Bypass

Unless explicitly opted out of, anything running under seccomp will have
SSB mitigations enabled. Choosing the "prctl" mode will disable this.

[ tglx: Adjusted it to the new arch_seccomp_spec_mitigate() mechanism ]

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../admin-guide/kernel-parameters.txt         | 26 +++++++++------
 arch/x86/include/asm/nospec-branch.h          |  1 +
 arch/x86/kernel/cpu/bugs.c                    | 32 +++++++++++++------
 3 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d2ae1e335b..f2040d46f095 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4049,19 +4049,27 @@
 			This parameter controls whether the Speculative Store
 			Bypass optimization is used.
 
-			on     - Unconditionally disable Speculative Store Bypass
-			off    - Unconditionally enable Speculative Store Bypass
-			auto   - Kernel detects whether the CPU model contains an
-				 implementation of Speculative Store Bypass and
-				 picks the most appropriate mitigation.
-			prctl  - Control Speculative Store Bypass per thread
-				 via prctl. Speculative Store Bypass is enabled
-				 for a process by default. The state of the control
-				 is inherited on fork.
+			on      - Unconditionally disable Speculative Store Bypass
+			off     - Unconditionally enable Speculative Store Bypass
+			auto    - Kernel detects whether the CPU model contains an
+				  implementation of Speculative Store Bypass and
+				  picks the most appropriate mitigation. If the
+				  CPU is not vulnerable, "off" is selected. If the
+				  CPU is vulnerable the default mitigation is
+				  architecture and Kconfig dependent. See below.
+			prctl   - Control Speculative Store Bypass per thread
+				  via prctl. Speculative Store Bypass is enabled
+				  for a process by default. The state of the control
+				  is inherited on fork.
+			seccomp - Same as "prctl" above, but all seccomp threads
+				  will disable SSB unless they explicitly opt out.
 
 			Not specifying this option is equivalent to
 			spec_store_bypass_disable=auto.
 
+			Default mitigations:
+			X86:	If CONFIG_SECCOMP=y "seccomp", otherwise "prctl"
+
 	spia_io_base=	[HW,MTD]
 	spia_fio_base=
 	spia_pedr=
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 71ad01422655..328ea3cb769f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -233,6 +233,7 @@ enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
 	SPEC_STORE_BYPASS_DISABLE,
 	SPEC_STORE_BYPASS_PRCTL,
+	SPEC_STORE_BYPASS_SECCOMP,
 };
 
 extern char __indirect_thunk_start[];
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5dab4c3d26e7..563d8e54c863 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -416,22 +416,25 @@ enum ssb_mitigation_cmd {
 	SPEC_STORE_BYPASS_CMD_AUTO,
 	SPEC_STORE_BYPASS_CMD_ON,
 	SPEC_STORE_BYPASS_CMD_PRCTL,
+	SPEC_STORE_BYPASS_CMD_SECCOMP,
 };
 
 static const char *ssb_strings[] = {
 	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
 	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled",
-	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl"
+	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl",
+	[SPEC_STORE_BYPASS_SECCOMP]	= "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
 };
 
 static const struct {
 	const char *option;
 	enum ssb_mitigation_cmd cmd;
 } ssb_mitigation_options[] = {
-	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },  /* Platform decides */
-	{ "on",		SPEC_STORE_BYPASS_CMD_ON },    /* Disable Speculative Store Bypass */
-	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },  /* Don't touch Speculative Store Bypass */
-	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },    /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },      /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },    /* Don't touch Speculative Store Bypass */
+	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL },   /* Disable Speculative Store Bypass via prctl */
+	{ "seccomp",	SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
 };
 
 static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
@@ -481,8 +484,15 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 
 	switch (cmd) {
 	case SPEC_STORE_BYPASS_CMD_AUTO:
-		/* Choose prctl as the default mode */
-		mode = SPEC_STORE_BYPASS_PRCTL;
+	case SPEC_STORE_BYPASS_CMD_SECCOMP:
+		/*
+		 * Choose prctl+seccomp as the default mode if seccomp is
+		 * enabled.
+		 */
+		if (IS_ENABLED(CONFIG_SECCOMP))
+			mode = SPEC_STORE_BYPASS_SECCOMP;
+		else
+			mode = SPEC_STORE_BYPASS_PRCTL;
 		break;
 	case SPEC_STORE_BYPASS_CMD_ON:
 		mode = SPEC_STORE_BYPASS_DISABLE;
@@ -530,12 +540,14 @@ static void ssb_select_mitigation()
 }
 
 #undef pr_fmt
+#define pr_fmt(fmt)     "Speculation prctl: " fmt
 
 static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
 	bool update;
 
-	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL)
+	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+	    ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
 		return -ENXIO;
 
 	switch (ctrl) {
@@ -583,7 +595,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 #ifdef CONFIG_SECCOMP
 void arch_seccomp_spec_mitigate(struct task_struct *task)
 {
-	ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+	if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+		ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
 }
 #endif
 
@@ -592,6 +605,7 @@ static int ssb_prctl_get(struct task_struct *task)
 	switch (ssb_mode) {
 	case SPEC_STORE_BYPASS_DISABLE:
 		return PR_SPEC_DISABLE;
+	case SPEC_STORE_BYPASS_SECCOMP:
 	case SPEC_STORE_BYPASS_PRCTL:
 		if (task_spec_ssb_force_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;

From 388126a3e6c706eabf2359839753d550639c25ff Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Tue, 24 Apr 2018 17:25:19 -0300
Subject: [PATCH 072/393] ARM: dts: imx7s: Pass the 'fsl,sec-era' property

Currently the following error is seen from the CAAM driver:

caam 30900000.caam: device ID = 0x0a16030000000000 (Era -524)

Pass the 'fsl,sec-era' property to properly describe the
era information.

This error happens because the 'fsl,sec-era' is not passed via
device tree.

The era information is used in various places inside drivers/crypto/caam,
so pass the correct version via device tree.

Fixes: 0eeabcad7da5 ("ARM: dts: imx7s: add CAAM device node")
Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 arch/arm/boot/dts/imx7s.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi
index 4d42335c0dee..ce85b3ca1a55 100644
--- a/arch/arm/boot/dts/imx7s.dtsi
+++ b/arch/arm/boot/dts/imx7s.dtsi
@@ -868,6 +868,7 @@ sai3: sai@308c0000 {
 
 			crypto: caam@30900000 {
 				compatible = "fsl,sec-v4.0";
+				fsl,sec-era = <8>;
 				#address-cells = <1>;
 				#size-cells = <1>;
 				reg = <0x30900000 0x40000>;

From 6dd85fbb87d1d6b87a3b1f02ca28d7b2abd2e7ba Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 20 Apr 2018 16:49:46 +0200
Subject: [PATCH 073/393] s390: move expoline assembler macros to a header

To be able to use the expoline branches in different assembler
files move the associated macros from entry.S to a new header
nospec-insn.h.

While we are at it make the macros a bit nicer to use.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/nospec-insn.h | 127 ++++++++++++++++++++++++++++
 arch/s390/kernel/entry.S            | 105 ++++++-----------------
 2 files changed, 151 insertions(+), 81 deletions(-)
 create mode 100644 arch/s390/include/asm/nospec-insn.h

diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
new file mode 100644
index 000000000000..440689cbcf51
--- /dev/null
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_NOSPEC_ASM_H
+#define _ASM_S390_NOSPEC_ASM_H
+
+#include <asm/dwarf.h>
+
+#ifdef __ASSEMBLY__
+
+#ifdef CONFIG_EXPOLINE
+
+/*
+ * The expoline macros are used to create thunks in the same format
+ * as gcc generates them. The 'comdat' section flag makes sure that
+ * the various thunks are merged into a single copy.
+ */
+	.macro __THUNK_PROLOG_NAME name
+	.pushsection .text.\name,"axG",@progbits,\name,comdat
+	.globl \name
+	.hidden \name
+	.type \name,@function
+\name:
+	CFI_STARTPROC
+	.endm
+
+	.macro __THUNK_EPILOG
+	CFI_ENDPROC
+	.popsection
+	.endm
+
+	.macro __THUNK_PROLOG_BR r1,r2
+	__THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1
+	.endm
+
+	.macro __THUNK_BR r1,r2
+	jg	__s390x_indirect_jump_r\r2\()use_r\r1
+	.endm
+
+	.macro __THUNK_BRASL r1,r2,r3
+	brasl	\r1,__s390x_indirect_jump_r\r3\()use_r\r2
+	.endm
+
+	.macro	__DECODE_RR expand,reg,ruse
+	.set __decode_fail,1
+	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \reg,%r\r1
+	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \ruse,%r\r2
+	\expand \r1,\r2
+	.set __decode_fail,0
+	.endif
+	.endr
+	.endif
+	.endr
+	.if __decode_fail == 1
+	.error "__DECODE_RR failed"
+	.endif
+	.endm
+
+	.macro	__DECODE_RRR expand,rsave,rtarget,ruse
+	.set __decode_fail,1
+	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \rsave,%r\r1
+	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \rtarget,%r\r2
+	.irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \ruse,%r\r3
+	\expand \r1,\r2,\r3
+	.set __decode_fail,0
+	.endif
+	.endr
+	.endif
+	.endr
+	.endif
+	.endr
+	.if __decode_fail == 1
+	.error "__DECODE_RRR failed"
+	.endif
+	.endm
+
+	.macro __THUNK_EX_BR reg,ruse
+#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+	exrl	0,555f
+	j	.
+#else
+	larl	\ruse,555f
+	ex	0,0(\ruse)
+	j	.
+#endif
+555:	br	\reg
+	.endm
+
+	.macro GEN_BR_THUNK reg,ruse=%r1
+	__DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse
+	__THUNK_EX_BR \reg,\ruse
+	__THUNK_EPILOG
+	.endm
+
+	.macro BR_EX reg,ruse=%r1
+557:	__DECODE_RR __THUNK_BR,\reg,\ruse
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	557b-.
+	.popsection
+	.endm
+
+	.macro BASR_EX rsave,rtarget,ruse=%r1
+559:	__DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	559b-.
+	.popsection
+	.endm
+
+#else
+	.macro GEN_BR_THUNK reg,ruse=%r1
+	.endm
+
+	 .macro BR_EX reg,ruse=%r1
+	br	\reg
+	.endm
+
+	.macro BASR_EX rsave,rtarget,ruse=%r1
+	basr	\rsave,\rtarget
+	.endm
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_NOSPEC_ASM_H */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 3f22f139a041..f03402efab4b 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -28,6 +28,7 @@
 #include <asm/setup.h>
 #include <asm/nmi.h>
 #include <asm/export.h>
+#include <asm/nospec-insn.h>
 
 __PT_R0      =	__PT_GPRS
 __PT_R1      =	__PT_GPRS + 8
@@ -183,67 +184,9 @@ _LPP_OFFSET	= __LC_LPP
 		    "jnz .+8; .long 0xb2e8d000", 82
 	.endm
 
-#ifdef CONFIG_EXPOLINE
-
-	.macro GEN_BR_THUNK name,reg,tmp
-	.section .text.\name,"axG",@progbits,\name,comdat
-	.globl \name
-	.hidden \name
-	.type \name,@function
-\name:
-	CFI_STARTPROC
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-	exrl	0,0f
-#else
-	larl	\tmp,0f
-	ex	0,0(\tmp)
-#endif
-	j	.
-0:	br	\reg
-	CFI_ENDPROC
-	.endm
-
-	GEN_BR_THUNK __s390x_indirect_jump_r1use_r9,%r9,%r1
-	GEN_BR_THUNK __s390x_indirect_jump_r1use_r14,%r14,%r1
-	GEN_BR_THUNK __s390x_indirect_jump_r11use_r14,%r14,%r11
-
-	.macro BASR_R14_R9
-0:	brasl	%r14,__s390x_indirect_jump_r1use_r9
-	.pushsection .s390_indirect_branches,"a",@progbits
-	.long	0b-.
-	.popsection
-	.endm
-
-	.macro BR_R1USE_R14
-0:	jg	__s390x_indirect_jump_r1use_r14
-	.pushsection .s390_indirect_branches,"a",@progbits
-	.long	0b-.
-	.popsection
-	.endm
-
-	.macro BR_R11USE_R14
-0:	jg	__s390x_indirect_jump_r11use_r14
-	.pushsection .s390_indirect_branches,"a",@progbits
-	.long	0b-.
-	.popsection
-	.endm
-
-#else	/* CONFIG_EXPOLINE */
-
-	.macro BASR_R14_R9
-	basr	%r14,%r9
-	.endm
-
-	.macro BR_R1USE_R14
-	br	%r14
-	.endm
-
-	.macro BR_R11USE_R14
-	br	%r14
-	.endm
-
-#endif /* CONFIG_EXPOLINE */
-
+	GEN_BR_THUNK %r9
+	GEN_BR_THUNK %r14
+	GEN_BR_THUNK %r14,%r11
 
 	.section .kprobes.text, "ax"
 .Ldummy:
@@ -260,7 +203,7 @@ _LPP_OFFSET	= __LC_LPP
 ENTRY(__bpon)
 	.globl __bpon
 	BPON
-	BR_R1USE_R14
+	BR_EX	%r14
 
 /*
  * Scheduler resume function, called by switch_to
@@ -284,7 +227,7 @@ ENTRY(__switch_to)
 	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
 	ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
-	BR_R1USE_R14
+	BR_EX	%r14
 
 .L__critical_start:
 
@@ -351,7 +294,7 @@ sie_exit:
 	xgr	%r5,%r5
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
 	lg	%r2,__SF_SIE_REASON(%r15)	# return exit reason code
-	BR_R1USE_R14
+	BR_EX	%r14
 .Lsie_fault:
 	lghi	%r14,-EFAULT
 	stg	%r14,__SF_SIE_REASON(%r15)	# set exit reason code
@@ -410,7 +353,7 @@ ENTRY(system_call)
 	lgf	%r9,0(%r8,%r10)			# get system call add.
 	TSTMSK	__TI_flags(%r12),_TIF_TRACE
 	jnz	.Lsysc_tracesys
-	BASR_R14_R9				# call sys_xxxx
+	BASR_EX	%r14,%r9			# call sys_xxxx
 	stg	%r2,__PT_R2(%r11)		# store return value
 
 .Lsysc_return:
@@ -595,7 +538,7 @@ ENTRY(system_call)
 	lmg	%r3,%r7,__PT_R3(%r11)
 	stg	%r7,STACK_FRAME_OVERHEAD(%r15)
 	lg	%r2,__PT_ORIG_GPR2(%r11)
-	BASR_R14_R9			# call sys_xxx
+	BASR_EX	%r14,%r9		# call sys_xxx
 	stg	%r2,__PT_R2(%r11)	# store return value
 .Lsysc_tracenogo:
 	TSTMSK	__TI_flags(%r12),_TIF_TRACE
@@ -619,7 +562,7 @@ ENTRY(ret_from_fork)
 	lmg	%r9,%r10,__PT_R9(%r11)	# load gprs
 ENTRY(kernel_thread_starter)
 	la	%r2,0(%r10)
-	BASR_R14_R9
+	BASR_EX	%r14,%r9
 	j	.Lsysc_tracenogo
 
 /*
@@ -701,7 +644,7 @@ ENTRY(pgm_check_handler)
 	je	.Lpgm_return
 	lgf	%r9,0(%r10,%r1)		# load address of handler routine
 	lgr	%r2,%r11		# pass pointer to pt_regs
-	BASR_R14_R9			# branch to interrupt-handler
+	BASR_EX	%r14,%r9		# branch to interrupt-handler
 .Lpgm_return:
 	LOCKDEP_SYS_EXIT
 	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
@@ -1019,7 +962,7 @@ ENTRY(psw_idle)
 	stpt	__TIMER_IDLE_ENTER(%r2)
 .Lpsw_idle_lpsw:
 	lpswe	__SF_EMPTY(%r15)
-	BR_R1USE_R14
+	BR_EX	%r14
 .Lpsw_idle_end:
 
 /*
@@ -1061,7 +1004,7 @@ ENTRY(save_fpu_regs)
 .Lsave_fpu_regs_done:
 	oi	__LC_CPU_FLAGS+7,_CIF_FPU
 .Lsave_fpu_regs_exit:
-	BR_R1USE_R14
+	BR_EX	%r14
 .Lsave_fpu_regs_end:
 EXPORT_SYMBOL(save_fpu_regs)
 
@@ -1107,7 +1050,7 @@ load_fpu_regs:
 .Lload_fpu_regs_done:
 	ni	__LC_CPU_FLAGS+7,255-_CIF_FPU
 .Lload_fpu_regs_exit:
-	BR_R1USE_R14
+	BR_EX	%r14
 .Lload_fpu_regs_end:
 
 .L__critical_end:
@@ -1322,7 +1265,7 @@ cleanup_critical:
 	jl	0f
 	clg	%r9,BASED(.Lcleanup_table+104)	# .Lload_fpu_regs_end
 	jl	.Lcleanup_load_fpu_regs
-0:	BR_R11USE_R14
+0:	BR_EX	%r14
 
 	.align	8
 .Lcleanup_table:
@@ -1358,7 +1301,7 @@ cleanup_critical:
 	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
 	larl	%r9,sie_exit			# skip forward to sie_exit
-	BR_R11USE_R14
+	BR_EX	%r14
 #endif
 
 .Lcleanup_system_call:
@@ -1412,7 +1355,7 @@ cleanup_critical:
 	stg	%r15,56(%r11)		# r15 stack pointer
 	# set new psw address and exit
 	larl	%r9,.Lsysc_do_svc
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 .Lcleanup_system_call_insn:
 	.quad	system_call
 	.quad	.Lsysc_stmg
@@ -1424,7 +1367,7 @@ cleanup_critical:
 
 .Lcleanup_sysc_tif:
 	larl	%r9,.Lsysc_tif
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 
 .Lcleanup_sysc_restore:
 	# check if stpt has been executed
@@ -1441,14 +1384,14 @@ cleanup_critical:
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
 1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 .Lcleanup_sysc_restore_insn:
 	.quad	.Lsysc_exit_timer
 	.quad	.Lsysc_done - 4
 
 .Lcleanup_io_tif:
 	larl	%r9,.Lio_tif
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 
 .Lcleanup_io_restore:
 	# check if stpt has been executed
@@ -1462,7 +1405,7 @@ cleanup_critical:
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
 1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 .Lcleanup_io_restore_insn:
 	.quad	.Lio_exit_timer
 	.quad	.Lio_done - 4
@@ -1515,17 +1458,17 @@ cleanup_critical:
 	# prepare return psw
 	nihh	%r8,0xfcfd		# clear irq & wait state bits
 	lg	%r9,48(%r11)		# return from psw_idle
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 .Lcleanup_idle_insn:
 	.quad	.Lpsw_idle_lpsw
 
 .Lcleanup_save_fpu_regs:
 	larl	%r9,save_fpu_regs
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 
 .Lcleanup_load_fpu_regs:
 	larl	%r9,load_fpu_regs
-	BR_R11USE_R14
+	BR_EX	%r14,%r11
 
 /*
  * Integer constants

From 467a3bf219cee12259182c5cb4821f88fd518a51 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 23 Apr 2018 14:31:36 +0200
Subject: [PATCH 074/393] s390/crc32-vx: use expoline for indirect branches

The return from the crc32_le_vgfm_16/crc32c_le_vgfm_16 and the
crc32_be_vgfm_16 functions are done with "br %r14". These are indirect
branches as well and need to use execute trampolines for CONFIG_EXPOLINE=y.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/crypto/crc32be-vx.S | 5 ++++-
 arch/s390/crypto/crc32le-vx.S | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S
index e8077f0971f8..2bf01ba44107 100644
--- a/arch/s390/crypto/crc32be-vx.S
+++ b/arch/s390/crypto/crc32be-vx.S
@@ -13,6 +13,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/nospec-insn.h>
 #include <asm/vx-insn.h>
 
 /* Vector register range containing CRC-32 constants */
@@ -67,6 +68,8 @@
 
 .previous
 
+	GEN_BR_THUNK %r14
+
 .text
 /*
  * The CRC-32 function(s) use these calling conventions:
@@ -203,6 +206,6 @@ ENTRY(crc32_be_vgfm_16)
 
 .Ldone:
 	VLGVF	%r2,%v2,3
-	br	%r14
+	BR_EX	%r14
 
 .previous
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S
index d8c67a58c0c5..7d6f568bd3ad 100644
--- a/arch/s390/crypto/crc32le-vx.S
+++ b/arch/s390/crypto/crc32le-vx.S
@@ -14,6 +14,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/nospec-insn.h>
 #include <asm/vx-insn.h>
 
 /* Vector register range containing CRC-32 constants */
@@ -76,6 +77,7 @@
 
 .previous
 
+	GEN_BR_THUNK %r14
 
 .text
 
@@ -264,6 +266,6 @@ crc32_le_vgfm_generic:
 
 .Ldone:
 	VLGVF	%r2,%v2,2
-	br	%r14
+	BR_EX	%r14
 
 .previous

From 97489e0663fa700d6e7febddc43b58df98d7bcda Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 23 Apr 2018 14:31:36 +0200
Subject: [PATCH 075/393] s390/lib: use expoline for indirect branches

The return from the memmove, memset, memcpy, __memset16, __memset32 and
__memset64 functions are done with "br %r14". These are indirect branches
as well and need to use execute trampolines for CONFIG_EXPOLINE=y.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/lib/mem.S | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S
index 495c9c4bacc7..2311f15be9cf 100644
--- a/arch/s390/lib/mem.S
+++ b/arch/s390/lib/mem.S
@@ -7,6 +7,9 @@
 
 #include <linux/linkage.h>
 #include <asm/export.h>
+#include <asm/nospec-insn.h>
+
+	GEN_BR_THUNK %r14
 
 /*
  * void *memmove(void *dest, const void *src, size_t n)
@@ -33,14 +36,14 @@ ENTRY(memmove)
 .Lmemmove_forward_remainder:
 	larl	%r5,.Lmemmove_mvc
 	ex	%r4,0(%r5)
-	br	%r14
+	BR_EX	%r14
 .Lmemmove_reverse:
 	ic	%r0,0(%r4,%r3)
 	stc	%r0,0(%r4,%r1)
 	brctg	%r4,.Lmemmove_reverse
 	ic	%r0,0(%r4,%r3)
 	stc	%r0,0(%r4,%r1)
-	br	%r14
+	BR_EX	%r14
 .Lmemmove_mvc:
 	mvc	0(1,%r1),0(%r3)
 EXPORT_SYMBOL(memmove)
@@ -77,7 +80,7 @@ ENTRY(memset)
 .Lmemset_clear_remainder:
 	larl	%r3,.Lmemset_xc
 	ex	%r4,0(%r3)
-	br	%r14
+	BR_EX	%r14
 .Lmemset_fill:
 	cghi	%r4,1
 	lgr	%r1,%r2
@@ -95,10 +98,10 @@ ENTRY(memset)
 	stc	%r3,0(%r1)
 	larl	%r5,.Lmemset_mvc
 	ex	%r4,0(%r5)
-	br	%r14
+	BR_EX	%r14
 .Lmemset_fill_exit:
 	stc	%r3,0(%r1)
-	br	%r14
+	BR_EX	%r14
 .Lmemset_xc:
 	xc	0(1,%r1),0(%r1)
 .Lmemset_mvc:
@@ -121,7 +124,7 @@ ENTRY(memcpy)
 .Lmemcpy_remainder:
 	larl	%r5,.Lmemcpy_mvc
 	ex	%r4,0(%r5)
-	br	%r14
+	BR_EX	%r14
 .Lmemcpy_loop:
 	mvc	0(256,%r1),0(%r3)
 	la	%r1,256(%r1)
@@ -159,10 +162,10 @@ ENTRY(__memset\bits)
 	\insn	%r3,0(%r1)
 	larl	%r5,.L__memset_mvc\bits
 	ex	%r4,0(%r5)
-	br	%r14
+	BR_EX	%r14
 .L__memset_exit\bits:
 	\insn	%r3,0(%r2)
-	br	%r14
+	BR_EX	%r14
 .L__memset_mvc\bits:
 	mvc	\bytes(1,%r1),0(%r1)
 .endm

From bb765d1c331f62b59049d35607ed2e365802bef9 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 4 Apr 2018 21:03:21 +0200
Subject: [PATCH 076/393] tee: shm: fix use-after-free via temporarily dropped
 reference

Bump the file's refcount before moving the reference into the fd table,
not afterwards. The old code could drop the file's refcount to zero for a
short moment before calling get_file() via get_dma_buf().

This code can only be triggered on ARM systems that use Linaro's OP-TEE.

Fixes: 967c9cca2cc5 ("tee: generic TEE subsystem")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_shm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 556960a1bab3..07d3be6f0780 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -360,9 +360,10 @@ int tee_shm_get_fd(struct tee_shm *shm)
 	if (!(shm->flags & TEE_SHM_DMA_BUF))
 		return -EINVAL;
 
+	get_dma_buf(shm->dmabuf);
 	fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
-	if (fd >= 0)
-		get_dma_buf(shm->dmabuf);
+	if (fd < 0)
+		dma_buf_put(shm->dmabuf);
 	return fd;
 }
 

From ab9d3db5b320a052452b9cd035599ee3c84bbee9 Mon Sep 17 00:00:00 2001
From: Etienne Carriere <etienne.carriere@linaro.org>
Date: Sun, 29 Apr 2018 14:22:29 +0200
Subject: [PATCH 077/393] tee: check shm references are consistent in
 offset/size

This change prevents userland from referencing TEE shared memory
outside the area initially allocated by its owner. Prior this change an
application could not reference or access memory it did not own but
it could reference memory not explicitly allocated by owner but still
allocated to the owner due to the memory allocation granule.

Reported-by: Alexandre Jutras <alexandre.jutras@nxp.com>
Signed-off-by: Etienne Carriere <etienne.carriere@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 0124a91c8d71..dd46b758852a 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -238,6 +238,17 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params,
 			if (IS_ERR(shm))
 				return PTR_ERR(shm);
 
+			/*
+			 * Ensure offset + size does not overflow offset
+			 * and does not overflow the size of the referred
+			 * shared memory object.
+			 */
+			if ((ip.a + ip.b) < ip.a ||
+			    (ip.a + ip.b) > shm->size) {
+				tee_shm_put(shm);
+				return -EINVAL;
+			}
+
 			params[n].u.memref.shm_offs = ip.a;
 			params[n].u.memref.size = ip.b;
 			params[n].u.memref.shm = shm;

From 41bd6adf3c5d3990d7a67639aff53d54483744fa Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 7 May 2018 08:28:17 -0700
Subject: [PATCH 078/393] Revert "ARM: dts: logicpd-som-lv: Fix pinmux
 controller references"

This reverts commit 30443b3104527c83102fa85347ae4bf21caaf77a.

Turns out this causes other issues as reported by Adam.

Signed-off-by: Tony Lingren <tony@atomide.com>
---
 arch/arm/boot/dts/logicpd-som-lv.dtsi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi
index efd8c3351e10..6fa7bba3e801 100644
--- a/arch/arm/boot/dts/logicpd-som-lv.dtsi
+++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi
@@ -206,6 +206,8 @@ OMAP3_CORE1_IOPAD(0x20ba, PIN_OUTPUT | MUX_MODE4)        /* gpmc_ncs6.gpio_57 */
 };
 
 &omap3_pmx_wkup {
+	pinctrl-names = "default";
+	pinctrl-0 = <&hsusb2_reset_pin>;
 	hsusb2_reset_pin: pinmux_hsusb1_reset_pin {
 		pinctrl-single,pins = <
 			OMAP3_WKUP_IOPAD(0x2a0e, PIN_OUTPUT | MUX_MODE4)	/* sys_boot2.gpio_4 */
@@ -232,6 +234,8 @@ OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0)	/* i2c3_sda */
 };
 
 &omap3_pmx_core2 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&hsusb2_2_pins>;
 	hsusb2_2_pins: pinmux_hsusb2_2_pins {
 		pinctrl-single,pins = <
 			OMAP3630_CORE2_IOPAD(0x25f0, PIN_OUTPUT | MUX_MODE3)            /* etk_d10.hsusb2_clk */

From 675c7215aacf54242b2e8bc64bab698abbe764db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20P=C3=A9ron?= <peron.clem@gmail.com>
Date: Thu, 3 May 2018 17:32:07 +0200
Subject: [PATCH 079/393] ARM: dts: cygnus: fix irq type for arm global timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As per ARM documentation
PPI(0) ID27 - global timer interrupt is rising-edge sensitive.

set IRQ triggering type to IRQ_TYPE_EDGE_RISING for ARM Global timers.

Fixes: c9ad7bc5fe3 ("ARM: dts: Enable Broadcom Cygnus SoC")
Signed-off-by: Clément Péron <peron.clem@gmail.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 arch/arm/boot/dts/bcm-cygnus.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/bcm-cygnus.dtsi b/arch/arm/boot/dts/bcm-cygnus.dtsi
index 699fdf94d139..9fe4f5a6379e 100644
--- a/arch/arm/boot/dts/bcm-cygnus.dtsi
+++ b/arch/arm/boot/dts/bcm-cygnus.dtsi
@@ -69,7 +69,7 @@ core {
 		timer@20200 {
 			compatible = "arm,cortex-a9-global-timer";
 			reg = <0x20200 0x100>;
-			interrupts = <GIC_PPI 11 IRQ_TYPE_LEVEL_HIGH>;
+			interrupts = <GIC_PPI 11 IRQ_TYPE_EDGE_RISING>;
 			clocks = <&periph_clk>;
 		};
 

From 23a4d7fd34856da8218c4cfc23dba7a6ec0a423a Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 25 Apr 2018 18:35:26 +0200
Subject: [PATCH 080/393] s390/ftrace: use expoline for indirect branches

The return from the ftrace_stub, _mcount, ftrace_caller and
return_to_handler functions is done with "br %r14" and "br %r1".
These are indirect branches as well and need to use execute
trampolines for CONFIG_EXPOLINE=y.

The ftrace_caller function is a special case as it returns to the
start of a function and may only use %r0 and %r1. For a pre z10
machine the standard execute trampoline uses a LARL + EX to do
this, but this requires *two* registers in the range %r1..%r15.
To get around this the 'br %r1' located in the lowcore is used,
then the EX instruction does not need an address register.
But the lowcore trick may only be used for pre z14 machines,
with noexec=on the mapping for the first page may not contain
instructions. The solution for that is an ALTERNATIVE in the
expoline THUNK generated by 'GEN_BR_THUNK %r1' to switch to
EXRL, this relies on the fact that a machine that supports
noexec=on has EXRL as well.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/nospec-insn.h | 12 ++++++++++++
 arch/s390/kernel/asm-offsets.c      |  1 +
 arch/s390/kernel/mcount.S           | 14 +++++++++-----
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 440689cbcf51..7d7640e1cf90 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -2,12 +2,16 @@
 #ifndef _ASM_S390_NOSPEC_ASM_H
 #define _ASM_S390_NOSPEC_ASM_H
 
+#include <asm/alternative-asm.h>
+#include <asm/asm-offsets.h>
 #include <asm/dwarf.h>
 
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_EXPOLINE
 
+_LC_BR_R1 = __LC_BR_R1
+
 /*
  * The expoline macros are used to create thunks in the same format
  * as gcc generates them. The 'comdat' section flag makes sure that
@@ -78,13 +82,21 @@
 	.endm
 
 	.macro __THUNK_EX_BR reg,ruse
+	# Be very careful when adding instructions to this macro!
+	# The ALTERNATIVE replacement code has a .+10 which targets
+	# the "br \reg" after the code has been patched.
 #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
 	exrl	0,555f
 	j	.
 #else
+	.ifc \reg,%r1
+	ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35
+	j	.
+	.else
 	larl	\ruse,555f
 	ex	0,0(\ruse)
 	j	.
+	.endif
 #endif
 555:	br	\reg
 	.endm
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index eb2a5c0443cd..11aea745a2a6 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -181,6 +181,7 @@ int main(void)
 	OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
 	OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
 	OFFSET(__LC_GMAP, lowcore, gmap);
+	OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
 	/* hardware defined lowcore locations 0x1000 - 0x18ff */
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 82df7d80fab2..27110f3294ed 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -9,13 +9,17 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/ftrace.h>
+#include <asm/nospec-insn.h>
 #include <asm/ptrace.h>
 #include <asm/export.h>
 
+	GEN_BR_THUNK %r1
+	GEN_BR_THUNK %r14
+
 	.section .kprobes.text, "ax"
 
 ENTRY(ftrace_stub)
-	br	%r14
+	BR_EX	%r14
 
 #define STACK_FRAME_SIZE  (STACK_FRAME_OVERHEAD + __PT_SIZE)
 #define STACK_PTREGS	  (STACK_FRAME_OVERHEAD)
@@ -23,7 +27,7 @@ ENTRY(ftrace_stub)
 #define STACK_PTREGS_PSW  (STACK_PTREGS + __PT_PSW)
 
 ENTRY(_mcount)
-	br	%r14
+	BR_EX	%r14
 
 EXPORT_SYMBOL(_mcount)
 
@@ -53,7 +57,7 @@ ENTRY(ftrace_caller)
 #endif
 	lgr	%r3,%r14
 	la	%r5,STACK_PTREGS(%r15)
-	basr	%r14,%r1
+	BASR_EX	%r14,%r1
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 # The j instruction gets runtime patched to a nop instruction.
 # See ftrace_enable_ftrace_graph_caller.
@@ -68,7 +72,7 @@ ftrace_graph_caller_end:
 #endif
 	lg	%r1,(STACK_PTREGS_PSW+8)(%r15)
 	lmg	%r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
-	br	%r1
+	BR_EX	%r1
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
@@ -81,6 +85,6 @@ ENTRY(return_to_handler)
 	aghi	%r15,STACK_FRAME_OVERHEAD
 	lgr	%r14,%r2
 	lmg	%r2,%r5,32(%r15)
-	br	%r14
+	BR_EX	%r14
 
 #endif

From c50c84c3ac4d5db683904bdb3257798b6ef980ae Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 25 Apr 2018 18:41:30 +0200
Subject: [PATCH 081/393] s390/kernel: use expoline for indirect branches

The assember code in arch/s390/kernel uses a few more indirect branches
which need to be done with execute trampolines for CONFIG_EXPOLINE=y.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/base.S   | 24 ++++++++++++++----------
 arch/s390/kernel/reipl.S  |  7 +++++--
 arch/s390/kernel/swsusp.S | 10 ++++++----
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S
index f6c56009e822..b65874b0b412 100644
--- a/arch/s390/kernel/base.S
+++ b/arch/s390/kernel/base.S
@@ -9,18 +9,22 @@
 
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-insn.h>
 #include <asm/ptrace.h>
 #include <asm/sigp.h>
 
+	GEN_BR_THUNK %r9
+	GEN_BR_THUNK %r14
+
 ENTRY(s390_base_mcck_handler)
 	basr	%r13,0
 0:	lg	%r15,__LC_PANIC_STACK	# load panic stack
 	aghi	%r15,-STACK_FRAME_OVERHEAD
 	larl	%r1,s390_base_mcck_handler_fn
-	lg	%r1,0(%r1)
-	ltgr	%r1,%r1
+	lg	%r9,0(%r1)
+	ltgr	%r9,%r9
 	jz	1f
-	basr	%r14,%r1
+	BASR_EX	%r14,%r9
 1:	la	%r1,4095
 	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)
 	lpswe	__LC_MCK_OLD_PSW
@@ -37,10 +41,10 @@ ENTRY(s390_base_ext_handler)
 	basr	%r13,0
 0:	aghi	%r15,-STACK_FRAME_OVERHEAD
 	larl	%r1,s390_base_ext_handler_fn
-	lg	%r1,0(%r1)
-	ltgr	%r1,%r1
+	lg	%r9,0(%r1)
+	ltgr	%r9,%r9
 	jz	1f
-	basr	%r14,%r1
+	BASR_EX	%r14,%r9
 1:	lmg	%r0,%r15,__LC_SAVE_AREA_ASYNC
 	ni	__LC_EXT_OLD_PSW+1,0xfd	# clear wait state bit
 	lpswe	__LC_EXT_OLD_PSW
@@ -57,10 +61,10 @@ ENTRY(s390_base_pgm_handler)
 	basr	%r13,0
 0:	aghi	%r15,-STACK_FRAME_OVERHEAD
 	larl	%r1,s390_base_pgm_handler_fn
-	lg	%r1,0(%r1)
-	ltgr	%r1,%r1
+	lg	%r9,0(%r1)
+	ltgr	%r9,%r9
 	jz	1f
-	basr	%r14,%r1
+	BASR_EX	%r14,%r9
 	lmg	%r0,%r15,__LC_SAVE_AREA_SYNC
 	lpswe	__LC_PGM_OLD_PSW
 1:	lpswe	disabled_wait_psw-0b(%r13)
@@ -117,7 +121,7 @@ ENTRY(diag308_reset)
 	larl	%r4,.Lcontinue_psw	# Restore PSW flags
 	lpswe	0(%r4)
 .Lcontinue:
-	br	%r14
+	BR_EX	%r14
 .align 16
 .Lrestart_psw:
 	.long	0x00080000,0x80000000 + .Lrestart_part2
diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S
index 73cc3750f0d3..7f14adf512c6 100644
--- a/arch/s390/kernel/reipl.S
+++ b/arch/s390/kernel/reipl.S
@@ -7,8 +7,11 @@
 
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-insn.h>
 #include <asm/sigp.h>
 
+	GEN_BR_THUNK %r9
+
 #
 # Issue "store status" for the current CPU to its prefix page
 # and call passed function afterwards
@@ -67,9 +70,9 @@ ENTRY(store_status)
 	st	%r4,0(%r1)
 	st	%r5,4(%r1)
 	stg	%r2,8(%r1)
-	lgr	%r1,%r2
+	lgr	%r9,%r2
 	lgr	%r2,%r3
-	br	%r1
+	BR_EX	%r9
 
 	.section .bss
 	.align	8
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
index e99187149f17..a049a7b9d6e8 100644
--- a/arch/s390/kernel/swsusp.S
+++ b/arch/s390/kernel/swsusp.S
@@ -13,6 +13,7 @@
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-insn.h>
 #include <asm/sigp.h>
 
 /*
@@ -24,6 +25,8 @@
  * (see below) in the resume process.
  * This function runs with disabled interrupts.
  */
+	GEN_BR_THUNK %r14
+
 	.section .text
 ENTRY(swsusp_arch_suspend)
 	stmg	%r6,%r15,__SF_GPRS(%r15)
@@ -103,7 +106,7 @@ ENTRY(swsusp_arch_suspend)
 	spx	0x318(%r1)
 	lmg	%r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
 	lghi	%r2,0
-	br	%r14
+	BR_EX	%r14
 
 /*
  * Restore saved memory image to correct place and restore register context.
@@ -197,11 +200,10 @@ pgm_check_entry:
 	larl	%r15,init_thread_union
 	ahi	%r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)
 	larl	%r2,.Lpanic_string
-	larl	%r3,sclp_early_printk
 	lghi	%r1,0
 	sam31
 	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE
-	basr	%r14,%r3
+	brasl	%r14,sclp_early_printk
 	larl	%r3,.Ldisabled_wait_31
 	lpsw	0(%r3)
 4:
@@ -267,7 +269,7 @@ restore_registers:
 	/* Return 0 */
 	lmg	%r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
 	lghi	%r2,0
-	br	%r14
+	BR_EX	%r14
 
 	.section .data..nosave,"aw",@progbits
 	.align	8

From 4253b0e0627ee3461e64c2495c616f1c8f6b127b Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 24 Apr 2018 08:23:54 +0200
Subject: [PATCH 082/393] s390: move spectre sysfs attribute code

The nospec-branch.c file is compiled without the gcc options to
generate expoline thunks. The return branch of the sysfs show
functions cpu_show_spectre_v1 and cpu_show_spectre_v2 is an indirect
branch as well. These need to be compiled with expolines.

Move the sysfs functions for spectre reporting to a separate file
and loose an '.' for one of the messages.

Cc: stable@vger.kernel.org # 4.16
Fixes: d424986f1d ("s390: add sysfs attributes for spectre")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/Makefile        |  1 +
 arch/s390/kernel/nospec-branch.c | 19 -------------------
 arch/s390/kernel/nospec-sysfs.c  | 21 +++++++++++++++++++++
 3 files changed, 22 insertions(+), 19 deletions(-)
 create mode 100644 arch/s390/kernel/nospec-sysfs.c

diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 84ea6225efb4..f92dd8ed3884 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -65,6 +65,7 @@ obj-y	+= nospec-branch.o
 
 extra-y				+= head.o head64.o vmlinux.lds
 
+obj-$(CONFIG_SYSFS)		+= nospec-sysfs.o
 CFLAGS_REMOVE_nospec-branch.o	+= $(CC_FLAGS_EXPOLINE)
 
 obj-$(CONFIG_MODULES)		+= module.o
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 46d49a11663f..834cf29f2599 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/cpu.h>
 #include <asm/nospec-branch.h>
 
 static int __init nobp_setup_early(char *str)
@@ -44,24 +43,6 @@ static int __init nospec_report(void)
 }
 arch_initcall(nospec_report);
 
-#ifdef CONFIG_SYSFS
-ssize_t cpu_show_spectre_v1(struct device *dev,
-			    struct device_attribute *attr, char *buf)
-{
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
-}
-
-ssize_t cpu_show_spectre_v2(struct device *dev,
-			    struct device_attribute *attr, char *buf)
-{
-	if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable)
-		return sprintf(buf, "Mitigation: execute trampolines\n");
-	if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
-		return sprintf(buf, "Mitigation: limited branch prediction.\n");
-	return sprintf(buf, "Vulnerable\n");
-}
-#endif
-
 #ifdef CONFIG_EXPOLINE
 
 int nospec_disable = IS_ENABLED(CONFIG_EXPOLINE_OFF);
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
new file mode 100644
index 000000000000..8affad5f18cb
--- /dev/null
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <asm/facility.h>
+#include <asm/nospec-branch.h>
+
+ssize_t cpu_show_spectre_v1(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable)
+		return sprintf(buf, "Mitigation: execute trampolines\n");
+	if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+		return sprintf(buf, "Mitigation: limited branch prediction\n");
+	return sprintf(buf, "Vulnerable\n");
+}

From 9f18fff63cfd6f559daa1eaae60640372c65f84b Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 24 Apr 2018 11:18:49 +0200
Subject: [PATCH 083/393] s390: remove indirect branch from
 do_softirq_own_stack

The inline assembly to call __do_softirq on the irq stack uses
an indirect branch. This can be replaced with a normal relative
branch.

Cc: stable@vger.kernel.org # 4.16
Fixes: f19fbd5ed6 ("s390: introduce execute-trampolines for branches")
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/irq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 94f2099bceb0..3d17c41074ca 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -176,10 +176,9 @@ void do_softirq_own_stack(void)
 		new -= STACK_FRAME_OVERHEAD;
 		((struct stack_frame *) new)->back_chain = old;
 		asm volatile("   la    15,0(%0)\n"
-			     "   basr  14,%2\n"
+			     "   brasl 14,__do_softirq\n"
 			     "   la    15,0(%1)\n"
-			     : : "a" (new), "a" (old),
-			         "a" (__do_softirq)
+			     : : "a" (new), "a" (old)
 			     : "0", "1", "2", "3", "4", "5", "14",
 			       "cc", "memory" );
 	} else {

From 6deaa3bbca804b2a3627fd685f75de64da7be535 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 24 Apr 2018 15:32:08 +0200
Subject: [PATCH 084/393] s390: extend expoline to BC instructions

The BPF JIT uses a 'b <disp>(%r<x>)' instruction in the definition
of the sk_load_word and sk_load_half functions.

Add support for branch-on-condition instructions contained in the
thunk code of an expoline.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/nospec-insn.h | 57 +++++++++++++++++++++++++++++
 arch/s390/kernel/nospec-branch.c    | 25 ++++++++++---
 2 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 7d7640e1cf90..a01f81186e86 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -35,10 +35,18 @@ _LC_BR_R1 = __LC_BR_R1
 	__THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1
 	.endm
 
+	.macro __THUNK_PROLOG_BC d0,r1,r2
+	__THUNK_PROLOG_NAME __s390x_indirect_branch_\d0\()_\r2\()use_\r1
+	.endm
+
 	.macro __THUNK_BR r1,r2
 	jg	__s390x_indirect_jump_r\r2\()use_r\r1
 	.endm
 
+	.macro __THUNK_BC d0,r1,r2
+	jg	__s390x_indirect_branch_\d0\()_\r2\()use_\r1
+	.endm
+
 	.macro __THUNK_BRASL r1,r2,r3
 	brasl	\r1,__s390x_indirect_jump_r\r3\()use_r\r2
 	.endm
@@ -81,6 +89,23 @@ _LC_BR_R1 = __LC_BR_R1
 	.endif
 	.endm
 
+	.macro	__DECODE_DRR expand,disp,reg,ruse
+	.set __decode_fail,1
+	.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \reg,%r\r1
+	.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	.ifc \ruse,%r\r2
+	\expand \disp,\r1,\r2
+	.set __decode_fail,0
+	.endif
+	.endr
+	.endif
+	.endr
+	.if __decode_fail == 1
+	.error "__DECODE_DRR failed"
+	.endif
+	.endm
+
 	.macro __THUNK_EX_BR reg,ruse
 	# Be very careful when adding instructions to this macro!
 	# The ALTERNATIVE replacement code has a .+10 which targets
@@ -101,17 +126,42 @@ _LC_BR_R1 = __LC_BR_R1
 555:	br	\reg
 	.endm
 
+	.macro __THUNK_EX_BC disp,reg,ruse
+#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+	exrl	0,556f
+	j	.
+#else
+	larl	\ruse,556f
+	ex	0,0(\ruse)
+	j	.
+#endif
+556:	b	\disp(\reg)
+	.endm
+
 	.macro GEN_BR_THUNK reg,ruse=%r1
 	__DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse
 	__THUNK_EX_BR \reg,\ruse
 	__THUNK_EPILOG
 	.endm
 
+	.macro GEN_B_THUNK disp,reg,ruse=%r1
+	__DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse
+	__THUNK_EX_BC \disp,\reg,\ruse
+	__THUNK_EPILOG
+	.endm
+
 	.macro BR_EX reg,ruse=%r1
 557:	__DECODE_RR __THUNK_BR,\reg,\ruse
 	.pushsection .s390_indirect_branches,"a",@progbits
 	.long	557b-.
 	.popsection
+	.endm
+
+	 .macro B_EX disp,reg,ruse=%r1
+558:	__DECODE_DRR __THUNK_BC,\disp,\reg,\ruse
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	558b-.
+	.popsection
 	.endm
 
 	.macro BASR_EX rsave,rtarget,ruse=%r1
@@ -123,10 +173,17 @@ _LC_BR_R1 = __LC_BR_R1
 
 #else
 	.macro GEN_BR_THUNK reg,ruse=%r1
+	.endm
+
+	.macro GEN_B_THUNK disp,reg,ruse=%r1
 	.endm
 
 	 .macro BR_EX reg,ruse=%r1
 	br	\reg
+	.endm
+
+	 .macro B_EX disp,reg,ruse=%r1
+	b	\disp(\reg)
 	.endm
 
 	.macro BASR_EX rsave,rtarget,ruse=%r1
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 834cf29f2599..8ad6a7128b3a 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -93,7 +93,6 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
 	s32 *epo;
 
 	/* Second part of the instruction replace is always a nop */
-	memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x00, 0x00 }, 4);
 	for (epo = start; epo < end; epo++) {
 		instr = (u8 *) epo + *epo;
 		if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
@@ -114,18 +113,34 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
 			br = thunk + (*(int *)(thunk + 2)) * 2;
 		else
 			continue;
-		if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
+		/* Check for unconditional branch 0x07f? or 0x47f???? */
+		if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0)
 			continue;
+
+		memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4);
 		switch (type) {
 		case BRCL_EXPOLINE:
-			/* brcl to thunk, replace with br + nop */
 			insnbuf[0] = br[0];
 			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
+			if (br[0] == 0x47) {
+				/* brcl to b, replace with bc + nopr */
+				insnbuf[2] = br[2];
+				insnbuf[3] = br[3];
+			} else {
+				/* brcl to br, replace with bcr + nop */
+			}
 			break;
 		case BRASL_EXPOLINE:
-			/* brasl to thunk, replace with basr + nop */
-			insnbuf[0] = 0x0d;
 			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
+			if (br[0] == 0x47) {
+				/* brasl to b, replace with bas + nopr */
+				insnbuf[0] = 0x4d;
+				insnbuf[2] = br[2];
+				insnbuf[3] = br[3];
+			} else {
+				/* brasl to br, replace with basr + nop */
+				insnbuf[0] = 0x0d;
+			}
 			break;
 		}
 

From de5cb6eb514ebe241e3edeb290cb41deb380b81d Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 23 Apr 2018 14:31:36 +0200
Subject: [PATCH 085/393] s390: use expoline thunks in the BPF JIT

The BPF JIT need safe guarding against spectre v2 in the sk_load_xxx
assembler stubs and the indirect branches generated by the JIT itself
need to be converted to expolines.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/net/bpf_jit.S      | 16 +++++----
 arch/s390/net/bpf_jit_comp.c | 63 ++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S
index 25bb4643c4f4..9f794869c1b0 100644
--- a/arch/s390/net/bpf_jit.S
+++ b/arch/s390/net/bpf_jit.S
@@ -9,6 +9,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/nospec-insn.h>
 #include "bpf_jit.h"
 
 /*
@@ -54,7 +55,7 @@ ENTRY(sk_load_##NAME##_pos);						\
 	clg	%r3,STK_OFF_HLEN(%r15);	/* Offset + SIZE > hlen? */	\
 	jh	sk_load_##NAME##_slow;					\
 	LOAD	%r14,-SIZE(%r3,%r12);	/* Get data from skb */		\
-	b	OFF_OK(%r6);		/* Return */			\
+	B_EX	OFF_OK,%r6;		/* Return */			\
 									\
 sk_load_##NAME##_slow:;							\
 	lgr	%r2,%r7;		/* Arg1 = skb pointer */	\
@@ -64,11 +65,14 @@ sk_load_##NAME##_slow:;							\
 	brasl	%r14,skb_copy_bits;	/* Get data from skb */		\
 	LOAD	%r14,STK_OFF_TMP(%r15);	/* Load from temp bufffer */	\
 	ltgr	%r2,%r2;		/* Set cc to (%r2 != 0) */	\
-	br	%r6;			/* Return */
+	BR_EX	%r6;			/* Return */
 
 sk_load_common(word, 4, llgf)	/* r14 = *(u32 *) (skb->data+offset) */
 sk_load_common(half, 2, llgh)	/* r14 = *(u16 *) (skb->data+offset) */
 
+	GEN_BR_THUNK %r6
+	GEN_B_THUNK OFF_OK,%r6
+
 /*
  * Load 1 byte from SKB (optimized version)
  */
@@ -80,7 +84,7 @@ ENTRY(sk_load_byte_pos)
 	clg	%r3,STK_OFF_HLEN(%r15)	# Offset >= hlen?
 	jnl	sk_load_byte_slow
 	llgc	%r14,0(%r3,%r12)	# Get byte from skb
-	b	OFF_OK(%r6)		# Return OK
+	B_EX	OFF_OK,%r6		# Return OK
 
 sk_load_byte_slow:
 	lgr	%r2,%r7			# Arg1 = skb pointer
@@ -90,7 +94,7 @@ sk_load_byte_slow:
 	brasl	%r14,skb_copy_bits	# Get data from skb
 	llgc	%r14,STK_OFF_TMP(%r15)	# Load result from temp buffer
 	ltgr	%r2,%r2			# Set cc to (%r2 != 0)
-	br	%r6			# Return cc
+	BR_EX	%r6			# Return cc
 
 #define sk_negative_common(NAME, SIZE, LOAD)				\
 sk_load_##NAME##_slow_neg:;						\
@@ -104,7 +108,7 @@ sk_load_##NAME##_slow_neg:;						\
 	jz	bpf_error;						\
 	LOAD	%r14,0(%r2);		/* Get data from pointer */	\
 	xr	%r3,%r3;		/* Set cc to zero */		\
-	br	%r6;			/* Return cc */
+	BR_EX	%r6;			/* Return cc */
 
 sk_negative_common(word, 4, llgf)
 sk_negative_common(half, 2, llgh)
@@ -113,4 +117,4 @@ sk_negative_common(byte, 1, llgc)
 bpf_error:
 # force a return 0 from jit handler
 	ltgr	%r15,%r15	# Set condition code
-	br	%r6
+	BR_EX	%r6
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 78a19c93b380..dd2bcf0e7d00 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -25,6 +25,8 @@
 #include <linux/bpf.h>
 #include <asm/cacheflush.h>
 #include <asm/dis.h>
+#include <asm/facility.h>
+#include <asm/nospec-branch.h>
 #include <asm/set_memory.h>
 #include "bpf_jit.h"
 
@@ -41,6 +43,8 @@ struct bpf_jit {
 	int base_ip;		/* Base address for literal pool */
 	int ret0_ip;		/* Address of return 0 */
 	int exit_ip;		/* Address of exit */
+	int r1_thunk_ip;	/* Address of expoline thunk for 'br %r1' */
+	int r14_thunk_ip;	/* Address of expoline thunk for 'br %r14' */
 	int tail_call_start;	/* Tail call start offset */
 	int labels[1];		/* Labels for local jumps */
 };
@@ -250,6 +254,19 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 	REG_SET_SEEN(b2);					\
 })
 
+#define EMIT6_PCREL_RILB(op, b, target)				\
+({								\
+	int rel = (target - jit->prg) / 2;			\
+	_EMIT6(op | reg_high(b) << 16 | rel >> 16, rel & 0xffff);	\
+	REG_SET_SEEN(b);					\
+})
+
+#define EMIT6_PCREL_RIL(op, target)				\
+({								\
+	int rel = (target - jit->prg) / 2;			\
+	_EMIT6(op | rel >> 16, rel & 0xffff);			\
+})
+
 #define _EMIT6_IMM(op, imm)					\
 ({								\
 	unsigned int __imm = (imm);				\
@@ -469,8 +486,45 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
 	EMIT4(0xb9040000, REG_2, BPF_REG_0);
 	/* Restore registers */
 	save_restore_regs(jit, REGS_RESTORE, stack_depth);
+	if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) {
+		jit->r14_thunk_ip = jit->prg;
+		/* Generate __s390_indirect_jump_r14 thunk */
+		if (test_facility(35)) {
+			/* exrl %r0,.+10 */
+			EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+		} else {
+			/* larl %r1,.+14 */
+			EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14);
+			/* ex 0,0(%r1) */
+			EMIT4_DISP(0x44000000, REG_0, REG_1, 0);
+		}
+		/* j . */
+		EMIT4_PCREL(0xa7f40000, 0);
+	}
 	/* br %r14 */
 	_EMIT2(0x07fe);
+
+	if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable &&
+	    (jit->seen & SEEN_FUNC)) {
+		jit->r1_thunk_ip = jit->prg;
+		/* Generate __s390_indirect_jump_r1 thunk */
+		if (test_facility(35)) {
+			/* exrl %r0,.+10 */
+			EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+			/* j . */
+			EMIT4_PCREL(0xa7f40000, 0);
+			/* br %r1 */
+			_EMIT2(0x07f1);
+		} else {
+			/* larl %r1,.+14 */
+			EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14);
+			/* ex 0,S390_lowcore.br_r1_tampoline */
+			EMIT4_DISP(0x44000000, REG_0, REG_0,
+				   offsetof(struct lowcore, br_r1_trampoline));
+			/* j . */
+			EMIT4_PCREL(0xa7f40000, 0);
+		}
+	}
 }
 
 /*
@@ -966,8 +1020,13 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		/* lg %w1,<d(imm)>(%l) */
 		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L,
 			      EMIT_CONST_U64(func));
-		/* basr %r14,%w1 */
-		EMIT2(0x0d00, REG_14, REG_W1);
+		if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) {
+			/* brasl %r14,__s390_indirect_jump_r1 */
+			EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
+		} else {
+			/* basr %r14,%w1 */
+			EMIT2(0x0d00, REG_14, REG_W1);
+		}
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
 		if ((jit->seen & SEEN_SKB) &&

From fa89adba1941e4f3b213399b81732a5c12fd9131 Mon Sep 17 00:00:00 2001
From: Jens Remus <jremus@linux.ibm.com>
Date: Thu, 3 May 2018 13:52:47 +0200
Subject: [PATCH 086/393] scsi: zfcp: fix infinite iteration on ERP ready list

zfcp_erp_adapter_reopen() schedules blocking of all of the adapter's
rports via zfcp_scsi_schedule_rports_block() and enqueues a reopen
adapter ERP action via zfcp_erp_action_enqueue(). Both are separately
processed asynchronously and concurrently.

Blocking of rports is done in a kworker by zfcp_scsi_rport_work(). It
calls zfcp_scsi_rport_block(), which then traces a DBF REC "scpdely" via
zfcp_dbf_rec_trig().  zfcp_dbf_rec_trig() acquires the DBF REC spin lock
and then iterates with list_for_each() over the adapter's ERP ready list
without holding the ERP lock. This opens a race window in which the
current list entry can be moved to another list, causing list_for_each()
to iterate forever on the wrong list, as the erp_ready_head is never
encountered as terminal condition.

Meanwhile the ERP action can be processed in the ERP thread by
zfcp_erp_thread(). It calls zfcp_erp_strategy(), which acquires the ERP
lock and then calls zfcp_erp_action_to_running() to move the ERP action
from the ready to the running list.  zfcp_erp_action_to_running() can
move the ERP action using list_move() just during the aforementioned
race window. It then traces a REC RUN "erator1" via zfcp_dbf_rec_run().
zfcp_dbf_rec_run() tries to acquire the DBF REC spin lock. If this is
held by the infinitely looping kworker, it effectively spins forever.

Example Sequence Diagram:

Process                ERP Thread             rport_work
-------------------    -------------------    -------------------
zfcp_erp_adapter_reopen()
zfcp_erp_adapter_block()
zfcp_scsi_schedule_rports_block()
lock ERP                                      zfcp_scsi_rport_work()
zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_ADAPTER)
list_add_tail() on ready                      !(rport_task==RPORT_ADD)
wake_up() ERP thread                          zfcp_scsi_rport_block()
zfcp_dbf_rec_trig()    zfcp_erp_strategy()    zfcp_dbf_rec_trig()
unlock ERP                                    lock DBF REC
zfcp_erp_wait()        lock ERP
|                      zfcp_erp_action_to_running()
|                                             list_for_each() ready
|                      list_move()              current entry
|                        ready to running
|                      zfcp_dbf_rec_run()       endless loop over running
|                      zfcp_dbf_rec_run_lvl()
|                      lock DBF REC spins forever

Any adapter recovery can trigger this, such as setting the device offline
or reboot.

V4.9 commit 4eeaa4f3f1d6 ("zfcp: close window with unblocked rport
during rport gone") introduced additional tracing of (un)blocking of
rports. It missed that the adapter->erp_lock must be held when calling
zfcp_dbf_rec_trig().

This fix uses the approach formerly introduced by commit aa0fec62391c
("[SCSI] zfcp: Fix sparse warning by providing new entry in dbf") that got
later removed by commit ae0904f60fab ("[SCSI] zfcp: Redesign of the debug
tracing for recovery actions.").

Introduce zfcp_dbf_rec_trig_lock(), a wrapper for zfcp_dbf_rec_trig() that
acquires and releases the adapter->erp_lock for read.

Reported-by: Sebastian Ott <sebott@linux.ibm.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
Fixes: 4eeaa4f3f1d6 ("zfcp: close window with unblocked rport during rport gone")
Cc: <stable@vger.kernel.org> # 2.6.32+
Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/s390/scsi/zfcp_dbf.c  | 23 ++++++++++++++++++++++-
 drivers/s390/scsi/zfcp_ext.h  |  5 ++++-
 drivers/s390/scsi/zfcp_scsi.c | 14 +++++++-------
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c
index a8b831000b2d..18c4f933e8b9 100644
--- a/drivers/s390/scsi/zfcp_dbf.c
+++ b/drivers/s390/scsi/zfcp_dbf.c
@@ -4,7 +4,7 @@
  *
  * Debug traces for zfcp.
  *
- * Copyright IBM Corp. 2002, 2017
+ * Copyright IBM Corp. 2002, 2018
  */
 
 #define KMSG_COMPONENT "zfcp"
@@ -308,6 +308,27 @@ void zfcp_dbf_rec_trig(char *tag, struct zfcp_adapter *adapter,
 	spin_unlock_irqrestore(&dbf->rec_lock, flags);
 }
 
+/**
+ * zfcp_dbf_rec_trig_lock - trace event related to triggered recovery with lock
+ * @tag: identifier for event
+ * @adapter: adapter on which the erp_action should run
+ * @port: remote port involved in the erp_action
+ * @sdev: scsi device involved in the erp_action
+ * @want: wanted erp_action
+ * @need: required erp_action
+ *
+ * The adapter->erp_lock must not be held.
+ */
+void zfcp_dbf_rec_trig_lock(char *tag, struct zfcp_adapter *adapter,
+			    struct zfcp_port *port, struct scsi_device *sdev,
+			    u8 want, u8 need)
+{
+	unsigned long flags;
+
+	read_lock_irqsave(&adapter->erp_lock, flags);
+	zfcp_dbf_rec_trig(tag, adapter, port, sdev, want, need);
+	read_unlock_irqrestore(&adapter->erp_lock, flags);
+}
 
 /**
  * zfcp_dbf_rec_run_lvl - trace event related to running recovery
diff --git a/drivers/s390/scsi/zfcp_ext.h b/drivers/s390/scsi/zfcp_ext.h
index bf8ea4df2bb8..e5eed8aac0ce 100644
--- a/drivers/s390/scsi/zfcp_ext.h
+++ b/drivers/s390/scsi/zfcp_ext.h
@@ -4,7 +4,7 @@
  *
  * External function declarations.
  *
- * Copyright IBM Corp. 2002, 2016
+ * Copyright IBM Corp. 2002, 2018
  */
 
 #ifndef ZFCP_EXT_H
@@ -35,6 +35,9 @@ extern int zfcp_dbf_adapter_register(struct zfcp_adapter *);
 extern void zfcp_dbf_adapter_unregister(struct zfcp_adapter *);
 extern void zfcp_dbf_rec_trig(char *, struct zfcp_adapter *,
 			      struct zfcp_port *, struct scsi_device *, u8, u8);
+extern void zfcp_dbf_rec_trig_lock(char *tag, struct zfcp_adapter *adapter,
+				   struct zfcp_port *port,
+				   struct scsi_device *sdev, u8 want, u8 need);
 extern void zfcp_dbf_rec_run(char *, struct zfcp_erp_action *);
 extern void zfcp_dbf_rec_run_lvl(int level, char *tag,
 				 struct zfcp_erp_action *erp);
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 4d2ba5682493..22f9562f415c 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -4,7 +4,7 @@
  *
  * Interface to Linux SCSI midlayer.
  *
- * Copyright IBM Corp. 2002, 2017
+ * Copyright IBM Corp. 2002, 2018
  */
 
 #define KMSG_COMPONENT "zfcp"
@@ -618,9 +618,9 @@ static void zfcp_scsi_rport_register(struct zfcp_port *port)
 	ids.port_id = port->d_id;
 	ids.roles = FC_RPORT_ROLE_FCP_TARGET;
 
-	zfcp_dbf_rec_trig("scpaddy", port->adapter, port, NULL,
-			  ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD,
-			  ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD);
+	zfcp_dbf_rec_trig_lock("scpaddy", port->adapter, port, NULL,
+			       ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD,
+			       ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD);
 	rport = fc_remote_port_add(port->adapter->scsi_host, 0, &ids);
 	if (!rport) {
 		dev_err(&port->adapter->ccw_device->dev,
@@ -642,9 +642,9 @@ static void zfcp_scsi_rport_block(struct zfcp_port *port)
 	struct fc_rport *rport = port->rport;
 
 	if (rport) {
-		zfcp_dbf_rec_trig("scpdely", port->adapter, port, NULL,
-				  ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL,
-				  ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL);
+		zfcp_dbf_rec_trig_lock("scpdely", port->adapter, port, NULL,
+				       ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL,
+				       ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL);
 		fc_remote_port_delete(rport);
 		port->rport = NULL;
 	}

From a4995684a949cc1d28fbf09900c47c34b9427ecf Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 27 Apr 2018 11:16:09 -0700
Subject: [PATCH 087/393] netfilter: bridge: stp fix reference to uninitialized
 data

The destination mac (destmac) is only valid if EBT_DESTMAC flag
is set. Fix by changing the order of the comparison to look for
the flag first.

Reported-by: syzbot+5c06e318fc558cc27823@syzkaller.appspotmail.com
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_stp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 47ba98db145d..46c1fe7637ea 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -161,8 +161,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
 	/* Make sure the match only receives stp frames */
 	if (!par->nft_compat &&
 	    (!ether_addr_equal(e->destmac, eth_stp_addr) ||
-	     !is_broadcast_ether_addr(e->destmsk) ||
-	     !(e->bitmask & EBT_DESTMAC)))
+	     !(e->bitmask & EBT_DESTMAC) ||
+	     !is_broadcast_ether_addr(e->destmsk)))
 		return -EINVAL;
 
 	return 0;

From b8e9dc1c75714ceb53615743e1036f76e00f5a17 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 2 May 2018 14:07:42 +0200
Subject: [PATCH 088/393] netfilter: nf_tables: nft_compat: fix refcount leak
 on xt module

Taehee Yoo reported following bug:
    iptables-compat -I OUTPUT -m cpu --cpu 0
    iptables-compat -F
    lsmod |grep xt_cpu
    xt_cpu                 16384  1

Quote:
"When above command is given, a netlink message has two expressions that
are the cpu compat and the nft_counter.
The nft_expr_type_get() in the nf_tables_expr_parse() successes
first expression then, calls select_ops callback.
(allocates memory and holds module)
But, second nft_expr_type_get() in the nf_tables_expr_parse()
returns -EAGAIN because of request_module().
In that point, by the 'goto err1',
the 'module_put(info[i].ops->type->owner)' is called.
There is no release routine."

The core problem is that unlike all other expression,
nft_compat select_ops has side effects.

1. it allocates dynamic memory which holds an nft ops struct.
   In all other expressions, ops has static storage duration.
2. It grabs references to the xt module that it is supposed to
   invoke.

Depending on where things go wrong, error unwinding doesn't
always do the right thing.

In the above scenario, a new nft_compat_expr is created and
xt_cpu module gets loaded with a refcount of 1.

Due to to -EAGAIN, the netlink messages get re-parsed.
When that happens, nft_compat finds that xt_cpu is already present
and increments module refcount again.

This fixes the problem by making select_ops to have no visible
side effects and removes all extra module_get/put.

When select_ops creates a new nft_compat expression, the new
expression has a refcount of 0, and the xt module gets its refcount
incremented.

When error happens, the next call finds existing entry, but will no
longer increase the reference count -- the presence of existing
nft_xt means we already hold a module reference.

Because nft_xt_put is only called from nft_compat destroy hook,
it will never see the initial zero reference count.
->destroy can only be called after ->init(), and that will increase the
refcount.

Lastly, we now free nft_xt struct with kfree_rcu.
Else, we get use-after free in nf_tables_rule_destroy:

  while (expr != nft_expr_last(rule) && expr->ops) {
    nf_tables_expr_destroy(ctx, expr);
    expr = nft_expr_next(expr); // here

nft_expr_next() dereferences expr->ops. This is safe
for all users, as ops have static storage duration.
In nft_compat case however, its ->destroy callback can
free the memory that hold the ops structure.

Tested-by: Taehee Yoo <ap420073@gmail.com>
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 92 ++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 34 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8e23726b9081..870d8c29dae9 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -27,14 +27,24 @@ struct nft_xt {
 	struct list_head	head;
 	struct nft_expr_ops	ops;
 	unsigned int		refcnt;
+
+	/* Unlike other expressions, ops doesn't have static storage duration.
+	 * nft core assumes they do.  We use kfree_rcu so that nft core can
+	 * can check expr->ops->size even after nft_compat->destroy() frees
+	 * the nft_xt struct that holds the ops structure.
+	 */
+	struct rcu_head		rcu_head;
 };
 
-static void nft_xt_put(struct nft_xt *xt)
+static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (--xt->refcnt == 0) {
 		list_del(&xt->head);
-		kfree(xt);
+		kfree_rcu(xt, rcu_head);
+		return true;
 	}
+
+	return false;
 }
 
 static int nft_compat_chain_validate_dependency(const char *tablename,
@@ -226,6 +236,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	struct xt_target *target = expr->ops->data;
 	struct xt_tgchk_param par;
 	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+	struct nft_xt *nft_xt;
 	u16 proto = 0;
 	bool inv = false;
 	union nft_entry e = {};
@@ -236,25 +247,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	if (ctx->nla[NFTA_RULE_COMPAT]) {
 		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
 		if (ret < 0)
-			goto err;
+			return ret;
 	}
 
 	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
 
 	ret = xt_check_target(&par, size, proto, inv);
 	if (ret < 0)
-		goto err;
+		return ret;
 
 	/* The standard target cannot be used */
-	if (target->target == NULL) {
-		ret = -EINVAL;
-		goto err;
-	}
+	if (!target->target)
+		return -EINVAL;
 
+	nft_xt = container_of(expr->ops, struct nft_xt, ops);
+	nft_xt->refcnt++;
 	return 0;
-err:
-	module_put(target->me);
-	return ret;
 }
 
 static void
@@ -271,8 +279,8 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 
-	nft_xt_put(container_of(expr->ops, struct nft_xt, ops));
-	module_put(target->me);
+	if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
+		module_put(target->me);
 }
 
 static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -411,6 +419,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	struct xt_match *match = expr->ops->data;
 	struct xt_mtchk_param par;
 	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+	struct nft_xt *nft_xt;
 	u16 proto = 0;
 	bool inv = false;
 	union nft_entry e = {};
@@ -421,19 +430,18 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	if (ctx->nla[NFTA_RULE_COMPAT]) {
 		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
 		if (ret < 0)
-			goto err;
+			return ret;
 	}
 
 	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
 
 	ret = xt_check_match(&par, size, proto, inv);
 	if (ret < 0)
-		goto err;
+		return ret;
 
+	nft_xt = container_of(expr->ops, struct nft_xt, ops);
+	nft_xt->refcnt++;
 	return 0;
-err:
-	module_put(match->me);
-	return ret;
 }
 
 static void
@@ -450,8 +458,8 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	if (par.match->destroy != NULL)
 		par.match->destroy(&par);
 
-	nft_xt_put(container_of(expr->ops, struct nft_xt, ops));
-	module_put(match->me);
+	if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
+		module_put(match->me);
 }
 
 static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -654,13 +662,8 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	list_for_each_entry(nft_match, &nft_match_list, head) {
 		struct xt_match *match = nft_match->ops.data;
 
-		if (nft_match_cmp(match, mt_name, rev, family)) {
-			if (!try_module_get(match->me))
-				return ERR_PTR(-ENOENT);
-
-			nft_match->refcnt++;
+		if (nft_match_cmp(match, mt_name, rev, family))
 			return &nft_match->ops;
-		}
 	}
 
 	match = xt_request_find_match(family, mt_name, rev);
@@ -679,7 +682,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nft_match->refcnt = 1;
+	nft_match->refcnt = 0;
 	nft_match->ops.type = &nft_match_type;
 	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
 	nft_match->ops.eval = nft_match_eval;
@@ -739,13 +742,8 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	list_for_each_entry(nft_target, &nft_target_list, head) {
 		struct xt_target *target = nft_target->ops.data;
 
-		if (nft_target_cmp(target, tg_name, rev, family)) {
-			if (!try_module_get(target->me))
-				return ERR_PTR(-ENOENT);
-
-			nft_target->refcnt++;
+		if (nft_target_cmp(target, tg_name, rev, family))
 			return &nft_target->ops;
-		}
 	}
 
 	target = xt_request_find_target(family, tg_name, rev);
@@ -764,7 +762,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nft_target->refcnt = 1;
+	nft_target->refcnt = 0;
 	nft_target->ops.type = &nft_target_type;
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
@@ -823,6 +821,32 @@ static int __init nft_compat_module_init(void)
 
 static void __exit nft_compat_module_exit(void)
 {
+	struct nft_xt *xt, *next;
+
+	/* list should be empty here, it can be non-empty only in case there
+	 * was an error that caused nft_xt expr to not be initialized fully
+	 * and noone else requested the same expression later.
+	 *
+	 * In this case, the lists contain 0-refcount entries that still
+	 * hold module reference.
+	 */
+	list_for_each_entry_safe(xt, next, &nft_target_list, head) {
+		struct xt_target *target = xt->ops.data;
+
+		if (WARN_ON_ONCE(xt->refcnt))
+			continue;
+		module_put(target->me);
+		kfree(xt);
+	}
+
+	list_for_each_entry_safe(xt, next, &nft_match_list, head) {
+		struct xt_match *match = xt->ops.data;
+
+		if (WARN_ON_ONCE(xt->refcnt))
+			continue;
+		module_put(match->me);
+		kfree(xt);
+	}
 	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
 	nft_unregister_expr(&nft_target_type);
 	nft_unregister_expr(&nft_match_type);

From a050d345cef0dc6249263540da1e902bba617e43 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 3 May 2018 22:01:40 +0300
Subject: [PATCH 089/393] ipvs: fix refcount usage for conns in ops mode

Connections in One-packet scheduling mode (-o, --ops) are
removed with refcnt=0 because they are not hashed in conn table.
To avoid refcount_dec reporting this as error, change them to be
removed with refcount_dec_if_one as all other connections.

refcount_t hit zero at ip_vs_conn_put+0x31/0x40 [ip_vs]
in sh[15519], uid/euid: 497/497
WARNING: CPU: 0 PID: 15519 at ../kernel/panic.c:657
refcount_error_report+0x94/0x9e
Modules linked in: ip_vs_rr cirrus ttm sb_edac
edac_core drm_kms_helper crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel pcbc mousedev drm aesni_intel aes_x86_64
crypto_simd glue_helper cryptd psmouse evdev input_leds led_class
intel_agp fb_sys_fops syscopyarea sysfillrect intel_rapl_perf mac_hid
intel_gtt serio_raw sysimgblt agpgart i2c_piix4 i2c_core ata_generic
pata_acpi floppy cfg80211 rfkill button loop macvlan ip_vs
nf_conntrack libcrc32c crc32c_generic ip_tables x_tables ipv6
crc_ccitt autofs4 ext4 crc16 mbcache jbd2 fscrypto ata_piix libata
atkbd libps2 scsi_mod crc32c_intel i8042 rtc_cmos serio af_packet
dm_mod dax fuse xen_netfront xen_blkfront
CPU: 0 PID: 15519 Comm: sh Tainted: G        W
4.15.17 #1-NixOS
Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006
RIP: 0010:refcount_error_report+0x94/0x9e
RSP: 0000:ffffa344dde039c8 EFLAGS: 00010296
RAX: 0000000000000057 RBX: ffffffff92f20e06 RCX: 0000000000000006
RDX: 0000000000000007 RSI: 0000000000000086 RDI: ffffa344dde165c0
RBP: ffffa344dde03b08 R08: 0000000000000218 R09: 0000000000000004
R10: ffffffff93006a80 R11: 0000000000000001 R12: ffffa344d68cd100
R13: 00000000000001f1 R14: ffffffff92f12fb0 R15: 0000000000000004
FS:  00007fc9d2040fc0(0000) GS:ffffa344dde00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000262a000 CR3: 0000000016a0c004 CR4: 00000000001606f0
Call Trace:
 <IRQ>
 ex_handler_refcount+0x4e/0x80
 fixup_exception+0x33/0x40
 do_trap+0x83/0x140
 do_error_trap+0x83/0xf0
 ? ip_vs_conn_drop_conntrack+0x120/0x1a5 [ip_vs]
 ? ip_finish_output2+0x29c/0x390
 ? ip_finish_output2+0x1a2/0x390
 invalid_op+0x1b/0x40
RIP: 0010:ip_vs_conn_put+0x31/0x40 [ip_vs]
RSP: 0000:ffffa344dde03bb8 EFLAGS: 00010246
RAX: 0000000000000001 RBX: ffffa344df31cf00 RCX: ffffa344d7450198
RDX: 0000000000000003 RSI: 00000000fffffe01 RDI: ffffa344d7450140
RBP: 0000000000000002 R08: 0000000000000476 R09: 0000000000000000
R10: ffffa344dde03b28 R11: ffffa344df200000 R12: ffffa344d7d09000
R13: ffffa344def3a980 R14: ffffffffc04f6e20 R15: 0000000000000008
 ip_vs_in.part.29.constprop.36+0x34f/0x640 [ip_vs]
 ? ip_vs_conn_out_get+0xe0/0xe0 [ip_vs]
 ip_vs_remote_request4+0x47/0xa0 [ip_vs]
 ? ip_vs_in.part.29.constprop.36+0x640/0x640 [ip_vs]
 nf_hook_slow+0x43/0xc0
 ip_local_deliver+0xac/0xc0
 ? ip_rcv_finish+0x400/0x400
 ip_rcv+0x26c/0x380
 __netif_receive_skb_core+0x3a0/0xb10
 ? inet_gro_receive+0x23c/0x2b0
 ? netif_receive_skb_internal+0x24/0xb0
 netif_receive_skb_internal+0x24/0xb0
 napi_gro_receive+0xb8/0xe0
 xennet_poll+0x676/0xb40 [xen_netfront]
 net_rx_action+0x139/0x3a0
 __do_softirq+0xde/0x2b4
 irq_exit+0xae/0xb0
 xen_evtchn_do_upcall+0x2c/0x40
 xen_hvm_callback_vector+0x7d/0x90
 </IRQ>
RIP: 0033:0x7fc9d11c91f9
RSP: 002b:00007ffebe8a2ea0 EFLAGS: 00000202 ORIG_RAX:
ffffffffffffff0c
RAX: 00000000ffffffff RBX: 0000000002609808 RCX: 0000000000000054
RDX: 0000000000000001 RSI: 0000000002605440 RDI: 00000000025f940e
RBP: 00000000025f940e R08: 000000000260213d R09: 1999999999999999
R10: 000000000262a808 R11: 00000000025f942d R12: 00000000025f940e
R13: 00007fc9d1301e20 R14: 00000000025f9408 R15: 00007fc9d1302720
Code: 48 8b 95 80 00 00 00 41 55 49 8d 8c 24 e0 05 00
00 45 8b 84 24 38 04 00 00 41 89 c1 48 89 de 48 c7 c7 a8 2f f2 92 e8
7c fa ff ff <0f> 0b 58 5b 5d 41 5c 41 5d c3 0f 1f 44 00 00 55 48 89 e5
41 56

Reported-by: Net Filter <netfilternetfilter@gmail.com>
Fixes: b54ab92b84b6 ("netfilter: refcounter conversions")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_conn.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 370abbf6f421..75de46576f51 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -232,7 +232,10 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 {
 	unsigned int hash;
-	bool ret;
+	bool ret = false;
+
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return refcount_dec_if_one(&cp->refcnt);
 
 	hash = ip_vs_conn_hashkey_conn(cp);
 
@@ -240,15 +243,13 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
-		ret = false;
 		/* Decrease refcnt and unlink conn only if we are last user */
 		if (refcount_dec_if_one(&cp->refcnt)) {
 			hlist_del_rcu(&cp->c_list);
 			cp->flags &= ~IP_VS_CONN_F_HASHED;
 			ret = true;
 		}
-	} else
-		ret = refcount_read(&cp->refcnt) ? false : true;
+	}
 
 	spin_unlock(&cp->lock);
 	ct_write_unlock_bh(hash);
@@ -454,12 +455,6 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
-static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
-{
-	__ip_vs_conn_put(cp);
-	ip_vs_conn_expire(&cp->timer);
-}
-
 /*
  *      Put back the conn and restart its timer with its timeout
  */
@@ -478,7 +473,7 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
 	    (refcount_read(&cp->refcnt) == 1) &&
 	    !timer_pending(&cp->timer))
 		/* expire connection immediately */
-		__ip_vs_conn_put_notimer(cp);
+		ip_vs_conn_expire(&cp->timer);
 	else
 		__ip_vs_conn_put_timer(cp);
 }

From d5e032fc5697b6c0d6b4958bcacb981a08f8174e Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 3 May 2018 22:02:18 +0300
Subject: [PATCH 090/393] ipvs: fix stats update from local clients

Local clients are not properly synchronized on 32-bit CPUs when
updating stats (3.10+). Now it is possible estimation_timer (timer),
a stats reader, to interrupt the local client in the middle of
write_seqcount_{begin,end} sequence leading to loop (DEADLOCK).
The same interrupt can happen from received packet (SoftIRQ)
which updates the same per-CPU stats.

Fix it by disabling BH while updating stats.

Found with debug:

WARNING: inconsistent lock state
4.17.0-rc2-00105-g35cb6d7-dirty #2 Not tainted
--------------------------------
inconsistent {IN-SOFTIRQ-R} -> {SOFTIRQ-ON-W} usage.
ftp/2545 [HC0[0]:SC0[0]:HE1:SE1] takes:
86845479 (&syncp->seq#6){+.+-}, at: ip_vs_schedule+0x1c5/0x59e [ip_vs]
{IN-SOFTIRQ-R} state was registered at:
 lock_acquire+0x44/0x5b
 estimation_timer+0x1b3/0x341 [ip_vs]
 call_timer_fn+0x54/0xcd
 run_timer_softirq+0x10c/0x12b
 __do_softirq+0xc1/0x1a9
 do_softirq_own_stack+0x1d/0x23
 irq_exit+0x4a/0x64
 smp_apic_timer_interrupt+0x63/0x71
 apic_timer_interrupt+0x3a/0x40
 default_idle+0xa/0xc
 arch_cpu_idle+0x9/0xb
 default_idle_call+0x21/0x23
 do_idle+0xa0/0x167
 cpu_startup_entry+0x19/0x1b
 start_secondary+0x133/0x182
 startup_32_smp+0x164/0x168
irq event stamp: 42213

other info that might help us debug this:
Possible unsafe locking scenario:

      CPU0
      ----
 lock(&syncp->seq#6);
 <Interrupt>
   lock(&syncp->seq#6);

*** DEADLOCK ***

Fixes: ac69269a45e8 ("ipvs: do not disable bh for long time")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_core.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5f6f73cf2174..0679dd101e72 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -119,6 +119,8 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		struct ip_vs_cpu_stats *s;
 		struct ip_vs_service *svc;
 
+		local_bh_disable();
+
 		s = this_cpu_ptr(dest->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
 		s->cnt.inpkts++;
@@ -137,6 +139,8 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		s->cnt.inpkts++;
 		s->cnt.inbytes += skb->len;
 		u64_stats_update_end(&s->syncp);
+
+		local_bh_enable();
 	}
 }
 
@@ -151,6 +155,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		struct ip_vs_cpu_stats *s;
 		struct ip_vs_service *svc;
 
+		local_bh_disable();
+
 		s = this_cpu_ptr(dest->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
 		s->cnt.outpkts++;
@@ -169,6 +175,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		s->cnt.outpkts++;
 		s->cnt.outbytes += skb->len;
 		u64_stats_update_end(&s->syncp);
+
+		local_bh_enable();
 	}
 }
 
@@ -179,6 +187,8 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 	struct netns_ipvs *ipvs = svc->ipvs;
 	struct ip_vs_cpu_stats *s;
 
+	local_bh_disable();
+
 	s = this_cpu_ptr(cp->dest->stats.cpustats);
 	u64_stats_update_begin(&s->syncp);
 	s->cnt.conns++;
@@ -193,6 +203,8 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 	u64_stats_update_begin(&s->syncp);
 	s->cnt.conns++;
 	u64_stats_update_end(&s->syncp);
+
+	local_bh_enable();
 }
 
 

From 25fd386e0bc065849db7400f579e82863ea44838 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 May 2018 18:16:06 +0200
Subject: [PATCH 091/393] netfilter: core: add missing __rcu annotation

removes following sparse error:
net/netfilter/core.c:598:30: warning: incorrect type in argument 1 (different address spaces)
net/netfilter/core.c:598:30:    expected struct nf_hook_entries **e
net/netfilter/core.c:598:30:    got struct nf_hook_entries [noderef] <asn:4>**<noident>

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0f6b8172fb9a..206fb2c4c319 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -585,7 +585,8 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
-static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max)
+static void __net_init
+__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
 {
 	int h;
 

From 4e09fc873d92398001e267f7b60c36c963f825b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 6 May 2018 00:45:43 +0200
Subject: [PATCH 092/393] netfilter: prefer nla_strlcpy for dealing with
 NLA_STRING attributes

fixes these warnings:
'nfnl_cthelper_create' at net/netfilter/nfnetlink_cthelper.c:237:2,
'nfnl_cthelper_new' at net/netfilter/nfnetlink_cthelper.c:450:9:
./include/linux/string.h:246:9: warning: '__builtin_strncpy' specified bound 16 equals destination size [-Wstringop-truncation]
  return __builtin_strncpy(p, q, size);
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Moreover, strncpy assumes null-terminated source buffers, but thats
not the case here.
Unlike strlcpy, nla_strlcpy *does* pad the destination buffer
while also considering nla attribute size.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c     | 2 +-
 net/netfilter/nfnetlink_cthelper.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index b9505bcd3827..6ddf89183e7b 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		nfacct->flags = flags;
 	}
 
-	strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+	nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
 		atomic64_set(&nfacct->bytes,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 4a4b293fb2e5..fa026b269b36 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -149,8 +149,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
 	    !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
 		return -EINVAL;
 
-	strncpy(expect_policy->name,
-		nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+	nla_strlcpy(expect_policy->name,
+		    nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
 	expect_policy->max_expected =
 		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
 	if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
@@ -234,7 +234,8 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 	if (ret < 0)
 		goto err1;
 
-	strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+	nla_strlcpy(helper->name,
+		    nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
 	size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
 	if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
 		ret = -ENOMEM;

From a44f6d82a471aa52fe218e43105fbe3c458fc5a6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 6 May 2018 00:46:16 +0200
Subject: [PATCH 093/393] netfilter: x_tables: add module alias for icmp
 matches

The icmp matches are implemented in ip_tables and ip6_tables,
respectively, so for normal iptables they are always available:
those modules are loaded once iptables calls getsockopt() to fetch
available module revisions.

In iptables-over-nftables case probing occurs via nfnetlink, so
these modules might not be loaded.  Add aliases so modprobe can load
these when icmp/icmp6 is requested.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 44b308d93ec2..e85f35b89c49 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -34,6 +34,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv4 packet filter");
+MODULE_ALIAS("ipt_icmp");
 
 void *ipt_alloc_initial_table(const struct xt_table *info)
 {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 65c9e1a58305..97f79dc943d7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -38,6 +38,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv6 packet filter");
+MODULE_ALIAS("ip6t_icmp6");
 
 void *ip6t_alloc_initial_table(const struct xt_table *info)
 {

From 009240940e84c1c089af88b454f7e804a4c5bd1b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 6 May 2018 00:47:20 +0200
Subject: [PATCH 094/393] netfilter: nf_tables: don't assume chain stats are
 set when jumplabel is set

nft_chain_stats_replace() and all other spots assume ->stats can be
NULL, but nft_update_chain_stats does not.  It must do this check,
just because the jump label is set doesn't mean all basechains have stats
assigned.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index dfd0bf3810d2..942702a2776f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -119,15 +119,22 @@ DEFINE_STATIC_KEY_FALSE(nft_counters_enabled);
 static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 					    const struct nft_pktinfo *pkt)
 {
+	struct nft_base_chain *base_chain;
 	struct nft_stats *stats;
 
-	local_bh_disable();
-	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(chain)->stats));
-	u64_stats_update_begin(&stats->syncp);
-	stats->pkts++;
-	stats->bytes += pkt->skb->len;
-	u64_stats_update_end(&stats->syncp);
-	local_bh_enable();
+	base_chain = nft_base_chain(chain);
+	if (!base_chain->stats)
+		return;
+
+	stats = this_cpu_ptr(rcu_dereference(base_chain->stats));
+	if (stats) {
+		local_bh_disable();
+		u64_stats_update_begin(&stats->syncp);
+		stats->pkts++;
+		stats->bytes += pkt->skb->len;
+		u64_stats_update_end(&stats->syncp);
+		local_bh_enable();
+	}
 }
 
 struct nft_jumpstack {

From 4bbaf2584b86b0772413edeac22ff448f36351b1 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.ibm.com>
Date: Thu, 3 May 2018 15:56:15 +0200
Subject: [PATCH 095/393] s390/cpum_sf: ensure sample frequency of perf event
 attributes is non-zero

Correct a trinity finding for the perf_event_open() system call with
a perf event attribute structure that uses a frequency but has the
sampling frequency set to zero.  This causes a FP divide exception during
the sample rate initialization for the hardware sampling facility.

Fixes: 8c069ff4bd606 ("s390/perf: add support for the CPU-Measurement Sampling Facility")
Cc: stable@vger.kernel.org # 3.14+
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_sf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 1c9ddd7aa5ec..0292d68e7dde 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -753,6 +753,10 @@ static int __hw_perf_event_init(struct perf_event *event)
 	 */
 	rate = 0;
 	if (attr->freq) {
+		if (!attr->sample_freq) {
+			err = -EINVAL;
+			goto out;
+		}
 		rate = freq_to_sample_rate(&si, attr->sample_freq);
 		rate = hw_limit_rate(&si, rate);
 		attr->freq = 0;

From 31c6085562a03124d3f6a5c43dd9888ac44495a5 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 27 Apr 2018 17:06:49 +0300
Subject: [PATCH 096/393] firmware: arm_scmi: Use after free in
 scmi_create_protocol_device()

We need to return here instead of setting up the freed sdev device as a
transport.

Fixes: 907b6d14911d ("firmware: arm_scmi: add per-protocol channels support using idr objects")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/driver.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 14b147135a0c..2455be8cbc4f 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -778,6 +778,7 @@ scmi_create_protocol_device(struct device_node *np, struct scmi_info *info,
 	if (scmi_mbox_chan_setup(info, &sdev->dev, prot_id)) {
 		dev_err(&sdev->dev, "failed to setup transport\n");
 		scmi_device_destroy(sdev);
+		return;
 	}
 
 	/* setup handle now as the transport is ready */

From 322579dcc865b94b47345ad1b6002ad167f85405 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 May 2018 14:21:56 -0700
Subject: [PATCH 097/393] libata: Blacklist some Sandisk SSDs for NCQ

Sandisk SSDs SD7SN6S256G and SD8SN8U256G are regularly locking up
regularly under sustained moderate load with NCQ enabled.  Blacklist
for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Cc: stable@vger.kernel.org
---
 drivers/ata/libata-core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 68596bd4cf06..a2498f5cfb28 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4493,6 +4493,10 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	/* https://bugzilla.kernel.org/show_bug.cgi?id=15573 */
 	{ "C300-CTFDDAC128MAG",	"0001",		ATA_HORKAGE_NONCQ, },
 
+	/* Some Sandisk SSDs lock up hard with NCQ enabled.  Reported on
+	   SD7SN6S256G and SD8SN8U256G */
+	{ "SanDisk SD[78]SN*G",	NULL,		ATA_HORKAGE_NONCQ, },
+
 	/* devices which puke on READ_NATIVE_MAX */
 	{ "HDS724040KLSA80",	"KFAOA20N",	ATA_HORKAGE_BROKEN_HPA, },
 	{ "WDC WD3200JD-00KLB0", "WD-WCAMR1130137", ATA_HORKAGE_BROKEN_HPA },

From 8bdf164744b2c7f63561846c01cff3db597f282d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 7 May 2018 15:22:35 +0200
Subject: [PATCH 098/393] netfilter: nft_compat: prepare for indirect info
 storage

Next patch will make it possible for *info to be stored in
a separate allocation instead of the expr private area.

This removes the 'expr priv area is info blob' assumption
from the match init/destroy/eval functions.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 47 +++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 870d8c29dae9..dec0afb0ffe0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -324,11 +324,11 @@ static int nft_target_validate(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static void nft_match_eval(const struct nft_expr *expr,
-			   struct nft_regs *regs,
-			   const struct nft_pktinfo *pkt)
+static void __nft_match_eval(const struct nft_expr *expr,
+			     struct nft_regs *regs,
+			     const struct nft_pktinfo *pkt,
+			     void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 	struct sk_buff *skb = pkt->skb;
 	bool ret;
@@ -352,6 +352,13 @@ static void nft_match_eval(const struct nft_expr *expr,
 	}
 }
 
+static void nft_match_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
+{
+	__nft_match_eval(expr, regs, pkt, nft_expr_priv(expr));
+}
+
 static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
 	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING },
 	[NFTA_MATCH_REV]	= { .type = NLA_U32 },
@@ -412,10 +419,10 @@ static void match_compat_from_user(struct xt_match *m, void *in, void *out)
 }
 
 static int
-nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-		const struct nlattr * const tb[])
+__nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		 const struct nlattr * const tb[],
+		 void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 	struct xt_mtchk_param par;
 	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
@@ -444,11 +451,18 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return 0;
 }
 
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+	       const struct nlattr * const tb[])
+{
+	return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr));
+}
+
 static void
-nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+__nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		    void *info)
 {
 	struct xt_match *match = expr->ops->data;
-	void *info = nft_expr_priv(expr);
 	struct xt_mtdtor_param par;
 
 	par.net = ctx->net;
@@ -462,9 +476,15 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 		module_put(match->me);
 }
 
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
+}
+
+static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			    void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 
 	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
@@ -478,6 +498,11 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	return __nft_match_dump(skb, expr, nft_expr_priv(expr));
+}
+
 static int nft_match_validate(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nft_data **data)

From 732a8049f365f514d0607e03938491bf6cb0d620 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 7 May 2018 15:22:36 +0200
Subject: [PATCH 099/393] netfilter: nft_compat: fix handling of large
 matchinfo size

currently matchinfo gets stored in the expression, but some xt matches
are very large.

To handle those we either need to switch nft core to kvmalloc and increase
size limit, or allocate the info blob of large matches separately.

This does the latter, this limits the scope of the changes to
nft_compat.

I picked a threshold of 192, this allows most matches to work as before and
handle only few ones via separate alloation (cgroup, u32, sctp, rt).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 64 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index dec0afb0ffe0..1d99a1efdafc 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -36,6 +36,13 @@ struct nft_xt {
 	struct rcu_head		rcu_head;
 };
 
+/* Used for matches where *info is larger than X byte */
+#define NFT_MATCH_LARGE_THRESH	192
+
+struct nft_xt_match_priv {
+	void *info;
+};
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (--xt->refcnt == 0) {
@@ -352,6 +359,15 @@ static void __nft_match_eval(const struct nft_expr *expr,
 	}
 }
 
+static void nft_match_large_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+	__nft_match_eval(expr, regs, pkt, priv->info);
+}
+
 static void nft_match_eval(const struct nft_expr *expr,
 			   struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
@@ -458,6 +474,24 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr));
 }
 
+static int
+nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		     const struct nlattr * const tb[])
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+	struct xt_match *m = expr->ops->data;
+	int ret;
+
+	priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+	if (!priv->info)
+		return -ENOMEM;
+
+	ret = __nft_match_init(ctx, expr, tb, priv->info);
+	if (ret)
+		kfree(priv->info);
+	return ret;
+}
+
 static void
 __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		    void *info)
@@ -482,6 +516,15 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }
 
+static void
+nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+	__nft_match_destroy(ctx, expr, priv->info);
+	kfree(priv->info);
+}
+
 static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
 			    void *info)
 {
@@ -503,6 +546,13 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return __nft_match_dump(skb, expr, nft_expr_priv(expr));
 }
 
+static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(e);
+
+	return __nft_match_dump(skb, e, priv->info);
+}
+
 static int nft_match_validate(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nft_data **data)
@@ -670,6 +720,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 {
 	struct nft_xt *nft_match;
 	struct xt_match *match;
+	unsigned int matchsize;
 	char *mt_name;
 	u32 rev, family;
 	int err;
@@ -709,7 +760,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->refcnt = 0;
 	nft_match->ops.type = &nft_match_type;
-	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
@@ -717,6 +767,18 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	nft_match->ops.validate = nft_match_validate;
 	nft_match->ops.data = match;
 
+	matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
+	if (matchsize > NFT_MATCH_LARGE_THRESH) {
+		matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv));
+
+		nft_match->ops.eval = nft_match_large_eval;
+		nft_match->ops.init = nft_match_large_init;
+		nft_match->ops.destroy = nft_match_large_destroy;
+		nft_match->ops.dump = nft_match_large_dump;
+	}
+
+	nft_match->ops.size = matchsize;
+
 	list_add(&nft_match->head, &nft_match_list);
 
 	return &nft_match->ops;

From bb7b40aecbf778c0c83a5bd62b0f03ca9f49a618 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 8 May 2018 02:43:57 +0200
Subject: [PATCH 100/393] netfilter: nf_tables: bogus EBUSY in chain deletions

When removing a rule that jumps to chain and such chain in the same
batch, this bogusly hits EBUSY. Add activate and deactivate operations
to expression that can be called from the preparation and the
commit/abort phases.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  5 ++++
 net/netfilter/nf_tables_api.c     | 46 ++++++++++++++++++++++++++++---
 net/netfilter/nft_immediate.c     | 15 ++++++++--
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cd368d1b8cb8..a1e28dd5d0bf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -170,6 +170,7 @@ struct nft_data_desc {
 int nft_data_init(const struct nft_ctx *ctx,
 		  struct nft_data *data, unsigned int size,
 		  struct nft_data_desc *desc, const struct nlattr *nla);
+void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
 void nft_data_release(const struct nft_data *data, enum nft_data_types type);
 int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 		  enum nft_data_types type, unsigned int len);
@@ -736,6 +737,10 @@ struct nft_expr_ops {
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nft_expr *expr,
 						const struct nlattr * const tb[]);
+	void				(*activate)(const struct nft_ctx *ctx,
+						    const struct nft_expr *expr);
+	void				(*deactivate)(const struct nft_ctx *ctx,
+						      const struct nft_expr *expr);
 	void				(*destroy)(const struct nft_ctx *ctx,
 						   const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 785d7fcf1fe1..3806db31cbbf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -214,6 +214,34 @@ static int nft_delchain(struct nft_ctx *ctx)
 	return err;
 }
 
+static void nft_rule_expr_activate(const struct nft_ctx *ctx,
+				   struct nft_rule *rule)
+{
+	struct nft_expr *expr;
+
+	expr = nft_expr_first(rule);
+	while (expr != nft_expr_last(rule) && expr->ops) {
+		if (expr->ops->activate)
+			expr->ops->activate(ctx, expr);
+
+		expr = nft_expr_next(expr);
+	}
+}
+
+static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
+				     struct nft_rule *rule)
+{
+	struct nft_expr *expr;
+
+	expr = nft_expr_first(rule);
+	while (expr != nft_expr_last(rule) && expr->ops) {
+		if (expr->ops->deactivate)
+			expr->ops->deactivate(ctx, expr);
+
+		expr = nft_expr_next(expr);
+	}
+}
+
 static int
 nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
 {
@@ -259,6 +287,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
 		nft_trans_destroy(trans);
 		return err;
 	}
+	nft_rule_expr_deactivate(ctx, rule);
 
 	return 0;
 }
@@ -2238,6 +2267,13 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 	kfree(rule);
 }
 
+static void nf_tables_rule_release(const struct nft_ctx *ctx,
+				   struct nft_rule *rule)
+{
+	nft_rule_expr_deactivate(ctx, rule);
+	nf_tables_rule_destroy(ctx, rule);
+}
+
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -2402,7 +2438,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	return 0;
 
 err2:
-	nf_tables_rule_destroy(&ctx, rule);
+	nf_tables_rule_release(&ctx, rule);
 err1:
 	for (i = 0; i < n; i++) {
 		if (info[i].ops != NULL)
@@ -4130,7 +4166,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
  *	NFT_GOTO verdicts. This function must be called on active data objects
  *	from the second phase of the commit protocol.
  */
-static void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
+void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
 {
 	if (type == NFT_DATA_VERDICT) {
 		switch (data->verdict.code) {
@@ -6015,10 +6051,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWRULE:
 			trans->ctx.chain->use--;
 			list_del_rcu(&nft_trans_rule(trans)->list);
+			nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans));
 			break;
 		case NFT_MSG_DELRULE:
 			trans->ctx.chain->use++;
 			nft_clear(trans->ctx.net, nft_trans_rule(trans));
+			nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSET:
@@ -6594,7 +6632,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 	list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
 		list_del(&rule->list);
 		ctx->chain->use--;
-		nf_tables_rule_destroy(ctx, rule);
+		nf_tables_rule_release(ctx, rule);
 	}
 	list_del(&ctx->chain->list);
 	ctx->table->use--;
@@ -6632,7 +6670,7 @@ static void __nft_release_tables(struct net *net)
 			list_for_each_entry_safe(rule, nr, &chain->rules, list) {
 				list_del(&rule->list);
 				chain->use--;
-				nf_tables_rule_destroy(&ctx, rule);
+				nf_tables_rule_release(&ctx, rule);
 			}
 		}
 		list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 4717d7796927..aa87ff8beae8 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -69,8 +69,16 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
 	return err;
 }
 
-static void nft_immediate_destroy(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr)
+static void nft_immediate_activate(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static void nft_immediate_deactivate(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
@@ -108,7 +116,8 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
 	.eval		= nft_immediate_eval,
 	.init		= nft_immediate_init,
-	.destroy	= nft_immediate_destroy,
+	.activate	= nft_immediate_activate,
+	.deactivate	= nft_immediate_deactivate,
 	.dump		= nft_immediate_dump,
 	.validate	= nft_immediate_validate,
 };

From 9533b292a7acc62c294ebcbd9e1f9f9d52adb10b Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 3 May 2018 20:29:19 -0700
Subject: [PATCH 101/393] IB: remove redundant INFINIBAND kconfig dependencies

INFINIBAND_ADDR_TRANS depends on INFINIBAND.  So there's no need for
options which depend INFINIBAND_ADDR_TRANS to also depend on INFINIBAND.
Remove the unnecessary INFINIBAND depends.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/ulp/srpt/Kconfig | 2 +-
 drivers/nvme/host/Kconfig           | 2 +-
 drivers/nvme/target/Kconfig         | 2 +-
 drivers/staging/lustre/lnet/Kconfig | 2 +-
 fs/cifs/Kconfig                     | 2 +-
 net/9p/Kconfig                      | 2 +-
 net/rds/Kconfig                     | 2 +-
 net/sunrpc/Kconfig                  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
index fb8b7182f05e..25bf6955b6d0 100644
--- a/drivers/infiniband/ulp/srpt/Kconfig
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_SRPT
 	tristate "InfiniBand SCSI RDMA Protocol target support"
-	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
+	depends on INFINIBAND_ADDR_TRANS && TARGET_CORE
 	---help---
 
 	  Support for the SCSI RDMA Protocol (SRP) Target driver. The
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..dbb7464c018c 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -27,7 +27,7 @@ config NVME_FABRICS
 
 config NVME_RDMA
 	tristate "NVM Express over Fabrics RDMA host driver"
-	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
+	depends on INFINIBAND_ADDR_TRANS && BLOCK
 	select NVME_CORE
 	select NVME_FABRICS
 	select SG_POOL
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d1..7595664ee753 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -27,7 +27,7 @@ config NVME_TARGET_LOOP
 
 config NVME_TARGET_RDMA
 	tristate "NVMe over Fabrics RDMA target support"
-	depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on INFINIBAND_ADDR_TRANS
 	depends on NVME_TARGET
 	select SGL_ALLOC
 	help
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
index ad049e6f24e4..f3b1ad4bd3dc 100644
--- a/drivers/staging/lustre/lnet/Kconfig
+++ b/drivers/staging/lustre/lnet/Kconfig
@@ -34,7 +34,7 @@ config LNET_SELFTEST
 
 config LNET_XPRT_IB
 	tristate "LNET infiniband support"
-	depends on LNET && PCI && INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on LNET && PCI && INFINIBAND_ADDR_TRANS
 	default LNET && INFINIBAND
 	help
 	  This option allows the LNET users to use infiniband as an
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 5f132d59dfc2..d61e2de8d0eb 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -197,7 +197,7 @@ config CIFS_SMB311
 
 config CIFS_SMB_DIRECT
 	bool "SMB Direct support (Experimental)"
-	depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
+	depends on CIFS=m && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND_ADDR_TRANS=y
 	help
 	  Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
 	  SMB Direct allows transferring SMB packets over RDMA. If unsure,
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index e6014e0e51f7..46c39f7da444 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -32,7 +32,7 @@ config NET_9P_XEN
 
 
 config NET_9P_RDMA
-	depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on INET && INFINIBAND_ADDR_TRANS
 	tristate "9P RDMA Transport (Experimental)"
 	help
 	  This builds support for an RDMA transport.
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index bffde4b46c5d..1a31502ee7db 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -8,7 +8,7 @@ config RDS
 
 config RDS_RDMA
 	tristate "RDS over Infiniband"
-	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on RDS && INFINIBAND_ADDR_TRANS
 	---help---
 	  Allow RDS to use Infiniband as a transport.
 	  This transport supports RDMA operations.
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index ac09ca803296..6358e5271070 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -50,7 +50,7 @@ config SUNRPC_DEBUG
 
 config SUNRPC_XPRT_RDMA
 	tristate "RPC-over-RDMA transport"
-	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on SUNRPC && INFINIBAND_ADDR_TRANS
 	default SUNRPC && INFINIBAND
 	select SG_POOL
 	help

From 27f70620faf02a897123eb36b9e614e8d540f751 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 3 May 2018 21:37:46 +0300
Subject: [PATCH 102/393] MAINTAINERS: Remove bouncing @mellanox.com addresses

Delete non-existent @mellanox.com addresses from MAINTAINERS file.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 MAINTAINERS | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..1b52c7e7fc7d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5391,7 +5391,6 @@ S:	Maintained
 F:	drivers/iommu/exynos-iommu.c
 
 EZchip NPS platform support
-M:	Elad Kanfi <eladkan@mellanox.com>
 M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
 F:	arch/arc/plat-eznps
@@ -9012,7 +9011,6 @@ Q:	http://patchwork.ozlabs.org/project/netdev/list/
 F:	drivers/net/ethernet/mellanox/mlx5/core/en_*
 
 MELLANOX ETHERNET INNOVA DRIVER
-M:	Ilan Tayari <ilant@mellanox.com>
 R:	Boris Pismenny <borisp@mellanox.com>
 L:	netdev@vger.kernel.org
 S:	Supported
@@ -9022,7 +9020,6 @@ F:	drivers/net/ethernet/mellanox/mlx5/core/fpga/*
 F:	include/linux/mlx5/mlx5_ifc_fpga.h
 
 MELLANOX ETHERNET INNOVA IPSEC DRIVER
-M:	Ilan Tayari <ilant@mellanox.com>
 R:	Boris Pismenny <borisp@mellanox.com>
 L:	netdev@vger.kernel.org
 S:	Supported
@@ -9078,7 +9075,6 @@ F:	include/uapi/rdma/mlx4-abi.h
 
 MELLANOX MLX5 core VPI driver
 M:	Saeed Mahameed <saeedm@mellanox.com>
-M:	Matan Barak <matanb@mellanox.com>
 M:	Leon Romanovsky <leonro@mellanox.com>
 L:	netdev@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
@@ -9089,7 +9085,6 @@ F:	drivers/net/ethernet/mellanox/mlx5/core/
 F:	include/linux/mlx5/
 
 MELLANOX MLX5 IB driver
-M:	Matan Barak <matanb@mellanox.com>
 M:	Leon Romanovsky <leonro@mellanox.com>
 L:	linux-rdma@vger.kernel.org
 W:	http://www.mellanox.com
@@ -9821,7 +9816,6 @@ F:	net/netfilter/xt_CONNSECMARK.c
 F:	net/netfilter/xt_SECMARK.c
 
 NETWORKING [TLS]
-M:	Ilya Lesokhin <ilyal@mellanox.com>
 M:	Aviad Yehezkel <aviadye@mellanox.com>
 M:	Dave Watson <davejwatson@fb.com>
 L:	netdev@vger.kernel.org

From f9e76ca3771bf23d2142a81a88ddd8f31f5c4c03 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Wed, 2 May 2018 06:42:51 -0700
Subject: [PATCH 103/393] IB/hfi1: Use after free race condition in send
 context error path

A pio send egress error can occur when the PSM library attempts to
to send a bad packet.  That issue is still being investigated.

The pio error interrupt handler then attempts to progress the recovery
of the errored pio send context.

Code inspection reveals that the handling lacks the necessary locking
if that recovery interleaves with a PSM close of the "context" object
contains the pio send context.

The lack of the locking can cause the recovery to access the already
freed pio send context object and incorrectly deduce that the pio
send context is actually a kernel pio send context as shown by the
NULL deref stack below:

[<ffffffff8143d78c>] _dev_info+0x6c/0x90
[<ffffffffc0613230>] sc_restart+0x70/0x1f0 [hfi1]
[<ffffffff816ab124>] ? __schedule+0x424/0x9b0
[<ffffffffc06133c5>] sc_halted+0x15/0x20 [hfi1]
[<ffffffff810aa3ba>] process_one_work+0x17a/0x440
[<ffffffff810ab086>] worker_thread+0x126/0x3c0
[<ffffffff810aaf60>] ? manage_workers.isra.24+0x2a0/0x2a0
[<ffffffff810b252f>] kthread+0xcf/0xe0
[<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40
[<ffffffff816b8798>] ret_from_fork+0x58/0x90
[<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40

This is the best case scenario and other scenarios can corrupt the
already freed memory.

Fix by adding the necessary locking in the pio send context error
handler.

Cc: <stable@vger.kernel.org> # 4.9.x
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/chip.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index e6a60fa59f2b..e6bdd0c1e80a 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -5944,6 +5944,7 @@ static void is_sendctxt_err_int(struct hfi1_devdata *dd,
 	u64 status;
 	u32 sw_index;
 	int i = 0;
+	unsigned long irq_flags;
 
 	sw_index = dd->hw_to_sw[hw_context];
 	if (sw_index >= dd->num_send_contexts) {
@@ -5953,10 +5954,12 @@ static void is_sendctxt_err_int(struct hfi1_devdata *dd,
 		return;
 	}
 	sci = &dd->send_contexts[sw_index];
+	spin_lock_irqsave(&dd->sc_lock, irq_flags);
 	sc = sci->sc;
 	if (!sc) {
 		dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
 			   sw_index, hw_context);
+		spin_unlock_irqrestore(&dd->sc_lock, irq_flags);
 		return;
 	}
 
@@ -5978,6 +5981,7 @@ static void is_sendctxt_err_int(struct hfi1_devdata *dd,
 	 */
 	if (sc->type != SC_USER)
 		queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+	spin_unlock_irqrestore(&dd->sc_lock, irq_flags);
 
 	/*
 	 * Update the counters for the corresponding status bits.

From 18b0362e87dfa09e355093b897b9db854e360d28 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Mon, 7 May 2018 10:20:01 +0300
Subject: [PATCH 104/393] RDMA/mlx5: Don't assume that medium blueFlame
 register exists

User can leave system without medium BlueFlames registers,
however the code assumed that at least one such register exists.

This patch fixes that assumption.

Fixes: c1be5232d21d ("IB/mlx5: Fix micro UAR allocator")
Reported-by: Rohit Zambre <rzambre@uci.edu>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 87b7c1be2a11..2193dc1765fb 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -484,11 +484,6 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
 	return 1;
 }
 
-static int first_med_bfreg(void)
-{
-	return 1;
-}
-
 enum {
 	/* this is the first blue flame register in the array of bfregs assigned
 	 * to a processes. Since we do not use it for blue flame but rather
@@ -514,6 +509,12 @@ static int num_med_bfreg(struct mlx5_ib_dev *dev,
 	return n >= 0 ? n : 0;
 }
 
+static int first_med_bfreg(struct mlx5_ib_dev *dev,
+			   struct mlx5_bfreg_info *bfregi)
+{
+	return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM;
+}
+
 static int first_hi_bfreg(struct mlx5_ib_dev *dev,
 			  struct mlx5_bfreg_info *bfregi)
 {
@@ -541,10 +542,13 @@ static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev,
 static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev,
 				 struct mlx5_bfreg_info *bfregi)
 {
-	int minidx = first_med_bfreg();
+	int minidx = first_med_bfreg(dev, bfregi);
 	int i;
 
-	for (i = first_med_bfreg(); i < first_hi_bfreg(dev, bfregi); i++) {
+	if (minidx < 0)
+		return minidx;
+
+	for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) {
 		if (bfregi->count[i] < bfregi->count[minidx])
 			minidx = i;
 		if (!bfregi->count[minidx])

From 37da2a03c036538a5a79766d74bfcf5b873e5cad Mon Sep 17 00:00:00 2001
From: Daria Velikovsky <daria@mellanox.com>
Date: Mon, 7 May 2018 10:20:02 +0300
Subject: [PATCH 105/393] RDMA/mlx5: Use proper spec flow label type

Flow label is defined as u32 in the in ipv6 flow spec, but
used internally in the flow specs parsing as u8. That was
causing loss of part of flow_label value.

Fixes: 2d1e697e9b716 ('IB/mlx5: Add support to match inner packet fields')
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Daria Velikovsky <daria@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a42c6b1cdb5a..cd1c342be6e3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2416,7 +2416,7 @@ static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
 	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
 }
 
-static void set_flow_label(void *misc_c, void *misc_v, u8 mask, u8 val,
+static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
 			   bool inner)
 {
 	if (inner) {

From 5a7189d529cd146cd5838af97b32fcac4122b471 Mon Sep 17 00:00:00 2001
From: Mustafa Ismail <mustafa.ismail@intel.com>
Date: Mon, 7 May 2018 12:52:17 -0500
Subject: [PATCH 106/393] i40iw: Fix memory leak in error path of create QP

If i40iw_allocate_dma_mem fails when creating a QP, the
memory allocated for the QP structure using kzalloc is not
freed because iwqp->allocated_buffer is used to free the
memory and it is not setup until later. Fix this by setting
iwqp->allocated_buffer before allocating the dma memory.

Fixes: d37498417947 ("i40iw: add files for iwarp interface")
Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 40e4f5ab2b46..3ebe91438523 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -614,6 +614,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
 		return ERR_PTR(-ENOMEM);
 
 	iwqp = (struct i40iw_qp *)mem;
+	iwqp->allocated_buffer = mem;
 	qp = &iwqp->sc_qp;
 	qp->back_qp = (void *)iwqp;
 	qp->push_idx = I40IW_INVALID_PUSH_PAGE_INDEX;
@@ -642,7 +643,6 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
 		goto error;
 	}
 
-	iwqp->allocated_buffer = mem;
 	iwqp->iwdev = iwdev;
 	iwqp->iwpd = iwpd;
 	iwqp->ibqp.qp_num = qp_num;

From eeb1af4f53fa74fd41d288b113eebcdfca4d311c Mon Sep 17 00:00:00 2001
From: Mustafa Ismail <mustafa.ismail@intel.com>
Date: Mon, 7 May 2018 12:52:18 -0500
Subject: [PATCH 107/393] i40iw: Use correct address in dst_neigh_lookup for
 IPv6

Use of incorrect structure address for IPv6 neighbor lookup
causes connections to IPv6 addresses to fail. Fix this by
using correct address in call to dst_neigh_lookup.

Fixes: f27b4746f378 ("i40iw: add connection management code")
Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 4cfa8f4647e2..f7c6fd9ff6e2 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -2093,7 +2093,7 @@ static int i40iw_addr_resolve_neigh_ipv6(struct i40iw_device *iwdev,
 	if (netif_is_bond_slave(netdev))
 		netdev = netdev_master_upper_dev_get(netdev);
 
-	neigh = dst_neigh_lookup(dst, &dst_addr);
+	neigh = dst_neigh_lookup(dst, dst_addr.sin6_addr.in6_u.u6_addr32);
 
 	rcu_read_lock();
 	if (neigh) {

From 1661d3b0e2183ce90f6611641c350a5aa02aaa80 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 8 May 2018 11:02:02 +0200
Subject: [PATCH 108/393] nvmet,rxe: defer ip datagram sending to tasklet

This addresses 3 separate problems:

1. When using NVME over Fabrics we may end up sending IP
packets in interrupt context, we should defer this work
to a tasklet.

[   50.939957] WARNING: CPU: 3 PID: 0 at kernel/softirq.c:161 __local_bh_enable_ip+0x1f/0xa0
[   50.942602] CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G        W         4.17.0-rc3-ARCH+ #104
[   50.945466] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
[   50.948163] RIP: 0010:__local_bh_enable_ip+0x1f/0xa0
[   50.949631] RSP: 0018:ffff88009c183900 EFLAGS: 00010006
[   50.951029] RAX: 0000000080010403 RBX: 0000000000000200 RCX: 0000000000000001
[   50.952636] RDX: 0000000000000000 RSI: 0000000000000200 RDI: ffffffff817e04ec
[   50.954278] RBP: ffff88009c183910 R08: 0000000000000001 R09: 0000000000000614
[   50.956000] R10: ffffea00021d5500 R11: 0000000000000001 R12: ffffffff817e04ec
[   50.957779] R13: 0000000000000000 R14: ffff88009566f400 R15: ffff8800956c7000
[   50.959402] FS:  0000000000000000(0000) GS:ffff88009c180000(0000) knlGS:0000000000000000
[   50.961552] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   50.963798] CR2: 000055c4ec0ccac0 CR3: 0000000002209001 CR4: 00000000000606e0
[   50.966121] Call Trace:
[   50.966845]  <IRQ>
[   50.967497]  __dev_queue_xmit+0x62d/0x690
[   50.968722]  dev_queue_xmit+0x10/0x20
[   50.969894]  neigh_resolve_output+0x173/0x190
[   50.971244]  ip_finish_output2+0x2b8/0x370
[   50.972527]  ip_finish_output+0x1d2/0x220
[   50.973785]  ? ip_finish_output+0x1d2/0x220
[   50.975010]  ip_output+0xd4/0x100
[   50.975903]  ip_local_out+0x3b/0x50
[   50.976823]  rxe_send+0x74/0x120
[   50.977702]  rxe_requester+0xe3b/0x10b0
[   50.978881]  ? ip_local_deliver_finish+0xd1/0xe0
[   50.980260]  rxe_do_task+0x85/0x100
[   50.981386]  rxe_run_task+0x2f/0x40
[   50.982470]  rxe_post_send+0x51a/0x550
[   50.983591]  nvmet_rdma_queue_response+0x10a/0x170
[   50.985024]  __nvmet_req_complete+0x95/0xa0
[   50.986287]  nvmet_req_complete+0x15/0x60
[   50.987469]  nvmet_bio_done+0x2d/0x40
[   50.988564]  bio_endio+0x12c/0x140
[   50.989654]  blk_update_request+0x185/0x2a0
[   50.990947]  blk_mq_end_request+0x1e/0x80
[   50.991997]  nvme_complete_rq+0x1cc/0x1e0
[   50.993171]  nvme_pci_complete_rq+0x117/0x120
[   50.994355]  __blk_mq_complete_request+0x15e/0x180
[   50.995988]  blk_mq_complete_request+0x6f/0xa0
[   50.997304]  nvme_process_cq+0xe0/0x1b0
[   50.998494]  nvme_irq+0x28/0x50
[   50.999572]  __handle_irq_event_percpu+0xa2/0x1c0
[   51.000986]  handle_irq_event_percpu+0x32/0x80
[   51.002356]  handle_irq_event+0x3c/0x60
[   51.003463]  handle_edge_irq+0x1c9/0x200
[   51.004473]  handle_irq+0x23/0x30
[   51.005363]  do_IRQ+0x46/0xd0
[   51.006182]  common_interrupt+0xf/0xf
[   51.007129]  </IRQ>

2. Work must always be offloaded to tasklet for rxe_post_send_kernel()
when using NVMEoF in order to solve lock ordering between neigh->ha_lock
seqlock and the nvme queue lock:

[   77.833783]  Possible interrupt unsafe locking scenario:
[   77.833783]
[   77.835831]        CPU0                    CPU1
[   77.837129]        ----                    ----
[   77.838313]   lock(&(&n->ha_lock)->seqcount);
[   77.839550]                                local_irq_disable();
[   77.841377]                                lock(&(&nvmeq->q_lock)->rlock);
[   77.843222]                                lock(&(&n->ha_lock)->seqcount);
[   77.845178]   <Interrupt>
[   77.846298]     lock(&(&nvmeq->q_lock)->rlock);
[   77.847986]
[   77.847986]  *** DEADLOCK ***

3. Same goes for the lock ordering between sch->q.lock and nvme queue lock:

[   47.634271]  Possible interrupt unsafe locking scenario:
[   47.634271]
[   47.636452]        CPU0                    CPU1
[   47.637861]        ----                    ----
[   47.639285]   lock(&(&sch->q.lock)->rlock);
[   47.640654]                                local_irq_disable();
[   47.642451]                                lock(&(&nvmeq->q_lock)->rlock);
[   47.644521]                                lock(&(&sch->q.lock)->rlock);
[   47.646480]   <Interrupt>
[   47.647263]     lock(&(&nvmeq->q_lock)->rlock);
[   47.648492]
[   47.648492]  *** DEADLOCK ***

Using NVMEoF after this patch seems to finally be stable, without it,
rxe eventually deadlocks the whole system and causes RCU stalls.

Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rxe/rxe_verbs.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 2cb52fd48cf1..73a00a1c06f6 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -761,7 +761,6 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr,
 	unsigned int mask;
 	unsigned int length = 0;
 	int i;
-	int must_sched;
 
 	while (wr) {
 		mask = wr_opcode_mask(wr->opcode, qp);
@@ -791,14 +790,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr,
 		wr = wr->next;
 	}
 
-	/*
-	 * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
-	 * and the requester call ip_local_out_sk() that takes spin_lock_bh.
-	 */
-	must_sched = (qp_type(qp) == IB_QPT_GSI) ||
-			(queue_count(qp->sq.queue) > 1);
-
-	rxe_run_task(&qp->req.task, must_sched);
+	rxe_run_task(&qp->req.task, 1);
 	if (unlikely(qp->req.state == QP_STATE_ERROR))
 		rxe_run_task(&qp->comp.task, 1);
 

From ecaaf1e26a37ddf7cba4e425cf68ae7ce1869bc0 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:03 +0800
Subject: [PATCH 109/393] RDMA/hns: Add rq inline flags judgement

It needs to set the rqie field of qp context by configured rq inline
flags. Besides, it need to decide whether posting inline rqwqe by
judged rq inline flags.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 25916e8522ed..162d42c6530a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -552,11 +552,15 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		}
 
 		/* rq support inline data */
-		sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list;
-		hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt = (u32)wr->num_sge;
-		for (i = 0; i < wr->num_sge; i++) {
-			sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr;
-			sge_list[i].len = wr->sg_list[i].length;
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+			sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list;
+			hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt =
+							       (u32)wr->num_sge;
+			for (i = 0; i < wr->num_sge; i++) {
+				sge_list[i].addr =
+					       (void *)(u64)wr->sg_list[i].addr;
+				sge_list[i].len = wr->sg_list[i].length;
+			}
 		}
 
 		hr_qp->rq.wrid[ind] = wr->wr_id;
@@ -2169,6 +2173,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 				    struct hns_roce_v2_qp_context *context,
 				    struct hns_roce_v2_qp_context *qpc_mask)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 
 	/*
@@ -2281,7 +2286,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	context->rq_db_record_addr = hr_qp->rdb.dma >> 32;
 	qpc_mask->rq_db_record_addr = 0;
 
-	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1);
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
+		    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) ? 1 : 0);
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
 
 	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,

From 3a39bbecc88fa9a99a80de588c8f1fe16aba3446 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:04 +0800
Subject: [PATCH 110/393] RDMA/hns: Bugfix for rq record db for kernel

When used rq record db for kernel, it needs to set the rdb_en of
hr_qp to 1 and configures the dma address of record rq db of qp
context.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index d4aad34c21e2..cdfb774e28ff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -660,6 +660,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				goto err_rq_sge_list;
 			}
 			*hr_qp->rdb.db_record = 0;
+			hr_qp->rdb_en = 1;
 		}
 
 		/* Allocate QP buf */

From f97a62c39423575c62649721657e5cc04f67c0ac Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:05 +0800
Subject: [PATCH 111/393] RDMA/hns: Load the RoCE dirver automatically

To enable the linux-kernel system to load the hns-roce-hw-v2 driver
automatically when hns-roce-hw-v2 is plugged in pci bus, it need to
create a MODULE_DEVICE_TABLE for expose the pci_table of
hns-roce-hw-v2 to user.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Reported-by: Zhou Wang <wangzhou1@hisilicon.com>
Tested-by: Xiaojun Tan <tanxiaojun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 162d42c6530a..50f55bd2c75f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -4709,6 +4709,8 @@ static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
 	{0, }
 };
 
+MODULE_DEVICE_TABLE(pci, hns_roce_hw_v2_pci_tbl);
+
 static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
 				  struct hnae3_handle *handle)
 {

From ad18e20ba2887e221e903d311f4c9a1586eacffb Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:06 +0800
Subject: [PATCH 112/393] RDMA/hns: Update convert function of endian format

Because the sys_image_guid of ib_device_attr structure is __be64, it
need to use cpu_to_be64 for converting.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 9d48bc07a9e6..96fb6a9ed93c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -199,7 +199,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
 
 	memset(props, 0, sizeof(*props));
 
-	props->sys_image_guid = cpu_to_be32(hr_dev->sys_image_guid);
+	props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid);
 	props->max_mr_size = (u64)(~(0ULL));
 	props->page_size_cap = hr_dev->caps.page_size_cap;
 	props->vendor_id = hr_dev->vendor_id;

From 2349fdd483ea933b223f3eca53ed42835383d316 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:07 +0800
Subject: [PATCH 113/393] RDMA/hns: Add return operation when configured global
 param fail

When configure global param function run fail, it should directly return
and the initial flow will stop.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 50f55bd2c75f..539b0caacb9f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1085,6 +1085,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	if (ret) {
 		dev_err(hr_dev->dev, "Configure global param fail, ret = %d.\n",
 			ret);
+		return ret;
 	}
 
 	/* Get pf resource owned by every pf */

From 391bd5fc7de54a5cb866e8897d60ee1d76b8840a Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:08 +0800
Subject: [PATCH 114/393] RDMA/hns: Not support qp transition from reset to
 reset for hip06

Because hip06 hardware is not support for qp transition from
reset to reset state, it need to return errno when qp
transited from reset to reset. This patch fixes it.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index cdfb774e28ff..baaf906f7c2e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -956,7 +956,14 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	}
 
 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
-		ret = 0;
+		if (hr_dev->caps.min_wqes) {
+			ret = -EPERM;
+			dev_err(dev, "cur_state=%d new_state=%d\n", cur_state,
+				new_state);
+		} else {
+			ret = 0;
+		}
+
 		goto out;
 	}
 

From 778cc5a8b75eee62d330059a2655b515cda43278 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:09 +0800
Subject: [PATCH 115/393] RDMA/hns: Fix the bug with rq sge

When received multiply rq sge, it should tag the
invalid lkey for the last non-zero length sge
when have some sges' length are zero. This patch
fixes it.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 539b0caacb9f..a40978bbc1d0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -547,8 +547,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		}
 
 		if (i < hr_qp->rq.max_gs) {
-			dseg[i].lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY);
-			dseg[i].addr = 0;
+			dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY);
+			dseg->addr = 0;
 		}
 
 		/* rq support inline data */

From 90e7a4d5066240b75cdfd1bf8944ca36622153b1 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:10 +0800
Subject: [PATCH 116/393] RDMA/hns: Set desc_dma_addr for zero when free cmq
 desc

In order to avoid illegal use for desc_dma_addr of ring,
it needs to set it zero when free cmq desc.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index a40978bbc1d0..46c3ab97e1ba 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -617,6 +617,8 @@ static void hns_roce_free_cmq_desc(struct hns_roce_dev *hr_dev,
 	dma_unmap_single(hr_dev->dev, ring->desc_dma_addr,
 			 ring->desc_num * sizeof(struct hns_roce_cmq_desc),
 			 DMA_BIDIRECTIONAL);
+
+	ring->desc_dma_addr = 0;
 	kfree(ring->desc);
 }
 

From 85e0274dc66430b0d0fad7ad01cbc0e0cbebf6dc Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:11 +0800
Subject: [PATCH 117/393] RDMA/hns: Enable inner_pa_vld filed of mpt

When enabled inner_pa_vld field of mpt, The pa0 and
pa1 will be valid and the hardware will use it
directly and not use base address of pbl. As a
result, it can reduce the delay.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 46c3ab97e1ba..7ecf0c911808 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1379,6 +1379,8 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
 
 	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S,
 		     mr->type == MR_TYPE_MR ? 0 : 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_INNER_PA_VLD_S,
+		     1);
 	mpt_entry->byte_12_mw_pa = cpu_to_le32(mpt_entry->byte_12_mw_pa);
 
 	mpt_entry->len_l = cpu_to_le32(lower_32_bits(mr->size));

From 79d442071a733057e4d9f28c90fbdb4f39d9fc23 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:12 +0800
Subject: [PATCH 118/393] RDMA/hns: Set NULL for __internal_mr

This patch mainly configure value for __internal_mr of mr_free_pd.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 47e1b6ac1e1a..b3417a9a49de 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -722,6 +722,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	free_mr->mr_free_pd = to_hr_pd(pd);
 	free_mr->mr_free_pd->ibpd.device  = &hr_dev->ib_dev;
 	free_mr->mr_free_pd->ibpd.uobject = NULL;
+	free_mr->mr_free_pd->ibpd.__internal_mr = NULL;
 	atomic_set(&free_mr->mr_free_pd->ibpd.usecnt, 0);
 
 	attr.qp_access_flags	= IB_ACCESS_REMOTE_WRITE;

From a0403be8af338c319b5176c1d2975d94a930a0bf Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Fri, 4 May 2018 10:57:13 +0800
Subject: [PATCH 119/393] RDMA/hns: Fix the bug with NULL pointer

When the last QP of eight QPs is not exist in
hns_roce_v1_mr_free_work_fn function, the
print for qpn of hr_qp may introduce a
calltrace for NULL pointer.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index b3417a9a49de..8013d69c5ac4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -1037,7 +1037,7 @@ static void hns_roce_v1_mr_free_work_fn(struct work_struct *work)
 
 	do {
 		ret = hns_roce_v1_poll_cq(&mr_free_cq->ib_cq, ne, wc);
-		if (ret < 0) {
+		if (ret < 0 && hr_qp) {
 			dev_err(dev,
 			   "(qp:0x%lx) starts, Poll cqe failed(%d) for mr 0x%x free! Remain %d cqe\n",
 			   hr_qp->qpn, ret, hr_mr->key, ne);

From a75895b1ebd944a7873cbf76d30de8720955f8b3 Mon Sep 17 00:00:00 2001
From: Andrew Boyer <andrew.boyer@dell.com>
Date: Mon, 7 May 2018 13:23:36 -0400
Subject: [PATCH 120/393] RDMA/i40iw: Avoid panic when objects are being
 created and destroyed

A panic occurs when there is a newly-registered element on the QP/CQ MR
list waiting to be attached, but a different MR is deregistered. The
current code only checks for whether the list is empty, not whether the
element being deregistered is actually on the list.

Fix the panic by adding a boolean to track if the object is on the list.

Fixes: d37498417947 ("i40iw: add files for iwarp interface")
Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/i40iw/i40iw_verbs.c | 11 +++++++++--
 drivers/infiniband/hw/i40iw/i40iw_verbs.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 3ebe91438523..68679ad4c6da 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -394,6 +394,7 @@ static struct i40iw_pbl *i40iw_get_pbl(unsigned long va,
 
 	list_for_each_entry(iwpbl, pbl_list, list) {
 		if (iwpbl->user_base == va) {
+			iwpbl->on_list = false;
 			list_del(&iwpbl->list);
 			return iwpbl;
 		}
@@ -1898,6 +1899,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 			goto error;
 		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
 		list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list);
+		iwpbl->on_list = true;
 		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
 		break;
 	case IW_MEMREG_TYPE_CQ:
@@ -1908,6 +1910,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 
 		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
 		list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list);
+		iwpbl->on_list = true;
 		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
 		break;
 	case IW_MEMREG_TYPE_MEM:
@@ -2045,14 +2048,18 @@ static void i40iw_del_memlist(struct i40iw_mr *iwmr,
 	switch (iwmr->type) {
 	case IW_MEMREG_TYPE_CQ:
 		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-		if (!list_empty(&ucontext->cq_reg_mem_list))
+		if (iwpbl->on_list) {
+			iwpbl->on_list = false;
 			list_del(&iwpbl->list);
+		}
 		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
 		break;
 	case IW_MEMREG_TYPE_QP:
 		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
-		if (!list_empty(&ucontext->qp_reg_mem_list))
+		if (iwpbl->on_list) {
+			iwpbl->on_list = false;
 			list_del(&iwpbl->list);
+		}
 		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
 		break;
 	default:
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
index 9067443cd311..76cf173377ab 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -78,6 +78,7 @@ struct i40iw_pbl {
 	};
 
 	bool pbl_allocated;
+	bool on_list;
 	u64 user_base;
 	struct i40iw_pble_alloc pble_alloc;
 	struct i40iw_mr *iwmr;

From 9f7b16afab9b47de471f4ef6a0c6c337f0a53566 Mon Sep 17 00:00:00 2001
From: Andrew Boyer <andrew.boyer@dell.com>
Date: Mon, 7 May 2018 13:23:37 -0400
Subject: [PATCH 121/393] RDMA/i40iw: Avoid reference leaks when processing the
 AEQ

In this switch there is a reference held on the QP. 'continue' will grab
the next event without releasing the reference, causing a leak.

Change it to 'break' to drop the reference before grabbing the next event.

Fixes: 4e9042e647ff ("i40iw: add hw and utils files")
Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/i40iw/i40iw_hw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
index 6139836fb533..c9f62ca7643c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -331,7 +331,7 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
 		switch (info->ae_id) {
 		case I40IW_AE_LLP_FIN_RECEIVED:
 			if (qp->term_flags)
-				continue;
+				break;
 			if (atomic_inc_return(&iwqp->close_timer_started) == 1) {
 				iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSE_WAIT;
 				if ((iwqp->hw_tcp_state == I40IW_TCP_STATE_CLOSE_WAIT) &&
@@ -360,7 +360,7 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
 			break;
 		case I40IW_AE_LLP_CONNECTION_RESET:
 			if (atomic_read(&iwqp->close_timer_started))
-				continue;
+				break;
 			i40iw_cm_disconn(iwqp);
 			break;
 		case I40IW_AE_QP_SUSPEND_COMPLETE:

From 43731753c4b7d832775cf6b2301dd0447a5a1851 Mon Sep 17 00:00:00 2001
From: Andrew Boyer <andrew.boyer@dell.com>
Date: Mon, 7 May 2018 13:23:38 -0400
Subject: [PATCH 122/393] RDMA/i40iw: Avoid panic when reading back the IRQ
 affinity hint

The current code sets an affinity hint with a cpumask_t stored on the
stack. This value can then be accessed through /proc/irq/*/affinity_hint/,
causing a segfault or returning corrupt data.

Move the cpumask_t into struct i40iw_msix_vector so it is available later.

Backtrace:
BUG: unable to handle kernel paging request at ffffb16e600e7c90
IP: irq_affinity_hint_proc_show+0x60/0xf0
PGD 17c0c6d067
PUD 17c0c6e067
PMD 15d4a0e067
PTE 0

Oops: 0000 [#1] SMP
Modules linked in: ...
CPU: 3 PID: 172543 Comm: grep Tainted: G           OE   ... #1
Hardware name: ...
task: ffff9a5caee08000 task.stack: ffffb16e659d8000
RIP: 0010:irq_affinity_hint_proc_show+0x60/0xf0
RSP: 0018:ffffb16e659dbd20 EFLAGS: 00010086
RAX: 0000000000000246 RBX: ffffb16e659dbd20 RCX: 0000000000000000
RDX: ffffb16e600e7c90 RSI: 0000000000000003 RDI: 0000000000000046
RBP: ffffb16e659dbd88 R08: 0000000000000038 R09: 0000000000000001
R10: 0000000070803079 R11: 0000000000000000 R12: ffff9a59d1d97a00
R13: ffff9a5da47a6cd8 R14: ffff9a5da47a6c00 R15: ffff9a59d1d97a00
FS:  00007f946c31d740(0000) GS:ffff9a5dc1800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffb16e600e7c90 CR3: 00000016a4339000 CR4: 00000000007406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 seq_read+0x12d/0x430
 ? sched_clock_cpu+0x11/0xb0
 proc_reg_read+0x48/0x70
 __vfs_read+0x37/0x140
 ? security_file_permission+0xa0/0xc0
 vfs_read+0x96/0x140
 SyS_read+0x58/0xc0
 do_syscall_64+0x5a/0x190
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x7f946bbc97e0
RSP: 002b:00007ffdd0c4ae08 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 000000000096b000 RCX: 00007f946bbc97e0
RDX: 000000000096b000 RSI: 00007f946a2f0000 RDI: 0000000000000004
RBP: 0000000000001000 R08: 00007f946a2ef011 R09: 000000000000000a
R10: 0000000000001000 R11: 0000000000000246 R12: 00007f946a2f0000
R13: 0000000000000004 R14: 0000000000000000 R15: 00007f946a2f0000
Code: b9 08 00 00 00 49 89 c6 48 89 df 31 c0 4d 8d ae d8 00 00 00 f3 48 ab 4c 89 ef e8 6c 9a 56 00 49 8b 96 30 01 00 00 48 85 d2 74 3f <48> 8b 0a 48 89 4d 98 48 8b 4a 08 48 89 4d a0 48 8b 4a 10 48 89
RIP: irq_affinity_hint_proc_show+0x60/0xf0 RSP: ffffb16e659dbd20
CR2: ffffb16e600e7c90

Fixes: 8e06af711bf2 ("i40iw: add main, hdr, status")
Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/i40iw/i40iw.h      | 1 +
 drivers/infiniband/hw/i40iw/i40iw_main.c | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
index d5d8c1be345a..2f2b4426ded7 100644
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -207,6 +207,7 @@ struct i40iw_msix_vector {
 	u32 irq;
 	u32 cpu_affinity;
 	u32 ceq_id;
+	cpumask_t mask;
 };
 
 struct l2params_work {
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 9cd0d3ef9057..05001e6da1f8 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -687,7 +687,6 @@ static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iw
 							 struct i40iw_msix_vector *msix_vec)
 {
 	enum i40iw_status_code status;
-	cpumask_t mask;
 
 	if (iwdev->msix_shared && !ceq_id) {
 		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
@@ -697,9 +696,9 @@ static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iw
 		status = request_irq(msix_vec->irq, i40iw_ceq_handler, 0, "CEQ", iwceq);
 	}
 
-	cpumask_clear(&mask);
-	cpumask_set_cpu(msix_vec->cpu_affinity, &mask);
-	irq_set_affinity_hint(msix_vec->irq, &mask);
+	cpumask_clear(&msix_vec->mask);
+	cpumask_set_cpu(msix_vec->cpu_affinity, &msix_vec->mask);
+	irq_set_affinity_hint(msix_vec->irq, &msix_vec->mask);
 
 	if (status) {
 		i40iw_pr_err("ceq irq config fail\n");

From 3d69191086fc87f202c79eb8873b9c82c2bb065a Mon Sep 17 00:00:00 2001
From: Christophe Jaillet <christophe.jaillet@wanadoo.fr>
Date: Tue, 8 May 2018 07:44:27 +0200
Subject: [PATCH 123/393] iw_cxgb4: Fix an error handling path in
 'c4iw_get_dma_mr()'

The error handling path of 'c4iw_get_dma_mr()' does not free resources
in the correct order.
If an error occures, it can leak 'mhp->wr_waitp'.

Fixes: a3f12da0e99a ("iw_cxgb4: allocate wait object for each memory object")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/mem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index e90f2fd8dc16..1445918e3239 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -489,10 +489,10 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
 err_dereg_mem:
 	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
 		  mhp->attr.pbl_addr, mhp->dereg_skb, mhp->wr_waitp);
-err_free_wr_wait:
-	c4iw_put_wr_wait(mhp->wr_waitp);
 err_free_skb:
 	kfree_skb(mhp->dereg_skb);
+err_free_wr_wait:
+	c4iw_put_wr_wait(mhp->wr_waitp);
 err_free_mhp:
 	kfree(mhp);
 	return ERR_PTR(ret);

From 76aa3de7095f15af7300012cb29ea8ab93eec348 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 7 May 2018 12:08:37 +0200
Subject: [PATCH 124/393] eeprom: at24: fix retrieving the at24_chip_data
 structure

Commit feb2f19b1e8f ("eeprom: at24: move platform data processing into
a separate routine") introduced a bug where we incorrectly retireve the
at24_chip_data structure. Remove the unnecessary ampersand operator.

Fixes: feb2f19b1e8f ("eeprom: at24: move platform data processing into a separate routine")
Reported-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/misc/eeprom/at24.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 0c125f207aea..33053b0d1fdf 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -518,7 +518,7 @@ static int at24_get_pdata(struct device *dev, struct at24_platform_data *pdata)
 	if (of_node && of_match_device(at24_of_match, dev))
 		cdata = of_device_get_match_data(dev);
 	else if (id)
-		cdata = (void *)&id->driver_data;
+		cdata = (void *)id->driver_data;
 	else
 		cdata = acpi_device_get_match_data(dev);
 

From 26aeb9daa02cd37178321cf915efd3d5eb8b0511 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 8 May 2018 19:42:40 -0700
Subject: [PATCH 125/393] nfp: bpf: allow zero-length capabilities

Some BPF capabilities carry no value, they simply indicate feature
is present.  Our capability parsing loop will exit early if last
capability is zero-length because it's looking for more than 8 bytes
of data (8B is our TLV header length).  Allow the last capability to
be zero-length.

This bug would lead to driver failing to probe with the following error
if the last capability FW advertises is zero-length:

    nfp: BPF capabilities left after parsing, parsed:92 total length:100
    nfp: invalid BPF capabilities at offset:92

Note the "parsed" and "length" values are 8 apart.

No shipping FW runs into this issue, but we can't guarantee that will
remain the case.

Fixes: 77a844ee650c ("nfp: bpf: prepare for parsing BPF FW capabilities")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 1dc424685f4e..35fb31f682af 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -335,7 +335,7 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 		return PTR_ERR(mem) == -ENOENT ? 0 : PTR_ERR(mem);
 
 	start = mem;
-	while (mem - start + 8 < nfp_cpp_area_size(area)) {
+	while (mem - start + 8 <= nfp_cpp_area_size(area)) {
 		u8 __iomem *value;
 		u32 type, length;
 

From 9f65fb29374ee37856dbad847b4e121aab72b510 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 9 May 2018 21:41:38 +0200
Subject: [PATCH 126/393] x86/bugs: Rename _RDS to _SSBD

Intel collateral will reference the SSB mitigation bit in IA32_SPEC_CTL[2]
as SSBD (Speculative Store Bypass Disable).

Hence changing it.

It is unclear yet what the MSR_IA32_ARCH_CAPABILITIES (0x10a) Bit(4) name
is going to be. Following the rename it would be SSBD_NO but that rolls out
to Speculative Store Bypass Disable No.

Also fixed the missing space in X86_FEATURE_AMD_SSBD.

[ tglx: Fixup x86_amd_rds_enable() and rds_tif_to_amd_ls_cfg() as well ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeatures.h |  4 ++--
 arch/x86/include/asm/msr-index.h   | 10 ++++-----
 arch/x86/include/asm/spec-ctrl.h   | 12 +++++-----
 arch/x86/include/asm/thread_info.h |  6 ++---
 arch/x86/kernel/cpu/amd.c          | 14 ++++++------
 arch/x86/kernel/cpu/bugs.c         | 36 +++++++++++++++---------------
 arch/x86/kernel/cpu/common.c       |  2 +-
 arch/x86/kernel/cpu/intel.c        |  2 +-
 arch/x86/kernel/process.c          |  8 +++----
 arch/x86/kvm/cpuid.c               |  2 +-
 arch/x86/kvm/vmx.c                 |  6 ++---
 11 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b2464c1787df..4e1c747acbf8 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -215,7 +215,7 @@
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
-#define X86_FEATURE_AMD_RDS		(7*32+24)  /* "" AMD RDS implementation */
+#define X86_FEATURE_AMD_SSBD		( 7*32+24)  /* "" AMD SSBD implementation */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
@@ -336,7 +336,7 @@
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
-#define X86_FEATURE_RDS			(18*32+31) /* Reduced Data Speculation */
+#define X86_FEATURE_SSBD		(18*32+31) /* Speculative Store Bypass Disable */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 810f50bb338d..0da3ca260b06 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,8 +42,8 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
-#define SPEC_CTRL_RDS_SHIFT		2	   /* Reduced Data Speculation bit */
-#define SPEC_CTRL_RDS			(1 << SPEC_CTRL_RDS_SHIFT)   /* Reduced Data Speculation */
+#define SPEC_CTRL_SSBD_SHIFT		2	   /* Speculative Store Bypass Disable bit */
+#define SPEC_CTRL_SSBD			(1 << SPEC_CTRL_SSBD_SHIFT)   /* Speculative Store Bypass Disable */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
@@ -70,10 +70,10 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
-#define ARCH_CAP_RDS_NO			(1 << 4)   /*
+#define ARCH_CAP_SSBD_NO		(1 << 4)   /*
 						    * Not susceptible to Speculative Store Bypass
-						    * attack, so no Reduced Data Speculation control
-						    * required.
+						    * attack, so no Speculative Store Bypass
+						    * control required.
 						    */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 45ef00ad5105..dc21209790bf 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -17,20 +17,20 @@ extern void x86_spec_ctrl_restore_host(u64);
 
 /* AMD specific Speculative Store Bypass MSR data */
 extern u64 x86_amd_ls_cfg_base;
-extern u64 x86_amd_ls_cfg_rds_mask;
+extern u64 x86_amd_ls_cfg_ssbd_mask;
 
 /* The Intel SPEC CTRL MSR base value cache */
 extern u64 x86_spec_ctrl_base;
 
-static inline u64 rds_tif_to_spec_ctrl(u64 tifn)
+static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
 {
-	BUILD_BUG_ON(TIF_RDS < SPEC_CTRL_RDS_SHIFT);
-	return (tifn & _TIF_RDS) >> (TIF_RDS - SPEC_CTRL_RDS_SHIFT);
+	BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+	return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
 }
 
-static inline u64 rds_tif_to_amd_ls_cfg(u64 tifn)
+static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
 {
-	return (tifn & _TIF_RDS) ? x86_amd_ls_cfg_rds_mask : 0ULL;
+	return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
 }
 
 extern void speculative_store_bypass_update(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e5c26cc59619..2ff2a30a264f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,7 +79,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
-#define TIF_RDS			5	/* Reduced data speculation */
+#define TIF_SSBD			5	/* Reduced data speculation */
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
@@ -106,7 +106,7 @@ struct thread_info {
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_RDS		(1 << TIF_RDS)
+#define _TIF_SSBD		(1 << TIF_SSBD)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
@@ -146,7 +146,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_RDS)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18efc33a8d2e..7bde990b0385 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -567,12 +567,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		}
 		/*
 		 * Try to cache the base value so further operations can
-		 * avoid RMW. If that faults, do not enable RDS.
+		 * avoid RMW. If that faults, do not enable SSBD.
 		 */
 		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
-			setup_force_cpu_cap(X86_FEATURE_RDS);
-			setup_force_cpu_cap(X86_FEATURE_AMD_RDS);
-			x86_amd_ls_cfg_rds_mask = 1ULL << bit;
+			setup_force_cpu_cap(X86_FEATURE_SSBD);
+			setup_force_cpu_cap(X86_FEATURE_AMD_SSBD);
+			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
 		}
 	}
 }
@@ -920,9 +920,9 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
 
-	if (boot_cpu_has(X86_FEATURE_AMD_RDS)) {
-		set_cpu_cap(c, X86_FEATURE_RDS);
-		set_cpu_cap(c, X86_FEATURE_AMD_RDS);
+	if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) {
+		set_cpu_cap(c, X86_FEATURE_SSBD);
+		set_cpu_cap(c, X86_FEATURE_AMD_SSBD);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 563d8e54c863..09b116b7f3bf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -45,10 +45,10 @@ static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
 
 /*
  * AMD specific MSR info for Speculative Store Bypass control.
- * x86_amd_ls_cfg_rds_mask is initialized in identify_boot_cpu().
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
  */
 u64 __ro_after_init x86_amd_ls_cfg_base;
-u64 __ro_after_init x86_amd_ls_cfg_rds_mask;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
 
 void __init check_bugs(void)
 {
@@ -146,7 +146,7 @@ u64 x86_spec_ctrl_get_default(void)
 	u64 msrval = x86_spec_ctrl_base;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		msrval |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+		msrval |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 	return msrval;
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
@@ -159,7 +159,7 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 		return;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
@@ -174,18 +174,18 @@ void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 		return;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, host);
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
-static void x86_amd_rds_enable(void)
+static void x86_amd_ssb_disable(void)
 {
-	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_rds_mask;
+	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
 
-	if (boot_cpu_has(X86_FEATURE_AMD_RDS))
+	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
@@ -473,7 +473,7 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
 	enum ssb_mitigation_cmd cmd;
 
-	if (!boot_cpu_has(X86_FEATURE_RDS))
+	if (!boot_cpu_has(X86_FEATURE_SSBD))
 		return mode;
 
 	cmd = ssb_parse_cmdline();
@@ -507,7 +507,7 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 	/*
 	 * We have three CPU feature flags that are in play here:
 	 *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
-	 *  - X86_FEATURE_RDS - CPU is able to turn off speculative store bypass
+	 *  - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
 	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
 	 */
 	if (mode == SPEC_STORE_BYPASS_DISABLE) {
@@ -518,12 +518,12 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 		 */
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
-			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
-			x86_spec_ctrl_mask &= ~SPEC_CTRL_RDS;
-			x86_spec_ctrl_set(SPEC_CTRL_RDS);
+			x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+			x86_spec_ctrl_mask &= ~SPEC_CTRL_SSBD;
+			x86_spec_ctrl_set(SPEC_CTRL_SSBD);
 			break;
 		case X86_VENDOR_AMD:
-			x86_amd_rds_enable();
+			x86_amd_ssb_disable();
 			break;
 		}
 	}
@@ -556,16 +556,16 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 		if (task_spec_ssb_force_disable(task))
 			return -EPERM;
 		task_clear_spec_ssb_disable(task);
-		update = test_and_clear_tsk_thread_flag(task, TIF_RDS);
+		update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
 		break;
 	case PR_SPEC_DISABLE:
 		task_set_spec_ssb_disable(task);
-		update = !test_and_set_tsk_thread_flag(task, TIF_RDS);
+		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
 		break;
 	case PR_SPEC_FORCE_DISABLE:
 		task_set_spec_ssb_disable(task);
 		task_set_spec_ssb_force_disable(task);
-		update = !test_and_set_tsk_thread_flag(task, TIF_RDS);
+		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
 		break;
 	default:
 		return -ERANGE;
@@ -635,7 +635,7 @@ void x86_spec_ctrl_setup_ap(void)
 		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
 
 	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
-		x86_amd_rds_enable();
+		x86_amd_ssb_disable();
 }
 
 #ifdef CONFIG_SYSFS
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e0517bcee446..9fbb388fadac 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -959,7 +959,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
 	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
-	   !(ia32_cap & ARCH_CAP_RDS_NO))
+	   !(ia32_cap & ARCH_CAP_SSBD_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (x86_match_cpu(cpu_no_speculation))
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ef3f9c01c274..0eab6c89c8d9 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -189,7 +189,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
-		setup_clear_cpu_cap(X86_FEATURE_RDS);
+		setup_clear_cpu_cap(X86_FEATURE_SSBD);
 	}
 
 	/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 397342725046..b77a091bf3b8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -283,11 +283,11 @@ static __always_inline void __speculative_store_bypass_update(unsigned long tifn
 {
 	u64 msr;
 
-	if (static_cpu_has(X86_FEATURE_AMD_RDS)) {
-		msr = x86_amd_ls_cfg_base | rds_tif_to_amd_ls_cfg(tifn);
+	if (static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+		msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
 		wrmsrl(MSR_AMD64_LS_CFG, msr);
 	} else {
-		msr = x86_spec_ctrl_base | rds_tif_to_spec_ctrl(tifn);
+		msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
 		wrmsrl(MSR_IA32_SPEC_CTRL, msr);
 	}
 }
@@ -329,7 +329,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	if ((tifp ^ tifn) & _TIF_NOCPUID)
 		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
 
-	if ((tifp ^ tifn) & _TIF_RDS)
+	if ((tifp ^ tifn) & _TIF_SSBD)
 		__speculative_store_bypass_update(tifn);
 }
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 376ac9a2a2b9..865c9a769864 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -407,7 +407,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(RDS) |
+		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SSBD) |
 		F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 16a111e44691..9b8d80bf3889 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3525,7 +3525,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SSBD))
 			return 1;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -3645,11 +3645,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SSBD))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_RDS))
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
 			return 1;
 
 		vmx->spec_ctrl = data;

From e96f46ee8587607a828f783daa6eb5b44d25004d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 9 May 2018 21:41:38 +0200
Subject: [PATCH 127/393] proc: Use underscores for SSBD in 'status'

The style for the 'status' file is CamelCase or this. _.

Fixes: fae1fa0fc ("proc: Provide details on speculation flaw mitigations")
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d178a0236514..72391b3f6927 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -336,7 +336,7 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 #ifdef CONFIG_SECCOMP
 	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
-	seq_printf(m, "\nSpeculation Store Bypass:\t");
+	seq_printf(m, "\nSpeculation_Store_Bypass:\t");
 	switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
 	case -EINVAL:
 		seq_printf(m, "unknown");

From dd0792699c4058e63c0715d9a7c2d40226fcdddc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 8 May 2018 15:43:45 +0200
Subject: [PATCH 128/393] Documentation/spec_ctrl: Do some minor cleanups

Fix some typos, improve formulations, end sentences with a fullstop.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/userspace-api/spec_ctrl.rst | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
index 1b3690d30943..32f3d55c54b7 100644
--- a/Documentation/userspace-api/spec_ctrl.rst
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -2,13 +2,13 @@
 Speculation Control
 ===================
 
-Quite some CPUs have speculation related misfeatures which are in fact
-vulnerabilites causing data leaks in various forms even accross privilege
-domains.
+Quite some CPUs have speculation-related misfeatures which are in
+fact vulnerabilities causing data leaks in various forms even across
+privilege domains.
 
 The kernel provides mitigation for such vulnerabilities in various
-forms. Some of these mitigations are compile time configurable and some on
-the kernel command line.
+forms. Some of these mitigations are compile-time configurable and some
+can be supplied on the kernel command line.
 
 There is also a class of mitigations which are very expensive, but they can
 be restricted to a certain set of processes or tasks in controlled
@@ -32,18 +32,18 @@ the following meaning:
 Bit  Define                Description
 ==== ===================== ===================================================
 0    PR_SPEC_PRCTL         Mitigation can be controlled per task by
-                           PR_SET_SPECULATION_CTRL
+                           PR_SET_SPECULATION_CTRL.
 1    PR_SPEC_ENABLE        The speculation feature is enabled, mitigation is
-                           disabled
+                           disabled.
 2    PR_SPEC_DISABLE       The speculation feature is disabled, mitigation is
-                           enabled
+                           enabled.
 3    PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A
                            subsequent prctl(..., PR_SPEC_ENABLE) will fail.
 ==== ===================== ===================================================
 
 If all bits are 0 the CPU is not affected by the speculation misfeature.
 
-If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
+If PR_SPEC_PRCTL is set, then the per-task control of the mitigation is
 available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
 misfeature will fail.
 
@@ -61,9 +61,9 @@ Common error codes
 Value   Meaning
 ======= =================================================================
 EINVAL  The prctl is not implemented by the architecture or unused
-        prctl(2) arguments are not 0
+        prctl(2) arguments are not 0.
 
-ENODEV  arg2 is selecting a not supported speculation misfeature
+ENODEV  arg2 is selecting a not supported speculation misfeature.
 ======= =================================================================
 
 PR_SET_SPECULATION_CTRL error codes
@@ -74,7 +74,7 @@ Value   Meaning
 0       Success
 
 ERANGE  arg3 is incorrect, i.e. it's neither PR_SPEC_ENABLE nor
-        PR_SPEC_DISABLE nor PR_SPEC_FORCE_DISABLE
+        PR_SPEC_DISABLE nor PR_SPEC_FORCE_DISABLE.
 
 ENXIO   Control of the selected speculation misfeature is not possible.
         See PR_GET_SPECULATION_CTRL.

From 06cb616b1bca7080824acfedb3d4c898e7a64836 Mon Sep 17 00:00:00 2001
From: Alexander Monakov <amonakov@ispras.ru>
Date: Sat, 28 Apr 2018 16:56:06 +0300
Subject: [PATCH 129/393] i2c: designware: fix poll-after-enable regression

Not all revisions of DW I2C controller implement the enable status register.
On platforms where that's the case (e.g. BG2CD and SPEAr ARM SoCs), waiting
for enable will time out as reading the unimplemented register yields zero.

It was observed that reading the IC_ENABLE_STATUS register once suffices to
avoid getting it stuck on Bay Trail hardware, so replace polling with one
dummy read of the register.

Fixes: fba4adbbf670 ("i2c: designware: must wait for enable")
Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Tested-by: Ben Gardner <gardner.ben@gmail.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Cc: stable@kernel.org
---
 drivers/i2c/busses/i2c-designware-master.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index fd36c39ddf4e..0cdba29ae0a9 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -209,7 +209,10 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev)
 	i2c_dw_disable_int(dev);
 
 	/* Enable the adapter */
-	__i2c_dw_enable_and_wait(dev, true);
+	__i2c_dw_enable(dev, true);
+
+	/* Dummy read to avoid the register getting stuck on Bay Trail */
+	dw_readl(dev, DW_IC_ENABLE_STATUS);
 
 	/* Clear and enable interrupts */
 	dw_readl(dev, DW_IC_CLR_INTR);

From d66d8ff3d21667b41eddbe86b35ab411e40d8c5f Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 10 May 2018 22:47:18 +0200
Subject: [PATCH 130/393] x86/bugs: Fix __ssb_select_mitigation() return type

__ssb_select_mitigation() returns one of the members of enum ssb_mitigation,
not ssb_mitigation_cmd; fix the prototype to reflect that.

Fixes: 24f7fc83b9204 ("x86/bugs: Provide boot parameters for the spec_store_bypass_disable mitigation")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 09b116b7f3bf..3287a42d4df4 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -468,7 +468,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
 	return cmd;
 }
 
-static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
+static enum ssb_mitigation __init __ssb_select_mitigation(void)
 {
 	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
 	enum ssb_mitigation_cmd cmd;

From 7bb4d366cba992904bffa4820d24e70a3de93e76 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 10 May 2018 22:47:32 +0200
Subject: [PATCH 131/393] x86/bugs: Make cpu_show_common() static

cpu_show_common() is not used outside of arch/x86/kernel/cpu/bugs.c, so
make it static.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3287a42d4df4..784753f0b41e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -640,7 +640,7 @@ void x86_spec_ctrl_setup_ap(void)
 
 #ifdef CONFIG_SYSFS
 
-ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			char *buf, unsigned int bug)
 {
 	if (!boot_cpu_has_bug(bug))

From 3597683c9da602b0440c5f742d64fa5da79cc026 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:09:34 -0700
Subject: [PATCH 132/393] tools: bpf: handle NULL return in
 bpf_prog_load_xattr()

bpf_object__open() can return error pointer as well as NULL.
Fix error handling in bpf_prog_load_xattr() (and indirectly
bpf_prog_load()).

Fixes: 6f6d33f3b3d0 ("bpf: selftests add sockmap tests")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5922443063f0..0f9f06df49bc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2035,7 +2035,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 		return -EINVAL;
 
 	obj = bpf_object__open(attr->file);
-	if (IS_ERR(obj))
+	if (IS_ERR_OR_NULL(obj))
 		return -ENOENT;
 
 	bpf_object__for_each_program(prog, obj) {

From 6d3299aef7df7225ecff653feedafb5d1646998b Mon Sep 17 00:00:00 2001
From: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Date: Mon, 7 May 2018 16:53:09 +0300
Subject: [PATCH 133/393] ARM: dts: imx51-zii-rdu1: fix touchscreen bindings

This fixes errors in RDU1 device tree that cause touch screens not
working.

Fixes: ceef0396f367 ("ARM: dts: imx: add ZII RDU1 board")
Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 arch/arm/boot/dts/imx51-zii-rdu1.dts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/boot/dts/imx51-zii-rdu1.dts b/arch/arm/boot/dts/imx51-zii-rdu1.dts
index 0c99ac04ad08..6464f2560e06 100644
--- a/arch/arm/boot/dts/imx51-zii-rdu1.dts
+++ b/arch/arm/boot/dts/imx51-zii-rdu1.dts
@@ -523,7 +523,7 @@ touchscreen@4c {
 	};
 
 	touchscreen@20 {
-		compatible = "syna,rmi4_i2c";
+		compatible = "syna,rmi4-i2c";
 		reg = <0x20>;
 		pinctrl-names = "default";
 		pinctrl-0 = <&pinctrl_ts>;
@@ -541,8 +541,8 @@ rmi4-f01@1 {
 
 		rmi4-f11@11 {
 			reg = <0x11>;
-			touch-inverted-y;
-			touch-swapped-x-y;
+			touchscreen-inverted-y;
+			touchscreen-swapped-x-y;
 			syna,sensor-type = <1>;
 		};
 	};

From ddc9cfb79c1096a0855839631c091aa7e9602052 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Thu, 26 Apr 2018 17:55:03 -0700
Subject: [PATCH 134/393] KVM: Extend MAX_IRQ_ROUTES to 4096 for all archs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our virtual machines make use of device assignment by configuring
12 NVMe disks for high I/O performance. Each NVMe device has 129
MSI-X Table entries:
Capabilities: [50] MSI-X: Enable+ Count=129 Masked-Vector table: BAR=0 offset=00002000
The windows virtual machines fail to boot since they will map the number of
MSI-table entries that the NVMe hardware reported to the bus to msi routing
table, this will exceed the 1024. This patch extends MAX_IRQ_ROUTES to 4096
for all archs, in the future this might be extended again if needed.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Tonny Lu <tonnylu@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6930c63126c7..6d6e79c59e68 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1045,13 +1045,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
-#ifdef CONFIG_S390
-#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that...
-#elif defined(CONFIG_ARM64)
-#define KVM_MAX_IRQ_ROUTES 4096
-#else
-#define KVM_MAX_IRQ_ROUTES 1024
-#endif
+#define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */
 
 bool kvm_arch_can_set_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,

From 6356ee0c9602004e0a3b4b2dad68ee2ee9385b17 Mon Sep 17 00:00:00 2001
From: Marian Rotariu <mrotariu@bitdefender.com>
Date: Mon, 30 Apr 2018 12:23:01 +0300
Subject: [PATCH 135/393] x86: Delay skip of emulated hypercall instruction

The IP increment should be done after the hypercall emulation, after
calling the various handlers. In this way, these handlers can accurately
identify the the IP of the VMCALL if they need it.

This patch keeps the same functionality for the Hyper-V handler which does
not use the return code of the standard kvm_skip_emulated_instruction()
call.

Signed-off-by: Marian Rotariu <mrotariu@bitdefender.com>
[Hyper-V hypercalls also need kvm_skip_emulated_instruction() - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c |  2 +-
 arch/x86/kvm/x86.c    | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 98618e397342..14dd5e5010a2 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1265,7 +1265,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 
 	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51ecd381793b..44bd4a23b59c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6671,12 +6671,13 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
-	int op_64_bit, r;
+	int op_64_bit;
 
-	r = kvm_skip_emulated_instruction(vcpu);
-
-	if (kvm_hv_hypercall_enabled(vcpu->kvm))
-		return kvm_hv_hypercall(vcpu);
+	if (kvm_hv_hypercall_enabled(vcpu->kvm)) {
+		if (!kvm_hv_hypercall(vcpu))
+			return 0;
+		goto out;
+	}
 
 	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
@@ -6697,7 +6698,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 
 	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
 		ret = -KVM_EPERM;
-		goto out;
+		goto out_error;
 	}
 
 	switch (nr) {
@@ -6717,12 +6718,14 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = -KVM_ENOSYS;
 		break;
 	}
-out:
+out_error:
 	if (!op_64_bit)
 		ret = (u32)ret;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+
+out:
 	++vcpu->stat.hypercalls;
-	return r;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 

From 452a68d0ef341c4d544757e02154788227b2a08b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 May 2018 19:24:34 +0200
Subject: [PATCH 136/393] KVM: hyperv: idr_find needs RCU protection

Even though the eventfd is released after the KVM SRCU grace period
elapses, the conn_to_evt data structure itself is not; it uses RCU
internally, instead.  Fix the read-side critical section to happen
under rcu_read_lock/unlock; the result is still protected by
vcpu->kvm->srcu.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 14dd5e5010a2..5708e951a5c6 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1296,8 +1296,10 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
 	if (param & ~KVM_HYPERV_CONN_ID_MASK)
 		return HV_STATUS_INVALID_HYPERCALL_INPUT;
 
-	/* conn_to_evt is protected by vcpu->kvm->srcu */
+	/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+	rcu_read_lock();
 	eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+	rcu_read_unlock();
 	if (!eventfd)
 		return HV_STATUS_INVALID_PORT_ID;
 

From bcb2b94ae01009db26d1ad0811975405149b14f0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Apr 2018 18:26:45 +0200
Subject: [PATCH 137/393] KVM: selftests: exit with 0 status code when tests
 cannot be run

Right now, skipped tests are returning a failure exit code if /dev/kvm does
not exists.  Consistently return a zero status code so that various scripts
over the interwebs do not complain.  Also return a zero status code if
the KVM_CAP_SYNC_REGS capability is not present, and hardcode in the
test the register kinds that are covered (rather than just using whatever
value of KVM_SYNC_X86_VALID_FIELDS is provided by the kernel headers).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile          |  2 +-
 .../testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c    | 16 ++++----
 tools/testing/selftests/kvm/sync_regs_test.c  | 40 ++++++++++++++-----
 .../selftests/kvm/vmx_tsc_adjust_test.c       |  4 +-
 5 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 2ddcc96ae456..d9d00319b07c 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -15,7 +15,7 @@ LIBKVM += $(LIBKVM_$(UNAME_M))
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
-CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
+CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I..
 
 # After inclusion, $(OUTPUT) is defined and
 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 7ab98e41324f..ac53730b30aa 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include "kselftest.h"
 
 ssize_t test_write(int fd, const void *buf, size_t count);
 ssize_t test_read(int fd, void *buf, size_t count);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 2cedfda181d4..37e2a787d2fc 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -50,8 +50,8 @@ int kvm_check_cap(long cap)
 	int kvm_fd;
 
 	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
-		KVM_DEV_PATH, kvm_fd, errno);
+	if (kvm_fd < 0)
+		exit(KSFT_SKIP);
 
 	ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
 	TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
@@ -91,8 +91,8 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
 	vm->mode = mode;
 	kvm_fd = open(KVM_DEV_PATH, perm);
-	TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
-		KVM_DEV_PATH, kvm_fd, errno);
+	if (kvm_fd < 0)
+		exit(KSFT_SKIP);
 
 	/* Create VM. */
 	vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL);
@@ -418,8 +418,8 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 
 	cpuid = allocate_kvm_cpuid2();
 	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
-		KVM_DEV_PATH, kvm_fd, errno);
+	if (kvm_fd < 0)
+		exit(KSFT_SKIP);
 
 	ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
 	TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
@@ -675,8 +675,8 @@ static int vcpu_mmap_sz(void)
 	int dev_fd, ret;
 
 	dev_fd = open(KVM_DEV_PATH, O_RDONLY);
-	TEST_ASSERT(dev_fd >= 0, "%s open %s failed, rc: %i errno: %i",
-		__func__, KVM_DEV_PATH, dev_fd, errno);
+	if (dev_fd < 0)
+		exit(KSFT_SKIP);
 
 	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
 	TEST_ASSERT(ret >= sizeof(struct kvm_run),
diff --git a/tools/testing/selftests/kvm/sync_regs_test.c b/tools/testing/selftests/kvm/sync_regs_test.c
index 428e9473f5e2..eae1ece3c31b 100644
--- a/tools/testing/selftests/kvm/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/sync_regs_test.c
@@ -85,6 +85,9 @@ static void compare_vcpu_events(struct kvm_vcpu_events *left,
 {
 }
 
+#define TEST_SYNC_FIELDS   (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
+#define INVALID_SYNC_FIELD 0x80000000
+
 int main(int argc, char *argv[])
 {
 	struct kvm_vm *vm;
@@ -98,9 +101,14 @@ int main(int argc, char *argv[])
 	setbuf(stdout, NULL);
 
 	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
-	TEST_ASSERT((unsigned long)cap == KVM_SYNC_X86_VALID_FIELDS,
-		    "KVM_CAP_SYNC_REGS (0x%x) != KVM_SYNC_X86_VALID_FIELDS (0x%lx)\n",
-		    cap, KVM_SYNC_X86_VALID_FIELDS);
+	if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) {
+		fprintf(stderr, "KVM_CAP_SYNC_REGS not supported, skipping test\n");
+		exit(KSFT_SKIP);
+	}
+	if ((cap & INVALID_SYNC_FIELD) != 0) {
+		fprintf(stderr, "The \"invalid\" field is not invalid, skipping test\n");
+		exit(KSFT_SKIP);
+	}
 
 	/* Create VM */
 	vm = vm_create_default(VCPU_ID, guest_code);
@@ -108,7 +116,14 @@ int main(int argc, char *argv[])
 	run = vcpu_state(vm, VCPU_ID);
 
 	/* Request reading invalid register set from VCPU. */
-	run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
+	run->kvm_valid_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vm, VCPU_ID);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+		    rv);
+	vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
 	rv = _vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
 		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
@@ -116,7 +131,14 @@ int main(int argc, char *argv[])
 	vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
 
 	/* Request setting invalid register set into VCPU. */
-	run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vm, VCPU_ID);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+		    rv);
+	vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
 	rv = _vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
 		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
@@ -125,7 +147,7 @@ int main(int argc, char *argv[])
 
 	/* Request and verify all valid register sets. */
 	/* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
-	run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	rv = _vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
 		    "Unexpected exit reason: %u (%s),\n",
@@ -146,7 +168,7 @@ int main(int argc, char *argv[])
 	run->s.regs.sregs.apic_base = 1 << 11;
 	/* TODO run->s.regs.events.XYZ = ABC; */
 
-	run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
 	rv = _vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
@@ -172,7 +194,7 @@ int main(int argc, char *argv[])
 	/* Clear kvm_dirty_regs bits, verify new s.regs values are
 	 * overwritten with existing guest values.
 	 */
-	run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = 0;
 	run->s.regs.regs.r11 = 0xDEADBEEF;
 	rv = _vcpu_run(vm, VCPU_ID);
@@ -211,7 +233,7 @@ int main(int argc, char *argv[])
 	 * with kvm_sync_regs values.
 	 */
 	run->kvm_valid_regs = 0;
-	run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS;
+	run->kvm_dirty_regs = TEST_SYNC_FIELDS;
 	run->s.regs.regs.r11 = 0xBBBB;
 	rv = _vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
index 8f7f62093add..aaa633263b2c 100644
--- a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
+++ b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
@@ -189,8 +189,8 @@ int main(int argc, char *argv[])
 	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
 
 	if (!(entry->ecx & CPUID_VMX)) {
-		printf("nested VMX not enabled, skipping test");
-		return 0;
+		fprintf(stderr, "nested VMX not enabled, skipping test\n");
+		exit(KSFT_SKIP);
 	}
 
 	vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code);

From c19986fea873f3c745122bf79013a872a190f212 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Fri, 4 May 2018 11:37:13 -0700
Subject: [PATCH 138/393] kvm: x86: Suppress CR3_PCID_INVD bit only when PCIDs
 are enabled

If the PCIDE bit is not set in CR4, then the MSb of CR3 is a reserved
bit. If the guest tries to set it, that should cause a #GP fault. So
mask out the bit only when the PCIDE bit is set.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 44bd4a23b59c..37dd9a9d050a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -843,7 +843,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 #ifdef CONFIG_X86_64
-	cr3 &= ~CR3_PCID_INVD;
+	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+	if (pcid_enabled)
+		cr3 &= ~CR3_PCID_INVD;
 #endif
 
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {

From 64f7a11586ab9262f00b8b6eceef6d8154921bd8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 30 Apr 2018 10:01:06 -0700
Subject: [PATCH 139/393] KVM: vmx: update sec exec controls for UMIP iff
 emulating UMIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update SECONDARY_EXEC_DESC for UMIP emulation if and only UMIP
is actually being emulated.  Skipping the VMCS update eliminates
unnecessary VMREAD/VMWRITE when UMIP is supported in hardware,
and on platforms that don't have SECONDARY_VM_EXEC_CONTROL.  The
latter case resolves a bug where KVM would fill the kernel log
with warnings due to failed VMWRITEs on older platforms.

Fixes: 0367f205a3b7 ("KVM: vmx: add support for emulating UMIP")
Cc: stable@vger.kernel.org #4.16
Reported-by: Paolo Zeppegno <pzeppegno@gmail.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Suggested-by: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7668806163f..3f1696570b41 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void)
 		SECONDARY_EXEC_ENABLE_VMFUNC;
 }
 
+static bool vmx_umip_emulated(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_DESC;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -4761,14 +4767,16 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	else
 		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
 
-	if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
-		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-			      SECONDARY_EXEC_DESC);
-		hw_cr4 &= ~X86_CR4_UMIP;
-	} else if (!is_guest_mode(vcpu) ||
-	           !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
-		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+		if (cr4 & X86_CR4_UMIP) {
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 				SECONDARY_EXEC_DESC);
+			hw_cr4 &= ~X86_CR4_UMIP;
+		} else if (!is_guest_mode(vcpu) ||
+			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_DESC);
+	}
 
 	if (cr4 & X86_CR4_VMXE) {
 		/*
@@ -9497,12 +9505,6 @@ static bool vmx_xsaves_supported(void)
 		SECONDARY_EXEC_XSAVES;
 }
 
-static bool vmx_umip_emulated(void)
-{
-	return vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_DESC;
-}
-
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;

From 7f6df440b8623c441c42d070bf592e2d2c1fa9bb Mon Sep 17 00:00:00 2001
From: Haneen Mohammed <hamohammed.sa@gmail.com>
Date: Fri, 11 May 2018 07:15:42 +0300
Subject: [PATCH 140/393] drm: Match sysfs name in link removal to link
 creation

This patch matches the sysfs name used in the unlinking with the
linking function. Otherwise, remove_compat_control_link() fails to remove
sysfs created by create_compat_control_link() in drm_dev_register().

Fixes: 6449b088dd51 ("drm: Add fake controlD* symlinks for backwards
compat")
Cc: Dave Airlie <airlied@gmail.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Emil Velikov <emil.l.velikov@gmail.com>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Cc: dri-devel@lists.freedesktop.org
Cc: <stable@vger.kernel.org> # v4.10+
Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com>
[seanpaul added Fixes and Cc tags]
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20180511041542.GA4253@haneen-vb
---
 drivers/gpu/drm/drm_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index a1b9338736e3..c2c21d839727 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -716,7 +716,7 @@ static void remove_compat_control_link(struct drm_device *dev)
 	if (!minor)
 		return;
 
-	name = kasprintf(GFP_KERNEL, "controlD%d", minor->index);
+	name = kasprintf(GFP_KERNEL, "controlD%d", minor->index + 64);
 	if (!name)
 		return;
 

From 1e2e547a93a00ebc21582c06ca3c6cfea2a309ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 May 2018 08:23:01 -0400
Subject: [PATCH 141/393] do d_instantiate/unlock_new_inode combinations safely

For anything NFS-exported we do _not_ want to unlock new inode
before it has grown an alias; original set of fixes got the
ordering right, but missed the nasty complication in case of
lockdep being enabled - unlock_new_inode() does
	lockdep_annotate_inode_mutex_key(inode)
which can only be done before anyone gets a chance to touch
->i_mutex.  Unfortunately, flipping the order and doing
unlock_new_inode() before d_instantiate() opens a window when
mkdir can race with open-by-fhandle on a guessed fhandle, leading
to multiple aliases for a directory inode and all the breakage
that follows from that.

	Correct solution: a new primitive (d_instantiate_new())
combining these two in the right order - lockdep annotate, then
d_instantiate(), then the rest of unlock_new_inode().  All
combinations of d_instantiate() with unlock_new_inode() should
be converted to that.

Cc: stable@kernel.org	# 2.6.29 and later
Tested-by: Mike Marshall <hubcap@omnibond.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c       | 16 ++++------------
 fs/dcache.c            | 22 ++++++++++++++++++++++
 fs/ecryptfs/inode.c    |  3 +--
 fs/ext2/namei.c        |  6 ++----
 fs/ext4/namei.c        |  6 ++----
 fs/f2fs/namei.c        | 12 ++++--------
 fs/jffs2/dir.c         | 12 ++++--------
 fs/jfs/namei.c         | 12 ++++--------
 fs/nilfs2/namei.c      |  6 ++----
 fs/orangefs/namei.c    |  9 +++------
 fs/reiserfs/namei.c    | 12 ++++--------
 fs/udf/namei.c         |  6 ++----
 fs/ufs/namei.c         |  6 ++----
 include/linux/dcache.h |  1 +
 14 files changed, 57 insertions(+), 72 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e064c49c9a9a..9e97cbb4f006 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6575,8 +6575,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock_inode;
 	} else {
 		btrfs_update_inode(trans, root, inode);
-		unlock_new_inode(inode);
-		d_instantiate(dentry, inode);
+		d_instantiate_new(dentry, inode);
 	}
 
 out_unlock:
@@ -6652,8 +6651,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock_inode;
 
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 
 out_unlock:
 	btrfs_end_transaction(trans);
@@ -6798,12 +6796,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	if (err)
 		goto out_fail_inode;
 
-	d_instantiate(dentry, inode);
-	/*
-	 * mkdir is special.  We're unlocking after we call d_instantiate
-	 * to avoid a race with nfsd calling d_instantiate.
-	 */
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 	drop_on_err = 0;
 
 out_fail:
@@ -10246,8 +10239,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock_inode;
 	}
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 
 out_unlock:
 	btrfs_end_transaction(trans);
diff --git a/fs/dcache.c b/fs/dcache.c
index 86d2de63461e..2acfc69878f5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1899,6 +1899,28 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 }
 EXPORT_SYMBOL(d_instantiate);
 
+/*
+ * This should be equivalent to d_instantiate() + unlock_new_inode(),
+ * with lockdep-related part of unlock_new_inode() done before
+ * anything else.  Use that instead of open-coding d_instantiate()/
+ * unlock_new_inode() combinations.
+ */
+void d_instantiate_new(struct dentry *entry, struct inode *inode)
+{
+	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+	BUG_ON(!inode);
+	lockdep_annotate_inode_mutex_key(inode);
+	security_d_instantiate(entry, inode);
+	spin_lock(&inode->i_lock);
+	__d_instantiate(entry, inode);
+	WARN_ON(!(inode->i_state & I_NEW));
+	inode->i_state &= ~I_NEW;
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(d_instantiate_new);
+
 /**
  * d_instantiate_no_diralias - instantiate a non-aliased dentry
  * @entry: dentry to complete
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 847904aa63a9..7bba8f2693b2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -283,8 +283,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 		iget_failed(ecryptfs_inode);
 		goto out;
 	}
-	unlock_new_inode(ecryptfs_inode);
-	d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+	d_instantiate_new(ecryptfs_dentry, ecryptfs_inode);
 out:
 	return rc;
 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 55f7caadb093..152453a91877 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,8 +41,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ext2_add_link(dentry, inode);
 	if (!err) {
-		unlock_new_inode(inode);
-		d_instantiate(dentry, inode);
+		d_instantiate_new(dentry, inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
@@ -255,8 +254,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 	if (err)
 		goto out_fail;
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 out:
 	return err;
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b1f21e3a0763..4a09063ce1d2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2411,8 +2411,7 @@ static int ext4_add_nondir(handle_t *handle,
 	int err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
-		unlock_new_inode(inode);
-		d_instantiate(dentry, inode);
+		d_instantiate_new(dentry, inode);
 		return 0;
 	}
 	drop_nlink(inode);
@@ -2651,8 +2650,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	err = ext4_mark_inode_dirty(handle, dir);
 	if (err)
 		goto out_clear_inode;
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d5098efe577c..75e37fd720b2 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -294,8 +294,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	alloc_nid_done(sbi, ino);
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
@@ -597,8 +596,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	err = page_symlink(inode, disk_link.name, disk_link.len);
 
 err_out:
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 
 	/*
 	 * Let's flush symlink data in order to avoid broken symlink as much as
@@ -661,8 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	alloc_nid_done(sbi, inode->i_ino);
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
@@ -713,8 +710,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 
 	alloc_nid_done(sbi, inode->i_ino);
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 0a754f38462e..e5a6deb38e1e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -209,8 +209,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
 		  __func__, inode->i_ino, inode->i_mode, inode->i_nlink,
 		  f->inocache->pino_nlink, inode->i_mapping->nrpages);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	return 0;
 
  fail:
@@ -430,8 +429,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	mutex_unlock(&dir_f->sem);
 	jffs2_complete_reservation(c);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	return 0;
 
  fail:
@@ -575,8 +573,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode
 	mutex_unlock(&dir_f->sem);
 	jffs2_complete_reservation(c);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	return 0;
 
  fail:
@@ -747,8 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
 	mutex_unlock(&dir_f->sem);
 	jffs2_complete_reservation(c);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	return 0;
 
  fail:
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b41596d71858..56c3fcbfe80e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
-		unlock_new_inode(ip);
-		d_instantiate(dentry, ip);
+		d_instantiate_new(dentry, ip);
 	}
 
       out2:
@@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
-		unlock_new_inode(ip);
-		d_instantiate(dentry, ip);
+		d_instantiate_new(dentry, ip);
 	}
 
       out2:
@@ -1059,8 +1057,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
-		unlock_new_inode(ip);
-		d_instantiate(dentry, ip);
+		d_instantiate_new(dentry, ip);
 	}
 
       out2:
@@ -1447,8 +1444,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
-		unlock_new_inode(ip);
-		d_instantiate(dentry, ip);
+		d_instantiate_new(dentry, ip);
 	}
 
       out1:
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1a2894aa0194..dd52d3f82e8d 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -46,8 +46,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
 	int err = nilfs_add_link(dentry, inode);
 
 	if (!err) {
-		d_instantiate(dentry, inode);
-		unlock_new_inode(inode);
+		d_instantiate_new(dentry, inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
@@ -243,8 +242,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto out_fail;
 
 	nilfs_mark_inode_dirty(inode);
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 out:
 	if (!err)
 		err = nilfs_transaction_commit(dir->i_sb);
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 6e3134e6d98a..1b5707c44c3f 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -75,8 +75,7 @@ static int orangefs_create(struct inode *dir,
 		     get_khandle_from_ino(inode),
 		     dentry);
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
@@ -332,8 +331,7 @@ static int orangefs_symlink(struct inode *dir,
 		     "Assigned symlink inode new number of %pU\n",
 		     get_khandle_from_ino(inode));
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
@@ -402,8 +400,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 		     "Assigned dir inode new number of %pU\n",
 		     get_khandle_from_ino(inode));
 
-	d_instantiate(dentry, inode);
-	unlock_new_inode(inode);
+	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bd39a998843d..5089dac02660 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
 	reiserfs_update_inode_transaction(inode);
 	reiserfs_update_inode_transaction(dir);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	retval = journal_end(&th);
 
 out_failed:
@@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
 		goto out_failed;
 	}
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	retval = journal_end(&th);
 
 out_failed:
@@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	/* the above add_entry did not update dir's stat data */
 	reiserfs_update_sd(&th, dir);
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	retval = journal_end(&th);
 out_failed:
 	reiserfs_write_unlock(dir->i_sb);
@@ -1187,8 +1184,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 		goto out_failed;
 	}
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	retval = journal_end(&th);
 out_failed:
 	reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 0458dd47e105..c586026508db 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -622,8 +622,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 
 	return 0;
 }
@@ -733,8 +732,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inc_nlink(dir);
 	dir->i_ctime = dir->i_mtime = current_time(dir);
 	mark_inode_dirty(dir);
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 32545cd00ceb..d5f43ba76c59 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -39,8 +39,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ufs_add_link(dentry, inode);
 	if (!err) {
-		unlock_new_inode(inode);
-		d_instantiate(dentry, inode);
+		d_instantiate_new(dentry, inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
@@ -193,8 +192,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 	if (err)
 		goto out_fail;
 
-	unlock_new_inode(inode);
-	d_instantiate(dentry, inode);
+	d_instantiate_new(dentry, inode);
 	return 0;
 
 out_fail:
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 94acbde17bb1..66c6e17e61e5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -224,6 +224,7 @@ extern seqlock_t rename_lock;
  * These are the low-level FS interfaces to the dcache..
  */
 extern void d_instantiate(struct dentry *, struct inode *);
+extern void d_instantiate_new(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
 extern int d_instantiate_no_diralias(struct dentry *, struct inode *);

From 79f546a696bff2590169fb5684e23d65f4d9f591 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 11 May 2018 11:20:57 +1000
Subject: [PATCH 142/393] fs: don't scan the inode cache before SB_BORN is set

We recently had an oops reported on a 4.14 kernel in
xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
and so the m_perag_tree lookup walked into lala land.  It produces
an oops down this path during the failed mount:

  radix_tree_gang_lookup_tag+0xc4/0x130
  xfs_perag_get_tag+0x37/0xf0
  xfs_reclaim_inodes_count+0x32/0x40
  xfs_fs_nr_cached_objects+0x11/0x20
  super_cache_count+0x35/0xc0
  shrink_slab.part.66+0xb1/0x370
  shrink_node+0x7e/0x1a0
  try_to_free_pages+0x199/0x470
  __alloc_pages_slowpath+0x3a1/0xd20
  __alloc_pages_nodemask+0x1c3/0x200
  cache_grow_begin+0x20b/0x2e0
  fallback_alloc+0x160/0x200
  kmem_cache_alloc+0x111/0x4e0

The problem is that the superblock shrinker is running before the
filesystem structures it depends on have been fully set up. i.e.
the shrinker is registered in sget(), before ->fill_super() has been
called, and the shrinker can call into the filesystem before
fill_super() does it's setup work. Essentially we are exposed to
both use-after-free and use-before-initialisation bugs here.

To fix this, add a check for the SB_BORN flag in super_cache_count.
In general, this flag is not set until ->fs_mount() completes
successfully, so we know that it is set after the filesystem
setup has completed. This matches the trylock_super() behaviour
which will not let super_cache_scan() run if SB_BORN is not set, and
hence will not allow the superblock shrinker from entering the
filesystem while it is being set up or after it has failed setup
and is being torn down.

Cc: stable@kernel.org
Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 122c402049a2..4b5b562176d0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -121,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	sb = container_of(shrink, struct super_block, s_shrink);
 
 	/*
-	 * Don't call trylock_super as it is a potential
-	 * scalability bottleneck. The counts could get updated
-	 * between super_cache_count and super_cache_scan anyway.
-	 * Call to super_cache_count with shrinker_rwsem held
-	 * ensures the safety of call to list_lru_shrink_count() and
-	 * s_op->nr_cached_objects().
+	 * We don't call trylock_super() here as it is a scalability bottleneck,
+	 * so we're exposed to partial setup state. The shrinker rwsem does not
+	 * protect filesystem operations backing list_lru_shrink_count() or
+	 * s_op->nr_cached_objects(). Counts can change between
+	 * super_cache_count and super_cache_scan, so we really don't need locks
+	 * here.
+	 *
+	 * However, if we are currently mounting the superblock, the underlying
+	 * filesystem might be in a state of partial construction and hence it
+	 * is dangerous to access it.  trylock_super() uses a SB_BORN check to
+	 * avoid this situation, so do the same here. The memory barrier is
+	 * matched with the one in mount_fs() as we don't hold locks here.
 	 */
+	if (!(sb->s_flags & SB_BORN))
+		return 0;
+	smp_rmb();
+
 	if (sb->s_op && sb->s_op->nr_cached_objects)
 		total_objects = sb->s_op->nr_cached_objects(sb, sc);
 
@@ -1272,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
+
+	/*
+	 * Write barrier is for super_cache_count(). We place it before setting
+	 * SB_BORN as the data dependency between the two functions is the
+	 * superblock structure contents that we just set up, not the SB_BORN
+	 * flag.
+	 */
+	smp_wmb();
 	sb->s_flags |= SB_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);

From ffed645e3be0e32f8e9ab068d257aee8d0fe8eec Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 11 May 2018 16:50:35 -0400
Subject: [PATCH 143/393] x86/bugs: Fix the parameters alignment and missing
 void

Fixes: 7bb4d366c ("x86/bugs: Make cpu_show_common() static")
Fixes: 24f7fc83b ("x86/bugs: Provide boot parameters for the spec_store_bypass_disable mitigation")
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 784753f0b41e..bab05913f864 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -531,7 +531,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
 	return mode;
 }
 
-static void ssb_select_mitigation()
+static void ssb_select_mitigation(void)
 {
 	ssb_mode = __ssb_select_mitigation();
 
@@ -641,7 +641,7 @@ void x86_spec_ctrl_setup_ap(void)
 #ifdef CONFIG_SYSFS
 
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
-			char *buf, unsigned int bug)
+			       char *buf, unsigned int bug)
 {
 	if (!boot_cpu_has_bug(bug))
 		return sprintf(buf, "Not affected\n");

From 3f12888dfae2a48741c4caa9214885b3aaf350f9 Mon Sep 17 00:00:00 2001
From: Wenwen Wang <wang6495@umn.edu>
Date: Sat, 5 May 2018 13:38:03 -0500
Subject: [PATCH 144/393] ALSA: control: fix a redundant-copy issue

In snd_ctl_elem_add_compat(), the fields of the struct 'data' need to be
copied from the corresponding fields of the struct 'data32' in userspace.
This is achieved by invoking copy_from_user() and get_user() functions. The
problem here is that the 'type' field is copied twice. One is by
copy_from_user() and one is by get_user(). Given that the 'type' field is
not used between the two copies, the second copy is *completely* redundant
and should be removed for better performance and cleanup. Also, these two
copies can cause inconsistent data: as the struct 'data32' resides in
userspace and a malicious userspace process can race to change the 'type'
field between the two copies to cause inconsistent data. Depending on how
the data is used in the future, such an inconsistency may cause potential
security risks.

For above reasons, we should take out the second copy.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/control_compat.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c
index a848836a5de0..507fd5210c1c 100644
--- a/sound/core/control_compat.c
+++ b/sound/core/control_compat.c
@@ -396,8 +396,7 @@ static int snd_ctl_elem_add_compat(struct snd_ctl_file *file,
 	if (copy_from_user(&data->id, &data32->id, sizeof(data->id)) ||
 	    copy_from_user(&data->type, &data32->type, 3 * sizeof(u32)))
 		goto error;
-	if (get_user(data->owner, &data32->owner) ||
-	    get_user(data->type, &data32->type))
+	if (get_user(data->owner, &data32->owner))
 		goto error;
 	switch (data->type) {
 	case SNDRV_CTL_ELEM_TYPE_BOOLEAN:

From c8beccc19b92f5172994c0732db689c08f4f98e5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 8 May 2018 09:27:46 +0200
Subject: [PATCH 145/393] ALSA: hda: Add Lenovo C50 All in one to the
 power_save blacklist

Power-saving is causing loud plops on the Lenovo C50 All in one, add it
to the blacklist.

BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1572975
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_intel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index b0c8c79848a9..a0c93b9c9a28 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2210,6 +2210,8 @@ static struct snd_pci_quirk power_save_blacklist[] = {
 	SND_PCI_QUIRK(0x1849, 0x0c0c, "Asrock B85M-ITX", 0),
 	/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
 	SND_PCI_QUIRK(0x1043, 0x8733, "Asus Prime X370-Pro", 0),
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=1572975 */
+	SND_PCI_QUIRK(0x17aa, 0x36a7, "Lenovo C50 All in one", 0),
 	/* https://bugzilla.kernel.org/show_bug.cgi?id=198611 */
 	SND_PCI_QUIRK(0x17aa, 0x2227, "Lenovo X1 Carbon 3rd Gen", 0),
 	{}

From f9bc6b2dd9cf025f827f471769e1d88b527bfb91 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 4 May 2018 13:01:32 -0700
Subject: [PATCH 146/393] x86/amd_nb: Add support for Raven Ridge CPUs

Add Raven Ridge root bridge and data fabric PCI IDs.
This is required for amd_pci_dev_to_node_id() and amd_smn_read().

Cc: stable@vger.kernel.org # v4.16+
Tested-by: Gabriel Craciunescu <nix.or.die@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 arch/x86/kernel/amd_nb.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c88e0b127810..b481b95bd8f6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -14,8 +14,11 @@
 #include <asm/amd_nb.h>
 
 #define PCI_DEVICE_ID_AMD_17H_ROOT	0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT	0x15d0
 #define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463
 #define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
 
 /* Protect the PCI config register pairs used for SMN and DF indirect access. */
 static DEFINE_MUTEX(smn_mutex);
@@ -24,6 +27,7 @@ static u32 *flush_words;
 
 static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
 	{}
 };
 
@@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
 	{}
 };
@@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{}
 };

From 3b031622f598481970400519bd5abc2a16708282 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 4 May 2018 13:01:33 -0700
Subject: [PATCH 147/393] hwmon: (k10temp) Use API function to access System
 Management Network

The SMN (System Management Network) on Family 17h AMD CPUs is also accessed
from other drivers, specifically EDAC. Accessing it directly is racy.
On top of that, accessing the SMN through root bridge 00:00 is wrong on
multi-die CPUs and may result in reading the temperature from the wrong
die. Use available API functions to fix the problem.

For this to work, add dependency on AMD_NB. Also change the Raven Ridge
PCI device ID to point to Data Fabric Function 3, since this ID is used
by the API functions to find the CPU node.

Cc: stable@vger.kernel.org # v4.16+
Tested-by: Gabriel Craciunescu <nix.or.die@gmail.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/Kconfig   |  2 +-
 drivers/hwmon/k10temp.c | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index f249a4428458..6ec307c93ece 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -272,7 +272,7 @@ config SENSORS_K8TEMP
 
 config SENSORS_K10TEMP
 	tristate "AMD Family 10h+ temperature sensor"
-	depends on X86 && PCI
+	depends on X86 && PCI && AMD_NB
 	help
 	  If you say yes here you get support for the temperature
 	  sensor(s) inside your CPU. Supported are later revisions of
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 34b5448b00be..3b73dee6fdc6 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <asm/amd_nb.h>
 #include <asm/processor.h>
 
 MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
@@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
 #define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463
 #endif
 
-#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
-#define PCI_DEVICE_ID_AMD_17H_RR_NB	0x15d0
+#ifndef PCI_DEVICE_ID_AMD_17H_M10H_DF_F3
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3	0x15eb
 #endif
 
 /* CPUID function 0x80000001, ebx */
@@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
 
 static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
 {
-	amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
-			  F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
+	amd_smn_read(amd_pci_dev_to_node_id(pdev),
+		     F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
 }
 
 static ssize_t temp1_input_show(struct device *dev,
@@ -322,7 +323,7 @@ static const struct pci_device_id k10temp_id_table[] = {
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
-	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
+	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
 	{}
 };
 MODULE_DEVICE_TABLE(pci, k10temp_id_table);

From 21493316a3c4598f308d5a9fa31cc74639c4caff Mon Sep 17 00:00:00 2001
From: Federico Cuello <fedux@fedux.com.ar>
Date: Wed, 9 May 2018 00:13:38 +0200
Subject: [PATCH 148/393] ALSA: usb: mixer: volume quirk for CM102-A+/102S+

Currently it's not possible to set volume lower than 26% (it just mutes).

Also fixes this warning:

  Warning! Unlikely big volume range (=9472), cval->res is probably wrong.
  [13] FU [PCM Playback Volume] ch = 2, val = -9473/-1/1

, and volume works fine for full range.

Signed-off-by: Federico Cuello <fedux@fedux.com.ar>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/mixer.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 344d7b069d59..bb5ab7a7dfa5 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -967,6 +967,14 @@ static void volume_control_quirks(struct usb_mixer_elem_info *cval,
 		}
 		break;
 
+	case USB_ID(0x0d8c, 0x0103):
+		if (!strcmp(kctl->id.name, "PCM Playback Volume")) {
+			usb_audio_info(chip,
+				 "set volume quirk for CM102-A+/102S+\n");
+			cval->min = -256;
+		}
+		break;
+
 	case USB_ID(0x0471, 0x0101):
 	case USB_ID(0x0471, 0x0104):
 	case USB_ID(0x0471, 0x0105):

From 2f0d520a1a73555ac51c19cd494493f60b4c1cea Mon Sep 17 00:00:00 2001
From: Jeremy Soller <jeremy@system76.com>
Date: Mon, 7 May 2018 09:28:45 -0600
Subject: [PATCH 149/393] ALSA: hda/realtek - Clevo P950ER ALC1220 Fixup

This adds support for the P950ER, which has the same required fixup as
the P950HR, but has a different PCI ID.

Signed-off-by: Jeremy Soller <jeremy@system76.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 2dd34dd77447..01a6643fc7d4 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2363,6 +2363,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = {
 	SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3),
 	SND_PCI_QUIRK(0x147b, 0x107a, "Abit AW9D-MAX", ALC882_FIXUP_ABIT_AW9D_MAX),
 	SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950),
+	SND_PCI_QUIRK(0x1558, 0x95e2, "Clevo P950ER", ALC1220_FIXUP_CLEVO_P950),
 	SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD),
 	SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD),
 	SND_PCI_QUIRK(0x17aa, 0x3a0d, "Lenovo Y530", ALC882_FIXUP_LENOVO_Y530),

From a466ef76b815b86748d9870ef2a430af7b39c710 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 9 May 2018 19:42:20 +0900
Subject: [PATCH 150/393] x86/kexec: Avoid double free_page() upon
 do_kexec_load() failure

>From ff82bedd3e12f0d3353282054ae48c3bd8c72012 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 9 May 2018 12:12:39 +0900
Subject: [PATCH v3] x86/kexec: avoid double free_page() upon do_kexec_load() failure.

syzbot is reporting crashes after memory allocation failure inside
do_kexec_load() [1]. This is because free_transition_pgtable() is called
by both init_transition_pgtable() and machine_kexec_cleanup() when memory
allocation failed inside init_transition_pgtable().

Regarding 32bit code, machine_kexec_free_page_tables() is called by both
machine_kexec_alloc_page_tables() and machine_kexec_cleanup() when memory
allocation failed inside machine_kexec_alloc_page_tables().

Fix this by leaving the error handling to machine_kexec_cleanup()
(and optionally setting NULL after free_page()).

[1] https://syzkaller.appspot.com/bug?id=91e52396168cf2bdd572fe1e1bc0bc645c1c6b40

Fixes: f5deb79679af6eb4 ("x86: kexec: Use one page table in x86_64 machine_kexec")
Fixes: 92be3d6bdf2cb349 ("kexec/i386: allocate page table pages dynamically")
Reported-by: syzbot <syzbot+d96f60296ef613fe1d69@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: prudo@linux.vnet.ibm.com
Cc: Huang Ying <ying.huang@intel.com>
Cc: syzkaller-bugs@googlegroups.com
Cc: takahiro.akashi@linaro.org
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: akpm@linux-foundation.org
Cc: dyoung@redhat.com
Cc: kirill.shutemov@linux.intel.com
Link: https://lkml.kernel.org/r/201805091942.DGG12448.tMFVFSJFQOOLHO@I-love.SAKURA.ne.jp
---
 arch/x86/kernel/machine_kexec_32.c | 6 +++++-
 arch/x86/kernel/machine_kexec_64.c | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 60cdec6628b0..d1ab07ec8c9a 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -57,12 +57,17 @@ static void load_segments(void)
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.pgd);
+	image->arch.pgd = NULL;
 #ifdef CONFIG_X86_PAE
 	free_page((unsigned long)image->arch.pmd0);
+	image->arch.pmd0 = NULL;
 	free_page((unsigned long)image->arch.pmd1);
+	image->arch.pmd1 = NULL;
 #endif
 	free_page((unsigned long)image->arch.pte0);
+	image->arch.pte0 = NULL;
 	free_page((unsigned long)image->arch.pte1);
+	image->arch.pte1 = NULL;
 }
 
 static int machine_kexec_alloc_page_tables(struct kimage *image)
@@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
 	    !image->arch.pmd0 || !image->arch.pmd1 ||
 #endif
 	    !image->arch.pte0 || !image->arch.pte1) {
-		machine_kexec_free_page_tables(image);
 		return -ENOMEM;
 	}
 	return 0;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a5e55d832d0a..6010449ca6d2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -39,9 +39,13 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
 static void free_transition_pgtable(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.p4d);
+	image->arch.p4d = NULL;
 	free_page((unsigned long)image->arch.pud);
+	image->arch.pud = NULL;
 	free_page((unsigned long)image->arch.pmd);
+	image->arch.pmd = NULL;
 	free_page((unsigned long)image->arch.pte);
+	image->arch.pte = NULL;
 }
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
@@ -91,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
-	free_transition_pgtable(image);
 	return result;
 }
 

From ee6a7354a3629f9b65bc18dbe393503e9440d6f5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 9 May 2018 21:58:15 +0900
Subject: [PATCH 151/393] kprobes/x86: Prohibit probing on exception masking
 instructions

Since MOV SS and POP SS instructions will delay the exceptions until the
next instruction is executed, single-stepping on it by kprobes must be
prohibited.

However, kprobes usually executes those instructions directly on trampoline
buffer (a.k.a. kprobe-booster), except for the kprobes which has
post_handler. Thus if kprobe user probes MOV SS with post_handler, it will
do single-stepping on the MOV SS.

This means it is safe that if it is used via ftrace or perf/bpf since those
don't use the post_handler.

Anyway, since the stack switching is a rare case, it is safer just
rejecting kprobes on such instructions.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S . Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/152587069574.17316.3311695234863248641.stgit@devbox
---
 arch/x86/include/asm/insn.h    | 18 ++++++++++++++++++
 arch/x86/kernel/kprobes/core.c |  4 ++++
 2 files changed, 22 insertions(+)

diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index b3e32b010ab1..c2c01f84df75 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
 	return insn_offset_displacement(insn) + insn->displacement.nbytes;
 }
 
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+	return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+		(insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+		 X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
 #endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0715f827607c..6f4d42377fe5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
 	if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
 
+	/* We should not singlestep on the exception masking instructions */
+	if (insn_masking_exception(insn))
+		return 0;
+
 #ifdef CONFIG_X86_64
 	/* Only x86_64 has RIP relative instructions */
 	if (insn_rip_relative(insn)) {

From 13ebe18c94f5b0665c01ae7fad2717ae959f4212 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 9 May 2018 21:58:45 +0900
Subject: [PATCH 152/393] uprobes/x86: Prohibit probing on MOV SS instruction

Since MOV SS and POP SS instructions will delay the exceptions until the
next instruction is executed, single-stepping on it by uprobes must be
prohibited.

uprobe already rejects probing on POP SS (0x1f), but allows probing on MOV
SS (0x8e and reg == 2).  This checks the target instruction and if it is
MOV SS or POP SS, returns -ENOTSUPP to reject probing.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S . Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/152587072544.17316.5950935243917346341.stgit@devbox
---
 arch/x86/kernel/uprobes.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 85c7ef23d99f..c84bb5396958 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -299,6 +299,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 	if (is_prefix_bad(insn))
 		return -ENOTSUPP;
 
+	/* We should not singlestep on the exception masking instructions */
+	if (insn_masking_exception(insn))
+		return -ENOTSUPP;
+
 	if (x86_64)
 		good_insns = good_insns_64;
 	else

From b127125d9db23e4856156a7c909a3c8e18b69f99 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 25 Apr 2018 10:28:38 -0400
Subject: [PATCH 153/393] fix breakage caused by d_find_alias() semantics
 change

"VFS: don't keep disconnected dentries on d_anon" had a non-trivial
side-effect - d_unhashed() now returns true for those dentries,
making d_find_alias() skip them altogether.  For most of its callers
that's fine - we really want a connected alias there.  However,
there is a codepath where we relied upon picking such aliases
if nothing else could be found - selinux delayed initialization
of contexts for inodes on already mounted filesystems used to
rely upon that.

Cc: stable@kernel.org # f1ee616214cb "VFS: don't keep disconnected dentries on d_anon"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 security/selinux/hooks.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..398d165f884e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1568,8 +1568,15 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 			/* Called from d_instantiate or d_splice_alias. */
 			dentry = dget(opt_dentry);
 		} else {
-			/* Called from selinux_complete_init, try to find a dentry. */
+			/*
+			 * Called from selinux_complete_init, try to find a dentry.
+			 * Some filesystems really want a connected one, so try
+			 * that first.  We could split SECURITY_FS_USE_XATTR in
+			 * two, depending upon that...
+			 */
 			dentry = d_find_alias(inode);
+			if (!dentry)
+				dentry = d_find_any_alias(inode);
 		}
 		if (!dentry) {
 			/*
@@ -1674,14 +1681,19 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 		if ((sbsec->flags & SE_SBGENFS) && !S_ISLNK(inode->i_mode)) {
 			/* We must have a dentry to determine the label on
 			 * procfs inodes */
-			if (opt_dentry)
+			if (opt_dentry) {
 				/* Called from d_instantiate or
 				 * d_splice_alias. */
 				dentry = dget(opt_dentry);
-			else
+			} else {
 				/* Called from selinux_complete_init, try to
-				 * find a dentry. */
+				 * find a dentry.  Some filesystems really want
+				 * a connected one, so try that first.
+				 */
 				dentry = d_find_alias(inode);
+				if (!dentry)
+					dentry = d_find_any_alias(inode);
+			}
 			/*
 			 * This can be hit on boot when a file is accessed
 			 * before the policy is loaded.  When we load policy we

From b1ae32dbab50ed19cfc16d225b0fb0114fb13025 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sun, 13 May 2018 12:32:22 -0700
Subject: [PATCH 154/393] x86/cpufeature: Guard asm_volatile_goto usage for BPF
 compilation

Workaround for the sake of BPF compilation which utilizes kernel
headers, but clang does not support ASM GOTO and fails the build.

Fixes: d0266046ad54 ("x86: Remove FAST_FEATURE_TESTS")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: daniel@iogearbox.net
Cc: peterz@infradead.org
Cc: netdev@vger.kernel.org
Cc: bp@alien8.de
Cc: yhs@fb.com
Cc: kernel-team@fb.com
Cc: torvalds@linux-foundation.org
Cc: davem@davemloft.net
Link: https://lkml.kernel.org/r/20180513193222.1997938-1-ast@kernel.org
---
 arch/x86/include/asm/cpufeature.h | 15 +++++++++++++++
 samples/bpf/Makefile              |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b27da9602a6d..aced6c9290d6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -140,6 +140,20 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 
 #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
 
+#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO)
+
+/*
+ * Workaround for the sake of BPF compilation which utilizes kernel
+ * headers, but clang does not support ASM GOTO and fails the build.
+ */
+#ifndef __BPF_TRACING__
+#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
+#endif
+
+#define static_cpu_has(bit)            boot_cpu_has(bit)
+
+#else
+
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -195,6 +209,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		boot_cpu_has(bit) :				\
 		_static_cpu_has(bit)				\
 )
+#endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6edd4bf6..092947676143 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -255,7 +255,7 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/%.o: $(src)/%.c
 	$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
 		-I$(srctree)/tools/testing/selftests/bpf/ \
-		-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
+		-D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
 		-D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
 		-Wno-gnu-variable-sized-type-not-at-end \
 		-Wno-address-of-packed-member -Wno-tautological-compare \

From f0dfd7a2b35b02030949100247d851b793cb275f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 9 May 2018 13:22:56 +0100
Subject: [PATCH 155/393] netfilter: nf_tables: fix memory leak on error exit
 return

Currently the -EBUSY error return path is not free'ing resources
allocated earlier, leaving a memory leak. Fix this by exiting via the
error exit label err5 that performs the necessary resource clean
up.

Detected by CoverityScan, CID#1432975 ("Resource leak")

Fixes: 9744a6fcefcb ("netfilter: nf_tables: check if same extensions are set when adding elements")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3806db31cbbf..91e80aa852d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4080,8 +4080,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
 			    nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
 			    nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
-			    nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
-				return -EBUSY;
+			    nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
+				err = -EBUSY;
+				goto err5;
+			}
 			if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
 			     nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
 			     memcmp(nft_set_ext_data(ext),

From b84bbaf7a6c8cca24f8acf25a2c8e46913a947ba Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 11 May 2018 13:24:25 -0400
Subject: [PATCH 156/393] packet: in packet_snd start writing at link layer
 allocation

Packet sockets allow construction of packets shorter than
dev->hard_header_len to accommodate protocols with variable length
link layer headers. These packets are padded to dev->hard_header_len,
because some device drivers interpret that as a minimum packet size.

packet_snd reserves dev->hard_header_len bytes on allocation.
SOCK_DGRAM sockets call skb_push in dev_hard_header() to ensure that
link layer headers are stored in the reserved range. SOCK_RAW sockets
do the same in tpacket_snd, but not in packet_snd.

Syzbot was able to send a zero byte packet to a device with massive
116B link layer header, causing padding to cross over into skb_shinfo.
Fix this by writing from the start of the llheader reserved range also
in the case of packet_snd/SOCK_RAW.

Update skb_set_network_header to the new offset. This also corrects
it for SOCK_DGRAM, where it incorrectly double counted reserve due to
the skb_push in dev_hard_header.

Fixes: 9ed988cd5915 ("packet: validate variable length ll headers")
Reported-by: syzbot+71d74a5406d02057d559@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..e9422fe45179 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2903,13 +2903,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	if (skb == NULL)
 		goto out_unlock;
 
-	skb_set_network_header(skb, reserve);
+	skb_reset_network_header(skb);
 
 	err = -EINVAL;
 	if (sock->type == SOCK_DGRAM) {
 		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
 		if (unlikely(offset < 0))
 			goto out_free;
+	} else if (reserve) {
+		skb_push(skb, reserve);
 	}
 
 	/* Returns -EFAULT on error */

From 55c82617c3e82210b7471e9334e8fc5df6a9961f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 12 May 2018 12:16:50 +0200
Subject: [PATCH 157/393] 3c59x: convert to generic DMA API

This driver supports EISA devices in addition to PCI devices, and relied
on the legacy behavior of the pci_dma* shims to pass on a NULL pointer
to the DMA API, and the DMA API being able to handle that.  When the
NULL forwarding broke the EISA support got broken.  Fix this by converting
to the DMA API instead of the legacy PCI shims.

Fixes: 4167b2ad ("PCI: Remove NULL device handling from PCI DMA API")
Reported-by: tedheadster <tedheadster@gmail.com>
Tested-by: tedheadster <tedheadster@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 104 +++++++++++++++---------------
 1 file changed, 51 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 36c8950dbd2d..176861bd2252 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -1212,9 +1212,9 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
 	vp->mii.reg_num_mask = 0x1f;
 
 	/* Makes sure rings are at least 16 byte aligned. */
-	vp->rx_ring = pci_alloc_consistent(pdev, sizeof(struct boom_rx_desc) * RX_RING_SIZE
+	vp->rx_ring = dma_alloc_coherent(gendev, sizeof(struct boom_rx_desc) * RX_RING_SIZE
 					   + sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-					   &vp->rx_ring_dma);
+					   &vp->rx_ring_dma, GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!vp->rx_ring)
 		goto free_device;
@@ -1476,11 +1476,10 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
 		return 0;
 
 free_ring:
-	pci_free_consistent(pdev,
-						sizeof(struct boom_rx_desc) * RX_RING_SIZE
-							+ sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-						vp->rx_ring,
-						vp->rx_ring_dma);
+	dma_free_coherent(&pdev->dev,
+		sizeof(struct boom_rx_desc) * RX_RING_SIZE +
+		sizeof(struct boom_tx_desc) * TX_RING_SIZE,
+		vp->rx_ring, vp->rx_ring_dma);
 free_device:
 	free_netdev(dev);
 	pr_err(PFX "vortex_probe1 fails.  Returns %d\n", retval);
@@ -1751,9 +1750,9 @@ vortex_open(struct net_device *dev)
 				break;			/* Bad news!  */
 
 			skb_reserve(skb, NET_IP_ALIGN);	/* Align IP on 16 byte boundaries */
-			dma = pci_map_single(VORTEX_PCI(vp), skb->data,
-					     PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
-			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma))
+			dma = dma_map_single(vp->gendev, skb->data,
+					     PKT_BUF_SZ, DMA_FROM_DEVICE);
+			if (dma_mapping_error(vp->gendev, dma))
 				break;
 			vp->rx_ring[i].addr = cpu_to_le32(dma);
 		}
@@ -2067,9 +2066,9 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (vp->bus_master) {
 		/* Set the bus-master controller to transfer the packet. */
 		int len = (skb->len + 3) & ~3;
-		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
-						PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, vp->tx_skb_dma)) {
+		vp->tx_skb_dma = dma_map_single(vp->gendev, skb->data, len,
+						DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, vp->tx_skb_dma)) {
 			dev_kfree_skb_any(skb);
 			dev->stats.tx_dropped++;
 			return NETDEV_TX_OK;
@@ -2168,9 +2167,9 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded | AddTCPChksum | AddUDPChksum);
 
 	if (!skb_shinfo(skb)->nr_frags) {
-		dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data, skb->len,
-					  PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+		dma_addr = dma_map_single(vp->gendev, skb->data, skb->len,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, dma_addr))
 			goto out_dma_err;
 
 		vp->tx_ring[entry].frag[0].addr = cpu_to_le32(dma_addr);
@@ -2178,9 +2177,9 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		int i;
 
-		dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data,
-					  skb_headlen(skb), PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+		dma_addr = dma_map_single(vp->gendev, skb->data,
+					  skb_headlen(skb), DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, dma_addr))
 			goto out_dma_err;
 
 		vp->tx_ring[entry].frag[0].addr = cpu_to_le32(dma_addr);
@@ -2189,21 +2188,21 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
-			dma_addr = skb_frag_dma_map(&VORTEX_PCI(vp)->dev, frag,
+			dma_addr = skb_frag_dma_map(vp->gendev, frag,
 						    0,
 						    frag->size,
 						    DMA_TO_DEVICE);
-			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr)) {
+			if (dma_mapping_error(vp->gendev, dma_addr)) {
 				for(i = i-1; i >= 0; i--)
-					dma_unmap_page(&VORTEX_PCI(vp)->dev,
+					dma_unmap_page(vp->gendev,
 						       le32_to_cpu(vp->tx_ring[entry].frag[i+1].addr),
 						       le32_to_cpu(vp->tx_ring[entry].frag[i+1].length),
 						       DMA_TO_DEVICE);
 
-				pci_unmap_single(VORTEX_PCI(vp),
+				dma_unmap_single(vp->gendev,
 						 le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
 						 le32_to_cpu(vp->tx_ring[entry].frag[0].length),
-						 PCI_DMA_TODEVICE);
+						 DMA_TO_DEVICE);
 
 				goto out_dma_err;
 			}
@@ -2218,8 +2217,8 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 #else
-	dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data, skb->len, PCI_DMA_TODEVICE);
-	if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+	dma_addr = dma_map_single(vp->gendev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(vp->gendev, dma_addr))
 		goto out_dma_err;
 	vp->tx_ring[entry].addr = cpu_to_le32(dma_addr);
 	vp->tx_ring[entry].length = cpu_to_le32(skb->len | LAST_FRAG);
@@ -2254,7 +2253,7 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 out:
 	return NETDEV_TX_OK;
 out_dma_err:
-	dev_err(&VORTEX_PCI(vp)->dev, "Error mapping dma buffer\n");
+	dev_err(vp->gendev, "Error mapping dma buffer\n");
 	goto out;
 }
 
@@ -2322,7 +2321,7 @@ vortex_interrupt(int irq, void *dev_id)
 		if (status & DMADone) {
 			if (ioread16(ioaddr + Wn7_MasterStatus) & 0x1000) {
 				iowrite16(0x1000, ioaddr + Wn7_MasterStatus); /* Ack the event. */
-				pci_unmap_single(VORTEX_PCI(vp), vp->tx_skb_dma, (vp->tx_skb->len + 3) & ~3, PCI_DMA_TODEVICE);
+				dma_unmap_single(vp->gendev, vp->tx_skb_dma, (vp->tx_skb->len + 3) & ~3, DMA_TO_DEVICE);
 				pkts_compl++;
 				bytes_compl += vp->tx_skb->len;
 				dev_kfree_skb_irq(vp->tx_skb); /* Release the transferred buffer */
@@ -2459,19 +2458,19 @@ boomerang_interrupt(int irq, void *dev_id)
 					struct sk_buff *skb = vp->tx_skbuff[entry];
 #if DO_ZEROCOPY
 					int i;
-					pci_unmap_single(VORTEX_PCI(vp),
+					dma_unmap_single(vp->gendev,
 							le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
 							le32_to_cpu(vp->tx_ring[entry].frag[0].length)&0xFFF,
-							PCI_DMA_TODEVICE);
+							DMA_TO_DEVICE);
 
 					for (i=1; i<=skb_shinfo(skb)->nr_frags; i++)
-							pci_unmap_page(VORTEX_PCI(vp),
+							dma_unmap_page(vp->gendev,
 											 le32_to_cpu(vp->tx_ring[entry].frag[i].addr),
 											 le32_to_cpu(vp->tx_ring[entry].frag[i].length)&0xFFF,
-											 PCI_DMA_TODEVICE);
+											 DMA_TO_DEVICE);
 #else
-					pci_unmap_single(VORTEX_PCI(vp),
-						le32_to_cpu(vp->tx_ring[entry].addr), skb->len, PCI_DMA_TODEVICE);
+					dma_unmap_single(vp->gendev,
+						le32_to_cpu(vp->tx_ring[entry].addr), skb->len, DMA_TO_DEVICE);
 #endif
 					pkts_compl++;
 					bytes_compl += skb->len;
@@ -2561,14 +2560,14 @@ static int vortex_rx(struct net_device *dev)
 				/* 'skb_put()' points to the start of sk_buff data area. */
 				if (vp->bus_master &&
 					! (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)) {
-					dma_addr_t dma = pci_map_single(VORTEX_PCI(vp), skb_put(skb, pkt_len),
-									   pkt_len, PCI_DMA_FROMDEVICE);
+					dma_addr_t dma = dma_map_single(vp->gendev, skb_put(skb, pkt_len),
+									   pkt_len, DMA_FROM_DEVICE);
 					iowrite32(dma, ioaddr + Wn7_MasterAddr);
 					iowrite16((skb->len + 3) & ~3, ioaddr + Wn7_MasterLen);
 					iowrite16(StartDMAUp, ioaddr + EL3_CMD);
 					while (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)
 						;
-					pci_unmap_single(VORTEX_PCI(vp), dma, pkt_len, PCI_DMA_FROMDEVICE);
+					dma_unmap_single(vp->gendev, dma, pkt_len, DMA_FROM_DEVICE);
 				} else {
 					ioread32_rep(ioaddr + RX_FIFO,
 					             skb_put(skb, pkt_len),
@@ -2635,11 +2634,11 @@ boomerang_rx(struct net_device *dev)
 			if (pkt_len < rx_copybreak &&
 			    (skb = netdev_alloc_skb(dev, pkt_len + 2)) != NULL) {
 				skb_reserve(skb, 2);	/* Align IP on 16 byte boundaries */
-				pci_dma_sync_single_for_cpu(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_sync_single_for_cpu(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				/* 'skb_put()' points to the start of sk_buff data area. */
 				skb_put_data(skb, vp->rx_skbuff[entry]->data,
 					     pkt_len);
-				pci_dma_sync_single_for_device(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_sync_single_for_device(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				vp->rx_copy++;
 			} else {
 				/* Pre-allocate the replacement skb.  If it or its
@@ -2651,9 +2650,9 @@ boomerang_rx(struct net_device *dev)
 					dev->stats.rx_dropped++;
 					goto clear_complete;
 				}
-				newdma = pci_map_single(VORTEX_PCI(vp), newskb->data,
-							PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
-				if (dma_mapping_error(&VORTEX_PCI(vp)->dev, newdma)) {
+				newdma = dma_map_single(vp->gendev, newskb->data,
+							PKT_BUF_SZ, DMA_FROM_DEVICE);
+				if (dma_mapping_error(vp->gendev, newdma)) {
 					dev->stats.rx_dropped++;
 					consume_skb(newskb);
 					goto clear_complete;
@@ -2664,7 +2663,7 @@ boomerang_rx(struct net_device *dev)
 				vp->rx_skbuff[entry] = newskb;
 				vp->rx_ring[entry].addr = cpu_to_le32(newdma);
 				skb_put(skb, pkt_len);
-				pci_unmap_single(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_unmap_single(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				vp->rx_nocopy++;
 			}
 			skb->protocol = eth_type_trans(skb, dev);
@@ -2761,8 +2760,8 @@ vortex_close(struct net_device *dev)
 	if (vp->full_bus_master_rx) { /* Free Boomerang bus master Rx buffers. */
 		for (i = 0; i < RX_RING_SIZE; i++)
 			if (vp->rx_skbuff[i]) {
-				pci_unmap_single(	VORTEX_PCI(vp), le32_to_cpu(vp->rx_ring[i].addr),
-									PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_unmap_single(vp->gendev, le32_to_cpu(vp->rx_ring[i].addr),
+									PKT_BUF_SZ, DMA_FROM_DEVICE);
 				dev_kfree_skb(vp->rx_skbuff[i]);
 				vp->rx_skbuff[i] = NULL;
 			}
@@ -2775,12 +2774,12 @@ vortex_close(struct net_device *dev)
 				int k;
 
 				for (k=0; k<=skb_shinfo(skb)->nr_frags; k++)
-						pci_unmap_single(VORTEX_PCI(vp),
+						dma_unmap_single(vp->gendev,
 										 le32_to_cpu(vp->tx_ring[i].frag[k].addr),
 										 le32_to_cpu(vp->tx_ring[i].frag[k].length)&0xFFF,
-										 PCI_DMA_TODEVICE);
+										 DMA_TO_DEVICE);
 #else
-				pci_unmap_single(VORTEX_PCI(vp), le32_to_cpu(vp->tx_ring[i].addr), skb->len, PCI_DMA_TODEVICE);
+				dma_unmap_single(vp->gendev, le32_to_cpu(vp->tx_ring[i].addr), skb->len, DMA_TO_DEVICE);
 #endif
 				dev_kfree_skb(skb);
 				vp->tx_skbuff[i] = NULL;
@@ -3288,11 +3287,10 @@ static void vortex_remove_one(struct pci_dev *pdev)
 
 	pci_iounmap(pdev, vp->ioaddr);
 
-	pci_free_consistent(pdev,
-						sizeof(struct boom_rx_desc) * RX_RING_SIZE
-							+ sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-						vp->rx_ring,
-						vp->rx_ring_dma);
+	dma_free_coherent(&pdev->dev,
+			sizeof(struct boom_rx_desc) * RX_RING_SIZE +
+			sizeof(struct boom_tx_desc) * TX_RING_SIZE,
+			vp->rx_ring, vp->rx_ring_dma);
 
 	pci_release_regions(pdev);
 

From 91dfd02b23006e7cc557bcb3a40aeb740f66fb52 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Sun, 13 May 2018 20:54:06 +0300
Subject: [PATCH 158/393] qede: Fix ref-cnt usage count

Rebooting while qedr is loaded with a VLAN interface present
results in unregister_netdevice waiting for the usage count
to become free.
The fix is that rdma devices should be removed before unregistering
the netdevice, to assure all references to ndev are decreased.

Fixes: cee9fbd8e2e9 ("qede: Add qedr framework")
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: Michal Kalderon <michal.kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index a01e7d6e5442..f6655e251bbd 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1066,13 +1066,12 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode)
 
 	DP_INFO(edev, "Starting qede_remove\n");
 
+	qede_rdma_dev_remove(edev);
 	unregister_netdev(ndev);
 	cancel_delayed_work_sync(&edev->sp_task);
 
 	qede_ptp_disable(edev);
 
-	qede_rdma_dev_remove(edev);
-
 	edev->ops->common->set_power_state(cdev, PCI_D0);
 
 	pci_set_drvdata(pdev, NULL);

From 0b3225ab9407f557a8e20f23f37aa7236c10a9b1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 4 May 2018 07:59:58 +0200
Subject: [PATCH 159/393] efi: Avoid potential crashes, fix the 'struct
 efi_pci_io_protocol_32' definition for mixed mode

Mixed mode allows a kernel built for x86_64 to interact with 32-bit
EFI firmware, but requires us to define all struct definitions carefully
when it comes to pointer sizes.

'struct efi_pci_io_protocol_32' currently uses a 'void *' for the
'romimage' field, which will be interpreted as a 64-bit field
on such kernels, potentially resulting in bogus memory references
and subsequent crashes.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: <stable@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180504060003.19618-13-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c | 6 ++++--
 include/linux/efi.h              | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 47d3efff6805..09f36c0d9d4f 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -163,7 +163,8 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
 	if (status != EFI_SUCCESS)
 		goto free_struct;
 
-	memcpy(rom->romdata, pci->romimage, pci->romsize);
+	memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
+	       pci->romsize);
 	return status;
 
 free_struct:
@@ -269,7 +270,8 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
 	if (status != EFI_SUCCESS)
 		goto free_struct;
 
-	memcpy(rom->romdata, pci->romimage, pci->romsize);
+	memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
+	       pci->romsize);
 	return status;
 
 free_struct:
diff --git a/include/linux/efi.h b/include/linux/efi.h
index f1b7d68ac460..3016d8c456bc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -395,8 +395,8 @@ typedef struct {
 	u32 attributes;
 	u32 get_bar_attributes;
 	u32 set_bar_attributes;
-	uint64_t romsize;
-	void *romimage;
+	u64 romsize;
+	u32 romimage;
 } efi_pci_io_protocol_32;
 
 typedef struct {
@@ -415,8 +415,8 @@ typedef struct {
 	u64 attributes;
 	u64 get_bar_attributes;
 	u64 set_bar_attributes;
-	uint64_t romsize;
-	void *romimage;
+	u64 romsize;
+	u64 romimage;
 } efi_pci_io_protocol_64;
 
 typedef struct {

From 0d463d8449010347b2bd8723cf32298bd07cfc57 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Sun, 6 May 2018 16:55:24 +0200
Subject: [PATCH 160/393] arm64: dts: exynos: Fix interrupt type for I2S1
 device on Exynos5433

All interrupts from SoC internal modules are level triggered, so fix
incorrect trigger type for I2S1 device on Exynos5433 SoCs.

This fixes following kernel warning:

WARNING: CPU: 2 PID: 1 at drivers/irqchip/irq-gic.c:1016 gic_irq_domain_translate+0xb0/0xb8
Modules linked in:
CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.16.0-rc7-next-20180329 #646
Hardware name: Samsung TM2 board (DT)
pstate: 20000005 (nzCv daif -PAN -UAO)
pc : gic_irq_domain_translate+0xb0/0xb8
lr : irq_create_fwspec_mapping+0x64/0x328
sp : ffff0000098b38d0
...
Call trace:
 gic_irq_domain_translate+0xb0/0xb8
 irq_create_of_mapping+0x78/0xa0
 of_irq_get+0x6c/0xa0
 of_irq_to_resource+0x38/0x108
 of_irq_to_resource_table+0x50/0x78
 of_device_alloc+0x118/0x1b8
 of_platform_device_create_pdata+0x54/0xe0
 of_platform_bus_create+0x118/0x340
 of_platform_bus_create+0x17c/0x340
 of_platform_populate+0x74/0xd8
 of_platform_default_populate_init+0xb0/0xcc
 do_one_initcall+0x50/0x158
 kernel_init_freeable+0x184/0x22c
 kernel_init+0x10/0x108
 ret_from_fork+0x10/0x18
---[ end trace 6decb2b3078d73f0 ]---

Fixes: d8d579c316e8 ("ARM: dts: exynos: Add I2S1 device node to exynos5433")
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm64/boot/dts/exynos/exynos5433.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/exynos/exynos5433.dtsi b/arch/arm64/boot/dts/exynos/exynos5433.dtsi
index c0231d077fa6..1ad8677f6a0a 100644
--- a/arch/arm64/boot/dts/exynos/exynos5433.dtsi
+++ b/arch/arm64/boot/dts/exynos/exynos5433.dtsi
@@ -1317,7 +1317,7 @@ i2s1: i2s@14d60000 {
 			reg = <0x14d60000 0x100>;
 			dmas = <&pdma0 31 &pdma0 30>;
 			dma-names = "tx", "rx";
-			interrupts = <GIC_SPI 435 IRQ_TYPE_NONE>;
+			interrupts = <GIC_SPI 435 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&cmu_peric CLK_PCLK_I2S1>,
 				 <&cmu_peric CLK_PCLK_I2S1>,
 				 <&cmu_peric CLK_SCLK_I2S1>;

From 4fe875e4bd3cae85ae6f6eaf77f63fabe613b66e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 14 May 2018 10:15:54 +0200
Subject: [PATCH 161/393] objtool, kprobes/x86: Sync the latest <asm/insn.h>
 header with tools/objtool/arch/x86/include/asm/insn.h

The following commit:

  ee6a7354a362: kprobes/x86: Prohibit probing on exception masking instructions

Modified <asm/insn.h>, adding the insn_masking_exception() function.

Sync the tooling version of the header to it, to fix this warning:

  Warning: synced file at 'tools/objtool/arch/x86/include/asm/insn.h' differs from latest kernel version at 'arch/x86/include/asm/insn.h'

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S . Miller" <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/arch/x86/include/asm/insn.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/objtool/arch/x86/include/asm/insn.h b/tools/objtool/arch/x86/include/asm/insn.h
index b3e32b010ab1..c2c01f84df75 100644
--- a/tools/objtool/arch/x86/include/asm/insn.h
+++ b/tools/objtool/arch/x86/include/asm/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
 	return insn_offset_displacement(insn) + insn->displacement.nbytes;
 }
 
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+	return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+		(insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+		 X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
 #endif /* _ASM_X86_INSN_H */

From 0afd0d9e0e7879d666c1df2fa1bea4d8716909fe Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 9 May 2018 22:39:14 -0500
Subject: [PATCH 162/393] objtool: Fix "noreturn" detection for recursive
 sibling calls

Objtool has some crude logic for detecting static "noreturn" functions
(aka "dead ends").  This is necessary for being able to correctly follow
GCC code flow when such functions are called.

It's remotely possible for two functions to call each other via sibling
calls.  If they don't have RET instructions, objtool's noreturn
detection logic goes into a recursive loop:

  drivers/char/ipmi/ipmi_ssif.o: warning: objtool: return_hosed_msg()+0x0: infinite recursion (objtool bug!)
  drivers/char/ipmi/ipmi_ssif.o: warning: objtool: deliver_recv_msg()+0x0: infinite recursion (objtool bug!)

Instead of reporting an error in this case, consider the functions to be
non-dead-ends.

Reported-and-tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: damian <damian.tometzki@icloud.com>
Link: http://lkml.kernel.org/r/7cc156408c5781a1f62085d352ced1fe39fe2f91.1525923412.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 5409f6f6c48d..264522d4e4af 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -190,9 +190,13 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 					continue;
 
 				if (recursion == 5) {
-					WARN_FUNC("infinite recursion (objtool bug!)",
-						  dest->sec, dest->offset);
-					return -1;
+					/*
+					 * Infinite recursion: two functions
+					 * have sibling calls to each other.
+					 * This is a very rare case.  It means
+					 * they aren't dead ends.
+					 */
+					return 0;
 				}
 
 				return __dead_end_function(file, dest_func,

From 13810435b9a7014fb92eb715f77da488f3b65b99 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 9 May 2018 22:39:15 -0500
Subject: [PATCH 163/393] objtool: Support GCC 8's cold subfunctions

GCC 8 moves a lot of unlikely code out of line to "cold" subfunctions in
.text.unlikely.  Properly detect the new subfunctions and treat them as
extensions of the original functions.

This fixes a bunch of warnings like:

  kernel/cgroup/cgroup.o: warning: objtool: parse_cgroup_root_flags()+0x33: sibling call from callable instruction with modified stack frame
  kernel/cgroup/cgroup.o: warning: objtool: cgroup_addrm_files()+0x290: sibling call from callable instruction with modified stack frame
  kernel/cgroup/cgroup.o: warning: objtool: cgroup_apply_control_enable()+0x25b: sibling call from callable instruction with modified stack frame
  kernel/cgroup/cgroup.o: warning: objtool: rebind_subsystems()+0x325: sibling call from callable instruction with modified stack frame

Reported-and-tested-by: damian <damian.tometzki@icloud.com>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0965e7fcfc5f31a276f0c7f298ff770c19b68706.1525923412.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 93 ++++++++++++++++++++++++-------------------
 tools/objtool/elf.c   | 42 ++++++++++++++++++-
 tools/objtool/elf.h   |  2 +
 3 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 264522d4e4af..14daf6a27d9f 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -59,6 +59,31 @@ static struct instruction *next_insn_same_sec(struct objtool_file *file,
 	return next;
 }
 
+static struct instruction *next_insn_same_func(struct objtool_file *file,
+					       struct instruction *insn)
+{
+	struct instruction *next = list_next_entry(insn, list);
+	struct symbol *func = insn->func;
+
+	if (!func)
+		return NULL;
+
+	if (&next->list != &file->insn_list && next->func == func)
+		return next;
+
+	/* Check if we're already in the subfunction: */
+	if (func == func->cfunc)
+		return NULL;
+
+	/* Move to the subfunction: */
+	return find_insn(file, func->cfunc->sec, func->cfunc->offset);
+}
+
+#define func_for_each_insn_all(file, func, insn)			\
+	for (insn = find_insn(file, func->sec, func->offset);		\
+	     insn;							\
+	     insn = next_insn_same_func(file, insn))
+
 #define func_for_each_insn(file, func, insn)				\
 	for (insn = find_insn(file, func->sec, func->offset);		\
 	     insn && &insn->list != &file->insn_list &&			\
@@ -149,10 +174,14 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 			if (!strcmp(func->name, global_noreturns[i]))
 				return 1;
 
-	if (!func->sec)
+	if (!func->len)
 		return 0;
 
-	func_for_each_insn(file, func, insn) {
+	insn = find_insn(file, func->sec, func->offset);
+	if (!insn->func)
+		return 0;
+
+	func_for_each_insn_all(file, func, insn) {
 		empty = false;
 
 		if (insn->type == INSN_RETURN)
@@ -167,28 +196,17 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 	 * case, the function's dead-end status depends on whether the target
 	 * of the sibling call returns.
 	 */
-	func_for_each_insn(file, func, insn) {
-		if (insn->sec != func->sec ||
-		    insn->offset >= func->offset + func->len)
-			break;
-
+	func_for_each_insn_all(file, func, insn) {
 		if (insn->type == INSN_JUMP_UNCONDITIONAL) {
 			struct instruction *dest = insn->jump_dest;
-			struct symbol *dest_func;
 
 			if (!dest)
 				/* sibling call to another file */
 				return 0;
 
-			if (dest->sec != func->sec ||
-			    dest->offset < func->offset ||
-			    dest->offset >= func->offset + func->len) {
-				/* local sibling call */
-				dest_func = find_symbol_by_offset(dest->sec,
-								  dest->offset);
-				if (!dest_func)
-					continue;
+			if (dest->func && dest->func->pfunc != insn->func->pfunc) {
 
+				/* local sibling call */
 				if (recursion == 5) {
 					/*
 					 * Infinite recursion: two functions
@@ -199,7 +217,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 					return 0;
 				}
 
-				return __dead_end_function(file, dest_func,
+				return __dead_end_function(file, dest->func,
 							   recursion + 1);
 			}
 		}
@@ -426,7 +444,7 @@ static void add_ignores(struct objtool_file *file)
 			if (!ignore_func(file, func))
 				continue;
 
-			func_for_each_insn(file, func, insn)
+			func_for_each_insn_all(file, func, insn)
 				insn->ignore = true;
 		}
 	}
@@ -786,9 +804,8 @@ static int add_special_section_alts(struct objtool_file *file)
 	return ret;
 }
 
-static int add_switch_table(struct objtool_file *file, struct symbol *func,
-			    struct instruction *insn, struct rela *table,
-			    struct rela *next_table)
+static int add_switch_table(struct objtool_file *file, struct instruction *insn,
+			    struct rela *table, struct rela *next_table)
 {
 	struct rela *rela = table;
 	struct instruction *alt_insn;
@@ -798,18 +815,13 @@ static int add_switch_table(struct objtool_file *file, struct symbol *func,
 		if (rela == next_table)
 			break;
 
-		if (rela->sym->sec != insn->sec ||
-		    rela->addend <= func->offset ||
-		    rela->addend >= func->offset + func->len)
+		alt_insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (!alt_insn)
 			break;
 
-		alt_insn = find_insn(file, insn->sec, rela->addend);
-		if (!alt_insn) {
-			WARN("%s: can't find instruction at %s+0x%x",
-			     file->rodata->rela->name, insn->sec->name,
-			     rela->addend);
-			return -1;
-		}
+		/* Make sure the jmp dest is in the function or subfunction: */
+		if (alt_insn->func->pfunc != insn->func->pfunc)
+			break;
 
 		alt = malloc(sizeof(*alt));
 		if (!alt) {
@@ -947,7 +959,7 @@ static int add_func_switch_tables(struct objtool_file *file,
 	struct rela *rela, *prev_rela = NULL;
 	int ret;
 
-	func_for_each_insn(file, func, insn) {
+	func_for_each_insn_all(file, func, insn) {
 		if (!last)
 			last = insn;
 
@@ -978,8 +990,7 @@ static int add_func_switch_tables(struct objtool_file *file,
 		 * the beginning of another switch table in the same function.
 		 */
 		if (prev_jump) {
-			ret = add_switch_table(file, func, prev_jump, prev_rela,
-					       rela);
+			ret = add_switch_table(file, prev_jump, prev_rela, rela);
 			if (ret)
 				return ret;
 		}
@@ -989,7 +1000,7 @@ static int add_func_switch_tables(struct objtool_file *file,
 	}
 
 	if (prev_jump) {
-		ret = add_switch_table(file, func, prev_jump, prev_rela, NULL);
+		ret = add_switch_table(file, prev_jump, prev_rela, NULL);
 		if (ret)
 			return ret;
 	}
@@ -1753,15 +1764,13 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 	while (1) {
 		next_insn = next_insn_same_sec(file, insn);
 
-
-		if (file->c_file && func && insn->func && func != insn->func) {
+		if (file->c_file && func && insn->func && func != insn->func->pfunc) {
 			WARN("%s() falls through to next function %s()",
 			     func->name, insn->func->name);
 			return 1;
 		}
 
-		if (insn->func)
-			func = insn->func;
+		func = insn->func ? insn->func->pfunc : NULL;
 
 		if (func && insn->ignore) {
 			WARN_FUNC("BUG: why am I validating an ignored function?",
@@ -1782,7 +1791,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 
 				i = insn;
 				save_insn = NULL;
-				func_for_each_insn_continue_reverse(file, func, i) {
+				func_for_each_insn_continue_reverse(file, insn->func, i) {
 					if (i->save) {
 						save_insn = i;
 						break;
@@ -1869,7 +1878,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 		case INSN_JUMP_UNCONDITIONAL:
 			if (insn->jump_dest &&
 			    (!func || !insn->jump_dest->func ||
-			     func == insn->jump_dest->func)) {
+			     insn->jump_dest->func->pfunc == func)) {
 				ret = validate_branch(file, insn->jump_dest,
 						      state);
 				if (ret)
@@ -2064,7 +2073,7 @@ static int validate_functions(struct objtool_file *file)
 
 	for_each_sec(file, sec) {
 		list_for_each_entry(func, &sec->symbol_list, list) {
-			if (func->type != STT_FUNC)
+			if (func->type != STT_FUNC || func->pfunc != func)
 				continue;
 
 			insn = find_insn(file, sec, func->offset);
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index c1c338661699..4e60e105583e 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -79,6 +79,19 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
 	return NULL;
 }
 
+struct symbol *find_symbol_by_name(struct elf *elf, const char *name)
+{
+	struct section *sec;
+	struct symbol *sym;
+
+	list_for_each_entry(sec, &elf->sections, list)
+		list_for_each_entry(sym, &sec->symbol_list, list)
+			if (!strcmp(sym->name, name))
+				return sym;
+
+	return NULL;
+}
+
 struct symbol *find_symbol_containing(struct section *sec, unsigned long offset)
 {
 	struct symbol *sym;
@@ -203,10 +216,11 @@ static int read_sections(struct elf *elf)
 
 static int read_symbols(struct elf *elf)
 {
-	struct section *symtab;
-	struct symbol *sym;
+	struct section *symtab, *sec;
+	struct symbol *sym, *pfunc;
 	struct list_head *entry, *tmp;
 	int symbols_nr, i;
+	char *coldstr;
 
 	symtab = find_section_by_name(elf, ".symtab");
 	if (!symtab) {
@@ -281,6 +295,30 @@ static int read_symbols(struct elf *elf)
 		hash_add(sym->sec->symbol_hash, &sym->hash, sym->idx);
 	}
 
+	/* Create parent/child links for any cold subfunctions */
+	list_for_each_entry(sec, &elf->sections, list) {
+		list_for_each_entry(sym, &sec->symbol_list, list) {
+			if (sym->type != STT_FUNC)
+				continue;
+			sym->pfunc = sym->cfunc = sym;
+			coldstr = strstr(sym->name, ".cold.");
+			if (coldstr) {
+				coldstr[0] = '\0';
+				pfunc = find_symbol_by_name(elf, sym->name);
+				coldstr[0] = '.';
+
+				if (!pfunc) {
+					WARN("%s(): can't find parent function",
+					     sym->name);
+					goto err;
+				}
+
+				sym->pfunc = pfunc;
+				pfunc->cfunc = sym;
+			}
+		}
+	}
+
 	return 0;
 
 err:
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index d86e2ff14466..de5cd2ddded9 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -61,6 +61,7 @@ struct symbol {
 	unsigned char bind, type;
 	unsigned long offset;
 	unsigned int len;
+	struct symbol *pfunc, *cfunc;
 };
 
 struct rela {
@@ -86,6 +87,7 @@ struct elf {
 struct elf *elf_open(const char *name, int flags);
 struct section *find_section_by_name(struct elf *elf, const char *name);
 struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
+struct symbol *find_symbol_by_name(struct elf *elf, const char *name);
 struct symbol *find_symbol_containing(struct section *sec, unsigned long offset);
 struct rela *find_rela_by_dest(struct section *sec, unsigned long offset);
 struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset,

From fd35c88b74170d9335530d9abf271d5d73eb5401 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 10 May 2018 17:48:49 -0500
Subject: [PATCH 164/393] objtool: Support GCC 8 switch tables

With GCC 8, some issues were found with the objtool switch table
detection.

1) In the .rodata section, immediately after the switch table, there can
   be another object which contains a pointer to the function which had
   the switch statement.  In this case objtool wrongly considers the
   function pointer to be part of the switch table.  Fix it by:

   a) making sure there are no pointers to the beginning of the
      function; and

   b) making sure there are no gaps in the switch table.

   Only the former was needed, the latter adds additional protection for
   future optimizations.

2) In find_switch_table(), case 1 and case 2 are missing the check to
   ensure that the .rodata switch table data is anonymous, i.e. that it
   isn't already associated with an ELF symbol.  Fix it by adding the
   same find_symbol_containing() check which is used for case 3.

This fixes the following warnings with GCC 8:

  drivers/block/virtio_blk.o: warning: objtool: virtio_queue_rq()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+72
  net/ipv6/icmp.o: warning: objtool: icmpv6_rcv()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+64
  drivers/usb/core/quirks.o: warning: objtool: quirks_param_set()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+48
  drivers/mtd/nand/raw/nand_hynix.o: warning: objtool: hynix_nand_decode_id()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+24
  drivers/mtd/nand/raw/nand_samsung.o: warning: objtool: samsung_nand_decode_id()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+32
  drivers/gpu/drm/nouveau/nvkm/subdev/top/gk104.o: warning: objtool: gk104_top_oneinit()+0x0: stack state mismatch: cfa1=7+8 cfa2=7+64

Reported-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: damian <damian.tometzki@icloud.com>
Link: http://lkml.kernel.org/r/20180510224849.xwi34d6tzheb5wgw@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 14daf6a27d9f..9bb04fddd3c8 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -810,17 +810,28 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn,
 	struct rela *rela = table;
 	struct instruction *alt_insn;
 	struct alternative *alt;
+	struct symbol *pfunc = insn->func->pfunc;
+	unsigned int prev_offset = 0;
 
 	list_for_each_entry_from(rela, &file->rodata->rela->rela_list, list) {
 		if (rela == next_table)
 			break;
 
+		/* Make sure the switch table entries are consecutive: */
+		if (prev_offset && rela->offset != prev_offset + 8)
+			break;
+
+		/* Detect function pointers from contiguous objects: */
+		if (rela->sym->sec == pfunc->sec &&
+		    rela->addend == pfunc->offset)
+			break;
+
 		alt_insn = find_insn(file, rela->sym->sec, rela->addend);
 		if (!alt_insn)
 			break;
 
 		/* Make sure the jmp dest is in the function or subfunction: */
-		if (alt_insn->func->pfunc != insn->func->pfunc)
+		if (alt_insn->func->pfunc != pfunc)
 			break;
 
 		alt = malloc(sizeof(*alt));
@@ -831,6 +842,13 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn,
 
 		alt->insn = alt_insn;
 		list_add_tail(&alt->list, &insn->alts);
+		prev_offset = rela->offset;
+	}
+
+	if (!prev_offset) {
+		WARN_FUNC("can't find switch jump table",
+			  insn->sec, insn->offset);
+		return -1;
 	}
 
 	return 0;
@@ -887,7 +905,9 @@ static struct rela *find_switch_table(struct objtool_file *file,
 	struct instruction *orig_insn = insn;
 
 	text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len);
-	if (text_rela && text_rela->sym == file->rodata->sym) {
+	if (text_rela && text_rela->sym == file->rodata->sym &&
+	    !find_symbol_containing(file->rodata, text_rela->addend)) {
+
 		/* case 1 */
 		rodata_rela = find_rela_by_dest(file->rodata,
 						text_rela->addend);

From 5f2b745f5e1304f438f9b2cd03ebc8120b6e0d3b Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Sun, 13 May 2018 17:33:57 -0400
Subject: [PATCH 165/393] x86/cpu: Make alternative_msr_write work for 32-bit
 code

Cast val and (val >> 32) to (u32), so that they fit in a
general-purpose register in both 32-bit and 64-bit code.

[ tglx: Made it u32 instead of uintptr_t ]

Fixes: c65732e4f721 ("x86/cpu: Restore CPUID_8000_0008_EBX reload")
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 328ea3cb769f..bc258e644e5e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -265,8 +265,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
 {
 	asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
 		: : "c" (msr),
-		    "a" (val),
-		    "d" (val >> 32),
+		    "a" ((u32)val),
+		    "d" ((u32)(val >> 32)),
 		    [feature] "i" (feature)
 		: "memory");
 }

From 2278446e2b7cd33ad894b32e7eb63afc7db6c86e Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Mon, 14 May 2018 11:57:23 +0300
Subject: [PATCH 166/393] xhci: Fix USB3 NULL pointer dereference at logical
 disconnect.

Hub driver will try to disable a USB3 device twice at logical disconnect,
racing with xhci_free_dev() callback from the first port disable.

This can be triggered with "udisksctl power-off --block-device <disk>"
or by writing "1" to the "remove" sysfs file for a USB3 device
in 4.17-rc4.

USB3 devices don't have a similar disabled link state as USB2 devices,
and use a U3 suspended link state instead. In this state the port
is still enabled and connected.

hub_port_connect() first disconnects the device, then later it notices
that device is still enabled (due to U3 states) it will try to disable
the port again (set to U3).

The xhci_free_dev() called during device disable is async, so checking
for existing xhci->devs[i] when setting link state to U3 the second time
was successful, even if device was being freed.

The regression was caused by, and whole thing revealed by,
Commit 44a182b9d177 ("xhci: Fix use-after-free in xhci_free_virt_device")
which sets xhci->devs[i]->udev to NULL before xhci_virt_dev() returned.
and causes a NULL pointer dereference the second time we try to set U3.

Fix this by checking xhci->devs[i]->udev exists before setting link state.

The original patch went to stable so this fix needs to be applied there as
well.

Fixes: 44a182b9d177 ("xhci: Fix use-after-free in xhci_free_virt_device")
Cc: <stable@vger.kernel.org>
Reported-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Tested-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-hub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 72ebbc908e19..32cd52ca8318 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -354,7 +354,7 @@ int xhci_find_slot_id_by_port(struct usb_hcd *hcd, struct xhci_hcd *xhci,
 
 	slot_id = 0;
 	for (i = 0; i < MAX_HC_SLOTS; i++) {
-		if (!xhci->devs[i])
+		if (!xhci->devs[i] || !xhci->devs[i]->udev)
 			continue;
 		speed = xhci->devs[i]->udev->speed;
 		if (((speed >= USB_SPEED_SUPER) == (hcd->speed >= HCD_USB3))

From 4a09f0210c8b1221aae8afda8bd3a603fece0986 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 9 May 2018 11:18:22 +0200
Subject: [PATCH 167/393] x86/boot/64/clang: Use fixup_pointer() to access
 '__supported_pte_mask'

Clang builds with defconfig started crashing after the following
commit:

  fb43d6cb91ef ("x86/mm: Do not auto-massage page protections")

This was caused by introducing a new global access in __startup_64().

Code in __startup_64() can be relocated during execution, but the compiler
doesn't have to generate PC-relative relocations when accessing globals
from that function. Clang actually does not generate them, which leads
to boot-time crashes. To work around this problem, every global pointer
must be adjusted using fixup_pointer().

Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: dvyukov@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: linux-mm@kvack.org
Cc: md@google.com
Cc: mka@chromium.org
Fixes: fb43d6cb91ef ("x86/mm: Do not auto-massage page protections")
Link: http://lkml.kernel.org/r/20180509091822.191810-1-glider@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head64.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0c408f8c4ed4..2d29e47c056e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -104,6 +104,12 @@ static bool __head check_la57_support(unsigned long physaddr)
 }
 #endif
 
+/* Code in __startup_64() can be relocated during execution, but the compiler
+ * doesn't have to generate PC-relative relocations when accessing globals from
+ * that function. Clang actually does not generate them, which leads to
+ * boot-time crashes. To work around this problem, every global pointer must
+ * be adjusted using fixup_pointer().
+ */
 unsigned long __head __startup_64(unsigned long physaddr,
 				  struct boot_params *bp)
 {
@@ -113,6 +119,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	p4dval_t *p4d;
 	pudval_t *pud;
 	pmdval_t *pmd, pmd_entry;
+	pteval_t *mask_ptr;
 	bool la57;
 	int i;
 	unsigned int *next_pgt_ptr;
@@ -196,7 +203,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
 	/* Filter out unsupported __PAGE_KERNEL_* bits: */
-	pmd_entry &= __supported_pte_mask;
+	mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
+	pmd_entry &= *mask_ptr;
 	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 

From 0fb96620dce351608aa82eed5942e2f58b07beda Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 14 May 2018 10:56:23 +0200
Subject: [PATCH 168/393] x86/pkeys/selftests: Adjust the self-test to fresh
 distros that export the pkeys ABI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ubuntu 18.04 started exporting pkeys details in header files, resulting
in build failures and warnings in the pkeys self-tests:

  protection_keys.c:232:0: warning: "SEGV_BNDERR" redefined
  protection_keys.c:387:5: error: conflicting types for ‘pkey_get’
  protection_keys.c:409:5: error: conflicting types for ‘pkey_set’
  ...

Fix these namespace conflicts and double definitions, plus also
clean up the ABI definitions to make it all a bit more readable ...

Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: dave.hansen@intel.com
Cc: linux-mm@kvack.org
Cc: linuxram@us.ibm.com
Cc: mpe@ellerman.id.au
Cc: shakeelb@google.com
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180514085623.GB7094@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index f15aa5a76fe3..bbe80a5c31c7 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -191,26 +191,30 @@ void lots_o_noops_around_write(int *write_to_me)
 #ifdef __i386__
 
 #ifndef SYS_mprotect_key
-# define SYS_mprotect_key 380
+# define SYS_mprotect_key	380
 #endif
+
 #ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc	 381
-# define SYS_pkey_free	 382
+# define SYS_pkey_alloc		381
+# define SYS_pkey_free		382
 #endif
-#define REG_IP_IDX REG_EIP
-#define si_pkey_offset 0x14
+
+#define REG_IP_IDX		REG_EIP
+#define si_pkey_offset		0x14
 
 #else
 
 #ifndef SYS_mprotect_key
-# define SYS_mprotect_key 329
+# define SYS_mprotect_key	329
 #endif
+
 #ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc	 330
-# define SYS_pkey_free	 331
+# define SYS_pkey_alloc		330
+# define SYS_pkey_free		331
 #endif
-#define REG_IP_IDX REG_RIP
-#define si_pkey_offset 0x20
+
+#define REG_IP_IDX		REG_RIP
+#define si_pkey_offset		0x20
 
 #endif
 
@@ -225,8 +229,14 @@ void dump_mem(void *dumpme, int len_bytes)
 	}
 }
 
-#define SEGV_BNDERR     3  /* failed address bound checks */
-#define SEGV_PKUERR     4
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR		3
+#endif
+
+#ifndef SEGV_PKUERR
+# define SEGV_PKUERR		4
+#endif
 
 static char *si_code_str(int si_code)
 {
@@ -393,10 +403,15 @@ pid_t fork_lazy_child(void)
 	return forkret;
 }
 
-#define PKEY_DISABLE_ACCESS    0x1
-#define PKEY_DISABLE_WRITE     0x2
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS	0x1
+#endif
 
-u32 pkey_get(int pkey, unsigned long flags)
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE	0x2
+#endif
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
 {
 	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 	u32 pkru = __rdpkru();
@@ -418,7 +433,7 @@ u32 pkey_get(int pkey, unsigned long flags)
 	return masked_pkru;
 }
 
-int pkey_set(int pkey, unsigned long rights, unsigned long flags)
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
 {
 	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 	u32 old_pkru = __rdpkru();
@@ -452,15 +467,15 @@ void pkey_disable_set(int pkey, int flags)
 		pkey, flags);
 	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 
-	pkey_rights = pkey_get(pkey, syscall_flags);
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
 
-	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 			pkey, pkey, pkey_rights);
 	pkey_assert(pkey_rights >= 0);
 
 	pkey_rights |= flags;
 
-	ret = pkey_set(pkey, pkey_rights, syscall_flags);
+	ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
 	assert(!ret);
 	/*pkru and flags have the same format */
 	shadow_pkru |= flags << (pkey * 2);
@@ -468,8 +483,8 @@ void pkey_disable_set(int pkey, int flags)
 
 	pkey_assert(ret >= 0);
 
-	pkey_rights = pkey_get(pkey, syscall_flags);
-	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 			pkey, pkey, pkey_rights);
 
 	dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
@@ -483,24 +498,24 @@ void pkey_disable_clear(int pkey, int flags)
 {
 	unsigned long syscall_flags = 0;
 	int ret;
-	int pkey_rights = pkey_get(pkey, syscall_flags);
+	int pkey_rights = hw_pkey_get(pkey, syscall_flags);
 	u32 orig_pkru = rdpkru();
 
 	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 
-	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 			pkey, pkey, pkey_rights);
 	pkey_assert(pkey_rights >= 0);
 
 	pkey_rights |= flags;
 
-	ret = pkey_set(pkey, pkey_rights, 0);
+	ret = hw_pkey_set(pkey, pkey_rights, 0);
 	/* pkru and flags have the same format */
 	shadow_pkru &= ~(flags << (pkey * 2));
 	pkey_assert(ret >= 0);
 
-	pkey_rights = pkey_get(pkey, syscall_flags);
-	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 			pkey, pkey, pkey_rights);
 
 	dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());

From 73bb4d6cd192b8629c5125aaada9892d9fc986b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 14 May 2018 10:59:08 +0200
Subject: [PATCH 169/393] x86/mpx/selftests: Adjust the self-test to fresh
 distros that export the MPX ABI

Fix this warning:

  mpx-mini-test.c:422:0: warning: "SEGV_BNDERR" redefined

Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: dave.hansen@intel.com
Cc: linux-mm@kvack.org
Cc: linuxram@us.ibm.com
Cc: mpe@ellerman.id.au
Cc: shakeelb@google.com
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180514085908.GA12798@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/mpx-mini-test.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
index 9c0325e1ea68..50f7e9272481 100644
--- a/tools/testing/selftests/x86/mpx-mini-test.c
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -368,6 +368,11 @@ static int expected_bnd_index = -1;
 uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */
 unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS];
 
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR	3
+#endif
+
 /*
  * The kernel is supposed to provide some information about the bounds
  * exception in the siginfo.  It should match what we have in the bounds
@@ -419,8 +424,6 @@ void handler(int signum, siginfo_t *si, void *vucontext)
 		br_count++;
 		dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count);
 
-#define SEGV_BNDERR     3  /* failed address bound checks */
-
 		dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n",
 				status, ip, br_reason);
 		dprintf2("si_signo: %d\n", si->si_signo);

From 59c2a7226fc5130032021c99f05ad5c0a56551cd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 8 May 2018 10:28:35 -0700
Subject: [PATCH 170/393] x86/selftests: Add mov_to_ss test

This exercises a nasty corner case of the x86 ISA.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/67e08b69817171da8026e0eb3af0214b06b4d74f.1525800455.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/Makefile      |   2 +-
 tools/testing/selftests/x86/mov_ss_trap.c | 285 ++++++++++++++++++++++
 2 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/mov_ss_trap.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index d744991c0f4f..39f66bc29b82 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -11,7 +11,7 @@ CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
 			check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
-			protection_keys test_vdso test_vsyscall
+			protection_keys test_vdso test_vsyscall mov_ss_trap
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c
new file mode 100644
index 000000000000..3c3a022654f3
--- /dev/null
+++ b/tools/testing/selftests/x86/mov_ss_trap.c
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mov_ss_trap.c: Exercise the bizarre side effects of a watchpoint on MOV SS
+ *
+ * This does MOV SS from a watchpointed address followed by various
+ * types of kernel entries.  A MOV SS that hits a watchpoint will queue
+ * up a #DB trap but will not actually deliver that trap.  The trap
+ * will be delivered after the next instruction instead.  The CPU's logic
+ * seems to be:
+ *
+ *  - Any fault: drop the pending #DB trap.
+ *  - INT $N, INT3, INTO, SYSCALL, SYSENTER: enter the kernel and then
+ *    deliver #DB.
+ *  - ICEBP: enter the kernel but do not deliver the watchpoint trap
+ *  - breakpoint: only one #DB is delivered (phew!)
+ *
+ * There are plenty of ways for a kernel to handle this incorrectly.  This
+ * test tries to exercise all the cases.
+ *
+ * This should mostly cover CVE-2018-1087 and CVE-2018-8897.
+ */
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/user.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <setjmp.h>
+#include <sys/prctl.h>
+
+#define X86_EFLAGS_RF (1UL << 16)
+
+#if __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+unsigned short ss;
+extern unsigned char breakpoint_insn[];
+sigjmp_buf jmpbuf;
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void enable_watchpoint(void)
+{
+	pid_t parent = getpid();
+	int status;
+
+	pid_t child = fork();
+	if (child < 0)
+		err(1, "fork");
+
+	if (child) {
+		if (waitpid(child, &status, 0) != child)
+			err(1, "waitpid for child");
+	} else {
+		unsigned long dr0, dr1, dr7;
+
+		dr0 = (unsigned long)&ss;
+		dr1 = (unsigned long)breakpoint_insn;
+		dr7 = ((1UL << 1) |	/* G0 */
+		       (3UL << 16) |	/* RW0 = read or write */
+		       (1UL << 18) |	/* LEN0 = 2 bytes */
+		       (1UL << 3));	/* G1, RW1 = insn */
+
+		if (ptrace(PTRACE_ATTACH, parent, NULL, NULL) != 0)
+			err(1, "PTRACE_ATTACH");
+
+		if (waitpid(parent, &status, 0) != parent)
+			err(1, "waitpid for child");
+
+		if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[0]), dr0) != 0)
+			err(1, "PTRACE_POKEUSER DR0");
+
+		if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[1]), dr1) != 0)
+			err(1, "PTRACE_POKEUSER DR1");
+
+		if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[7]), dr7) != 0)
+			err(1, "PTRACE_POKEUSER DR7");
+
+		printf("\tDR0 = %lx, DR1 = %lx, DR7 = %lx\n", dr0, dr1, dr7);
+
+		if (ptrace(PTRACE_DETACH, parent, NULL, NULL) != 0)
+			err(1, "PTRACE_DETACH");
+
+		exit(0);
+	}
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static char const * const signames[] = {
+	[SIGSEGV] = "SIGSEGV",
+	[SIGBUS] = "SIBGUS",
+	[SIGTRAP] = "SIGTRAP",
+	[SIGILL] = "SIGILL",
+};
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = ctx_void;
+
+	printf("\tGot SIGTRAP with RIP=%lx, EFLAGS.RF=%d\n",
+	       (unsigned long)ctx->uc_mcontext.gregs[REG_IP],
+	       !!(ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_RF));
+}
+
+static void handle_and_return(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = ctx_void;
+
+	printf("\tGot %s with RIP=%lx\n", signames[sig],
+	       (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+}
+
+static void handle_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = ctx_void;
+
+	printf("\tGot %s with RIP=%lx\n", signames[sig],
+	       (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+
+	siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+	unsigned long nr;
+
+	asm volatile ("mov %%ss, %[ss]" : [ss] "=m" (ss));
+	printf("\tSS = 0x%hx, &SS = 0x%p\n", ss, &ss);
+
+	if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0) == 0)
+		printf("\tPR_SET_PTRACER_ANY succeeded\n");
+
+	printf("\tSet up a watchpoint\n");
+	sethandler(SIGTRAP, sigtrap, 0);
+	enable_watchpoint();
+
+	printf("[RUN]\tRead from watched memory (should get SIGTRAP)\n");
+	asm volatile ("mov %[ss], %[tmp]" : [tmp] "=r" (nr) : [ss] "m" (ss));
+
+	printf("[RUN]\tMOV SS; INT3\n");
+	asm volatile ("mov %[ss], %%ss; int3" :: [ss] "m" (ss));
+
+	printf("[RUN]\tMOV SS; INT 3\n");
+	asm volatile ("mov %[ss], %%ss; .byte 0xcd, 0x3" :: [ss] "m" (ss));
+
+	printf("[RUN]\tMOV SS; CS CS INT3\n");
+	asm volatile ("mov %[ss], %%ss; .byte 0x2e, 0x2e; int3" :: [ss] "m" (ss));
+
+	printf("[RUN]\tMOV SS; CSx14 INT3\n");
+	asm volatile ("mov %[ss], %%ss; .fill 14,1,0x2e; int3" :: [ss] "m" (ss));
+
+	printf("[RUN]\tMOV SS; INT 4\n");
+	sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+	asm volatile ("mov %[ss], %%ss; int $4" :: [ss] "m" (ss));
+
+#ifdef __i386__
+	printf("[RUN]\tMOV SS; INTO\n");
+	sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+	nr = -1;
+	asm volatile ("add $1, %[tmp]; mov %[ss], %%ss; into"
+		      : [tmp] "+r" (nr) : [ss] "m" (ss));
+#endif
+
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; ICEBP\n");
+
+		/* Some emulators (e.g. QEMU TCG) don't emulate ICEBP. */
+		sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+
+		asm volatile ("mov %[ss], %%ss; .byte 0xf1" :: [ss] "m" (ss));
+	}
+
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; CLI\n");
+		sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+		asm volatile ("mov %[ss], %%ss; cli" :: [ss] "m" (ss));
+	}
+
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; #PF\n");
+		sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+		asm volatile ("mov %[ss], %%ss; mov (-1), %[tmp]"
+			      : [tmp] "=r" (nr) : [ss] "m" (ss));
+	}
+
+	/*
+	 * INT $1: if #DB has DPL=3 and there isn't special handling,
+	 * then the kernel will die.
+	 */
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; INT 1\n");
+		sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+		asm volatile ("mov %[ss], %%ss; int $1" :: [ss] "m" (ss));
+	}
+
+#ifdef __x86_64__
+	/*
+	 * In principle, we should test 32-bit SYSCALL as well, but
+	 * the calling convention is so unpredictable that it's
+	 * not obviously worth the effort.
+	 */
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; SYSCALL\n");
+		sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+		nr = SYS_getpid;
+		/*
+		 * Toggle the high bit of RSP to make it noncanonical to
+		 * strengthen this test on non-SMAP systems.
+		 */
+		asm volatile ("btc $63, %%rsp\n\t"
+			      "mov %[ss], %%ss; syscall\n\t"
+			      "btc $63, %%rsp"
+			      : "+a" (nr) : [ss] "m" (ss)
+			      : "rcx"
+#ifdef __x86_64__
+				, "r11"
+#endif
+			);
+	}
+#endif
+
+	printf("[RUN]\tMOV SS; breakpointed NOP\n");
+	asm volatile ("mov %[ss], %%ss; breakpoint_insn: nop" :: [ss] "m" (ss));
+
+	/*
+	 * Invoking SYSENTER directly breaks all the rules.  Just handle
+	 * the SIGSEGV.
+	 */
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; SYSENTER\n");
+		stack_t stack = {
+			.ss_sp = altstack_data,
+			.ss_size = SIGSTKSZ,
+		};
+		if (sigaltstack(&stack, NULL) != 0)
+			err(1, "sigaltstack");
+		sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
+		nr = SYS_getpid;
+		asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr)
+			      : [ss] "m" (ss) : "flags", "rcx"
+#ifdef __x86_64__
+				, "r11"
+#endif
+			);
+
+		/* We're unreachable here.  SYSENTER forgets RIP. */
+	}
+
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		printf("[RUN]\tMOV SS; INT $0x80\n");
+		sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+		nr = 20;	/* compat getpid */
+		asm volatile ("mov %[ss], %%ss; int $0x80"
+			      : "+a" (nr) : [ss] "m" (ss)
+			      : "flags"
+#ifdef __x86_64__
+				, "r8", "r9", "r10", "r11"
+#endif
+			);
+	}
+
+	printf("[OK]\tI aten't dead\n");
+	return 0;
+}

From 55556b0b2016806b2e16a20b62d143383983a34a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:38 -0700
Subject: [PATCH 171/393] x86/pkeys/selftests: Give better unexpected fault
 error messages

do_not_expect_pk_fault() is a helper that we call when we do not expect
a PK fault to have occurred.  But, it is a function, which means that
it obscures the line numbers from pkey_assert().  It also gives no
details.

Replace it with an implementation that gives nice line numbers and
also lets callers pass in a more descriptive message about what
happened that caused the unexpected fault.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171338.55D13B64@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index bbe80a5c31c7..a0f0a732784b 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -954,10 +954,11 @@ void expected_pk_fault(int pkey)
 	last_si_pkey = -1;
 }
 
-void do_not_expect_pk_fault(void)
-{
-	pkey_assert(last_pkru_faults == pkru_faults);
-}
+#define do_not_expect_pk_fault(msg)	do {			\
+	if (last_pkru_faults != pkru_faults)			\
+		dprintf0("unexpected PK fault: %s\n", msg);	\
+	pkey_assert(last_pkru_faults == pkru_faults);		\
+} while (0)
 
 int test_fds[10] = { -1 };
 int nr_test_fds;
@@ -1243,7 +1244,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
 	pkey_assert(ret != -1);
 	/* Now access from the current task, and expect NO exception: */
 	peek_result = read_ptr(plain_ptr);
-	do_not_expect_pk_fault();
+	do_not_expect_pk_fault("read plain pointer after ptrace");
 
 	ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
 	pkey_assert(ret != -1);
@@ -1287,7 +1288,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 	 */
 	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
 	lots_o_noops_around_write(&scratch);
-	do_not_expect_pk_fault();
+	do_not_expect_pk_fault("executing on PROT_EXEC memory");
 	ptr_contents = read_ptr(p1);
 	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
 	expected_pk_fault(pkey);

From 86b9eea230edf4c67d4d4a70fba9b74505867a25 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:40 -0700
Subject: [PATCH 172/393] x86/pkeys/selftests: Stop using assert()

If we use assert(), the program "crashes".  That can be scary to users,
so stop doing it.  Just exit with a >0 exit code instead.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171340.E63EF7DA@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index a0f0a732784b..8537a7cfe1cc 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -72,10 +72,9 @@ extern void abort_hooks(void);
 				test_nr, iteration_nr);	\
 		dprintf0("errno at assert: %d", errno);	\
 		abort_hooks();			\
-		assert(condition);		\
+		exit(__LINE__);			\
 	}					\
 } while (0)
-#define raw_assert(cond) assert(cond)
 
 void cat_into_file(char *str, char *file)
 {
@@ -87,12 +86,17 @@ void cat_into_file(char *str, char *file)
 	 * these need to be raw because they are called under
 	 * pkey_assert()
 	 */
-	raw_assert(fd >= 0);
+	if (fd < 0) {
+		fprintf(stderr, "error opening '%s'\n", str);
+		perror("error: ");
+		exit(__LINE__);
+	}
+
 	ret = write(fd, str, strlen(str));
 	if (ret != strlen(str)) {
 		perror("write to file failed");
 		fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
-		raw_assert(0);
+		exit(__LINE__);
 	}
 	close(fd);
 }

From a50093d60464dd51d1ae0c2267b0abe9e1de77f3 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:42 -0700
Subject: [PATCH 173/393] x86/pkeys/selftests: Remove dead debugging code, fix
 dprint_in_signal

There is some noisy debug code at the end of the signal handler.  It was
disabled by an early, unconditional "return".  However, that return also
hid a dprint_in_signal=0, which kept dprint_in_signal=1 and effectively
locked us into permanent dprint_in_signal=1 behavior.

Remove the return and the dead code, fixing dprint_in_signal.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171342.846B9B2E@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 8537a7cfe1cc..5f5aedb80e7b 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -325,22 +325,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 	dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
 	pkru_faults++;
 	dprintf1("<<<<==================================================\n");
-	return;
-	if (trapno == 14) {
-		fprintf(stderr,
-			"ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
-			trapno, ip);
-		fprintf(stderr, "si_addr %p\n", si->si_addr);
-		fprintf(stderr, "REG_ERR: %lx\n",
-				(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
-		exit(1);
-	} else {
-		fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
-		fprintf(stderr, "si_addr %p\n", si->si_addr);
-		fprintf(stderr, "REG_ERR: %lx\n",
-				(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
-		exit(2);
-	}
 	dprint_in_signal = 0;
 }
 

From caf9eb6b4c82fc6cbd03697052ff22d97b0c377b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:44 -0700
Subject: [PATCH 174/393] x86/pkeys/selftests: Avoid printf-in-signal deadlocks

printf() and friends are unusable in signal handlers.  They deadlock.
The pkey selftest does not do any normal printing in signal handlers,
only extra debugging.  So, just print the format string so we get
*some* output when debugging.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171344.C53FD2F3@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/pkey-helpers.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
index b3cb7670e026..254e5436bdd9 100644
--- a/tools/testing/selftests/x86/pkey-helpers.h
+++ b/tools/testing/selftests/x86/pkey-helpers.h
@@ -26,30 +26,26 @@ static inline void sigsafe_printf(const char *format, ...)
 {
 	va_list ap;
 
-	va_start(ap, format);
 	if (!dprint_in_signal) {
+		va_start(ap, format);
 		vprintf(format, ap);
+		va_end(ap);
 	} else {
 		int ret;
-		int len = vsnprintf(dprint_in_signal_buffer,
-				    DPRINT_IN_SIGNAL_BUF_SIZE,
-				    format, ap);
 		/*
-		 * len is amount that would have been printed,
-		 * but actual write is truncated at BUF_SIZE.
+		 * No printf() functions are signal-safe.
+		 * They deadlock easily. Write the format
+		 * string to get some output, even if
+		 * incomplete.
 		 */
-		if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
-			len = DPRINT_IN_SIGNAL_BUF_SIZE;
-		ret = write(1, dprint_in_signal_buffer, len);
+		ret = write(1, format, strlen(format));
 		if (ret < 0)
-			abort();
+			exit(1);
 	}
-	va_end(ap);
 }
 #define dprintf_level(level, args...) do {	\
 	if (level <= DEBUG_LEVEL)		\
 		sigsafe_printf(args);		\
-	fflush(NULL);				\
 } while (0)
 #define dprintf0(args...) dprintf_level(0, args)
 #define dprintf1(args...) dprintf_level(1, args)

From 7e7fd67ca39335a49619729821efb7cbdd674eb0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:46 -0700
Subject: [PATCH 175/393] x86/pkeys/selftests: Allow faults on unknown keys

The exec-only pkey is allocated inside the kernel and userspace
is not told what it is.  So, allow PK faults to occur that have
an unknown key.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171345.7FC7DA00@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 5f5aedb80e7b..7d95acd2aec3 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -921,13 +921,21 @@ void *malloc_pkey(long size, int prot, u16 pkey)
 }
 
 int last_pkru_faults;
+#define UNKNOWN_PKEY -2
 void expected_pk_fault(int pkey)
 {
 	dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
 			__func__, last_pkru_faults, pkru_faults);
 	dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
 	pkey_assert(last_pkru_faults + 1 == pkru_faults);
-	pkey_assert(last_si_pkey == pkey);
+
+       /*
+	* For exec-only memory, we do not know the pkey in
+	* advance, so skip this check.
+	*/
+	if (pkey != UNKNOWN_PKEY)
+		pkey_assert(last_si_pkey == pkey);
+
 	/*
 	 * The signal handler shold have cleared out PKRU to let the
 	 * test program continue.  We now have to restore it.

From 3fcd2b2d928904cbf30b01e2c5e4f1dd2f9ab262 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:47 -0700
Subject: [PATCH 176/393] x86/pkeys/selftests: Factor out "instruction page"

We currently have an execute-only test, but it is for
the explicit mprotect_pkey() interface.  We will soon
add a test for the implicit mprotect(PROT_EXEC)
enterface.  We need this code in both tests.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171347.C64AB733@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 7d95acd2aec3..083ebd51b44e 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -1253,12 +1253,9 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
 	free(plain_ptr_unaligned);
 }
 
-void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+void *get_pointer_to_instructions(void)
 {
 	void *p1;
-	int scratch;
-	int ptr_contents;
-	int ret;
 
 	p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
 	dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
@@ -1268,7 +1265,23 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 	/* Point 'p1' at the *second* page of the function: */
 	p1 += PAGE_SIZE;
 
+	/*
+	 * Try to ensure we fault this in on next touch to ensure
+	 * we get an instruction fault as opposed to a data one
+	 */
 	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+	return p1;
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+	void *p1;
+	int scratch;
+	int ptr_contents;
+	int ret;
+
+	p1 = get_pointer_to_instructions();
 	lots_o_noops_around_write(&scratch);
 	ptr_contents = read_ptr(p1);
 	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);

From 6af17cf89e99b64cf1f660bf848755442ab2f047 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:48 -0700
Subject: [PATCH 177/393] x86/pkeys/selftests: Add PROT_EXEC test

Under the covers, implement executable-only memory with
protection keys when userspace calls mprotect(PROT_EXEC).

But, we did not have a selftest for that.  Now we do.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171348.9EEE4BEF@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 083ebd51b44e..8cb4fe58f381 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -1303,6 +1303,49 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 	expected_pk_fault(pkey);
 }
 
+void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+	void *p1;
+	int scratch;
+	int ptr_contents;
+	int ret;
+
+	dprintf1("%s() start\n", __func__);
+
+	p1 = get_pointer_to_instructions();
+	lots_o_noops_around_write(&scratch);
+	ptr_contents = read_ptr(p1);
+	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+	/* Use a *normal* mprotect(), not mprotect_pkey(): */
+	ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+	pkey_assert(!ret);
+
+	dprintf2("pkru: %x\n", rdpkru());
+
+	/* Make sure this is an *instruction* fault */
+	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+	lots_o_noops_around_write(&scratch);
+	do_not_expect_pk_fault("executing on PROT_EXEC memory");
+	ptr_contents = read_ptr(p1);
+	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+	expected_pk_fault(UNKNOWN_PKEY);
+
+	/*
+	 * Put the memory back to non-PROT_EXEC.  Should clear the
+	 * exec-only pkey off the VMA and allow it to be readable
+	 * again.  Go to PROT_NONE first to check for a kernel bug
+	 * that did not clear the pkey when doing PROT_NONE.
+	 */
+	ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+	pkey_assert(!ret);
+
+	ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+	pkey_assert(!ret);
+	ptr_contents = read_ptr(p1);
+	do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
+}
+
 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
 {
 	int size = PAGE_SIZE;
@@ -1327,6 +1370,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
 	test_kernel_gup_of_access_disabled_region,
 	test_kernel_gup_write_to_write_disabled_region,
 	test_executing_on_unreadable_memory,
+	test_implicit_mprotect_exec_only_memory,
 	test_ptrace_of_child,
 	test_pkey_syscalls_on_non_allocated_pkey,
 	test_pkey_syscalls_bad_args,

From f50b4878329ab61d8e05796f655adeb6f5fb57c6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:50 -0700
Subject: [PATCH 178/393] x86/pkeys/selftests: Fix pkey exhaustion test
 off-by-one

In our "exhaust all pkeys" test, we make sure that there
is the expected number available.  Turns out that the
test did not cover the execute-only key, but discussed
it anyway.  It did *not* discuss the test-allocated
key.

Now that we have a test for the mprotect(PROT_EXEC) case,
this off-by-one issue showed itself.  Correct the off-by-
one and add the explanation for the case we missed.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171350.E1656B95@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 8cb4fe58f381..55a4e349a45e 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -1163,12 +1163,15 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 	pkey_assert(i < NR_PKEYS*2);
 
 	/*
-	 * There are 16 pkeys supported in hardware.  One is taken
-	 * up for the default (0) and another can be taken up by
-	 * an execute-only mapping.  Ensure that we can allocate
-	 * at least 14 (16-2).
+	 * There are 16 pkeys supported in hardware.  Three are
+	 * allocated by the time we get here:
+	 *   1. The default key (0)
+	 *   2. One possibly consumed by an execute-only mapping.
+	 *   3. One allocated by the test code and passed in via
+	 *      'pkey' to this function.
+	 * Ensure that we can allocate at least another 13 (16-3).
 	 */
-	pkey_assert(i >= NR_PKEYS-2);
+	pkey_assert(i >= NR_PKEYS-3);
 
 	for (i = 0; i < nr_allocated_pkeys; i++) {
 		err = sys_pkey_free(allocated_pkeys[i]);

From 0a0b152083cfc44ec1bb599b57b7aab41327f998 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:51 -0700
Subject: [PATCH 179/393] x86/pkeys: Override pkey when moving away from
 PROT_EXEC

I got a bug report that the following code (roughly) was
causing a SIGSEGV:

	mprotect(ptr, size, PROT_EXEC);
	mprotect(ptr, size, PROT_NONE);
	mprotect(ptr, size, PROT_READ);
	*ptr = 100;

The problem is hit when the mprotect(PROT_EXEC)
is implicitly assigned a protection key to the VMA, and made
that key ACCESS_DENY|WRITE_DENY.  The PROT_NONE mprotect()
failed to remove the protection key, and the PROT_NONE->
PROT_READ left the PTE usable, but the pkey still in place
and left the memory inaccessible.

To fix this, we ensure that we always "override" the pkee
at mprotect() if the VMA does not have execute-only
permissions, but the VMA has the execute-only pkey.

We had a check for PROT_READ/WRITE, but it did not work
for PROT_NONE.  This entirely removes the PROT_* checks,
which ensures that PROT_NONE now works.

Reported-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Fixes: 62b5f7d013f ("mm/core, x86/mm/pkeys: Add execute-only protection keys support")
Link: http://lkml.kernel.org/r/20180509171351.084C5A71@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pkeys.h | 12 +++++++++++-
 arch/x86/mm/pkeys.c          | 21 +++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ffda0df..39cd292805a9 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
+#define ARCH_DEFAULT_PKEY	0
+
 #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
@@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
-		return 0;
+		return ARCH_DEFAULT_PKEY;
 
 	return __execute_only_pkey(mm);
 }
@@ -56,6 +58,14 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 		return false;
 	if (pkey >= arch_max_pkey())
 		return false;
+	/*
+	 * The exec-only pkey is set in the allocation map, but
+	 * is not available to any of the user interfaces like
+	 * mprotect_pkey().
+	 */
+	if (pkey == mm->context.execute_only_pkey)
+		return false;
+
 	return mm_pkey_allocation_map(mm) & (1U << pkey);
 }
 
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0eea20a5..6e98e0a7c923 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 	 */
 	if (pkey != -1)
 		return pkey;
-	/*
-	 * Look for a protection-key-drive execute-only mapping
-	 * which is now being given permissions that are not
-	 * execute-only.  Move it back to the default pkey.
-	 */
-	if (vma_is_pkey_exec_only(vma) &&
-	    (prot & (PROT_READ|PROT_WRITE))) {
-		return 0;
-	}
+
 	/*
 	 * The mapping is execute-only.  Go try to get the
 	 * execute-only protection key.  If we fail to do that,
 	 * fall through as if we do not have execute-only
-	 * support.
+	 * support in this mm.
 	 */
 	if (prot == PROT_EXEC) {
 		pkey = execute_only_pkey(vma->vm_mm);
 		if (pkey > 0)
 			return pkey;
+	} else if (vma_is_pkey_exec_only(vma)) {
+		/*
+		 * Protections are *not* PROT_EXEC, but the mapping
+		 * is using the exec-only pkey.  This mapping was
+		 * PROT_EXEC and will no longer be.  Move back to
+		 * the default pkey.
+		 */
+		return ARCH_DEFAULT_PKEY;
 	}
+
 	/*
 	 * This is a vanilla, non-pkey mprotect (or we failed to
 	 * setup execute-only), inherit the pkey from the VMA we

From 3d64f4ed15c3c53dba4c514bf59c334464dee373 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:52 -0700
Subject: [PATCH 180/393] x86/pkeys/selftests: Fix pointer math

We dump out the entire area of the siginfo where the si_pkey_ptr is
supposed to be.  But, we do some math on the poitner, which is a u32.
We intended to do byte math, not u32 math on the pointer.

Cast it over to a u8* so it works.

Also, move this block of code to below th si_code check.  It doesn't
hurt anything, but the si_pkey field is gibberish for other signal
types.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171352.9BE09819@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 55a4e349a45e..ee8176358d12 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -303,13 +303,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 		dump_mem(pkru_ptr - 128, 256);
 	pkey_assert(*pkru_ptr);
 
-	si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
-	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
-	dump_mem(si_pkey_ptr - 8, 24);
-	siginfo_pkey = *si_pkey_ptr;
-	pkey_assert(siginfo_pkey < NR_PKEYS);
-	last_si_pkey = siginfo_pkey;
-
 	if ((si->si_code == SEGV_MAPERR) ||
 	    (si->si_code == SEGV_ACCERR) ||
 	    (si->si_code == SEGV_BNDERR)) {
@@ -317,6 +310,13 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 		exit(4);
 	}
 
+	si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
+	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+	dump_mem((u8 *)si_pkey_ptr - 8, 24);
+	siginfo_pkey = *si_pkey_ptr;
+	pkey_assert(siginfo_pkey < NR_PKEYS);
+	last_si_pkey = siginfo_pkey;
+
 	dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
 	/* need __rdpkru() version so we do not do shadow_pkru checking */
 	dprintf1("signal pkru from  pkru: %08x\n", __rdpkru());

From acb25d761d6f2f64e785ccefc71e54f244f1eda4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:54 -0700
Subject: [PATCH 181/393] x86/pkeys/selftests: Save off 'prot' for allocations

This makes it possible to to tell what 'prot' a given allocation
is supposed to have.  That way, if we want to change just the
pkey, we know what 'prot' to pass to mprotect_pkey().

Also, keep a record of the most recent allocation so the tests
can easily find it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171354.AA23E228@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index ee8176358d12..986ed38a2b25 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -677,10 +677,12 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 struct pkey_malloc_record {
 	void *ptr;
 	long size;
+	int prot;
 };
 struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
 long nr_pkey_malloc_records;
-void record_pkey_malloc(void *ptr, long size)
+void record_pkey_malloc(void *ptr, long size, int prot)
 {
 	long i;
 	struct pkey_malloc_record *rec = NULL;
@@ -712,6 +714,8 @@ void record_pkey_malloc(void *ptr, long size)
 		(int)(rec - pkey_malloc_records), rec, ptr, size);
 	rec->ptr = ptr;
 	rec->size = size;
+	rec->prot = prot;
+	pkey_last_malloc_record = rec;
 	nr_pkey_malloc_records++;
 }
 
@@ -756,7 +760,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
 	pkey_assert(ptr != (void *)-1);
 	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
 	pkey_assert(!ret);
-	record_pkey_malloc(ptr, size);
+	record_pkey_malloc(ptr, size, prot);
 	rdpkru();
 
 	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
@@ -777,7 +781,7 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 	size = ALIGN_UP(size, HPAGE_SIZE * 2);
 	ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 	pkey_assert(ptr != (void *)-1);
-	record_pkey_malloc(ptr, size);
+	record_pkey_malloc(ptr, size, prot);
 	mprotect_pkey(ptr, size, prot, pkey);
 
 	dprintf1("unaligned ptr: %p\n", ptr);
@@ -850,7 +854,7 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 	pkey_assert(ptr != (void *)-1);
 	mprotect_pkey(ptr, size, prot, pkey);
 
-	record_pkey_malloc(ptr, size);
+	record_pkey_malloc(ptr, size, prot);
 
 	dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
 	return ptr;
@@ -872,7 +876,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
 
 	mprotect_pkey(ptr, size, prot, pkey);
 
-	record_pkey_malloc(ptr, size);
+	record_pkey_malloc(ptr, size, prot);
 
 	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
 	close(fd);

From 3488a600d90bcaf061b104dbcfbdc8d99b398312 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:56 -0700
Subject: [PATCH 182/393] x86/pkeys/selftests: Add a test for pkey 0

Protection key 0 is the default key for all memory and will
not normally come back from pkey_alloc().  But, you might
still want pass it to mprotect_pkey().

This check ensures that you can use pkey 0.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180509171356.9E40B254@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 986ed38a2b25..460b4bdf4c1e 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -1184,6 +1184,35 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 	}
 }
 
+/*
+ * pkey 0 is special.  It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first.  Make sure that it
+ * is usable.
+ */
+void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+	long size;
+	int prot;
+
+	assert(pkey_last_malloc_record);
+	size = pkey_last_malloc_record->size;
+	/*
+	 * This is a bit of a hack.  But mprotect() requires
+	 * huge-page-aligned sizes when operating on hugetlbfs.
+	 * So, make sure that we use something that's a multiple
+	 * of a huge page when we can.
+	 */
+	if (size >= HPAGE_SIZE)
+		size = HPAGE_SIZE;
+	prot = pkey_last_malloc_record->prot;
+
+	/* Use pkey 0 */
+	mprotect_pkey(ptr, size, prot, 0);
+
+	/* Make sure that we can set it back to the original pkey. */
+	mprotect_pkey(ptr, size, prot, pkey);
+}
+
 void test_ptrace_of_child(int *ptr, u16 pkey)
 {
 	__attribute__((__unused__)) int peek_result;
@@ -1378,6 +1407,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
 	test_kernel_gup_write_to_write_disabled_region,
 	test_executing_on_unreadable_memory,
 	test_implicit_mprotect_exec_only_memory,
+	test_mprotect_with_pkey_0,
 	test_ptrace_of_child,
 	test_pkey_syscalls_on_non_allocated_pkey,
 	test_pkey_syscalls_bad_args,

From 2fa9d1cfaf0e02f8abef0757002bff12dfcfa4e6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 9 May 2018 10:13:58 -0700
Subject: [PATCH 183/393] x86/pkeys: Do not special case protection key 0

mm_pkey_is_allocated() treats pkey 0 as unallocated.  That is
inconsistent with the manpages, and also inconsistent with
mm->context.pkey_allocation_map.  Stop special casing it and only
disallow values that are actually bad (< 0).

The end-user visible effect of this is that you can now use
mprotect_pkey() to set pkey=0.

This is a bit nicer than what Ram proposed[1] because it is simpler
and removes special-casing for pkey 0.  On the other hand, it does
allow applications to pkey_free() pkey-0, but that's just a silly
thing to do, so we are not going to protect against it.

The scenario that could happen is similar to what happens if you free
any other pkey that is in use: it might get reallocated later and used
to protect some other data.  The most likely scenario is that pkey-0
comes back from pkey_alloc(), an access-disable or write-disable bit
is set in PKRU for it, and the next stack access will SIGSEGV.  It's
not horribly different from if you mprotect()'d your stack or heap to
be unreadable or unwritable, which is generally very foolish, but also
not explicitly prevented by the kernel.

1. http://lkml.kernel.org/r/1522112702-27853-1-git-send-email-linuxram@us.ibm.com

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>p
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellermen <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Fixes: 58ab9a088dda ("x86/pkeys: Check against max pkey to avoid overflows")
Link: http://lkml.kernel.org/r/20180509171358.47FD785E@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 2 +-
 arch/x86/include/asm/pkeys.h       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 57e3785d0d26..cf9911b5a53c 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk,
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
-		/* pkey 0 is the default and always allocated */
+		/* pkey 0 is the default and allocated implicitly */
 		mm->context.pkey_allocation_map = 0x1;
 		/* -1 means unallocated or invalid */
 		mm->context.execute_only_pkey = -1;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 39cd292805a9..851c04b7a092 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -51,10 +51,10 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
 	/*
 	 * "Allocated" pkeys are those that have been returned
-	 * from pkey_alloc().  pkey 0 is special, and never
-	 * returned from pkey_alloc().
+	 * from pkey_alloc() or pkey 0 which is allocated
+	 * implicitly when the mm is created.
 	 */
-	if (pkey <= 0)
+	if (pkey < 0)
 		return false;
 	if (pkey >= arch_max_pkey())
 		return false;

From 20943f984967477c906522112d2b6b5a29f94684 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Wed, 2 May 2018 20:50:21 +0100
Subject: [PATCH 184/393] drm/i915/userptr: reject zero user_size

Operating on a zero sized GEM userptr object will lead to explosions.

Fixes: 5cc9ed4b9a7a ("drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl")
Testcase: igt/gem_userptr_blits/input-checking
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180502195021.30900-1-matthew.auld@intel.com
(cherry picked from commit c11c7bfd213495784b22ef82a69b6489f8d0092f)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d596a8302ca3..854bd51b9478 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -778,6 +778,9 @@ i915_gem_userptr_ioctl(struct drm_device *dev,
 			    I915_USERPTR_UNSYNCHRONIZED))
 		return -EINVAL;
 
+	if (!args->user_size)
+		return -EINVAL;
+
 	if (offset_in_page(args->user_ptr | args->user_size))
 		return -EINVAL;
 

From cd078bf95df29d827daca0274a9a13821c11eedb Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 11 May 2018 13:11:45 +0100
Subject: [PATCH 185/393] drm/i915/execlists: Use rmb() to order CSB reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We assume that the CSB is written using the normal ringbuffer
coherency protocols, as outlined in kernel/events/ring_buffer.c:

    *   (HW)                              (DRIVER)
    *
    *   if (LOAD ->data_tail) {            LOAD ->data_head
    *                      (A)             smp_rmb()       (C)
    *      STORE $data                     LOAD $data
    *      smp_wmb()       (B)             smp_mb()        (D)
    *      STORE ->data_head               STORE ->data_tail
    *   }

So we assume that the HW fulfils its ordering requirements (B), and so
we should use a complimentary rmb (C) to ensure that our read of its
WRITE pointer is completed before we start accessing the data.

The final mb (D) is implied by the uncached mmio we perform to inform
the HW of our READ pointer.

References: https://bugs.freedesktop.org/show_bug.cgi?id=105064
References: https://bugs.freedesktop.org/show_bug.cgi?id=105888
References: https://bugs.freedesktop.org/show_bug.cgi?id=106185
Fixes: 767a983ab255 ("drm/i915/execlists: Read the context-status HEAD from the HWSP")
References: 61bf9719fa17 ("drm/i915/cnl: Use mmio access to context status buffer")
Suggested-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Rafael Antognolli <rafael.antognolli@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Timo Aaltonen <tjaalton@ubuntu.com>
Tested-by: Timo Aaltonen <tjaalton@ubuntu.com>
Acked-by: Michel Thierry <michel.thierry@intel.com>
Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180511121147.31915-1-chris@chris-wilson.co.uk
(cherry picked from commit 77dfedb5be03779f9a5d83e323a1b36e32090105)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e3a5f673ff67..8704f7f8d072 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -884,6 +884,7 @@ static void execlists_submission_tasklet(unsigned long data)
 
 			head = execlists->csb_head;
 			tail = READ_ONCE(buf[write_idx]);
+			rmb(); /* Hopefully paired with a wmb() in HW */
 		}
 		GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
 			  engine->name,

From b61f7dcf4eb2653e870c9079b02d11a0834cfe39 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Apr 2018 20:46:22 +0100
Subject: [PATCH 186/393] afs: Fix directory page locking

The afs directory loading code (primarily afs_read_dir()) locks all the
pages that hold a directory's content blob to defend against
getdents/getdents races and getdents/lookup races where the competitors
issue conflicting reads on the same data.  As the reads will complete
consecutively, they may retrieve different versions of the data and
one may overwrite the data that the other is busy parsing.

Fix this by not locking the pages at all, but rather by turning the
validation lock into an rwsem and getting an exclusive lock on it whilst
reading the data or validating the attributes and a shared lock whilst
parsing the data.  Sharing the attribute validation lock should be fine as
the data fetch will retrieve the attributes also.

The individual page locks aren't needed at all as the only place they're
being used is to serialise data loading.

Without this patch, the:

 	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
		...
	}

part of afs_read_dir() may be skipped, leaving the pages unlocked when we
hit the success: clause - in which case we try to unlock the not-locked
pages, leading to the following oops:

  page:ffffe38b405b4300 count:3 mapcount:0 mapping:ffff98156c83a978 index:0x0
  flags: 0xfffe000001004(referenced|private)
  raw: 000fffe000001004 ffff98156c83a978 0000000000000000 00000003ffffffff
  raw: dead000000000100 dead000000000200 0000000000000001 ffff98156b27c000
  page dumped because: VM_BUG_ON_PAGE(!PageLocked(page))
  page->mem_cgroup:ffff98156b27c000
  ------------[ cut here ]------------
  kernel BUG at mm/filemap.c:1205!
  ...
  RIP: 0010:unlock_page+0x43/0x50
  ...
  Call Trace:
   afs_dir_iterate+0x789/0x8f0 [kafs]
   ? _cond_resched+0x15/0x30
   ? kmem_cache_alloc_trace+0x166/0x1d0
   ? afs_do_lookup+0x69/0x490 [kafs]
   ? afs_do_lookup+0x101/0x490 [kafs]
   ? key_default_cmp+0x20/0x20
   ? request_key+0x3c/0x80
   ? afs_lookup+0xf1/0x340 [kafs]
   ? __lookup_slow+0x97/0x150
   ? lookup_slow+0x35/0x50
   ? walk_component+0x1bf/0x490
   ? path_lookupat.isra.52+0x75/0x200
   ? filename_lookup.part.66+0xa0/0x170
   ? afs_end_vnode_operation+0x41/0x60 [kafs]
   ? __check_object_size+0x9c/0x171
   ? strncpy_from_user+0x4a/0x170
   ? vfs_statx+0x73/0xe0
   ? __do_sys_newlstat+0x39/0x70
   ? __x64_sys_getdents+0xc9/0x140
   ? __x64_sys_getdents+0x140/0x140
   ? do_syscall_64+0x5b/0x160
   ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: f3ddee8dc4e2 ("afs: Fix directory handling")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c      | 38 ++++++++++++++++++--------------------
 fs/afs/inode.c    |  6 +++---
 fs/afs/internal.h |  2 +-
 fs/afs/super.c    |  2 +-
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5889f70d4d27..2853acd64482 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -180,6 +180,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
  * get reclaimed during the iteration.
  */
 static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
+	__acquires(&dvnode->validate_lock)
 {
 	struct afs_read *req;
 	loff_t i_size;
@@ -261,18 +262,21 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
 	/* If we're going to reload, we need to lock all the pages to prevent
 	 * races.
 	 */
+	ret = -ERESTARTSYS;
+	if (down_read_killable(&dvnode->validate_lock) < 0)
+		goto error;
+
+	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+		goto success;
+
+	up_read(&dvnode->validate_lock);
+	if (down_write_killable(&dvnode->validate_lock) < 0)
+		goto error;
+
 	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
-		ret = -ERESTARTSYS;
-		for (i = 0; i < req->nr_pages; i++)
-			if (lock_page_killable(req->pages[i]) < 0)
-				goto error_unlock;
-
-		if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-			goto success;
-
 		ret = afs_fetch_data(dvnode, key, req);
 		if (ret < 0)
-			goto error_unlock_all;
+			goto error_unlock;
 
 		task_io_account_read(PAGE_SIZE * req->nr_pages);
 
@@ -284,33 +288,26 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
 		for (i = 0; i < req->nr_pages; i++)
 			if (!afs_dir_check_page(dvnode, req->pages[i],
 						req->actual_len))
-				goto error_unlock_all;
+				goto error_unlock;
 
 		// TODO: Trim excess pages
 
 		set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
 	}
 
+	downgrade_write(&dvnode->validate_lock);
 success:
-	i = req->nr_pages;
-	while (i > 0)
-		unlock_page(req->pages[--i]);
 	return req;
 
-error_unlock_all:
-	i = req->nr_pages;
 error_unlock:
-	while (i > 0)
-		unlock_page(req->pages[--i]);
+	up_write(&dvnode->validate_lock);
 error:
 	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 
 content_has_grown:
-	i = req->nr_pages;
-	while (i > 0)
-		unlock_page(req->pages[--i]);
+	up_write(&dvnode->validate_lock);
 	afs_put_read(req);
 	goto retry;
 }
@@ -473,6 +470,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
 	}
 
 out:
+	up_read(&dvnode->validate_lock);
 	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 06194cfe9724..e855c6e5cf28 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -415,7 +415,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	if (valid)
 		goto valid;
 
-	mutex_lock(&vnode->validate_lock);
+	down_write(&vnode->validate_lock);
 
 	/* if the promise has expired, we need to check the server again to get
 	 * a new promise - note that if the (parent) directory's metadata was
@@ -444,13 +444,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	 * different */
 	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 		afs_zap_data(vnode);
-	mutex_unlock(&vnode->validate_lock);
+	up_write(&vnode->validate_lock);
 valid:
 	_leave(" = 0");
 	return 0;
 
 error_unlock:
-	mutex_unlock(&vnode->validate_lock);
+	up_write(&vnode->validate_lock);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index f8086ec95e24..468be1e0dffb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,7 +494,7 @@ struct afs_vnode {
 #endif
 	struct afs_permits __rcu *permit_cache;	/* cache of permits so far obtained */
 	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
-	struct mutex		validate_lock;	/* lock for validating this vnode */
+	struct rw_semaphore	validate_lock;	/* lock for validating this vnode */
 	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 65081ec3c36e..b02838cd9525 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -590,7 +590,7 @@ static void afs_i_init_once(void *_vnode)
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->vfs_inode);
 	mutex_init(&vnode->io_lock);
-	mutex_init(&vnode->validate_lock);
+	init_rwsem(&vnode->validate_lock);
 	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
 	INIT_LIST_HEAD(&vnode->wb_keys);

From 01fd79e6de74a447c5657913a335d9ce6508cdb1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 May 2018 22:03:18 +0100
Subject: [PATCH 187/393] afs: Fix address list parsing

The parsing of port specifiers in the address list obtained from the DNS
resolution upcall doesn't work as in4_pton() and in6_pton() will fail on
encountering an unexpected delimiter (in this case, the '+' marking the
port number).  However, in*_pton() can't be given multiple specifiers.

Fix this by finding the delimiter in advance and not relying on in*_pton()
to find the end of the address for us.

Fixes: 8b2a464ced77 ("afs: Add an address list concept")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/addr_list.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 3bedfed608a2..7587fb665ff1 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
 	p = text;
 	do {
 		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
-		char tdelim = delim;
+		const char *q, *stop;
 
 		if (*p == delim) {
 			p++;
@@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
 
 		if (*p == '[') {
 			p++;
-			tdelim = ']';
+			q = memchr(p, ']', end - p);
+		} else {
+			for (q = p; q < end; q++)
+				if (*q == '+' || *q == delim)
+					break;
 		}
 
-		if (in4_pton(p, end - p,
+		if (in4_pton(p, q - p,
 			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
-			     tdelim, &p)) {
+			     -1, &stop)) {
 			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
 			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
 			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-		} else if (in6_pton(p, end - p,
+		} else if (in6_pton(p, q - p,
 				    srx->transport.sin6.sin6_addr.s6_addr,
-				    tdelim, &p)) {
+				    -1, &stop)) {
 			/* Nothing to do */
 		} else {
 			goto bad_address;
 		}
 
-		if (tdelim == ']') {
-			if (p == end || *p != ']')
-				goto bad_address;
+		if (stop != q)
+			goto bad_address;
+
+		p = q;
+		if (q < end && *q == ']')
 			p++;
-		}
 
 		if (p < end) {
 			if (*p == '+') {

From f2686b09269ec1a6f23028b5675d87c3b4579a4c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 14:12:50 +0100
Subject: [PATCH 188/393] afs: Fix giving up callbacks on server destruction

When a server record is destroyed, we want to send a message to the server
telling it that we're giving up all the callbacks it has promised us.

Apply two fixes to this:

 (1) Only send the FS.GiveUpAllCallBacks message if we actually got a
     callback from that server.  We assume this to be the case if we
     performed at least one successful FS operation on that server.

 (2) Send it to the address last used for that server rather than always
     picking the first address in the list (which might be unreachable).

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/internal.h | 1 +
 fs/afs/rxrpc.c    | 6 +++++-
 fs/afs/server.c   | 8 +++++---
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 468be1e0dffb..8de04b29bec1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -396,6 +396,7 @@ struct afs_server {
 #define AFS_SERVER_FL_PROBED	5		/* The fileserver has been probed */
 #define AFS_SERVER_FL_PROBING	6		/* Fileserver is being probed */
 #define AFS_SERVER_FL_NO_IBULK	7		/* Fileserver doesn't support FS.InlineBulkStatus */
+#define AFS_SERVER_FL_MAY_HAVE_CB 8		/* May have callbacks on this fileserver */
 	atomic_t		usage;
 	u32			addr_version;	/* Address list version */
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 5c6263972ec9..1f6235a6e9ae 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -482,8 +482,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 		state = READ_ONCE(call->state);
 		switch (ret) {
 		case 0:
-			if (state == AFS_CALL_CL_PROC_REPLY)
+			if (state == AFS_CALL_CL_PROC_REPLY) {
+				if (call->cbi)
+					set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
+						&call->cbi->server->flags);
 				goto call_complete;
+			}
 			ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY);
 			goto done;
 		case -EINPROGRESS:
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 629c74986cff..2c5cff60e34d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -395,14 +395,16 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 	struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
 	struct afs_addr_cursor ac = {
 		.alist	= alist,
-		.addr	= &alist->addrs[0],
 		.start	= alist->index,
-		.index	= alist->index,
+		.index	= 0,
+		.addr	= &alist->addrs[alist->index],
 		.error	= 0,
 	};
 	_enter("%p", server);
 
-	afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+		afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+
 	call_rcu(&server->rcu, afs_server_rcu);
 	afs_dec_servers_outstanding(net);
 }

From d4a96bec7a7362834ef5c31d7b2cc9bf36eb0570 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 08:43:04 +0100
Subject: [PATCH 189/393] afs: Fix refcounting in callback registration

The refcounting on afs_cb_interest struct objects in
afs_register_server_cb_interest() is wrong as it uses the server list
entry's call back interest pointer without regard for the fact that it
might be replaced at any time and the object thrown away.

Fix this by:

 (1) Put a lock on the afs_server_list struct that can be used to
     mediate access to the callback interest pointers in the servers array.

 (2) Keep a ref on the callback interest that we get from the entry.

 (3) Dropping the old reference held by vnode->cb_interest if we replace
     the pointer.

Fixes: c435ee34551e ("afs: Overhaul the callback handling")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/callback.c    | 64 ++++++++++++++++++++++++++++++--------------
 fs/afs/internal.h    |  7 +++--
 fs/afs/rotate.c      |  4 +--
 fs/afs/server_list.c |  7 +++--
 4 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index abd9a84f4e88..09332945d322 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -23,36 +23,55 @@
 /*
  * Set up an interest-in-callbacks record for a volume on a server and
  * register it with the server.
- * - Called with volume->server_sem held.
+ * - Called with vnode->io_lock held.
  */
 int afs_register_server_cb_interest(struct afs_vnode *vnode,
-				    struct afs_server_entry *entry)
+				    struct afs_server_list *slist,
+				    unsigned int index)
 {
-	struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x;
+	struct afs_server_entry *entry = &slist->servers[index];
+	struct afs_cb_interest *cbi, *vcbi, *new, *old;
 	struct afs_server *server = entry->server;
 
 again:
+	if (vnode->cb_interest &&
+	    likely(vnode->cb_interest == entry->cb_interest))
+		return 0;
+
+	read_lock(&slist->lock);
+	cbi = afs_get_cb_interest(entry->cb_interest);
+	read_unlock(&slist->lock);
+
 	vcbi = vnode->cb_interest;
 	if (vcbi) {
-		if (vcbi == cbi)
-			return 0;
-
-		if (cbi && vcbi->server == cbi->server) {
-			write_seqlock(&vnode->cb_lock);
-			vnode->cb_interest = afs_get_cb_interest(cbi);
-			write_sequnlock(&vnode->cb_lock);
+		if (vcbi == cbi) {
 			afs_put_cb_interest(afs_v2net(vnode), cbi);
 			return 0;
 		}
 
+		/* Use a new interest in the server list for the same server
+		 * rather than an old one that's still attached to a vnode.
+		 */
+		if (cbi && vcbi->server == cbi->server) {
+			write_seqlock(&vnode->cb_lock);
+			old = vnode->cb_interest;
+			vnode->cb_interest = cbi;
+			write_sequnlock(&vnode->cb_lock);
+			afs_put_cb_interest(afs_v2net(vnode), old);
+			return 0;
+		}
+
+		/* Re-use the one attached to the vnode. */
 		if (!cbi && vcbi->server == server) {
-			afs_get_cb_interest(vcbi);
-			x = cmpxchg(&entry->cb_interest, cbi, vcbi);
-			if (x != cbi) {
-				cbi = x;
-				afs_put_cb_interest(afs_v2net(vnode), vcbi);
+			write_lock(&slist->lock);
+			if (entry->cb_interest) {
+				write_unlock(&slist->lock);
+				afs_put_cb_interest(afs_v2net(vnode), cbi);
 				goto again;
 			}
+
+			entry->cb_interest = cbi;
+			write_unlock(&slist->lock);
 			return 0;
 		}
 	}
@@ -72,13 +91,16 @@ int afs_register_server_cb_interest(struct afs_vnode *vnode,
 		list_add_tail(&new->cb_link, &server->cb_interests);
 		write_unlock(&server->cb_break_lock);
 
-		x = cmpxchg(&entry->cb_interest, cbi, new);
-		if (x == cbi) {
+		write_lock(&slist->lock);
+		if (!entry->cb_interest) {
+			entry->cb_interest = afs_get_cb_interest(new);
 			cbi = new;
+			new = NULL;
 		} else {
-			cbi = x;
-			afs_put_cb_interest(afs_v2net(vnode), new);
+			cbi = afs_get_cb_interest(entry->cb_interest);
 		}
+		write_unlock(&slist->lock);
+		afs_put_cb_interest(afs_v2net(vnode), new);
 	}
 
 	ASSERT(cbi);
@@ -88,11 +110,13 @@ int afs_register_server_cb_interest(struct afs_vnode *vnode,
 	 */
 	write_seqlock(&vnode->cb_lock);
 
-	vnode->cb_interest = afs_get_cb_interest(cbi);
+	old = vnode->cb_interest;
+	vnode->cb_interest = cbi;
 	vnode->cb_s_break = cbi->server->cb_s_break;
 	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 
 	write_sequnlock(&vnode->cb_lock);
+	afs_put_cb_interest(afs_v2net(vnode), old);
 	return 0;
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8de04b29bec1..e75e57e13320 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -434,6 +434,7 @@ struct afs_server_list {
 	unsigned short		index;		/* Server currently in use */
 	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
 	unsigned int		seq;		/* Set to ->servers_seq when installed */
+	rwlock_t		lock;
 	struct afs_server_entry	servers[];
 };
 
@@ -649,13 +650,15 @@ extern void afs_init_callback_state(struct afs_server *);
 extern void afs_break_callback(struct afs_vnode *);
 extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
 
-extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *);
+extern int afs_register_server_cb_interest(struct afs_vnode *,
+					   struct afs_server_list *, unsigned int);
 extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
 extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *);
 
 static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
 {
-	refcount_inc(&cbi->usage);
+	if (cbi)
+		refcount_inc(&cbi->usage);
 	return cbi;
 }
 
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ac0feac9d746..4a26d51b2968 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -350,8 +350,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 	 * break request before we've finished decoding the reply and
 	 * installing the vnode.
 	 */
-	fc->ac.error = afs_register_server_cb_interest(
-		vnode, &fc->server_list->servers[fc->index]);
+	fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
+						       fc->index);
 	if (fc->ac.error < 0)
 		goto failed;
 
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 0f8dc4c8f07c..8a5760aa5832 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
 		goto error;
 
 	refcount_set(&slist->usage, 1);
+	rwlock_init(&slist->lock);
 
 	/* Make sure a records exists for each server in the list. */
 	for (i = 0; i < vldb->nr_servers; i++) {
@@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
 			goto error_2;
 		}
 
-		/* Insertion-sort by server pointer */
+		/* Insertion-sort by UUID */
 		for (j = 0; j < slist->nr_servers; j++)
-			if (slist->servers[j].server >= server)
+			if (memcmp(&slist->servers[j].server->uuid,
+				   &server->uuid,
+				   sizeof(server->uuid)) >= 0)
 				break;
 		if (j < slist->nr_servers) {
 			if (slist->servers[j].server == server) {

From ec5a3b4b507efca903d848518dcf2ebf7b04b466 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 14:22:38 +0100
Subject: [PATCH 190/393] afs: Fix server rotation's handling of fileserver
 probe failure

The server rotation algorithm just gives up if it fails to probe a
fileserver.  Fix this by rotating to the next fileserver instead.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rotate.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 4a26d51b2968..84584dcced72 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -369,8 +369,16 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
 		fc->ac.alist = afs_get_addrlist(alist);
 
-		if (!afs_probe_fileserver(fc))
-			goto failed;
+		if (!afs_probe_fileserver(fc)) {
+			switch (fc->ac.error) {
+			case -ENOMEM:
+			case -ERESTARTSYS:
+			case -EINTR:
+				goto failed;
+			default:
+				goto next_server;
+			}
+		}
 	}
 
 	if (!fc->ac.alist)

From ea739a287f4f16d6250bea779a1026ead79695f2 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 10 May 2018 19:20:54 +0100
Subject: [PATCH 191/393] mtd: Fix comparison in map_word_andequal()

Commit 9e343e87d2c4 ("mtd: cfi: convert inline functions to macros")
changed map_word_andequal() into a macro, but also changed the right
hand side of the comparison from val3 to val2.  Change it back to use
val3 on the right hand side.

Thankfully this did not cause a regression because all callers
currently pass the same argument for val2 and val3.

Fixes: 9e343e87d2c4 ("mtd: cfi: convert inline functions to macros")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/linux/mtd/map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index b5b43f94f311..01b990e4b228 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -312,7 +312,7 @@ void map_destroy(struct mtd_info *mtd);
 ({									\
 	int i, ret = 1;							\
 	for (i = 0; i < map_words(map); i++) {				\
-		if (((val1).x[i] & (val2).x[i]) != (val2).x[i]) {	\
+		if (((val1).x[i] & (val2).x[i]) != (val3).x[i]) {	\
 			ret = 0;					\
 			break;						\
 		}							\

From 90d617633368ab97a2c7571c6e66dad54f39228d Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Wed, 9 May 2018 09:13:58 +0200
Subject: [PATCH 192/393] mtd: rawnand: marvell: Fix read logic for layouts
 with ->nchunks > 2

The code is doing monolithic reads for all chunks except the last one
which is wrong since a monolithic read will issue the
READ0+ADDRS+READ_START sequence. It not only takes longer because it
forces the NAND chip to reload the page content into its internal
cache, but by doing that we also reset the column pointer to 0, which
means we'll always read the first chunk instead of moving to the next
one.

Rework the code to do a monolithic read only for the first chunk,
then switch to naked reads for all intermediate chunks and finally
issue a last naked read for the last chunk.

Fixes: 02f26ecf8c77 mtd: nand: add reworked Marvell NAND controller driver
Cc: stable@vger.kernel.org
Reported-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Tested-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/marvell_nand.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c
index db5ec4e8bde9..ebb1d141b900 100644
--- a/drivers/mtd/nand/raw/marvell_nand.c
+++ b/drivers/mtd/nand/raw/marvell_nand.c
@@ -1194,11 +1194,13 @@ static void marvell_nfc_hw_ecc_bch_read_chunk(struct nand_chip *chip, int chunk,
 				  NDCB0_CMD2(NAND_CMD_READSTART);
 
 	/*
-	 * Trigger the naked read operation only on the last chunk.
-	 * Otherwise, use monolithic read.
+	 * Trigger the monolithic read on the first chunk, then naked read on
+	 * intermediate chunks and finally a last naked read on the last chunk.
 	 */
-	if (lt->nchunks == 1 || (chunk < lt->nchunks - 1))
+	if (chunk == 0)
 		nfc_op.ndcb[0] |= NDCB0_CMD_XTYPE(XTYPE_MONOLITHIC_RW);
+	else if (chunk < lt->nchunks - 1)
+		nfc_op.ndcb[0] |= NDCB0_CMD_XTYPE(XTYPE_NAKED_RW);
 	else
 		nfc_op.ndcb[0] |= NDCB0_CMD_XTYPE(XTYPE_LAST_NAKED_RW);
 

From 684b0f68cf1c1cf4a40834818653491c5cad4435 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 21:51:47 +0100
Subject: [PATCH 193/393] afs: Fix AFSFetchStatus decoder to provide OpenAFS
 compatibility

The OpenAFS server's RXAFS_InlineBulkStatus implementation has a bug
whereby if an error occurs on one of the vnodes being queried, then the
errorCode field is set correctly in the corresponding status, but the
interfaceVersion field is left unset.

Fix kAFS to deal with this by evaluating the AFSFetchStatus blob against
the following cases when called from FS.InlineBulkStatus delivery:

 (1) If InterfaceVersion == 0 then:

     (a) If errorCode != 0 then it indicates the abort code for the
         corresponding vnode.

     (b) If errorCode == 0 then the status record is invalid.

 (2) If InterfaceVersion == 1 then:

     (a) If errorCode != 0 then it indicates the abort code for the
         corresponding vnode.

     (b) If errorCode == 0 then the status record is valid and can be
     	 parsed.

 (3) If InterfaceVersion is anything else then the status record is
     invalid.

Fixes: dd9fbcb8e103 ("afs: Rearrange status mapping")
Reported-by: Jeffrey Altman <jaltman@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/fsclient.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index efacdb7c1dee..d16f26c6cdbe 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -134,6 +134,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 				     struct afs_read *read_req)
 {
 	const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
+	bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
 	u64 data_version, size;
 	u32 type, abort_code;
 	u8 flags = 0;
@@ -142,13 +143,32 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 	if (vnode)
 		write_seqlock(&vnode->cb_lock);
 
+	abort_code = ntohl(xdr->abort_code);
+
 	if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) {
+		if (xdr->if_version == htonl(0) &&
+		    abort_code != 0 &&
+		    inline_error) {
+			/* The OpenAFS fileserver has a bug in FS.InlineBulkStatus
+			 * whereby it doesn't set the interface version in the error
+			 * case.
+			 */
+			status->abort_code = abort_code;
+			ret = 0;
+			goto out;
+		}
+
 		pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
 		goto bad;
 	}
 
+	if (abort_code != 0 && inline_error) {
+		status->abort_code = abort_code;
+		ret = 0;
+		goto out;
+	}
+
 	type = ntohl(xdr->type);
-	abort_code = ntohl(xdr->abort_code);
 	switch (type) {
 	case AFS_FTYPE_FILE:
 	case AFS_FTYPE_DIR:
@@ -165,13 +185,6 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 		}
 		status->type = type;
 		break;
-	case AFS_FTYPE_INVALID:
-		if (abort_code != 0) {
-			status->abort_code = abort_code;
-			ret = 0;
-			goto out;
-		}
-		/* Fall through */
 	default:
 		goto bad;
 	}

From 3d9fa91161387ee629e7a07c47934d119910c8ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 22:55:59 +0100
Subject: [PATCH 194/393] afs: Fix VNOVOL handling in address rotation

If a volume location record lists multiple file servers for a volume, then
it's possible that due to a misconfiguration or a changing configuration
that one of the file servers doesn't know about it yet and will abort
VNOVOL.  Currently, the rotation algorithm will stop with EREMOTEIO.

Fix this by moving on to try the next server if VNOVOL is returned.  Once
all the servers have been tried and the record rechecked, the algorithm
will stop with EREMOTEIO or ENOMEDIUM.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rotate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 84584dcced72..e065bc0768e6 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 			 */
 			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
 				fc->ac.error = -EREMOTEIO;
-				goto failed;
+				goto next_server;
 			}
 
 			write_lock(&vnode->volume->servers_lock);
@@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 			 */
 			if (vnode->volume->servers == fc->server_list) {
 				fc->ac.error = -EREMOTEIO;
-				goto failed;
+				goto next_server;
 			}
 
 			/* Try again */

From 001ab5a67ee5d191c64aebf4b4ef8c7a0dcfd2bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 23:21:35 +0100
Subject: [PATCH 195/393] afs: Fix the handling of CB.InitCallBackState3 to
 find the server by UUID

Fix the handling of the CB.InitCallBackState3 service call to find the
record of a server that we're using by looking it up by the UUID passed as
the parameter rather than by its address (of which it might have many, and
which may change).

Fixes: c35eccb1f614 ("[AFS]: Implement the CB.InitCallBackState3 operation.")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 357de908df3a..bcd13397bd59 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -341,7 +341,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
  */
 static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
-	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
 	struct afs_uuid *r;
 	unsigned loop;
@@ -398,8 +397,9 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-	server = afs_find_server(call->net, &srx);
+	rcu_read_lock();
+	server = afs_find_server_by_uuid(call->net, call->request);
+	rcu_read_unlock();
 	if (!server)
 		return -ENOTCONN;
 	call->cm_server = server;

From 3709a399c15e4273d9a94b123374f12e5664318c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 22:59:42 +0100
Subject: [PATCH 196/393] afs: Add a tracepoint to record callbacks from
 unlisted servers

Add a tracepoint to record callbacks from servers for which we don't have a
record.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c         | 12 ++++++++---
 include/trace/events/afs.h | 42 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index bcd13397bd59..9f13375f49b8 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -287,8 +287,10 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 	 * vnodes to operate upon */
 	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 	server = afs_find_server(call->net, &srx);
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server(call, &srx);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
@@ -329,8 +331,10 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	server = afs_find_server(call->net, &srx);
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server(call, &srx);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
@@ -400,8 +404,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	rcu_read_lock();
 	server = afs_find_server_by_uuid(call->net, call->request);
 	rcu_read_unlock();
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server_u(call, call->request);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index f0820554caa9..d0a341bc4540 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -575,6 +575,48 @@ TRACE_EVENT(afs_protocol_error,
 		      __entry->call, __entry->error, __entry->where)
 	    );
 
+TRACE_EVENT(afs_cm_no_server,
+	    TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx),
+
+	    TP_ARGS(call, srx),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,			call	)
+		    __field(unsigned int,			op_id	)
+		    __field_struct(struct sockaddr_rxrpc,	srx	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call->debug_id;
+		    __entry->op_id = call->operation_ID;
+		    memcpy(&__entry->srx, srx, sizeof(__entry->srx));
+			   ),
+
+	    TP_printk("c=%08x op=%u %pISpc",
+		      __entry->call, __entry->op_id, &__entry->srx.transport)
+	    );
+
+TRACE_EVENT(afs_cm_no_server_u,
+	    TP_PROTO(struct afs_call *call, const uuid_t *uuid),
+
+	    TP_ARGS(call, uuid),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,			call	)
+		    __field(unsigned int,			op_id	)
+		    __field_struct(uuid_t,			uuid	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call->debug_id;
+		    __entry->op_id = call->operation_ID;
+		    memcpy(&__entry->uuid, uuid, sizeof(__entry->uuid));
+			   ),
+
+	    TP_printk("c=%08x op=%u %pU",
+		      __entry->call, __entry->op_id, &__entry->uuid)
+	    );
+
 #endif /* _TRACE_AFS_H */
 
 /* This part must be outside protection */

From a86b06d1ccd218a6a50d6a3a88fbd2abcd0eaa94 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 23:45:40 +0100
Subject: [PATCH 197/393] afs: Fix the handling of an unfound server in CM
 operations

If the client cache manager operations that need the server record
(CB.Callback, CB.InitCallBackState, and CB.InitCallBackState3) can't find
the server record, they abort the call from the file server with
RX_CALL_DEAD when they should return okay.

Fixes: c35eccb1f614 ("[AFS]: Implement the CB.InitCallBackState3 operation.")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 34 ++++++++++++----------------------
 fs/afs/rxrpc.c     |  5 -----
 2 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 9f13375f49b8..b44491410af3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -143,8 +143,8 @@ static void afs_cm_destructor(struct afs_call *call)
 	 * received.  The step number here must match the final number in
 	 * afs_deliver_cb_callback().
 	 */
-	if (call->unmarshall == 5) {
-		ASSERT(call->cm_server && call->count && call->request);
+	if (call->cm_server && call->unmarshall == 5) {
+		ASSERT(call->count && call->request);
 		afs_break_callbacks(call->cm_server, call->count, call->request);
 	}
 
@@ -168,7 +168,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 * yet */
 	afs_send_empty_reply(call);
 
-	afs_break_callbacks(call->cm_server, call->count, call->request);
+	if (call->cm_server)
+		afs_break_callbacks(call->cm_server, call->count, call->request);
 	afs_put_call(call);
 	_leave("");
 }
@@ -180,7 +181,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 {
 	struct afs_callback_break *cb;
 	struct sockaddr_rxrpc srx;
-	struct afs_server *server;
 	__be32 *bp;
 	int ret, loop;
 
@@ -286,12 +286,9 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-	server = afs_find_server(call->net, &srx);
-	if (!server) {
+	call->cm_server = afs_find_server(call->net, &srx);
+	if (!call->cm_server)
 		trace_afs_cm_no_server(call, &srx);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -305,7 +302,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 
 	_enter("{%p}", call->cm_server);
 
-	afs_init_callback_state(call->cm_server);
+	if (call->cm_server)
+		afs_init_callback_state(call->cm_server);
 	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
@@ -317,7 +315,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 {
 	struct sockaddr_rxrpc srx;
-	struct afs_server *server;
 	int ret;
 
 	_enter("");
@@ -330,12 +327,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(call->net, &srx);
-	if (!server) {
+	call->cm_server = afs_find_server(call->net, &srx);
+	if (!call->cm_server)
 		trace_afs_cm_no_server(call, &srx);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -345,7 +339,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
  */
 static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
-	struct afs_server *server;
 	struct afs_uuid *r;
 	unsigned loop;
 	__be32 *b;
@@ -402,13 +395,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	rcu_read_lock();
-	server = afs_find_server_by_uuid(call->net, call->request);
+	call->cm_server = afs_find_server_by_uuid(call->net, call->request);
 	rcu_read_unlock();
-	if (!server) {
+	if (!call->cm_server)
 		trace_afs_cm_no_server_u(call, call->request);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 1f6235a6e9ae..d0eee5d32c94 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -497,11 +497,6 @@ static void afs_deliver_to_call(struct afs_call *call)
 		case -ECONNABORTED:
 			ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
 			goto done;
-		case -ENOTCONN:
-			abort_code = RX_CALL_DEAD;
-			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
-						abort_code, ret, "KNC");
-			goto local_abort;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
 			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,

From f9c1bba3d392843f046d2ee27b4dfcec989d8a4b Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Fri, 11 May 2018 21:35:06 -0300
Subject: [PATCH 198/393] afs: Fix afs_find_server search loop

The code that looks up servers by addresses makes the assumption
that the list of addresses for a server is sorted.  It exits the
loop if it finds that the target address is larger than the
current candidate.  As the list is not currently sorted, this
can lead to a failure to find a matching server, which can cause
callbacks from that server to be ignored.

Remove the early exit case so that the complete list is searched.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/server.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/fs/afs/server.c b/fs/afs/server.c
index 2c5cff60e34d..3af4625e2f8c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -67,12 +67,6 @@ struct afs_server *afs_find_server(struct afs_net *net,
 							      sizeof(struct in6_addr));
 					if (diff == 0)
 						goto found;
-					if (diff < 0) {
-						// TODO: Sort the list
-						//if (i == alist->nr_ipv4)
-						//	goto not_found;
-						break;
-					}
 				}
 			}
 		} else {
@@ -87,17 +81,10 @@ struct afs_server *afs_find_server(struct afs_net *net,
 							(u32 __force)b->sin6_addr.s6_addr32[3]);
 					if (diff == 0)
 						goto found;
-					if (diff < 0) {
-						// TODO: Sort the list
-						//if (i == 0)
-						//	goto not_found;
-						break;
-					}
 				}
 			}
 		}
 
-	//not_found:
 		server = NULL;
 	found:
 		if (server && !atomic_inc_not_zero(&server->usage))

From 68251f0a6818f3be19b1471f36c956ca97c1427d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 12 May 2018 22:31:33 +0100
Subject: [PATCH 199/393] afs: Fix whole-volume callback handling

It's possible for an AFS file server to issue a whole-volume notification
that callbacks on all the vnodes in the file have been broken.  This is
done for R/O and backup volumes (which don't have per-file callbacks) and
for things like a volume being taken offline.

Fix callback handling to detect whole-volume notifications, to track it
across operations and to check it during inode validation.

Fixes: c435ee34551e ("afs: Overhaul the callback handling")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/callback.c | 28 +++++++++++++++++++++-------
 fs/afs/dir.c      | 18 +++++++++---------
 fs/afs/file.c     |  2 +-
 fs/afs/flock.c    |  6 +++---
 fs/afs/fsclient.c |  2 +-
 fs/afs/inode.c    | 13 ++++++++-----
 fs/afs/internal.h | 15 +++++++++++++++
 fs/afs/security.c |  7 +++----
 fs/afs/super.c    |  2 +-
 fs/afs/write.c    |  2 +-
 10 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 09332945d322..571437dcb252 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -113,6 +113,7 @@ int afs_register_server_cb_interest(struct afs_vnode *vnode,
 	old = vnode->cb_interest;
 	vnode->cb_interest = cbi;
 	vnode->cb_s_break = cbi->server->cb_s_break;
+	vnode->cb_v_break = vnode->volume->cb_v_break;
 	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 
 	write_sequnlock(&vnode->cb_lock);
@@ -195,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server,
 		if (cbi->vid != fid->vid)
 			continue;
 
-		data.volume = NULL;
-		data.fid = *fid;
-		inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
-		if (inode) {
-			vnode = AFS_FS_I(inode);
-			afs_break_callback(vnode);
-			iput(inode);
+		if (fid->vnode == 0 && fid->unique == 0) {
+			/* The callback break applies to an entire volume. */
+			struct afs_super_info *as = AFS_FS_S(cbi->sb);
+			struct afs_volume *volume = as->volume;
+
+			write_lock(&volume->cb_break_lock);
+			volume->cb_v_break++;
+			write_unlock(&volume->cb_break_lock);
+		} else {
+			data.volume = NULL;
+			data.fid = *fid;
+			inode = ilookup5_nowait(cbi->sb, fid->vnode,
+						afs_iget5_test, &data);
+			if (inode) {
+				vnode = AFS_FS_I(inode);
+				afs_break_callback(vnode);
+				iput(inode);
+			}
 		}
 	}
 
@@ -219,6 +231,8 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
 	ASSERT(server != NULL);
 	ASSERTCMP(count, <=, AFSCBMAX);
 
+	/* TODO: Sort the callback break list by volume ID */
+
 	for (; count > 0; callbacks++, count--) {
 		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
 		       callbacks->fid.vid,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2853acd64482..7d623008157f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1141,7 +1141,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
 				      &newfid, &newstatus, &newcb);
 		}
@@ -1211,7 +1211,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_remove(&fc, dentry->d_name.name, true,
 				      data_version);
 		}
@@ -1314,7 +1314,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_remove(&fc, dentry->d_name.name, false,
 				      data_version);
 		}
@@ -1371,7 +1371,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
 				      &newfid, &newstatus, &newcb);
 		}
@@ -1441,8 +1441,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 		}
 
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
-			fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+			fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
 			afs_fs_link(&fc, vnode, dentry->d_name.name, data_version);
 		}
 
@@ -1510,7 +1510,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_symlink(&fc, dentry->d_name.name,
 				       content, data_version,
 				       &newfid, &newstatus);
@@ -1586,8 +1586,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			}
 		}
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
-			fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode);
+			fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
 			afs_fs_rename(&fc, old_dentry->d_name.name,
 				      new_dvnode, new_dentry->d_name.name,
 				      orig_data_version, new_data_version);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index c24c08016dd9..7d4f26198573 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -238,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_fetch_data(&fc, desc);
 		}
 
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 7a0e017070ec..dc62d15a964b 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_set_lock(&fc, type);
 		}
 
@@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_current_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_extend_lock(&fc);
 		}
 
@@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_current_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_release_lock(&fc);
 		}
 
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index d16f26c6cdbe..b273e1d60478 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -261,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
 
 	write_seqlock(&vnode->cb_lock);
 
-	if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) {
+	if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
 		vnode->cb_version	= ntohl(*bp++);
 		cb_expiry		= ntohl(*bp++);
 		vnode->cb_type		= ntohl(*bp++);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e855c6e5cf28..479b7fdda124 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -108,7 +108,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_fetch_file_status(&fc, NULL, new_inode);
 		}
 
@@ -393,15 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	read_seqlock_excl(&vnode->cb_lock);
 
 	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break ||
+		    vnode->cb_v_break != vnode->volume->cb_v_break) {
 			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+			vnode->cb_v_break = vnode->volume->cb_v_break;
+			valid = false;
 		} else if (vnode->status.type == AFS_FTYPE_DIR &&
 			   test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) &&
 			   vnode->cb_expires_at - 10 > now) {
-				valid = true;
+			valid = true;
 		} else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
 			   vnode->cb_expires_at - 10 > now) {
-				valid = true;
+			valid = true;
 		}
 	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		valid = true;
@@ -574,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_setattr(&fc, attr);
 		}
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index e75e57e13320..e3f8a46663db 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -461,6 +461,9 @@ struct afs_volume {
 	rwlock_t		servers_lock;	/* Lock for ->servers */
 	unsigned int		servers_seq;	/* Incremented each time ->servers changes */
 
+	unsigned		cb_v_break;	/* Break-everything counter. */
+	rwlock_t		cb_break_lock;
+
 	afs_voltype_t		type;		/* type of volume */
 	short			error;
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
@@ -521,6 +524,7 @@ struct afs_vnode {
 	/* outstanding callback notification on this file */
 	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
 	unsigned int		cb_s_break;	/* Mass break counter on ->server */
+	unsigned int		cb_v_break;	/* Mass break counter on ->volume */
 	unsigned int		cb_break;	/* Break counter on vnode */
 	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
 
@@ -662,6 +666,17 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest
 	return cbi;
 }
 
+static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
+{
+	return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+}
+
+static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
+					    struct afs_cb_interest *cbi)
+{
+	return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+}
+
 /*
  * cell.c
  */
diff --git a/fs/afs/security.c b/fs/afs/security.c
index cea2fff313dc..1992b0ffa543 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
 					break;
 				}
 
-				if (cb_break != (vnode->cb_break +
-						 vnode->cb_interest->server->cb_s_break)) {
+				if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
 					changed = true;
 					break;
 				}
@@ -178,7 +177,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
 		}
 	}
 
-	if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break))
+	if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
 		goto someone_else_changed_it;
 
 	/* We need a ref on any permits list we want to copy as we'll have to
@@ -257,7 +256,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
 
 	spin_lock(&vnode->lock);
 	zap = rcu_access_pointer(vnode->permit_cache);
-	if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) &&
+	if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
 	    zap == permits)
 		rcu_assign_pointer(vnode->permit_cache, replacement);
 	else
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b02838cd9525..9e5d7966621c 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -688,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_get_volume_status(&fc, &vs);
 		}
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c164698dc304..8b39e6ebb40b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -351,7 +351,7 @@ static int afs_store_data(struct address_space *mapping,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_store_data(&fc, mapping, first, last, offset, to);
 		}
 

From 428edade4e6c70e5b51fcd4188d944fbb744d84c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 12 May 2018 00:28:58 +0100
Subject: [PATCH 200/393] afs: Fix CB.CallBack handling

The handling of CB.CallBack messages sent by the fileserver to the client
is broken in that they are currently being processed after the reply has
been transmitted.

This is not what the fileserver expects, however.  It holds up change
visibility until the reply comes so as to maintain cache coherency, and so
expects the client to have to refetch the state on the affected files.

Fix CB.CallBack handling to perform the callback break before sending the
reply.

The fileserver is free to hold up status fetches issued by other threads on
the same client that occur in reponse to the callback until any pending
changes have been committed.

Fixes: d001648ec7cf ("rxrpc: Don't expose skbs to in-kernel users [ver #2]")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index b44491410af3..c332c95a6940 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call)
 }
 
 /*
- * clean up a cache manager call
+ * Clean up a cache manager call.
  */
 static void afs_cm_destructor(struct afs_call *call)
 {
-	_enter("");
-
-	/* Break the callbacks here so that we do it after the final ACK is
-	 * received.  The step number here must match the final number in
-	 * afs_deliver_cb_callback().
-	 */
-	if (call->cm_server && call->unmarshall == 5) {
-		ASSERT(call->count && call->request);
-		afs_break_callbacks(call->cm_server, call->count, call->request);
-	}
-
 	kfree(call->buffer);
 	call->buffer = NULL;
 }
@@ -161,15 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 
 	_enter("");
 
-	/* be sure to send the reply *before* attempting to spam the AFS server
-	 * with FSFetchStatus requests on the vnodes with broken callbacks lest
-	 * the AFS server get into a vicious cycle of trying to break further
-	 * callbacks because it hadn't received completion of the CBCallBack op
-	 * yet */
-	afs_send_empty_reply(call);
-
+	/* We need to break the callbacks before sending the reply as the
+	 * server holds up change visibility till it receives our reply so as
+	 * to maintain cache coherency.
+	 */
 	if (call->cm_server)
 		afs_break_callbacks(call->cm_server, call->count, call->request);
+
+	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
 }
@@ -267,15 +255,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 		call->offset = 0;
 		call->unmarshall++;
-
-		/* Record that the message was unmarshalled successfully so
-		 * that the call destructor can know do the callback breaking
-		 * work, even if the final ACK isn't received.
-		 *
-		 * If the step number changes, then afs_cm_destructor() must be
-		 * updated also.
-		 */
-		call->unmarshall++;
 	case 5:
 		break;
 	}

From 4776cab43fd3111618112737a257dc3ef368eddd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 23:10:40 +0100
Subject: [PATCH 201/393] afs: Fix the non-encryption of calls

Some AFS servers refuse to accept unencrypted traffic, so can't be accessed
with kAFS.  Set the AF_RXRPC security level to encrypt client calls to deal
with this.

Note that incoming service calls are set by the remote client and so aren't
affected by this.

This requires an AF_RXRPC patch to pass the value set by setsockopt to calls
begun by the kernel.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d0eee5d32c94..08735948f15d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
+	unsigned int min_level;
 	int ret;
 
 	_enter("");
@@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net)
 	srx.transport.sin6.sin6_family	= AF_INET6;
 	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
+	min_level = RXRPC_SECURITY_ENCRYPT;
+	ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
+				(void *)&min_level, sizeof(min_level));
+	if (ret < 0)
+		goto error_2;
+
 	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
 	if (ret == -EADDRINUSE) {
 		srx.transport.sin6.sin6_port = 0;

From 6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Mon, 14 May 2018 10:51:34 +0800
Subject: [PATCH 202/393] Btrfs: send, fix invalid access to commit roots due
 to concurrent snapshotting

[BUG]
btrfs incremental send BUG happens when creating a snapshot of snapshot
that is being used by send.

[REASON]
The problem can happen if while we are doing a send one of the snapshots
used (parent or send) is snapshotted, because snapshoting implies COWing
the root of the source subvolume/snapshot.

1. When doing an incremental send, the send process will get the commit
   roots from the parent and send snapshots, and add references to them
   through extent_buffer_get().

2. When a snapshot/subvolume is snapshotted, its root node is COWed
   (transaction.c:create_pending_snapshot()).

3. COWing releases the space used by the node immediately, through:

   __btrfs_cow_block()
   --btrfs_free_tree_block()
   ----btrfs_add_free_space(bytenr of node)

4. Because send doesn't hold a transaction open, it's possible that
   the transaction used to create the snapshot commits, switches the
   commit root and the old space used by the previous root node gets
   assigned to some other node allocation. Allocation of a new node will
   use the existing extent buffer found in memory, which we previously
   got a reference through extent_buffer_get(), and allow the extent
   buffer's content (pages) to be modified:

   btrfs_alloc_tree_block
   --btrfs_reserve_extent
   ----find_free_extent (get bytenr of old node)
   --btrfs_init_new_buffer (use bytenr of old node)
   ----btrfs_find_create_tree_block
   ------alloc_extent_buffer
   --------find_extent_buffer (get old node)

5. So send can access invalid memory content and have unpredictable
   behaviour.

[FIX]
So we fix the problem by copying the commit roots of the send and
parent snapshots and use those copies.

CallTrace looks like this:
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ctree.c:1861!
 invalid opcode: 0000 [#1] SMP
 CPU: 6 PID: 24235 Comm: btrfs Tainted: P           O 3.10.105 #23721
 ffff88046652d680 ti: ffff88041b720000 task.ti: ffff88041b720000
 RIP: 0010:[<ffffffffa08dd0e8>] read_node_slot+0x108/0x110 [btrfs]
 RSP: 0018:ffff88041b723b68  EFLAGS: 00010246
 RAX: ffff88043ca6b000 RBX: ffff88041b723c50 RCX: ffff880000000000
 RDX: 000000000000004c RSI: ffff880314b133f8 RDI: ffff880458b24000
 RBP: 0000000000000000 R08: 0000000000000001 R09: ffff88041b723c66
 R10: 0000000000000001 R11: 0000000000001000 R12: ffff8803f3e48890
 R13: ffff8803f3e48880 R14: ffff880466351800 R15: 0000000000000001
 FS:  00007f8c321dc8c0(0000) GS:ffff88047fcc0000(0000)
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 R2: 00007efd1006d000 CR3: 0000000213a24000 CR4: 00000000003407e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Stack:
 ffff88041b723c50 ffff8803f3e48880 ffff8803f3e48890 ffff8803f3e48880
 ffff880466351800 0000000000000001 ffffffffa08dd9d7 ffff88041b723c50
 ffff8803f3e48880 ffff88041b723c66 ffffffffa08dde85 a9ff88042d2c4400
 Call Trace:
 [<ffffffffa08dd9d7>] ? tree_move_down.isra.33+0x27/0x50 [btrfs]
 [<ffffffffa08dde85>] ? tree_advance+0xb5/0xc0 [btrfs]
 [<ffffffffa08e83d4>] ? btrfs_compare_trees+0x2d4/0x760 [btrfs]
 [<ffffffffa0982050>] ? finish_inode_if_needed+0x870/0x870 [btrfs]
 [<ffffffffa09841ea>] ? btrfs_ioctl_send+0xeda/0x1050 [btrfs]
 [<ffffffffa094bd3d>] ? btrfs_ioctl+0x1e3d/0x33f0 [btrfs]
 [<ffffffff81111133>] ? handle_pte_fault+0x373/0x990
 [<ffffffff8153a096>] ? atomic_notifier_call_chain+0x16/0x20
 [<ffffffff81063256>] ? set_task_cpu+0xb6/0x1d0
 [<ffffffff811122c3>] ? handle_mm_fault+0x143/0x2a0
 [<ffffffff81539cc0>] ? __do_page_fault+0x1d0/0x500
 [<ffffffff81062f07>] ? check_preempt_curr+0x57/0x90
 [<ffffffff8115075a>] ? do_vfs_ioctl+0x4aa/0x990
 [<ffffffff81034f83>] ? do_fork+0x113/0x3b0
 [<ffffffff812dd7d7>] ? trace_hardirqs_off_thunk+0x3a/0x6c
 [<ffffffff81150cc8>] ? SyS_ioctl+0x88/0xa0
 [<ffffffff8153e422>] ? system_call_fastpath+0x16/0x1b
 ---[ end trace 29576629ee80b2e1 ]---

Fixes: 7069830a9e38 ("Btrfs: add btrfs_compare_trees function")
CC: stable@vger.kernel.org # 3.6+
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fd44835b386..63488f0b850f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5414,12 +5414,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 	down_read(&fs_info->commit_root_sem);
 	left_level = btrfs_header_level(left_root->commit_root);
 	left_root_level = left_level;
-	left_path->nodes[left_level] = left_root->commit_root;
+	left_path->nodes[left_level] =
+			btrfs_clone_extent_buffer(left_root->commit_root);
+	if (!left_path->nodes[left_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
 	extent_buffer_get(left_path->nodes[left_level]);
 
 	right_level = btrfs_header_level(right_root->commit_root);
 	right_root_level = right_level;
-	right_path->nodes[right_level] = right_root->commit_root;
+	right_path->nodes[right_level] =
+			btrfs_clone_extent_buffer(right_root->commit_root);
+	if (!right_path->nodes[right_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
 	extent_buffer_get(right_path->nodes[right_level]);
 	up_read(&fs_info->commit_root_sem);
 

From 9a8fca62aacc1599fea8e813d01e1955513e4fad Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 May 2018 16:42:42 +0100
Subject: [PATCH 203/393] Btrfs: fix xattr loss after power failure

If a file has xattrs, we fsync it, to ensure we clear the flags
BTRFS_INODE_NEEDS_FULL_SYNC and BTRFS_INODE_COPY_EVERYTHING from its
inode, the current transaction commits and then we fsync it (without
either of those bits being set in its inode), we end up not logging
all its xattrs. This results in deleting all xattrs when replying the
log after a power failure.

Trivial reproducer

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foobar
  $ setfattr -n user.xa -v qwerty /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 64K" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar
  <power failure>

  $ mount /dev/sdb /mnt
  $ getfattr --absolute-names --dump /mnt/foobar
  <empty output>
  $

So fix this by making sure all xattrs are logged if we log a file's inode
item and neither the flags BTRFS_INODE_NEEDS_FULL_SYNC nor
BTRFS_INODE_COPY_EVERYTHING were set in the inode.

Fixes: 36283bf777d9 ("Btrfs: fix fsync xattr loss in the fast fsync path")
Cc: <stable@vger.kernel.org> # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 43758e30aa7a..c1509547c762 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4827,6 +4827,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree = &inode->extent_tree;
 	u64 logged_isize = 0;
 	bool need_log_inode_item = true;
+	bool xattrs_logged = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5128,6 +5129,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
 	if (err)
 		goto out_unlock;
+	xattrs_logged = true;
 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
 		btrfs_release_path(path);
 		btrfs_release_path(dst_path);
@@ -5140,6 +5142,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	btrfs_release_path(dst_path);
 	if (need_log_inode_item) {
 		err = log_inode_item(trans, log, dst_path, inode);
+		if (!err && !xattrs_logged) {
+			err = btrfs_log_all_xattrs(trans, root, inode, path,
+						   dst_path);
+			btrfs_release_path(path);
+		}
 		if (err)
 			goto out_unlock;
 	}

From ebc3dd688cd988754a304147753b13e58de1b5a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Mon, 14 May 2018 09:40:05 -0500
Subject: [PATCH 204/393] usb: musb: fix remote wakeup racing with suspend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It has been observed that writing 0xF2 to the power register while it
reads as 0xF4 results in the register having the value 0xF0, i.e. clearing
RESUME and setting SUSPENDM in one go does not work. It might also violate
the USB spec to transition directly from resume to suspend, especially
when not taking T_DRSMDN into account. But this is what happens when a
remote wakeup occurs between SetPortFeature USB_PORT_FEAT_SUSPEND on the
root hub and musb_bus_suspend being called.

This commit returns -EBUSY when musb_bus_suspend is called while remote
wakeup is signalled and thus avoids to reset the RESUME bit. Ignoring
this error when musb_port_suspend is called from musb_hub_control is ok.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Bin Liu <b-liu@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/musb/musb_host.c    |  5 ++++-
 drivers/usb/musb/musb_host.h    |  7 +++++--
 drivers/usb/musb/musb_virthub.c | 25 +++++++++++++++----------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index e7f99d55922a..15a42cee0a9c 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c
@@ -2524,8 +2524,11 @@ static int musb_bus_suspend(struct usb_hcd *hcd)
 {
 	struct musb	*musb = hcd_to_musb(hcd);
 	u8		devctl;
+	int		ret;
 
-	musb_port_suspend(musb, true);
+	ret = musb_port_suspend(musb, true);
+	if (ret)
+		return ret;
 
 	if (!is_host_active(musb))
 		return 0;
diff --git a/drivers/usb/musb/musb_host.h b/drivers/usb/musb/musb_host.h
index 72392bbcd0a4..2999845632ce 100644
--- a/drivers/usb/musb/musb_host.h
+++ b/drivers/usb/musb/musb_host.h
@@ -67,7 +67,7 @@ extern void musb_host_rx(struct musb *, u8);
 extern void musb_root_disconnect(struct musb *musb);
 extern void musb_host_resume_root_hub(struct musb *musb);
 extern void musb_host_poke_root_hub(struct musb *musb);
-extern void musb_port_suspend(struct musb *musb, bool do_suspend);
+extern int musb_port_suspend(struct musb *musb, bool do_suspend);
 extern void musb_port_reset(struct musb *musb, bool do_reset);
 extern void musb_host_finish_resume(struct work_struct *work);
 #else
@@ -99,7 +99,10 @@ static inline void musb_root_disconnect(struct musb *musb)	{}
 static inline void musb_host_resume_root_hub(struct musb *musb)	{}
 static inline void musb_host_poll_rh_status(struct musb *musb)	{}
 static inline void musb_host_poke_root_hub(struct musb *musb)	{}
-static inline void musb_port_suspend(struct musb *musb, bool do_suspend) {}
+static inline int musb_port_suspend(struct musb *musb, bool do_suspend)
+{
+	return 0;
+}
 static inline void musb_port_reset(struct musb *musb, bool do_reset) {}
 static inline void musb_host_finish_resume(struct work_struct *work) {}
 #endif
diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c
index 5165d2b07ade..2f8dd9826e94 100644
--- a/drivers/usb/musb/musb_virthub.c
+++ b/drivers/usb/musb/musb_virthub.c
@@ -48,14 +48,14 @@ void musb_host_finish_resume(struct work_struct *work)
 	spin_unlock_irqrestore(&musb->lock, flags);
 }
 
-void musb_port_suspend(struct musb *musb, bool do_suspend)
+int musb_port_suspend(struct musb *musb, bool do_suspend)
 {
 	struct usb_otg	*otg = musb->xceiv->otg;
 	u8		power;
 	void __iomem	*mbase = musb->mregs;
 
 	if (!is_host_active(musb))
-		return;
+		return 0;
 
 	/* NOTE:  this doesn't necessarily put PHY into low power mode,
 	 * turning off its clock; that's a function of PHY integration and
@@ -66,16 +66,20 @@ void musb_port_suspend(struct musb *musb, bool do_suspend)
 	if (do_suspend) {
 		int retries = 10000;
 
-		power &= ~MUSB_POWER_RESUME;
-		power |= MUSB_POWER_SUSPENDM;
-		musb_writeb(mbase, MUSB_POWER, power);
+		if (power & MUSB_POWER_RESUME)
+			return -EBUSY;
 
-		/* Needed for OPT A tests */
-		power = musb_readb(mbase, MUSB_POWER);
-		while (power & MUSB_POWER_SUSPENDM) {
+		if (!(power & MUSB_POWER_SUSPENDM)) {
+			power |= MUSB_POWER_SUSPENDM;
+			musb_writeb(mbase, MUSB_POWER, power);
+
+			/* Needed for OPT A tests */
 			power = musb_readb(mbase, MUSB_POWER);
-			if (retries-- < 1)
-				break;
+			while (power & MUSB_POWER_SUSPENDM) {
+				power = musb_readb(mbase, MUSB_POWER);
+				if (retries-- < 1)
+					break;
+			}
 		}
 
 		musb_dbg(musb, "Root port suspended, power %02x", power);
@@ -111,6 +115,7 @@ void musb_port_suspend(struct musb *musb, bool do_suspend)
 		schedule_delayed_work(&musb->finish_resume_work,
 				      msecs_to_jiffies(USB_RESUME_TIMEOUT));
 	}
+	return 0;
 }
 
 void musb_port_reset(struct musb *musb, bool do_reset)

From 76936e9a6df17b89481bd2655c8684291afbe656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Cami?= <fcami@fedoraproject.org>
Date: Sun, 13 May 2018 20:11:15 +0200
Subject: [PATCH 205/393] libata: Apply NOLPM quirk for SAMSUNG PM830 CXM13D1Q.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch the drive errors out regularly:

[    1.090154] ata1.00: ATA-8: SAMSUNG SSD PM830 mSATA 256GB,
CXM13D1Q, max UDMA/133
(...)
[  345.154996] ata1.00: exception Emask 0x40 SAct 0x0 SErr 0xc0800 action 0x6
[  345.155006] ata1.00: irq_stat 0x40000001
[  345.155013] ata1: SError: { HostInt CommWake 10B8B }
[  345.155018] ata1.00: failed command: SET FEATURES
[  345.155032] ata1.00: cmd ef/05:e1:00:00:00/00:00:00:00:00/40 tag 7
                        res 51/04:e1:00:00:00/00:00:00:00:00/40 Emask 0x41 (internal error)
[  345.155038] ata1.00: status: { DRDY ERR }
[  345.155042] ata1.00: error: { ABRT }
[  345.155051] ata1: hard resetting link
[  345.465661] ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
[  345.466955] ata1.00: configured for UDMA/133
[  345.467085] ata1: EH complete

Signed-off-by: François Cami <fcami@fedoraproject.org>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a2498f5cfb28..7ed2f009911a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4553,8 +4553,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 						ATA_HORKAGE_ZERO_AFTER_TRIM |
 						ATA_HORKAGE_NOLPM, },
 
-	/* This specific Samsung model/firmware-rev does not handle LPM well */
+	/* These specific Samsung models/firmware-revs do not handle LPM well */
 	{ "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM, },
+	{ "SAMSUNG SSD PM830 mSATA *",  "CXM13D1Q", ATA_HORKAGE_NOLPM, },
 
 	/* Sandisk devices which are known to not handle LPM well */
 	{ "SanDisk SD7UB3Q*G1001",	NULL,	ATA_HORKAGE_NOLPM, },

From 9954b80b8c0e8abc98e17bba0fccd9876211ceaa Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 10 May 2018 14:24:20 +0100
Subject: [PATCH 206/393] ARM: keystone: fix platform_domain_notifier array
 overrun

platform_domain_notifier contains a variable sized array, which the
pm_clk_notify() notifier treats as a NULL terminated array:

     for (con_id = clknb->con_ids; *con_id; con_id++)
             pm_clk_add(dev, *con_id);

Omitting the initialiser for con_ids means that the array is zero
sized, and there is no NULL terminator.  This leads to pm_clk_notify()
overrunning into what ever structure follows, which may not be NULL.
This leads to an oops:

Unable to handle kernel NULL pointer dereference at virtual address 0000008c
pgd = c0003000
[0000008c] *pgd=80000800004003c, *pmd=00000000c
Internal error: Oops: 206 [#1] PREEMPT SMP ARM
Modules linked in:c
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.16.0+ #9
Hardware name: Keystone
PC is at strlen+0x0/0x34
LR is at kstrdup+0x18/0x54
pc : [<c0623340>]    lr : [<c0111d6c>]    psr: 20000013
sp : eec73dc0  ip : eed780c0  fp : 00000001
r10: 00000000  r9 : 00000000  r8 : eed71e10
r7 : 0000008c  r6 : 0000008c  r5 : 014000c0  r4 : c03a6ff4
r3 : c09445d0  r2 : 00000000  r1 : 014000c0  r0 : 0000008c
Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
Control: 30c5387d  Table: 00003000  DAC: fffffffd
Process swapper/0 (pid: 1, stack limit = 0xeec72210)
Stack: (0xeec73dc0 to 0xeec74000)
...
[<c0623340>] (strlen) from [<c0111d6c>] (kstrdup+0x18/0x54)
[<c0111d6c>] (kstrdup) from [<c03a6ff4>] (__pm_clk_add+0x58/0x120)
[<c03a6ff4>] (__pm_clk_add) from [<c03a731c>] (pm_clk_notify+0x64/0xa8)
[<c03a731c>] (pm_clk_notify) from [<c004614c>] (notifier_call_chain+0x44/0x84)
[<c004614c>] (notifier_call_chain) from [<c0046320>] (__blocking_notifier_call_chain+0x48/0x60)
[<c0046320>] (__blocking_notifier_call_chain) from [<c0046350>] (blocking_notifier_call_chain+0x18/0x20)
[<c0046350>] (blocking_notifier_call_chain) from [<c0390234>] (device_add+0x36c/0x534)
[<c0390234>] (device_add) from [<c047fc00>] (of_platform_device_create_pdata+0x70/0xa4)
[<c047fc00>] (of_platform_device_create_pdata) from [<c047fea0>] (of_platform_bus_create+0xf0/0x1ec)
[<c047fea0>] (of_platform_bus_create) from [<c047fff8>] (of_platform_populate+0x5c/0xac)
[<c047fff8>] (of_platform_populate) from [<c08b1f04>] (of_platform_default_populate_init+0x8c/0xa8)
[<c08b1f04>] (of_platform_default_populate_init) from [<c000a78c>] (do_one_initcall+0x3c/0x164)
[<c000a78c>] (do_one_initcall) from [<c087bd9c>] (kernel_init_freeable+0x10c/0x1d0)
[<c087bd9c>] (kernel_init_freeable) from [<c0628db0>] (kernel_init+0x8/0xf0)
[<c0628db0>] (kernel_init) from [<c00090d8>] (ret_from_fork+0x14/0x3c)
Exception stack(0xeec73fb0 to 0xeec73ff8)
3fa0:                                     00000000 00000000 00000000 00000000
3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
3fe0: 00000000 00000000 00000000 00000000 00000013 00000000
Code: e3520000 1afffff7 e12fff1e c0801730 (e5d02000)
---[ end trace cafa8f148e262e80 ]---

Fix this by adding the necessary initialiser.

Fixes: fc20ffe1213b ("ARM: keystone: add PM domain support for clock management")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-keystone/pm_domain.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-keystone/pm_domain.c b/arch/arm/mach-keystone/pm_domain.c
index fe57e2692629..abca83d22ff3 100644
--- a/arch/arm/mach-keystone/pm_domain.c
+++ b/arch/arm/mach-keystone/pm_domain.c
@@ -29,6 +29,7 @@ static struct dev_pm_domain keystone_pm_domain = {
 
 static struct pm_clk_notifier_block platform_domain_notifier = {
 	.pm_domain = &keystone_pm_domain,
+	.con_ids = { NULL },
 };
 
 static const struct of_device_id of_keystone_table[] = {

From b196d88aba8ac72b775137854121097f4c4c6862 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 11 May 2018 10:49:25 +0800
Subject: [PATCH 207/393] tun: fix use after free for ptr_ring

We used to initialize ptr_ring during TUNSETIFF, this is because its
size depends on the tx_queue_len of netdevice. And we try to clean it
up when socket were detached from netdevice. A race were spotted when
trying to do uninit during a read which will lead a use after free for
pointer ring. Solving this by always initialize a zero size ptr_ring
in open() and do resizing during TUNSETIFF, and then we can safely do
cleanup during close(). With this, there's no need for the workaround
that was introduced by commit 4df0bfc79904 ("tun: fix a memory leak
for tfile->tx_array").

Reported-by: syzbot+e8b902c3c3fadf0a9dba@syzkaller.appspotmail.com
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Fixes: 1576d9860599 ("tun: switch to use skb array for tx")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ef33950a45d9..9fbbb328b95b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -681,15 +681,6 @@ static void tun_queue_purge(struct tun_file *tfile)
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
-static void tun_cleanup_tx_ring(struct tun_file *tfile)
-{
-	if (tfile->tx_ring.queue) {
-		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
-		xdp_rxq_info_unreg(&tfile->xdp_rxq);
-		memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-	}
-}
-
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -736,7 +727,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 			    tun->dev->reg_state == NETREG_REGISTERED)
 				unregister_netdevice(tun->dev);
 		}
-		tun_cleanup_tx_ring(tfile);
+		if (tun)
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
 	}
 }
@@ -783,14 +775,14 @@ static void tun_detach_all(struct net_device *dev)
 		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 		tun_enable_queue(tfile);
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	BUG_ON(tun->numdisabled != 0);
 
@@ -834,7 +826,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	}
 
 	if (!tfile->detached &&
-	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
+	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
+			    GFP_KERNEL, tun_ptr_free)) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -3219,6 +3212,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
+	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
+		sk_free(&tfile->sk);
+		return -ENOMEM;
+	}
+
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
@@ -3239,8 +3237,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
-	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-
 	return 0;
 }
 
@@ -3249,6 +3245,7 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
+	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }

From 0f8db8cc73df60b3de9a5eebd8f117b56eff5b03 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 11 May 2018 20:15:11 +0300
Subject: [PATCH 208/393] selinux: add AF_UNSPEC and INADDR_ANY checks to
 selinux_socket_bind()

Commit d452930fd3b9 ("selinux: Add SCTP support") breaks compatibility
with the old programs that can pass sockaddr_in structure with AF_UNSPEC
and INADDR_ANY to bind(). As a result, bind() returns EAFNOSUPPORT error.
This was found with LTP/asapi_01 test.

Similar to commit 29c486df6a20 ("net: ipv4: relax AF_INET check in
bind()"), which relaxed AF_INET check for compatibility, add AF_UNSPEC
case to AF_INET and make sure that the address is INADDR_ANY.

Fixes: d452930fd3b9 ("selinux: Add SCTP support")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 21b377aef69a..16df6cca9a1b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4568,6 +4568,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
 static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
 {
 	struct sock *sk = sock->sk;
+	struct sk_security_struct *sksec = sk->sk_security;
 	u16 family;
 	int err;
 
@@ -4579,11 +4580,11 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 	family = sk->sk_family;
 	if (family == PF_INET || family == PF_INET6) {
 		char *addrp;
-		struct sk_security_struct *sksec = sk->sk_security;
 		struct common_audit_data ad;
 		struct lsm_network_audit net = {0,};
 		struct sockaddr_in *addr4 = NULL;
 		struct sockaddr_in6 *addr6 = NULL;
+		u16 family_sa = address->sa_family;
 		unsigned short snum;
 		u32 sid, node_perm;
 
@@ -4593,11 +4594,20 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 		 * need to check address->sa_family as it is possible to have
 		 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
 		 */
-		switch (address->sa_family) {
+		switch (family_sa) {
+		case AF_UNSPEC:
 		case AF_INET:
 			if (addrlen < sizeof(struct sockaddr_in))
 				return -EINVAL;
 			addr4 = (struct sockaddr_in *)address;
+			if (family_sa == AF_UNSPEC) {
+				/* see __inet_bind(), we only want to allow
+				 * AF_UNSPEC if the address is INADDR_ANY
+				 */
+				if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
+					goto err_af;
+				family_sa = AF_INET;
+			}
 			snum = ntohs(addr4->sin_port);
 			addrp = (char *)&addr4->sin_addr.s_addr;
 			break;
@@ -4609,13 +4619,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 			addrp = (char *)&addr6->sin6_addr.s6_addr;
 			break;
 		default:
-			/* Note that SCTP services expect -EINVAL, whereas
-			 * others expect -EAFNOSUPPORT.
-			 */
-			if (sksec->sclass == SECCLASS_SCTP_SOCKET)
-				return -EINVAL;
-			else
-				return -EAFNOSUPPORT;
+			goto err_af;
 		}
 
 		if (snum) {
@@ -4673,7 +4677,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 		ad.u.net->sport = htons(snum);
 		ad.u.net->family = family;
 
-		if (address->sa_family == AF_INET)
+		if (family_sa == AF_INET)
 			ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
 		else
 			ad.u.net->v6info.saddr = addr6->sin6_addr;
@@ -4686,6 +4690,11 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 	}
 out:
 	return err;
+err_af:
+	/* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
+	if (sksec->sclass == SECCLASS_SCTP_SOCKET)
+		return -EINVAL;
+	return -EAFNOSUPPORT;
 }
 
 /* This supports connect(2) and SCTP connect services such as sctp_connectx(3)

From 88b7d370bb4b1280717ebdacd6748456f9ba484f Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 11 May 2018 20:15:12 +0300
Subject: [PATCH 209/393] selinux: fix address family in bind() and connect()
 to match address/port

Since sctp_bindx() and sctp_connectx() can have multiple addresses,
sk_family can differ from sa_family. Therefore, selinux_socket_bind()
and selinux_socket_connect_helper(), which process sockaddr structure
(address and port), should use the address family from that structure
too, and not from the socket one.

The initialization of the data for the audit record is moved above,
in selinux_socket_bind(), so that there is no duplicate changes and
code.

Fixes: d452930fd3b9 ("selinux: Add SCTP support")
Suggested-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 16df6cca9a1b..f5f2d6a582f0 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4622,6 +4622,11 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 			goto err_af;
 		}
 
+		ad.type = LSM_AUDIT_DATA_NET;
+		ad.u.net = &net;
+		ad.u.net->sport = htons(snum);
+		ad.u.net->family = family_sa;
+
 		if (snum) {
 			int low, high;
 
@@ -4633,10 +4638,6 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 						      snum, &sid);
 				if (err)
 					goto out;
-				ad.type = LSM_AUDIT_DATA_NET;
-				ad.u.net = &net;
-				ad.u.net->sport = htons(snum);
-				ad.u.net->family = family;
 				err = avc_has_perm(&selinux_state,
 						   sksec->sid, sid,
 						   sksec->sclass,
@@ -4668,15 +4669,10 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 			break;
 		}
 
-		err = sel_netnode_sid(addrp, family, &sid);
+		err = sel_netnode_sid(addrp, family_sa, &sid);
 		if (err)
 			goto out;
 
-		ad.type = LSM_AUDIT_DATA_NET;
-		ad.u.net = &net;
-		ad.u.net->sport = htons(snum);
-		ad.u.net->family = family;
-
 		if (family_sa == AF_INET)
 			ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
 		else
@@ -4772,7 +4768,7 @@ static int selinux_socket_connect_helper(struct socket *sock,
 		ad.type = LSM_AUDIT_DATA_NET;
 		ad.u.net = &net;
 		ad.u.net->dport = htons(snum);
-		ad.u.net->family = sk->sk_family;
+		ad.u.net->family = address->sa_family;
 		err = avc_has_perm(&selinux_state,
 				   sksec->sid, sid, sksec->sclass, perm, &ad);
 		if (err)

From 4152dc91b5932e7fe49a5afed62a068b2f31d196 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 11 May 2018 20:15:13 +0300
Subject: [PATCH 210/393] selinux: correctly handle sa_family cases in
 selinux_sctp_bind_connect()

Allow to pass the socket address structure with AF_UNSPEC family for
compatibility purposes. selinux_socket_bind() will further check it
for INADDR_ANY and selinux_socket_connect_helper() should return
EINVAL.

For a bad address family return EINVAL instead of AFNOSUPPORT error,
i.e. what is expected from SCTP protocol in such case.

Fixes: d452930fd3b9 ("selinux: Add SCTP support")
Suggested-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f5f2d6a582f0..efeb1db8f61d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5269,6 +5269,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname,
 	while (walk_size < addrlen) {
 		addr = addr_buf;
 		switch (addr->sa_family) {
+		case AF_UNSPEC:
 		case AF_INET:
 			len = sizeof(struct sockaddr_in);
 			break;
@@ -5276,7 +5277,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname,
 			len = sizeof(struct sockaddr_in6);
 			break;
 		default:
-			return -EAFNOSUPPORT;
+			return -EINVAL;
 		}
 
 		err = -EINVAL;

From 7b34c0fb1b51a6377752fc971e57577eeb102d60 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Tue, 8 May 2018 14:06:15 -0700
Subject: [PATCH 211/393] ACPICA: Add deferred package support for the Load and
 loadTable operators

Completes the support and fixes a regression introduced in
version 20180209.

The regression caused package objects that were loaded by the Load and
loadTable operators. This created an error message like the following:

[    0.251922] ACPI Error: No pointer back to namespace node in package
00000000fd2a44cd (20180313/dsargs-303)

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199413
Fixes: 5a8361f7ecce (ACPICA: Integrate package handling with module-level code)
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/acnamesp.h |  4 ++
 drivers/acpi/acpica/exconfig.c | 14 +++++++
 drivers/acpi/acpica/nsinit.c   | 76 +++++++++++++++++++++++++---------
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h
index 514aaf948ea9..3825df923480 100644
--- a/drivers/acpi/acpica/acnamesp.h
+++ b/drivers/acpi/acpica/acnamesp.h
@@ -56,6 +56,10 @@ acpi_status acpi_ns_initialize_objects(void);
 
 acpi_status acpi_ns_initialize_devices(u32 flags);
 
+acpi_status
+acpi_ns_init_one_package(acpi_handle obj_handle,
+			 u32 level, void *context, void **return_value);
+
 /*
  * nsload -  Namespace loading
  */
diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c
index 99d92cb32803..f85c6f3271f6 100644
--- a/drivers/acpi/acpica/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -174,6 +174,13 @@ acpi_ex_load_table_op(struct acpi_walk_state *walk_state,
 		return_ACPI_STATUS(status);
 	}
 
+	/* Complete the initialization/resolution of package objects */
+
+	status = acpi_ns_walk_namespace(ACPI_TYPE_PACKAGE, ACPI_ROOT_OBJECT,
+					ACPI_UINT32_MAX, 0,
+					acpi_ns_init_one_package, NULL, NULL,
+					NULL);
+
 	/* Parameter Data (optional) */
 
 	if (parameter_node) {
@@ -430,6 +437,13 @@ acpi_ex_load_op(union acpi_operand_object *obj_desc,
 		return_ACPI_STATUS(status);
 	}
 
+	/* Complete the initialization/resolution of package objects */
+
+	status = acpi_ns_walk_namespace(ACPI_TYPE_PACKAGE, ACPI_ROOT_OBJECT,
+					ACPI_UINT32_MAX, 0,
+					acpi_ns_init_one_package, NULL, NULL,
+					NULL);
+
 	/* Store the ddb_handle into the Target operand */
 
 	status = acpi_ex_store(ddb_handle, target, walk_state);
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index 77f2b5f4948a..d77257d1c827 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -240,6 +240,58 @@ acpi_status acpi_ns_initialize_devices(u32 flags)
 	return_ACPI_STATUS(status);
 }
 
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ns_init_one_package
+ *
+ * PARAMETERS:  obj_handle      - Node
+ *              level           - Current nesting level
+ *              context         - Not used
+ *              return_value    - Not used
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Callback from acpi_walk_namespace. Invoked for every package
+ *              within the namespace. Used during dynamic load of an SSDT.
+ *
+ ******************************************************************************/
+
+acpi_status
+acpi_ns_init_one_package(acpi_handle obj_handle,
+			 u32 level, void *context, void **return_value)
+{
+	acpi_status status;
+	union acpi_operand_object *obj_desc;
+	struct acpi_namespace_node *node =
+	    (struct acpi_namespace_node *)obj_handle;
+
+	obj_desc = acpi_ns_get_attached_object(node);
+	if (!obj_desc) {
+		return (AE_OK);
+	}
+
+	/* Exit if package is already initialized */
+
+	if (obj_desc->package.flags & AOPOBJ_DATA_VALID) {
+		return (AE_OK);
+	}
+
+	status = acpi_ds_get_package_arguments(obj_desc);
+	if (ACPI_FAILURE(status)) {
+		return (AE_OK);
+	}
+
+	status =
+	    acpi_ut_walk_package_tree(obj_desc, NULL,
+				      acpi_ds_init_package_element, NULL);
+	if (ACPI_FAILURE(status)) {
+		return (AE_OK);
+	}
+
+	obj_desc->package.flags |= AOPOBJ_DATA_VALID;
+	return (AE_OK);
+}
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ns_init_one_object
@@ -360,27 +412,11 @@ acpi_ns_init_one_object(acpi_handle obj_handle,
 
 	case ACPI_TYPE_PACKAGE:
 
+		/* Complete the initialization/resolution of the package object */
+
 		info->package_init++;
-		status = acpi_ds_get_package_arguments(obj_desc);
-		if (ACPI_FAILURE(status)) {
-			break;
-		}
-
-		ACPI_DEBUG_PRINT_RAW((ACPI_DB_PARSE,
-				      "%s: Completing resolution of Package elements\n",
-				      ACPI_GET_FUNCTION_NAME));
-
-		/*
-		 * Resolve all named references in package objects (and all
-		 * sub-packages). This action has been deferred until the entire
-		 * namespace has been loaded, in order to support external and
-		 * forward references from individual package elements (05/2017).
-		 */
-		status = acpi_ut_walk_package_tree(obj_desc, NULL,
-						   acpi_ds_init_package_element,
-						   NULL);
-
-		obj_desc->package.flags |= AOPOBJ_DATA_VALID;
+		status =
+		    acpi_ns_init_one_package(obj_handle, level, NULL, NULL);
 		break;
 
 	default:

From 0cf442c6bcf572e04f5690340d5b8e62afcee2ca Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 24 Apr 2018 17:45:06 +0200
Subject: [PATCH 212/393] cpufreq: armada-37xx: driver relies on cpufreq-dt

Armada-37xx driver registers a cpufreq-dt driver. Not having
CONFIG_CPUFREQ_DT selected leads to a silent abort during the probe.
Prevent that situation by having the former depending on the latter.

Fixes: 92ce45fb875d7 (cpufreq: Add DVFS support for Armada 37xx)
Cc: 4.16+ <stable@vger.kernel.org> # 4.16+
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index de55c7d57438..96b35b8b3606 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -20,7 +20,7 @@ config ACPI_CPPC_CPUFREQ
 
 config ARM_ARMADA_37XX_CPUFREQ
 	tristate "Armada 37xx CPUFreq support"
-	depends on ARCH_MVEBU
+	depends on ARCH_MVEBU && CPUFREQ_DT
 	help
 	  This adds the CPUFreq driver support for Marvell Armada 37xx SoCs.
 	  The Armada 37xx PMU supports 4 frequency and VDD levels.

From 57f6f99fdad9984801cde05c1db68fe39b474a10 Mon Sep 17 00:00:00 2001
From: Tarick Bedeir <tarick@google.com>
Date: Sun, 13 May 2018 16:38:45 -0700
Subject: [PATCH 213/393] net/mlx4_core: Fix error handling in
 mlx4_init_port_info.

Avoid exiting the function with a lingering sysfs file (if the first
call to device_create_file() fails while the second succeeds), and avoid
calling devlink_port_unregister() twice.

In other words, either mlx4_init_port_info() succeeds and returns zero, or
it fails, returns non-zero, and requires no cleanup.

Fixes: 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB ports")
Signed-off-by: Tarick Bedeir <tarick@google.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 211578ffc70d..60172a38c4a4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2929,6 +2929,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
 	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
@@ -2950,9 +2951,10 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 				   &info->port_attr);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
-	return err;
+	return 0;
 }
 
 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)

From d49baa7e12ee70c0a7b821d088a770c94c02e494 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 May 2018 17:01:30 -0700
Subject: [PATCH 214/393] net/smc: check for missing nlattrs in SMC_PNETID
 messages

It's possible to crash the kernel in several different ways by sending
messages to the SMC_PNETID generic netlink family that are missing the
expected attributes:

- Missing SMC_PNETID_NAME => null pointer dereference when comparing
  names.
- Missing SMC_PNETID_ETHNAME => null pointer dereference accessing
  smc_pnetentry::ndev.
- Missing SMC_PNETID_IBNAME => null pointer dereference accessing
  smc_pnetentry::smcibdev.
- Missing SMC_PNETID_IBPORT => out of bounds array access to
  smc_ib_device::pattr[-1].

Fix it by validating that all expected attributes are present and that
SMC_PNETID_IBPORT is nonzero.

Reported-by: syzbot+5cd61039dc9b8bfa6e47@syzkaller.appspotmail.com
Fixes: 6812baabf24d ("smc: establish pnet table management")
Cc: <stable@vger.kernel.org> # v4.11+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 71 ++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 74568cdbca70..d7b88b2d1b22 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -245,40 +245,45 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
 static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
 			       struct nlattr *tb[])
 {
-	char *string, *ibname = NULL;
-	int rc = 0;
+	char *string, *ibname;
+	int rc;
 
 	memset(pnetelem, 0, sizeof(*pnetelem));
 	INIT_LIST_HEAD(&pnetelem->list);
-	if (tb[SMC_PNETID_NAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
-		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_ETHNAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
-		pnetelem->ndev = dev_get_by_name(net, string);
-		if (!pnetelem->ndev)
-			return -ENOENT;
-	}
-	if (tb[SMC_PNETID_IBNAME]) {
-		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
-		ibname = strim(ibname);
-		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
-		if (!pnetelem->smcibdev) {
-			rc = -ENOENT;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_IBPORT]) {
-		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
-		if (pnetelem->ib_port > SMC_MAX_PORTS) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_NAME])
+		goto error;
+	string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+	if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_ETHNAME])
+		goto error;
+	rc = -ENOENT;
+	string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+	pnetelem->ndev = dev_get_by_name(net, string);
+	if (!pnetelem->ndev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBNAME])
+		goto error;
+	rc = -ENOENT;
+	ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+	ibname = strim(ibname);
+	pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+	if (!pnetelem->smcibdev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBPORT])
+		goto error;
+	pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+	if (pnetelem->ib_port < 1 || pnetelem->ib_port > SMC_MAX_PORTS)
+		goto error;
+
 	return 0;
 
 error:
@@ -307,6 +312,8 @@ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
 	void *hdr;
 	int rc;
 
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	pnetelem = smc_pnet_find_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 	if (!pnetelem)
@@ -359,6 +366,8 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
 
 static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
 {
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	return smc_pnet_remove_by_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 }

From 45dd9b0666a162f8e4be76096716670cf1741f0e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 9 May 2018 14:36:09 -0400
Subject: [PATCH 215/393] tracing/x86/xen: Remove zero data size trace events
 trace_xen_mmu_flush_tlb{_all}

Doing an audit of trace events, I discovered two trace events in the xen
subsystem that use a hack to create zero data size trace events. This is not
what trace events are for. Trace events add memory footprint overhead, and
if all you need to do is see if a function is hit or not, simply make that
function noinline and use function tracer filtering.

Worse yet, the hack used was:

 __array(char, x, 0)

Which creates a static string of zero in length. There's assumptions about
such constructs in ftrace that this is a dynamic string that is nul
terminated. This is not the case with these tracepoints and can cause
problems in various parts of ftrace.

Nuke the trace events!

Link: http://lkml.kernel.org/r/20180509144605.5a220327@gandalf.local.home

Cc: stable@vger.kernel.org
Fixes: 95a7d76897c1e ("xen/mmu: Use Xen specific TLB flush instead of the generic one.")
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/xen/mmu.c         |  4 +---
 arch/x86/xen/mmu_pv.c      |  4 +---
 include/trace/events/xen.h | 16 ----------------
 3 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d33e7dbe3129..2d76106788a3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,13 +42,11 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 }
 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 
-static void xen_flush_tlb_all(void)
+static noinline void xen_flush_tlb_all(void)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb_all(0);
-
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 486c0a34d00b..2c30cabfda90 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1310,13 +1310,11 @@ unsigned long xen_read_cr2_direct(void)
 	return this_cpu_read(xen_vcpu_info.arch.cr2);
 }
 
-static void xen_flush_tlb(void)
+static noinline void xen_flush_tlb(void)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb(0);
-
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index 7dd8f34c37df..fdcf88bcf0ea 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -352,22 +352,6 @@ DECLARE_EVENT_CLASS(xen_mmu_pgd,
 DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_pin);
 DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_unpin);
 
-TRACE_EVENT(xen_mmu_flush_tlb_all,
-	    TP_PROTO(int x),
-	    TP_ARGS(x),
-	    TP_STRUCT__entry(__array(char, x, 0)),
-	    TP_fast_assign((void)x),
-	    TP_printk("%s", "")
-	);
-
-TRACE_EVENT(xen_mmu_flush_tlb,
-	    TP_PROTO(int x),
-	    TP_ARGS(x),
-	    TP_STRUCT__entry(__array(char, x, 0)),
-	    TP_fast_assign((void)x),
-	    TP_printk("%s", "")
-	);
-
 TRACE_EVENT(xen_mmu_flush_tlb_one_user,
 	    TP_PROTO(unsigned long addr),
 	    TP_ARGS(addr),

From 55a2aa08b3af519a9693f99cdf7fa6d8b62d9f65 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Fri, 27 Apr 2018 09:28:34 +1000
Subject: [PATCH 216/393] MIPS: c-r4k: Fix data corruption related to cache
 coherence

When DMA will be performed to a MIPS32 1004K CPS, the L1-cache for the
range needs to be flushed and invalidated first.
The code currently takes one of two approaches.
1/ If the range is less than the size of the dcache, then HIT type
   requests flush/invalidate cache lines for the particular addresses.
   HIT-type requests a globalised by the CPS so this is safe on SMP.

2/ If the range is larger than the size of dcache, then INDEX type
   requests flush/invalidate the whole cache. INDEX type requests affect
   the local cache only. CPS does not propagate them in any way. So this
   invalidation is not safe on SMP CPS systems.

Data corruption due to '2' can quite easily be demonstrated by
repeatedly "echo 3 > /proc/sys/vm/drop_caches" and then sha1sum a file
that is several times the size of available memory. Dropping caches
means that large contiguous extents (large than dcache) are more likely.

This was not a problem before Linux-4.8 because option 2 was never used
if CONFIG_MIPS_CPS was defined. The commit which removed that apparently
didn't appreciate the full consequence of the change.

We could, in theory, globalize the INDEX based flush by sending an IPI
to other cores. These cache invalidation routines can be called with
interrupts disabled and synchronous IPI require interrupts to be
enabled. Asynchronous IPI may not trigger writeback soon enough. So we
cannot use IPI in practice.

We can already test if IPI would be needed for an INDEX operation with
r4k_op_needs_ipi(R4K_INDEX). If this is true then we mustn't try the
INDEX approach as we cannot use IPI. If this is false (e.g. when there
is only one core and hence one L1 cache) then it is safe to use the
INDEX approach without IPI.

This patch avoids options 2 if r4k_op_needs_ipi(R4K_INDEX), and so
eliminates the corruption.

Fixes: c00ab4896ed5 ("MIPS: Remove cpu_has_safe_index_cacheops")
Signed-off-by: NeilBrown <neil@brown.name>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 4.8+
Patchwork: https://patchwork.linux-mips.org/patch/19259/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/mm/c-r4k.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 6f534b209971..e12dfa48b478 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -851,9 +851,12 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size)
 	/*
 	 * Either no secondary cache or the available caches don't have the
 	 * subset property so we have to flush the primary caches
-	 * explicitly
+	 * explicitly.
+	 * If we would need IPI to perform an INDEX-type operation, then
+	 * we have to use the HIT-type alternative as IPI cannot be used
+	 * here due to interrupts possibly being disabled.
 	 */
-	if (size >= dcache_size) {
+	if (!r4k_op_needs_ipi(R4K_INDEX) && size >= dcache_size) {
 		r4k_blast_dcache();
 	} else {
 		R4600_HIT_CACHEOP_WAR_IMPL;
@@ -890,7 +893,7 @@ static void r4k_dma_cache_inv(unsigned long addr, unsigned long size)
 		return;
 	}
 
-	if (size >= dcache_size) {
+	if (!r4k_op_needs_ipi(R4K_INDEX) && size >= dcache_size) {
 		r4k_blast_dcache();
 	} else {
 		R4600_HIT_CACHEOP_WAR_IMPL;

From c60128ce97674fd05adb8b5ae79eb6745a03192e Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Wed, 28 Mar 2018 17:38:12 +0200
Subject: [PATCH 217/393] MIPS: Fix build with DEBUG_ZBOOT and MACH_JZ4770

The debug definitions were missing for MACH_JZ4770, resulting in a build
failure when DEBUG_ZBOOT was set.

Since the UART addresses are the same across all Ingenic SoCs, we just
use a #ifdef CONFIG_MACH_INGENIC instead of checking for individual
Ingenic SoCs.

Additionally, I added a #define for the UART0 address in-code and
dropped the <asm/mach-jz4740/base.h> include, for the reason that this
include file is slowly being phased out as the whole platform is being
moved to devicetree.

Fixes: 9be5f3e92ed5 ("MIPS: ingenic: Initial JZ4770 support")
Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 4.16
Patchwork: https://patchwork.linux-mips.org/patch/18957/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/boot/compressed/uart-16550.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c
index b3043c08f769..aee8d7b8f091 100644
--- a/arch/mips/boot/compressed/uart-16550.c
+++ b/arch/mips/boot/compressed/uart-16550.c
@@ -18,9 +18,9 @@
 #define PORT(offset) (CKSEG1ADDR(AR7_REGS_UART0) + (4 * offset))
 #endif
 
-#if defined(CONFIG_MACH_JZ4740) || defined(CONFIG_MACH_JZ4780)
-#include <asm/mach-jz4740/base.h>
-#define PORT(offset) (CKSEG1ADDR(JZ4740_UART0_BASE_ADDR) + (4 * offset))
+#ifdef CONFIG_MACH_INGENIC
+#define INGENIC_UART0_BASE_ADDR	0x10030000
+#define PORT(offset) (CKSEG1ADDR(INGENIC_UART0_BASE_ADDR) + (4 * offset))
 #endif
 
 #ifdef CONFIG_CPU_XLR

From 71e909c0cdad28a1df1fa14442929e68615dee45 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 30 Apr 2018 15:56:47 +0100
Subject: [PATCH 218/393] MIPS: ptrace: Expose FIR register through FP regset

Correct commit 7aeb753b5353 ("MIPS: Implement task_user_regset_view.")
and expose the FIR register using the unused 4 bytes at the end of the
NT_PRFPREG regset.  Without that register included clients cannot use
the PTRACE_GETREGSET request to retrieve the complete FPU register set
and have to resort to one of the older interfaces, either PTRACE_PEEKUSR
or PTRACE_GETFPREGS, to retrieve the missing piece of data.  Also the
register is irreversibly missing from core dumps.

This register is architecturally hardwired and read-only so the write
path does not matter.  Ignore data supplied on writes then.

Fixes: 7aeb753b5353 ("MIPS: Implement task_user_regset_view.")
Signed-off-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 3.13+
Patchwork: https://patchwork.linux-mips.org/patch/19273/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kernel/ptrace.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 0b23b1ad99e6..aede42990f08 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -463,7 +463,7 @@ static int fpr_get_msa(struct task_struct *target,
 /*
  * Copy the floating-point context to the supplied NT_PRFPREG buffer.
  * Choose the appropriate helper for general registers, and then copy
- * the FCSR register separately.
+ * the FCSR and FIR registers separately.
  */
 static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
@@ -471,6 +471,7 @@ static int fpr_get(struct task_struct *target,
 		   void *kbuf, void __user *ubuf)
 {
 	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
+	const int fir_pos = fcr31_pos + sizeof(u32);
 	int err;
 
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
@@ -483,6 +484,12 @@ static int fpr_get(struct task_struct *target,
 	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				  &target->thread.fpu.fcr31,
 				  fcr31_pos, fcr31_pos + sizeof(u32));
+	if (err)
+		return err;
+
+	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &boot_cpu_data.fpu_id,
+				  fir_pos, fir_pos + sizeof(u32));
 
 	return err;
 }
@@ -531,7 +538,8 @@ static int fpr_set_msa(struct task_struct *target,
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context.
  * Choose the appropriate helper for general registers, and then copy
- * the FCSR register separately.
+ * the FCSR register separately.  Ignore the incoming FIR register
+ * contents though, as the register is read-only.
  *
  * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
  * which is supposed to have been guaranteed by the kernel before
@@ -545,6 +553,7 @@ static int fpr_set(struct task_struct *target,
 		   const void *kbuf, const void __user *ubuf)
 {
 	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
+	const int fir_pos = fcr31_pos + sizeof(u32);
 	u32 fcr31;
 	int err;
 
@@ -572,6 +581,11 @@ static int fpr_set(struct task_struct *target,
 		ptrace_setfcr31(target, fcr31);
 	}
 
+	if (count > 0)
+		err = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+						fir_pos,
+						fir_pos + sizeof(u32));
+
 	return err;
 }
 

From ba3696e94d9d590d9a7e55f68e81c25dba515191 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 14 May 2018 18:23:50 +0100
Subject: [PATCH 219/393] KVM: Fix spelling mistake: "cop_unsuable" ->
 "cop_unusable"

Trivial fix to spelling mistake in debugfs_entries text.

Fixes: 669e846e6c4e ("KVM/MIPS32: MIPS arch specific APIs for KVM")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kernel-janitors@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.10+
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kvm/mips.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2549fdd27ee1..0f725e9cee8f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -45,7 +45,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "cache",	  VCPU_STAT(cache_exits),	 KVM_STAT_VCPU },
 	{ "signal",	  VCPU_STAT(signal_exits),	 KVM_STAT_VCPU },
 	{ "interrupt",	  VCPU_STAT(int_exits),		 KVM_STAT_VCPU },
-	{ "cop_unsuable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU },
+	{ "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU },
 	{ "tlbmod",	  VCPU_STAT(tlbmod_exits),	 KVM_STAT_VCPU },
 	{ "tlbmiss_ld",	  VCPU_STAT(tlbmiss_ld_exits),	 KVM_STAT_VCPU },
 	{ "tlbmiss_st",	  VCPU_STAT(tlbmiss_st_exits),	 KVM_STAT_VCPU },

From a5a92abbce56c41ff121db41a33b9c0a0ff39365 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 25 Apr 2018 23:10:35 +0200
Subject: [PATCH 220/393] MIPS: xilfpga: Stop generating useless dtb.o

A dtb.o is generated from nexys4ddr.dts but this is never used since it
has been moved to mips/generic with commit b35565bb16a5 ("MIPS: generic:
Add support for MIPSfpga").

Fixes: b35565bb16a5 ("MIPS: generic: Add support for MIPSfpga")
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 4.15+
Patchwork: https://patchwork.linux-mips.org/patch/19244/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/boot/dts/xilfpga/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/boot/dts/xilfpga/Makefile b/arch/mips/boot/dts/xilfpga/Makefile
index 9987e0e378c5..69ca00590b8d 100644
--- a/arch/mips/boot/dts/xilfpga/Makefile
+++ b/arch/mips/boot/dts/xilfpga/Makefile
@@ -1,4 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 dtb-$(CONFIG_FIT_IMAGE_FDT_XILFPGA)	+= nexys4ddr.dtb
-
-obj-y				+= $(patsubst %.dtb, %.dtb.o, $(dtb-y))

From 947bc875116042d5375446aa29bc1073c2d38977 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 25 Apr 2018 23:10:36 +0200
Subject: [PATCH 221/393] MIPS: xilfpga: Actually include FDT in fitImage

Commit b35565bb16a5 ("MIPS: generic: Add support for MIPSfpga") added
and its.S file for xilfpga but forgot to add it to
arch/mips/generic/Platform so it is never used.

Fixes: b35565bb16a5 ("MIPS: generic: Add support for MIPSfpga")
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 4.15+
Patchwork: https://patchwork.linux-mips.org/patch/19245/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/generic/Platform | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform
index b51432dd10b6..0dd0d5d460a5 100644
--- a/arch/mips/generic/Platform
+++ b/arch/mips/generic/Platform
@@ -16,3 +16,4 @@ all-$(CONFIG_MIPS_GENERIC)	:= vmlinux.gz.itb
 its-y					:= vmlinux.its.S
 its-$(CONFIG_FIT_IMAGE_FDT_BOSTON)	+= board-boston.its.S
 its-$(CONFIG_FIT_IMAGE_FDT_NI169445)	+= board-ni169445.its.S
+its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA)	+= board-xilfpga.its.S

From 9a3a92ccfe3620743d4ae57c987dc8e9c5f88996 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 14 May 2018 16:49:43 +0100
Subject: [PATCH 222/393] MIPS: Fix ptrace(2) PTRACE_PEEKUSR and PTRACE_POKEUSR
 accesses to o32 FGRs

Check the TIF_32BIT_FPREGS task setting of the tracee rather than the
tracer in determining the layout of floating-point general registers in
the floating-point context, correcting access to odd-numbered registers
for o32 tracees where the setting disagrees between the two processes.

Fixes: 597ce1723e0f ("MIPS: Support for 64-bit FP with O32 binaries")
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 3.14+
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kernel/ptrace.c   | 4 ++--
 arch/mips/kernel/ptrace32.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index aede42990f08..8d098b9f395c 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -807,7 +807,7 @@ long arch_ptrace(struct task_struct *child, long request,
 			fregs = get_fpu_regs(child);
 
 #ifdef CONFIG_32BIT
-			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+			if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even
@@ -902,7 +902,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 			init_fp_ctx(child);
 #ifdef CONFIG_32BIT
-			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+			if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 2b9260f92ccd..656a137c1fe2 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -99,7 +99,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 				break;
 			}
 			fregs = get_fpu_regs(child);
-			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+			if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even
@@ -212,7 +212,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 				       sizeof(child->thread.fpu));
 				child->thread.fpu.fcr31 = 0;
 			}
-			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+			if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even

From 849a742c59a3d597473c0232f9c2506c69eeef14 Mon Sep 17 00:00:00 2001
From: Kumar Sanghvi <kumaras@chelsio.com>
Date: Mon, 14 May 2018 16:27:34 +0530
Subject: [PATCH 223/393] cxgb4: Correct ntuple mask validation for hash
 filters

Earlier code of doing bitwise AND with field width bits was wrong.
Instead, simplify code to calculate ntuple_mask based on supplied
fields and then compare with mask configured in hw - which is the
correct and simpler way to validate ntuple mask.

Fixes: 3eb8b62d5a26 ("cxgb4: add support to create hash-filters via tc-flower offload")
Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 88 +++++++------------
 1 file changed, 30 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index db92f1858060..b76447baccaf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -836,7 +836,7 @@ bool is_filter_exact_match(struct adapter *adap,
 {
 	struct tp_params *tp = &adap->params.tp;
 	u64 hash_filter_mask = tp->hash_filter_mask;
-	u32 mask;
+	u64 ntuple_mask = 0;
 
 	if (!is_hashfilter(adap))
 		return false;
@@ -865,73 +865,45 @@ bool is_filter_exact_match(struct adapter *adap,
 	if (!fs->val.fport || fs->mask.fport != 0xffff)
 		return false;
 
-	if (tp->fcoe_shift >= 0) {
-		mask = (hash_filter_mask >> tp->fcoe_shift) & FT_FCOE_W;
-		if (mask && !fs->mask.fcoe)
-			return false;
-	}
+	/* calculate tuple mask and compare with mask configured in hw */
+	if (tp->fcoe_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.fcoe << tp->fcoe_shift;
 
-	if (tp->port_shift >= 0) {
-		mask = (hash_filter_mask >> tp->port_shift) & FT_PORT_W;
-		if (mask && !fs->mask.iport)
-			return false;
-	}
+	if (tp->port_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.iport << tp->port_shift;
 
 	if (tp->vnic_shift >= 0) {
-		mask = (hash_filter_mask >> tp->vnic_shift) & FT_VNIC_ID_W;
-
-		if ((adap->params.tp.ingress_config & VNIC_F)) {
-			if (mask && !fs->mask.pfvf_vld)
-				return false;
-		} else {
-			if (mask && !fs->mask.ovlan_vld)
-				return false;
-		}
+		if ((adap->params.tp.ingress_config & VNIC_F))
+			ntuple_mask |= (u64)fs->mask.pfvf_vld << tp->vnic_shift;
+		else
+			ntuple_mask |= (u64)fs->mask.ovlan_vld <<
+				tp->vnic_shift;
 	}
 
-	if (tp->vlan_shift >= 0) {
-		mask = (hash_filter_mask >> tp->vlan_shift) & FT_VLAN_W;
-		if (mask && !fs->mask.ivlan)
-			return false;
-	}
+	if (tp->vlan_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ivlan << tp->vlan_shift;
 
-	if (tp->tos_shift >= 0) {
-		mask = (hash_filter_mask >> tp->tos_shift) & FT_TOS_W;
-		if (mask && !fs->mask.tos)
-			return false;
-	}
+	if (tp->tos_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.tos << tp->tos_shift;
 
-	if (tp->protocol_shift >= 0) {
-		mask = (hash_filter_mask >> tp->protocol_shift) & FT_PROTOCOL_W;
-		if (mask && !fs->mask.proto)
-			return false;
-	}
+	if (tp->protocol_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.proto << tp->protocol_shift;
 
-	if (tp->ethertype_shift >= 0) {
-		mask = (hash_filter_mask >> tp->ethertype_shift) &
-			FT_ETHERTYPE_W;
-		if (mask && !fs->mask.ethtype)
-			return false;
-	}
+	if (tp->ethertype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ethtype << tp->ethertype_shift;
 
-	if (tp->macmatch_shift >= 0) {
-		mask = (hash_filter_mask >> tp->macmatch_shift) & FT_MACMATCH_W;
-		if (mask && !fs->mask.macidx)
-			return false;
-	}
+	if (tp->macmatch_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.macidx << tp->macmatch_shift;
+
+	if (tp->matchtype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.matchtype << tp->matchtype_shift;
+
+	if (tp->frag_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.frag << tp->frag_shift;
+
+	if (ntuple_mask != hash_filter_mask)
+		return false;
 
-	if (tp->matchtype_shift >= 0) {
-		mask = (hash_filter_mask >> tp->matchtype_shift) &
-			FT_MPSHITTYPE_W;
-		if (mask && !fs->mask.matchtype)
-			return false;
-	}
-	if (tp->frag_shift >= 0) {
-		mask = (hash_filter_mask >> tp->frag_shift) &
-			FT_FRAGMENTATION_W;
-		if (mask && !fs->mask.frag)
-			return false;
-	}
 	return true;
 }
 

From 61aeecea40afb2b89933e27cd4adb10fc2e75cfd Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:34 -0400
Subject: [PATCH 224/393] vmxnet3: set the DMA mask before the first DMA map
 operation

The DMA mask must be set before, not after, the first DMA map operation, or
the first DMA map operation could in theory fail on some systems.

Fixes: b0eb57cb97e78 ("VMXNET3: Add support for virtual IOMMU")
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++----------------
 drivers/net/vmxnet3/vmxnet3_int.h |  8 +++--
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 9ebe2a689966..ed58685c353f 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2688,7 +2688,7 @@ vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
 /* ==================== initialization and cleanup routines ============ */
 
 static int
-vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter)
 {
 	int err;
 	unsigned long mmio_start, mmio_len;
@@ -2700,30 +2700,12 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 		return err;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
-		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_consistent_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = true;
-	} else {
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = false;
-	}
-
 	err = pci_request_selected_regions(pdev, (1 << 2) - 1,
 					   vmxnet3_driver_name);
 	if (err) {
 		dev_err(&pdev->dev,
 			"Failed to request region for adapter: error %d\n", err);
-		goto err_set_mask;
+		goto err_enable_device;
 	}
 
 	pci_set_master(pdev);
@@ -2751,7 +2733,7 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 	iounmap(adapter->hw_addr0);
 err_ioremap:
 	pci_release_selected_regions(pdev, (1 << 2) - 1);
-err_set_mask:
+err_enable_device:
 	pci_disable_device(pdev);
 	return err;
 }
@@ -3254,7 +3236,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 #endif
 	};
 	int err;
-	bool dma64 = false; /* stupid gcc */
+	bool dma64;
 	u32 ver;
 	struct net_device *netdev;
 	struct vmxnet3_adapter *adapter;
@@ -3300,6 +3282,24 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	adapter->rx_ring_size = VMXNET3_DEF_RX_RING_SIZE;
 	adapter->rx_ring2_size = VMXNET3_DEF_RX_RING2_SIZE;
 
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_consistent_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = true;
+	} else {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = false;
+	}
+
 	spin_lock_init(&adapter->cmd_lock);
 	adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter,
 					     sizeof(struct vmxnet3_adapter),
@@ -3307,7 +3307,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	if (dma_mapping_error(&adapter->pdev->dev, adapter->adapter_pa)) {
 		dev_err(&pdev->dev, "Failed to map dma\n");
 		err = -EFAULT;
-		goto err_dma_map;
+		goto err_set_mask;
 	}
 	adapter->shared = dma_alloc_coherent(
 				&adapter->pdev->dev,
@@ -3358,7 +3358,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	}
 #endif /* VMXNET3_RSS */
 
-	err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+	err = vmxnet3_alloc_pci_resources(adapter);
 	if (err < 0)
 		goto err_alloc_pci;
 
@@ -3504,7 +3504,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 err_alloc_shared:
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
-err_dma_map:
+err_set_mask:
 	free_netdev(netdev);
 	return err;
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index a3326463b71f..b379bed5f51d 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,12 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.14.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.15.0-k"
 
-/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040e00
+/* Each byte of this 32-bit integer encodes a version number in
+ * VMXNET3_DRIVER_VERSION_STRING.
+ */
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040f00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */

From f3002c1374fb2367c9d8dbb28852791ef90d2bac Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:49 -0400
Subject: [PATCH 225/393] vmxnet3: use DMA memory barriers where required

The gen bits must be read first from (resp. written last to) DMA memory.
The proper way to enforce this on Linux is to call dma_rmb() (resp.
dma_wmb()).

Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 22 ++++++++++++++++++++++
 drivers/net/vmxnet3/vmxnet3_int.h |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index ed58685c353f..27a9bb8c9611 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -369,6 +369,11 @@ vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq,
 
 	gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
 	while (VMXNET3_TCD_GET_GEN(&gdesc->tcd) == tq->comp_ring.gen) {
+		/* Prevent any &gdesc->tcd field from being (speculatively)
+		 * read before (&gdesc->tcd)->gen is read.
+		 */
+		dma_rmb();
+
 		completed += vmxnet3_unmap_pkt(VMXNET3_TCD_GET_TXIDX(
 					       &gdesc->tcd), tq, adapter->pdev,
 					       adapter);
@@ -1103,6 +1108,11 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
+	/* Ensure that the write to (&gdesc->txd)->gen will be observed after
+	 * all other writes to &gdesc->txd.
+	 */
+	dma_wmb();
+
 	/* finally flips the GEN bit of the SOP desc. */
 	gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^
 						  VMXNET3_TXD_GEN);
@@ -1298,6 +1308,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			 */
 			break;
 		}
+
+		/* Prevent any rcd field from being (speculatively) read before
+		 * rcd->gen is read.
+		 */
+		dma_rmb();
+
 		BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2 &&
 		       rcd->rqID != rq->dataRingQid);
 		idx = rcd->rxdIdx;
@@ -1528,6 +1544,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		ring->next2comp = idx;
 		num_to_alloc = vmxnet3_cmd_ring_desc_avail(ring);
 		ring = rq->rx_ring + ring_idx;
+
+		/* Ensure that the writes to rxd->gen bits will be observed
+		 * after all other writes to rxd objects.
+		 */
+		dma_wmb();
+
 		while (num_to_alloc) {
 			vmxnet3_getRxDesc(rxd, &ring->base[ring->next2fill].rxd,
 					  &rxCmdDesc);
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index b379bed5f51d..a2c554f8a61b 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,12 +69,12 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.15.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.16.0-k"
 
 /* Each byte of this 32-bit integer encodes a version number in
  * VMXNET3_DRIVER_VERSION_STRING.
  */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040f00
+#define VMXNET3_DRIVER_VERSION_NUM      0x01041000
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */

From 125966db1fea12575ee21b2b3fc95a59032406f0 Mon Sep 17 00:00:00 2001
From: Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
Date: Thu, 10 May 2018 19:12:18 +0530
Subject: [PATCH 226/393] scsi: target: tcmu: fix error resetting
 qfull_time_out to default

Problem:

$ cat /sys/kernel/config/target/core/user_0/block/attrib/qfull_time_out
-1

$ echo "-1" > /sys/kernel/config/target/core/user_0/block/attrib/qfull_time_out
-bash: echo: write error: Invalid argument

Fix:

This patch will help reset qfull_time_out to its default
i.e. qfull_time_out=-1.

Signed-off-by: Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
Acked-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_user.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 4ad89ea71a70..4f26bdc3d1dc 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -2121,6 +2121,8 @@ static ssize_t tcmu_qfull_time_out_store(struct config_item *item,
 
 	if (val >= 0) {
 		udev->qfull_time_out = val * MSEC_PER_SEC;
+	} else if (val == -1) {
+		udev->qfull_time_out = val;
 	} else {
 		printk(KERN_ERR "Invalid qfull timeout value %d\n", val);
 		return -EINVAL;

From a406b0a0693eafc6f6b3a633d25749370bf40d8c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 13 May 2018 17:10:52 -0700
Subject: [PATCH 227/393] scsi: core: clean up generated file
 scsi_devinfo_tbl.c

"make clean" should remove the generated file "scsi_devinfo_tbl.c", so
list it in the clean-files variable so that the file gets cleaned up.

Fixes: 345e29608b4b ("scsi: scsi: Export blacklist flags to sysfs")
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index e29f9b8fd66d..56c940394729 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -182,7 +182,7 @@ zalon7xx-objs	:= zalon.o ncr53c8xx.o
 NCR_Q720_mod-objs	:= NCR_Q720.o ncr53c8xx.o
 
 # Files generated that shall be removed upon make clean
-clean-files :=	53c700_d.h 53c700_u.h
+clean-files :=	53c700_d.h 53c700_u.h scsi_devinfo_tbl.c
 
 $(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h
 

From 76ef6b28ea4f81c3d511866a9b31392caa833126 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 15 May 2018 13:38:15 +1000
Subject: [PATCH 228/393] drm: set FMODE_UNSIGNED_OFFSET for drm files

Since we have the ttm and gem vma managers using a subset
of the file address space for objects, and these start at
0x100000000 they will overflow the new mmap checks.

I've checked all the mmap routines I could see for any
bad behaviour but overall most people use GEM/TTM VMA
managers even the legacy drivers have a hashtable.

Reported-and-Tested-by: Arthur Marsh (amarsh04 on #radeon)
Fixes: be83bbf8068 (mmap: introduce sane default mmap limits)
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index e394799979a6..6d9b9453707c 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -212,6 +212,7 @@ static int drm_open_helper(struct file *filp, struct drm_minor *minor)
 		return -ENOMEM;
 
 	filp->private_data = priv;
+	filp->f_mode |= FMODE_UNSIGNED_OFFSET;
 	priv->filp = filp;
 	priv->pid = get_pid(task_pid(current));
 	priv->minor = minor;

From c99f0802e42fcd38e84ee4d306691805ebed204f Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Fri, 11 May 2018 16:25:35 +0100
Subject: [PATCH 229/393] ALSA: usb-audio: Use Class Specific EP for UAC3
 devices.

bmAtributes offset doesn't exist in the UAC3 CS_EP descriptor.
Hence, checking for pitch control as if it was UAC2 doesn't make
any sense. Use the defined UAC3 offsets instead.

Fixes: 9a2fe9b801f5 ("ALSA: usb: initial USB Audio Device Class 3.0 support")
Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Reviewed-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/stream.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sound/usb/stream.c b/sound/usb/stream.c
index 956be9f7c72a..5ed334575fc7 100644
--- a/sound/usb/stream.c
+++ b/sound/usb/stream.c
@@ -576,7 +576,7 @@ static int parse_uac_endpoint_attributes(struct snd_usb_audio *chip,
 
 	if (protocol == UAC_VERSION_1) {
 		attributes = csep->bmAttributes;
-	} else {
+	} else if (protocol == UAC_VERSION_2) {
 		struct uac2_iso_endpoint_descriptor *csep2 =
 			(struct uac2_iso_endpoint_descriptor *) csep;
 
@@ -585,6 +585,13 @@ static int parse_uac_endpoint_attributes(struct snd_usb_audio *chip,
 		/* emulate the endpoint attributes of a v1 device */
 		if (csep2->bmControls & UAC2_CONTROL_PITCH)
 			attributes |= UAC_EP_CS_ATTR_PITCH_CONTROL;
+	} else { /* UAC_VERSION_3 */
+		struct uac3_iso_endpoint_descriptor *csep3 =
+			(struct uac3_iso_endpoint_descriptor *) csep;
+
+		/* emulate the endpoint attributes of a v1 device */
+		if (le32_to_cpu(csep3->bmControls) & UAC2_CONTROL_PITCH)
+			attributes |= UAC_EP_CS_ATTR_PITCH_CONTROL;
 	}
 
 	return attributes;

From 6f5ec2993b1f39aed12fa6fd56e8dc2272ee8a33 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 14 May 2018 08:53:24 -0500
Subject: [PATCH 230/393] objtool: Detect RIP-relative switch table references

Typically a switch table can be found by detecting a .rodata access
followed an indirect jump:

    1969:	4a 8b 0c e5 00 00 00 	mov    0x0(,%r12,8),%rcx
    1970:	00
			196d: R_X86_64_32S	.rodata+0x438
    1971:	e9 00 00 00 00       	jmpq   1976 <dispc_runtime_suspend+0xb6a>
			1972: R_X86_64_PC32	__x86_indirect_thunk_rcx-0x4

Randy Dunlap reported a case (seen with GCC 4.8) where the .rodata
access uses RIP-relative addressing:

    19bd:	48 8b 3d 00 00 00 00 	mov    0x0(%rip),%rdi        # 19c4 <dispc_runtime_suspend+0xbb8>
			19c0: R_X86_64_PC32	.rodata+0x45c
    19c4:	e9 00 00 00 00       	jmpq   19c9 <dispc_runtime_suspend+0xbbd>
			19c5: R_X86_64_PC32	__x86_indirect_thunk_rdi-0x4

In this case the relocation addend needs to be adjusted accordingly in
order to find the location of the switch table.

The fix is for case 3 (as described in the comments), but also make the
existing case 1 & 2 checks more precise by only adjusting the addend for
R_X86_64_PC32 relocations.

This fixes the following warnings:

  drivers/video/fbdev/omap2/omapfb/dss/dispc.o: warning: objtool: dispc_runtime_suspend()+0xbb8: sibling call from callable instruction with modified stack frame
  drivers/video/fbdev/omap2/omapfb/dss/dispc.o: warning: objtool: dispc_runtime_resume()+0xcc5: sibling call from callable instruction with modified stack frame

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b6098294fd67afb69af8c47c9883d7a68bf0f8ea.1526305958.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9bb04fddd3c8..f4bbce838433 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -903,24 +903,24 @@ static struct rela *find_switch_table(struct objtool_file *file,
 {
 	struct rela *text_rela, *rodata_rela;
 	struct instruction *orig_insn = insn;
+	unsigned long table_offset;
 
+	/* case 1 & 2 */
 	text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len);
 	if (text_rela && text_rela->sym == file->rodata->sym &&
 	    !find_symbol_containing(file->rodata, text_rela->addend)) {
 
-		/* case 1 */
-		rodata_rela = find_rela_by_dest(file->rodata,
-						text_rela->addend);
-		if (rodata_rela)
-			return rodata_rela;
+		table_offset = text_rela->addend;
+		if (text_rela->type == R_X86_64_PC32) {
+			/* case 2 */
+			table_offset += 4;
+			file->ignore_unreachables = true;
+		}
 
-		/* case 2 */
-		rodata_rela = find_rela_by_dest(file->rodata,
-						text_rela->addend + 4);
+		rodata_rela = find_rela_by_dest(file->rodata, table_offset);
 		if (!rodata_rela)
 			return NULL;
 
-		file->ignore_unreachables = true;
 		return rodata_rela;
 	}
 
@@ -954,18 +954,21 @@ static struct rela *find_switch_table(struct objtool_file *file,
 		if (!text_rela || text_rela->sym != file->rodata->sym)
 			continue;
 
+		table_offset = text_rela->addend;
+		if (text_rela->type == R_X86_64_PC32)
+			table_offset += 4;
+
 		/*
 		 * Make sure the .rodata address isn't associated with a
 		 * symbol.  gcc jump tables are anonymous data.
 		 */
-		if (find_symbol_containing(file->rodata, text_rela->addend))
+		if (find_symbol_containing(file->rodata, table_offset))
 			continue;
 
-		rodata_rela = find_rela_by_dest(file->rodata, text_rela->addend);
-		if (!rodata_rela)
-			continue;
-
-		return rodata_rela;
+		/* mov [rodata addr], %reg */
+		rodata_rela = find_rela_by_dest(file->rodata, table_offset);
+		if (rodata_rela)
+			return rodata_rela;
 	}
 
 	return NULL;

From e521813468f786271a87e78e8644243bead48fad Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 2 May 2018 08:48:43 +0200
Subject: [PATCH 231/393] s390/qdio: fix access to uninitialized qdio_q fields

Ever since CQ/QAOB support was added, calling qdio_free() straight after
qdio_alloc() results in qdio_release_memory() accessing uninitialized
memory (ie. q->u.out.use_cq and q->u.out.aobs). Followed by a
kmem_cache_free() on the random AOB addresses.

For older kernels that don't have 6e30c549f6ca, the same applies if
qdio_establish() fails in the DEV_STATE_ONLINE check.

While initializing q->u.out.use_cq would be enough to fix this
particular bug, the more future-proof change is to just zero-alloc the
whole struct.

Fixes: 104ea556ee7f ("qdio: support asynchronous delivery of storage blocks")
Cc: <stable@vger.kernel.org> #v3.2+
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/qdio_setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c
index 439991d71b14..22ddb49e0f50 100644
--- a/drivers/s390/cio/qdio_setup.c
+++ b/drivers/s390/cio/qdio_setup.c
@@ -141,7 +141,7 @@ static int __qdio_allocate_qs(struct qdio_q **irq_ptr_qs, int nr_queues)
 	int i;
 
 	for (i = 0; i < nr_queues; i++) {
-		q = kmem_cache_alloc(qdio_q_cache, GFP_KERNEL);
+		q = kmem_cache_zalloc(qdio_q_cache, GFP_KERNEL);
 		if (!q)
 			return -ENOMEM;
 

From 2e68adcd2fb21b7188ba449f0fab3bee2910e500 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 2 May 2018 08:28:34 +0200
Subject: [PATCH 232/393] s390/qdio: don't release memory in qdio_setup_irq()

Calling qdio_release_memory() on error is just plain wrong. It frees
the main qdio_irq struct, when following code still uses it.

Also, no other error path in qdio_establish() does this. So trust
callers to clean up via qdio_free() if some step of the QDIO
initialization fails.

Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Cc: <stable@vger.kernel.org> #v2.6.27+
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/qdio_setup.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c
index 22ddb49e0f50..4c14ce428e92 100644
--- a/drivers/s390/cio/qdio_setup.c
+++ b/drivers/s390/cio/qdio_setup.c
@@ -456,7 +456,6 @@ int qdio_setup_irq(struct qdio_initialize *init_data)
 {
 	struct ciw *ciw;
 	struct qdio_irq *irq_ptr = init_data->cdev->private->qdio_data;
-	int rc;
 
 	memset(&irq_ptr->qib, 0, sizeof(irq_ptr->qib));
 	memset(&irq_ptr->siga_flag, 0, sizeof(irq_ptr->siga_flag));
@@ -493,16 +492,14 @@ int qdio_setup_irq(struct qdio_initialize *init_data)
 	ciw = ccw_device_get_ciw(init_data->cdev, CIW_TYPE_EQUEUE);
 	if (!ciw) {
 		DBF_ERROR("%4x NO EQ", irq_ptr->schid.sch_no);
-		rc = -EINVAL;
-		goto out_err;
+		return -EINVAL;
 	}
 	irq_ptr->equeue = *ciw;
 
 	ciw = ccw_device_get_ciw(init_data->cdev, CIW_TYPE_AQUEUE);
 	if (!ciw) {
 		DBF_ERROR("%4x NO AQ", irq_ptr->schid.sch_no);
-		rc = -EINVAL;
-		goto out_err;
+		return -EINVAL;
 	}
 	irq_ptr->aqueue = *ciw;
 
@@ -512,9 +509,6 @@ int qdio_setup_irq(struct qdio_initialize *init_data)
 	init_data->cdev->handler = qdio_int_handler;
 	spin_unlock_irq(get_ccwdev_lock(irq_ptr->cdev));
 	return 0;
-out_err:
-	qdio_release_memory(irq_ptr);
-	return rc;
 }
 
 void qdio_print_subchannel_info(struct qdio_irq *irq_ptr,

From de9a8634f1cb4560a35696d472cc7f1383d9b866 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 9 May 2018 21:46:29 +0200
Subject: [PATCH 233/393] i2c: pmcmsp: return message count on master_xfer
 success

Returning zero is wrong in this case.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Fixes: 1b144df1d7d6 ("i2c: New PMC MSP71xx TWI bus driver")
---
 drivers/i2c/busses/i2c-pmcmsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-pmcmsp.c b/drivers/i2c/busses/i2c-pmcmsp.c
index 2aa0e83174c5..ec27e27e8d06 100644
--- a/drivers/i2c/busses/i2c-pmcmsp.c
+++ b/drivers/i2c/busses/i2c-pmcmsp.c
@@ -567,7 +567,7 @@ static int pmcmsptwi_master_xfer(struct i2c_adapter *adap,
 		return -1;
 	}
 
-	return 0;
+	return num;
 }
 
 static u32 pmcmsptwi_i2c_func(struct i2c_adapter *adapter)

From 12d9bbc5a7f347eaa65ff2a9d34995cadc05eb1b Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 9 May 2018 21:46:30 +0200
Subject: [PATCH 234/393] i2c: pmcmsp: fix error return from master_xfer

Returning -1 (-EPERM) is not appropriate here, go with -EIO.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Fixes: 1b144df1d7d6 ("i2c: New PMC MSP71xx TWI bus driver")
---
 drivers/i2c/busses/i2c-pmcmsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-pmcmsp.c b/drivers/i2c/busses/i2c-pmcmsp.c
index ec27e27e8d06..dae8ac618a52 100644
--- a/drivers/i2c/busses/i2c-pmcmsp.c
+++ b/drivers/i2c/busses/i2c-pmcmsp.c
@@ -564,7 +564,7 @@ static int pmcmsptwi_master_xfer(struct i2c_adapter *adap,
 		 * TODO: We could potentially loop and retry in the case
 		 * of MSP_TWI_XFER_TIMEOUT.
 		 */
-		return -1;
+		return -EIO;
 	}
 
 	return num;

From 35cd67a0caf767aba472452865dcb4471fcce2b1 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 9 May 2018 21:47:48 +0200
Subject: [PATCH 235/393] i2c: viperboard: return message count on master_xfer
 success

Returning zero is wrong in this case.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Fixes: 174a13aa8669 ("i2c: Add viperboard i2c master driver")
---
 drivers/i2c/busses/i2c-viperboard.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-viperboard.c b/drivers/i2c/busses/i2c-viperboard.c
index e4be86b3de9a..7235c7302bb7 100644
--- a/drivers/i2c/busses/i2c-viperboard.c
+++ b/drivers/i2c/busses/i2c-viperboard.c
@@ -337,7 +337,7 @@ static int vprbrd_i2c_xfer(struct i2c_adapter *i2c, struct i2c_msg *msgs,
 		}
 		mutex_unlock(&vb->lock);
 	}
-	return 0;
+	return num;
 error:
 	mutex_unlock(&vb->lock);
 	return error;

From 28b68acc4a88dcf91fd1dcf2577371dc9bf574cc Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Wed, 11 Apr 2018 18:13:30 -0600
Subject: [PATCH 236/393] usbip: usbip_host: refine probe and disconnect debug
 msgs to be useful

Refine probe and disconnect debug msgs to be useful and say what is
in progress.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index dd8ef36ab10e..7813c1862941 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -302,7 +302,7 @@ static int stub_probe(struct usb_device *udev)
 	struct bus_id_priv *busid_priv;
 	int rc;
 
-	dev_dbg(&udev->dev, "Enter\n");
+	dev_dbg(&udev->dev, "Enter probe\n");
 
 	/* check we should claim or not by busid_table */
 	busid_priv = get_busid_priv(udev_busid);
@@ -404,7 +404,7 @@ static void stub_disconnect(struct usb_device *udev)
 	struct bus_id_priv *busid_priv;
 	int rc;
 
-	dev_dbg(&udev->dev, "Enter\n");
+	dev_dbg(&udev->dev, "Enter disconnect\n");
 
 	busid_priv = get_busid_priv(udev_busid);
 	if (!busid_priv) {

From 1e180f167d4e413afccbbb4a421b48b2de832549 Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah@kernel.org>
Date: Mon, 30 Apr 2018 16:17:19 -0600
Subject: [PATCH 237/393] usbip: usbip_host: delete device from busid_table
 after rebind

Device is left in the busid_table after unbind and rebind. Rebind
initiates usb bus scan and the original driver claims the device.
After rescan the device should be deleted from the busid_table as
it no longer belongs to usbip_host.

Fix it to delete the device after device_attach() succeeds.

Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index d41d0cdeec0f..fb46bd62d538 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -186,6 +186,9 @@ static ssize_t rebind_store(struct device_driver *dev, const char *buf,
 	if (!bid)
 		return -ENODEV;
 
+	/* mark the device for deletion so probe ignores it during rescan */
+	bid->status = STUB_BUSID_OTHER;
+
 	/* device_attach() callers should hold parent lock for USB */
 	if (bid->udev->dev.parent)
 		device_lock(bid->udev->dev.parent);
@@ -197,6 +200,9 @@ static ssize_t rebind_store(struct device_driver *dev, const char *buf,
 		return ret;
 	}
 
+	/* delete device from busid_table */
+	del_match_busid((char *) buf);
+
 	return count;
 }
 

From 7510df3f29d44685bab7b1918b61a8ccd57126a9 Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah@kernel.org>
Date: Mon, 30 Apr 2018 16:17:20 -0600
Subject: [PATCH 238/393] usbip: usbip_host: run rebind from exit when module
 is removed

After removing usbip_host module, devices it releases are left without
a driver. For example, when a keyboard or a mass storage device are
bound to usbip_host when it is removed, these devices are no longer
bound to any driver.

Fix it to run device_attach() from the module exit routine to restore
the devices to their original drivers. This includes cleanup changes
and moving device_attach() code to a common routine to be called from
rebind_store() and usbip_host_exit().

Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_dev.c  |  6 +---
 drivers/usb/usbip/stub_main.c | 60 +++++++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index 7813c1862941..9d0425113c4b 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -448,12 +448,8 @@ static void stub_disconnect(struct usb_device *udev)
 	busid_priv->sdev = NULL;
 	stub_device_free(sdev);
 
-	if (busid_priv->status == STUB_BUSID_ALLOC) {
+	if (busid_priv->status == STUB_BUSID_ALLOC)
 		busid_priv->status = STUB_BUSID_ADDED;
-	} else {
-		busid_priv->status = STUB_BUSID_OTHER;
-		del_match_busid((char *)udev_busid);
-	}
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index fb46bd62d538..587b9bc10042 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -14,6 +14,7 @@
 #define DRIVER_DESC "USB/IP Host Driver"
 
 struct kmem_cache *stub_priv_cache;
+
 /*
  * busid_tables defines matching busids that usbip can grab. A user can change
  * dynamically what device is locally used and what device is exported to a
@@ -169,6 +170,51 @@ static ssize_t match_busid_store(struct device_driver *dev, const char *buf,
 }
 static DRIVER_ATTR_RW(match_busid);
 
+static int do_rebind(char *busid, struct bus_id_priv *busid_priv)
+{
+	int ret;
+
+	/* device_attach() callers should hold parent lock for USB */
+	if (busid_priv->udev->dev.parent)
+		device_lock(busid_priv->udev->dev.parent);
+	ret = device_attach(&busid_priv->udev->dev);
+	if (busid_priv->udev->dev.parent)
+		device_unlock(busid_priv->udev->dev.parent);
+	if (ret < 0) {
+		dev_err(&busid_priv->udev->dev, "rebind failed\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void stub_device_rebind(void)
+{
+#if IS_MODULE(CONFIG_USBIP_HOST)
+	struct bus_id_priv *busid_priv;
+	int i;
+
+	/* update status to STUB_BUSID_OTHER so probe ignores the device */
+	spin_lock(&busid_table_lock);
+	for (i = 0; i < MAX_BUSID; i++) {
+		if (busid_table[i].name[0] &&
+		    busid_table[i].shutdown_busid) {
+			busid_priv = &(busid_table[i]);
+			busid_priv->status = STUB_BUSID_OTHER;
+		}
+	}
+	spin_unlock(&busid_table_lock);
+
+	/* now run rebind */
+	for (i = 0; i < MAX_BUSID; i++) {
+		if (busid_table[i].name[0] &&
+		    busid_table[i].shutdown_busid) {
+			busid_priv = &(busid_table[i]);
+			do_rebind(busid_table[i].name, busid_priv);
+		}
+	}
+#endif
+}
+
 static ssize_t rebind_store(struct device_driver *dev, const char *buf,
 				 size_t count)
 {
@@ -189,16 +235,9 @@ static ssize_t rebind_store(struct device_driver *dev, const char *buf,
 	/* mark the device for deletion so probe ignores it during rescan */
 	bid->status = STUB_BUSID_OTHER;
 
-	/* device_attach() callers should hold parent lock for USB */
-	if (bid->udev->dev.parent)
-		device_lock(bid->udev->dev.parent);
-	ret = device_attach(&bid->udev->dev);
-	if (bid->udev->dev.parent)
-		device_unlock(bid->udev->dev.parent);
-	if (ret < 0) {
-		dev_err(&bid->udev->dev, "rebind failed\n");
+	ret = do_rebind((char *) buf, bid);
+	if (ret < 0)
 		return ret;
-	}
 
 	/* delete device from busid_table */
 	del_match_busid((char *) buf);
@@ -323,6 +362,9 @@ static void __exit usbip_host_exit(void)
 	 */
 	usb_deregister_device_driver(&stub_driver);
 
+	/* initiate scan to attach devices */
+	stub_device_rebind();
+
 	kmem_cache_destroy(stub_priv_cache);
 }
 

From 22076557b07c12086eeb16b8ce2b0b735f7a27e7 Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah@kernel.org>
Date: Mon, 14 May 2018 20:49:58 -0600
Subject: [PATCH 239/393] usbip: usbip_host: fix NULL-ptr deref and
 use-after-free errors

usbip_host updates device status without holding lock from stub probe,
disconnect and rebind code paths. When multiple requests to import a
device are received, these unprotected code paths step all over each
other and drive fails with NULL-ptr deref and use-after-free errors.

The driver uses a table lock to protect the busid array for adding and
deleting busids to the table. However, the probe, disconnect and rebind
paths get the busid table entry and update the status without holding
the busid table lock. Add a new finer grain lock to protect the busid
entry. This new lock will be held to search and update the busid entry
fields from get_busid_idx(), add_match_busid() and del_match_busid().

match_busid_show() does the same to access the busid entry fields.

get_busid_priv() changed to return the pointer to the busid entry holding
the busid lock. stub_probe(), stub_disconnect() and stub_device_rebind()
call put_busid_priv() to release the busid lock before returning. This
changes fixes the unprotected code paths eliminating the race conditions
in updating the busid entries.

Reported-by: Jakub Jirasek
Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub.h      |  2 ++
 drivers/usb/usbip/stub_dev.c  | 33 ++++++++++++++++++++---------
 drivers/usb/usbip/stub_main.c | 40 ++++++++++++++++++++++++++++++-----
 3 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/drivers/usb/usbip/stub.h b/drivers/usb/usbip/stub.h
index 14a72357800a..35618ceb2791 100644
--- a/drivers/usb/usbip/stub.h
+++ b/drivers/usb/usbip/stub.h
@@ -73,6 +73,7 @@ struct bus_id_priv {
 	struct stub_device *sdev;
 	struct usb_device *udev;
 	char shutdown_busid;
+	spinlock_t busid_lock;
 };
 
 /* stub_priv is allocated from stub_priv_cache */
@@ -83,6 +84,7 @@ extern struct usb_device_driver stub_driver;
 
 /* stub_main.c */
 struct bus_id_priv *get_busid_priv(const char *busid);
+void put_busid_priv(struct bus_id_priv *bid);
 int del_match_busid(char *busid);
 void stub_device_cleanup_urbs(struct stub_device *sdev);
 
diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index 9d0425113c4b..c0d6ff1baa72 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -300,7 +300,7 @@ static int stub_probe(struct usb_device *udev)
 	struct stub_device *sdev = NULL;
 	const char *udev_busid = dev_name(&udev->dev);
 	struct bus_id_priv *busid_priv;
-	int rc;
+	int rc = 0;
 
 	dev_dbg(&udev->dev, "Enter probe\n");
 
@@ -317,13 +317,15 @@ static int stub_probe(struct usb_device *udev)
 		 * other matched drivers by the driver core.
 		 * See driver_probe_device() in driver/base/dd.c
 		 */
-		return -ENODEV;
+		rc = -ENODEV;
+		goto call_put_busid_priv;
 	}
 
 	if (udev->descriptor.bDeviceClass == USB_CLASS_HUB) {
 		dev_dbg(&udev->dev, "%s is a usb hub device... skip!\n",
 			 udev_busid);
-		return -ENODEV;
+		rc = -ENODEV;
+		goto call_put_busid_priv;
 	}
 
 	if (!strcmp(udev->bus->bus_name, "vhci_hcd")) {
@@ -331,13 +333,16 @@ static int stub_probe(struct usb_device *udev)
 			"%s is attached on vhci_hcd... skip!\n",
 			udev_busid);
 
-		return -ENODEV;
+		rc = -ENODEV;
+		goto call_put_busid_priv;
 	}
 
 	/* ok, this is my device */
 	sdev = stub_device_alloc(udev);
-	if (!sdev)
-		return -ENOMEM;
+	if (!sdev) {
+		rc = -ENOMEM;
+		goto call_put_busid_priv;
+	}
 
 	dev_info(&udev->dev,
 		"usbip-host: register new device (bus %u dev %u)\n",
@@ -369,7 +374,9 @@ static int stub_probe(struct usb_device *udev)
 	}
 	busid_priv->status = STUB_BUSID_ALLOC;
 
-	return 0;
+	rc = 0;
+	goto call_put_busid_priv;
+
 err_files:
 	usb_hub_release_port(udev->parent, udev->portnum,
 			     (struct usb_dev_state *) udev);
@@ -379,6 +386,9 @@ static int stub_probe(struct usb_device *udev)
 
 	busid_priv->sdev = NULL;
 	stub_device_free(sdev);
+
+call_put_busid_priv:
+	put_busid_priv(busid_priv);
 	return rc;
 }
 
@@ -417,7 +427,7 @@ static void stub_disconnect(struct usb_device *udev)
 	/* get stub_device */
 	if (!sdev) {
 		dev_err(&udev->dev, "could not get device");
-		return;
+		goto call_put_busid_priv;
 	}
 
 	dev_set_drvdata(&udev->dev, NULL);
@@ -432,12 +442,12 @@ static void stub_disconnect(struct usb_device *udev)
 				  (struct usb_dev_state *) udev);
 	if (rc) {
 		dev_dbg(&udev->dev, "unable to release port\n");
-		return;
+		goto call_put_busid_priv;
 	}
 
 	/* If usb reset is called from event handler */
 	if (usbip_in_eh(current))
-		return;
+		goto call_put_busid_priv;
 
 	/* shutdown the current connection */
 	shutdown_busid(busid_priv);
@@ -450,6 +460,9 @@ static void stub_disconnect(struct usb_device *udev)
 
 	if (busid_priv->status == STUB_BUSID_ALLOC)
 		busid_priv->status = STUB_BUSID_ADDED;
+
+call_put_busid_priv:
+	put_busid_priv(busid_priv);
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 587b9bc10042..41c7b9de2a92 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -26,6 +26,8 @@ static spinlock_t busid_table_lock;
 
 static void init_busid_table(void)
 {
+	int i;
+
 	/*
 	 * This also sets the bus_table[i].status to
 	 * STUB_BUSID_OTHER, which is 0.
@@ -33,6 +35,9 @@ static void init_busid_table(void)
 	memset(busid_table, 0, sizeof(busid_table));
 
 	spin_lock_init(&busid_table_lock);
+
+	for (i = 0; i < MAX_BUSID; i++)
+		spin_lock_init(&busid_table[i].busid_lock);
 }
 
 /*
@@ -44,15 +49,20 @@ static int get_busid_idx(const char *busid)
 	int i;
 	int idx = -1;
 
-	for (i = 0; i < MAX_BUSID; i++)
+	for (i = 0; i < MAX_BUSID; i++) {
+		spin_lock(&busid_table[i].busid_lock);
 		if (busid_table[i].name[0])
 			if (!strncmp(busid_table[i].name, busid, BUSID_SIZE)) {
 				idx = i;
+				spin_unlock(&busid_table[i].busid_lock);
 				break;
 			}
+		spin_unlock(&busid_table[i].busid_lock);
+	}
 	return idx;
 }
 
+/* Returns holding busid_lock. Should call put_busid_priv() to unlock */
 struct bus_id_priv *get_busid_priv(const char *busid)
 {
 	int idx;
@@ -60,13 +70,21 @@ struct bus_id_priv *get_busid_priv(const char *busid)
 
 	spin_lock(&busid_table_lock);
 	idx = get_busid_idx(busid);
-	if (idx >= 0)
+	if (idx >= 0) {
 		bid = &(busid_table[idx]);
+		/* get busid_lock before returning */
+		spin_lock(&bid->busid_lock);
+	}
 	spin_unlock(&busid_table_lock);
 
 	return bid;
 }
 
+void put_busid_priv(struct bus_id_priv *bid)
+{
+	spin_unlock(&bid->busid_lock);
+}
+
 static int add_match_busid(char *busid)
 {
 	int i;
@@ -79,15 +97,19 @@ static int add_match_busid(char *busid)
 		goto out;
 	}
 
-	for (i = 0; i < MAX_BUSID; i++)
+	for (i = 0; i < MAX_BUSID; i++) {
+		spin_lock(&busid_table[i].busid_lock);
 		if (!busid_table[i].name[0]) {
 			strlcpy(busid_table[i].name, busid, BUSID_SIZE);
 			if ((busid_table[i].status != STUB_BUSID_ALLOC) &&
 			    (busid_table[i].status != STUB_BUSID_REMOV))
 				busid_table[i].status = STUB_BUSID_ADDED;
 			ret = 0;
+			spin_unlock(&busid_table[i].busid_lock);
 			break;
 		}
+		spin_unlock(&busid_table[i].busid_lock);
+	}
 
 out:
 	spin_unlock(&busid_table_lock);
@@ -108,6 +130,8 @@ int del_match_busid(char *busid)
 	/* found */
 	ret = 0;
 
+	spin_lock(&busid_table[idx].busid_lock);
+
 	if (busid_table[idx].status == STUB_BUSID_OTHER)
 		memset(busid_table[idx].name, 0, BUSID_SIZE);
 
@@ -115,6 +139,7 @@ int del_match_busid(char *busid)
 	    (busid_table[idx].status != STUB_BUSID_ADDED))
 		busid_table[idx].status = STUB_BUSID_REMOV;
 
+	spin_unlock(&busid_table[idx].busid_lock);
 out:
 	spin_unlock(&busid_table_lock);
 
@@ -127,9 +152,12 @@ static ssize_t match_busid_show(struct device_driver *drv, char *buf)
 	char *out = buf;
 
 	spin_lock(&busid_table_lock);
-	for (i = 0; i < MAX_BUSID; i++)
+	for (i = 0; i < MAX_BUSID; i++) {
+		spin_lock(&busid_table[i].busid_lock);
 		if (busid_table[i].name[0])
 			out += sprintf(out, "%s ", busid_table[i].name);
+		spin_unlock(&busid_table[i].busid_lock);
+	}
 	spin_unlock(&busid_table_lock);
 	out += sprintf(out, "\n");
 
@@ -204,7 +232,7 @@ static void stub_device_rebind(void)
 	}
 	spin_unlock(&busid_table_lock);
 
-	/* now run rebind */
+	/* now run rebind - no need to hold locks. driver files are removed */
 	for (i = 0; i < MAX_BUSID; i++) {
 		if (busid_table[i].name[0] &&
 		    busid_table[i].shutdown_busid) {
@@ -234,6 +262,8 @@ static ssize_t rebind_store(struct device_driver *dev, const char *buf,
 
 	/* mark the device for deletion so probe ignores it during rescan */
 	bid->status = STUB_BUSID_OTHER;
+	/* release the busid lock */
+	put_busid_priv(bid);
 
 	ret = do_rebind((char *) buf, bid);
 	if (ret < 0)

From 73d4337ed9ceddef4b2f0e226634d5f985aa2d1c Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Fri, 11 May 2018 20:51:34 +0530
Subject: [PATCH 240/393] ARM: davinci: dm646x: fix timer interrupt generation

commit b38434145b34 ("ARM: davinci: irqs: Correct McASP1 TX interrupt
definition for DM646x") inadvertently removed priority setting for
timer0_12 (bottom half of timer0). This timer is used as clockevent.

When INTPRIn register setting for an interrupt is left at 0, it is
mapped to FIQ by the AINTC causing the timer interrupt to not get
generated.

Fix it by including an entry for timer0_12 in interrupt priority map
array. While at it, move the clockevent comment to the right place.

Fixes: b38434145b34 ("ARM: davinci: irqs: Correct McASP1 TX interrupt definition for DM646x")
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm646x.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-davinci/dm646x.c b/arch/arm/mach-davinci/dm646x.c
index 109ab1fa0d2c..c32ca27ab343 100644
--- a/arch/arm/mach-davinci/dm646x.c
+++ b/arch/arm/mach-davinci/dm646x.c
@@ -488,7 +488,8 @@ static u8 dm646x_default_priorities[DAVINCI_N_AINTC_IRQ] = {
 	[IRQ_DM646X_MCASP0TXINT]        = 7,
 	[IRQ_DM646X_MCASP0RXINT]        = 7,
 	[IRQ_DM646X_RESERVED_3]         = 7,
-	[IRQ_DM646X_MCASP1TXINT]        = 7,    /* clockevent */
+	[IRQ_DM646X_MCASP1TXINT]        = 7,
+	[IRQ_TINT0_TINT12]              = 7,    /* clockevent */
 	[IRQ_TINT0_TINT34]              = 7,    /* clocksource */
 	[IRQ_TINT1_TINT12]              = 7,    /* DSP timer */
 	[IRQ_TINT1_TINT34]              = 7,    /* system tick */

From 7d46899d57f8b61eb28701d9a4043b71e3392c26 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Fri, 11 May 2018 20:51:35 +0530
Subject: [PATCH 241/393] ARM: davinci: board-dm646x-evm: pass correct I2C
 adapter id for VPIF

commit a16cb91ad9c4 ("[media] media: vpif: use a configurable
i2c_adapter_id for vpif display") removed hardcoded I2C adaptor
setting in VPIF driver, but missed updating platform data passed
from DM646x board.

Fix it.

Fixes: a16cb91ad9c4 ("[media] media: vpif: use a configurable i2c_adapter_id for vpif display")
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-dm646x-evm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c
index 2d37f5b0e1f5..8c28fa7157e0 100644
--- a/arch/arm/mach-davinci/board-dm646x-evm.c
+++ b/arch/arm/mach-davinci/board-dm646x-evm.c
@@ -532,6 +532,7 @@ static struct vpif_display_config dm646x_vpif_display_config = {
 	.set_clock	= set_vpif_clock,
 	.subdevinfo	= dm646x_vpif_subdev,
 	.subdev_count	= ARRAY_SIZE(dm646x_vpif_subdev),
+	.i2c_adapter_id = 1,
 	.chan_config[0] = {
 		.outputs = dm6467_ch0_outputs,
 		.output_count = ARRAY_SIZE(dm6467_ch0_outputs),
@@ -674,6 +675,7 @@ static struct vpif_capture_config dm646x_vpif_capture_cfg = {
 	.setup_input_channel_mode = setup_vpif_input_channel_mode,
 	.subdev_info = vpif_capture_sdev_info,
 	.subdev_count = ARRAY_SIZE(vpif_capture_sdev_info),
+	.i2c_adapter_id = 1,
 	.chan_config[0] = {
 		.inputs = dm6467_ch0_inputs,
 		.input_count = ARRAY_SIZE(dm6467_ch0_inputs),

From bb7298a7e87cf3430eb62be8746e5d7a07ca9d7c Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Fri, 11 May 2018 20:51:36 +0530
Subject: [PATCH 242/393] ARM: davinci: board-dm646x-evm: set VPIF capture card
 name

VPIF capture driver expects card name to be set since it
uses it without checking for NULL. The commit which
introduced VPIF display and capture support added card
name only for display, not for capture.

Set it in platform data to probe driver successfully.

While at it, also fix the display card name to something more
appropriate.

Fixes: 85609c1ccda6 ("DaVinci: DM646x - platform changes for vpif capture and display drivers")
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-dm646x-evm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c
index 8c28fa7157e0..a3c0d1e87647 100644
--- a/arch/arm/mach-davinci/board-dm646x-evm.c
+++ b/arch/arm/mach-davinci/board-dm646x-evm.c
@@ -537,7 +537,7 @@ static struct vpif_display_config dm646x_vpif_display_config = {
 		.outputs = dm6467_ch0_outputs,
 		.output_count = ARRAY_SIZE(dm6467_ch0_outputs),
 	},
-	.card_name	= "DM646x EVM",
+	.card_name	= "DM646x EVM Video Display",
 };
 
 /**
@@ -696,6 +696,7 @@ static struct vpif_capture_config dm646x_vpif_capture_cfg = {
 			.fid_pol = 0,
 		},
 	},
+	.card_name = "DM646x EVM Video Capture",
 };
 
 static void __init evm_init_video(void)

From 4c27625b7a67eb9006963ed2bcf8e53b259b43af Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Sat, 5 May 2018 04:02:32 -0700
Subject: [PATCH 243/393] KVM: X86: Lower the default timer frequency limit to
 200us
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Anthoine reported:
 The period used by Windows change over time but it can be 1
 milliseconds or less. I saw the limit_periodic_timer_frequency
 print so 500 microseconds is sometimes reached.

As suggested by Paolo, lower the default timer frequency limit to a
smaller interval of 200 us (5000 Hz) to leave some headroom. This
is required due to Windows 10 changing the scheduler tick limit
from 1024 Hz to 2048 Hz.

Reported-by: Anthoine Bourgeois <anthoine.bourgeois@blade-group.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Anthoine Bourgeois <anthoine.bourgeois@blade-group.com>
Cc: Darren Kenny <darren.kenny@oracle.com>
Cc: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37dd9a9d050a..59371de5d722 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 static bool __read_mostly report_ignored_msrs = true;
 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
 
-unsigned int min_timer_period_us = 500;
+unsigned int min_timer_period_us = 200;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
 static bool __read_mostly kvmclock_periodic_sync = true;

From 72cb0d893343cd33e6ab62cf26f2625d5d3532c9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 9 Apr 2018 13:58:13 -0700
Subject: [PATCH 244/393] drm/vc4: Fix leak of the file_priv that stored the
 perfmon.

Signed-off-by: Eric Anholt <eric@anholt.net>
Fixes: 65101d8c9108 ("drm/vc4: Expose performance counters to userspace")
Link: https://patchwork.freedesktop.org/patch/msgid/20180409205813.7077-1-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
---
 drivers/gpu/drm/vc4/vc4_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 94b99c90425a..7c95ed5c5cac 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -130,6 +130,7 @@ static void vc4_close(struct drm_device *dev, struct drm_file *file)
 	struct vc4_file *vc4file = file->driver_priv;
 
 	vc4_perfmon_close_file(vc4file);
+	kfree(vc4file);
 }
 
 static const struct vm_operations_struct vc4_vm_ops = {

From 401dca8cbd14fc4b32d93499dcd12a1711a73ecc Mon Sep 17 00:00:00 2001
From: Philippe Bergheaud <felix@linux.ibm.com>
Date: Mon, 14 May 2018 10:27:35 +0200
Subject: [PATCH 245/393] cxl: Set the PBCQ Tunnel BAR register when enabling
 capi mode

Skiboot used to set the default Tunnel BAR register value when capi
mode was enabled. This approach was ok for the cxl driver, but
prevented other drivers from choosing different values.

Skiboot versions > 5.11 will not set the default value any longer.
This patch modifies the cxl driver to set/reset the Tunnel BAR
register when entering/exiting the cxl mode, with
pnv_pci_set_tunnel_bar().

That should work with old skiboot (since we are re-writing the value
already set) and new skiboot.

mpe: The tunnel support was only merged into Linux recently, in commit
d6a90bb83b50 ("powerpc/powernv: Enable tunneled operations")
(v4.17-rc1), so with new skiboot kernels between that commit and this
will not work correctly.

Fixes: d6a90bb83b50 ("powerpc/powernv: Enable tunneled operations")
Signed-off-by: Philippe Bergheaud <felix@linux.ibm.com>
Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/pci.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 83f1d08058fc..4eb893284af2 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1742,6 +1742,10 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
 	/* Required for devices using CAPP DMA mode, harmless for others */
 	pci_set_master(dev);
 
+	if (cxl_is_power9())
+		if (pnv_pci_set_tunnel_bar(dev, 0x00020000E0000000ull, 1))
+			dev_info(&dev->dev, "Tunneled operations unsupported\n");
+
 	if ((rc = pnv_phb_to_cxl_mode(dev, adapter->native->sl_ops->capi_mode)))
 		goto err;
 
@@ -1768,6 +1772,9 @@ static void cxl_deconfigure_adapter(struct cxl *adapter)
 {
 	struct pci_dev *pdev = to_pci_dev(adapter->dev.parent);
 
+	if (cxl_is_power9())
+		pnv_pci_set_tunnel_bar(pdev, 0x00020000E0000000ull, 0);
+
 	cxl_native_release_psl_err_irq(adapter);
 	cxl_unmap_adapter_regs(adapter);
 

From 497a0790e2c604366b9e35dcb41310319e9bca13 Mon Sep 17 00:00:00 2001
From: Philippe Bergheaud <felix@linux.ibm.com>
Date: Mon, 14 May 2018 10:27:36 +0200
Subject: [PATCH 246/393] cxl: Report the tunneled operations status

Failure to synchronize the tunneled operations does not prevent
the initialization of the cxl card. This patch reports the tunneled
operations status via /sys.

Signed-off-by: Philippe Bergheaud <felix@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/ABI/testing/sysfs-class-cxl |  8 ++++++++
 drivers/misc/cxl/cxl.h                    |  1 +
 drivers/misc/cxl/pci.c                    |  7 ++++++-
 drivers/misc/cxl/sysfs.c                  | 10 ++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index 640f65e79ef1..8e69345c37cc 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -244,3 +244,11 @@ Description:    read only
                 Returns 1 if the psl timebase register is synchronized
                 with the core timebase register, 0 otherwise.
 Users:          https://github.com/ibm-capi/libcxl
+
+What:           /sys/class/cxl/<card>/tunneled_ops_supported
+Date:           May 2018
+Contact:        linuxppc-dev@lists.ozlabs.org
+Description:    read only
+                Returns 1 if tunneled operations are supported in capi mode,
+                0 otherwise.
+Users:          https://github.com/ibm-capi/libcxl
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a4c9c8297a6d..918d4fb742d1 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -717,6 +717,7 @@ struct cxl {
 	bool perst_select_user;
 	bool perst_same_image;
 	bool psl_timebase_synced;
+	bool tunneled_ops_supported;
 
 	/*
 	 * number of contexts mapped on to this card. Possible values are:
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 4eb893284af2..4d6736f9d463 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1742,9 +1742,14 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
 	/* Required for devices using CAPP DMA mode, harmless for others */
 	pci_set_master(dev);
 
-	if (cxl_is_power9())
+	adapter->tunneled_ops_supported = false;
+
+	if (cxl_is_power9()) {
 		if (pnv_pci_set_tunnel_bar(dev, 0x00020000E0000000ull, 1))
 			dev_info(&dev->dev, "Tunneled operations unsupported\n");
+		else
+			adapter->tunneled_ops_supported = true;
+	}
 
 	if ((rc = pnv_phb_to_cxl_mode(dev, adapter->native->sl_ops->capi_mode)))
 		goto err;
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index 95285b7f636f..4b5a4c5d3c01 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -78,6 +78,15 @@ static ssize_t psl_timebase_synced_show(struct device *device,
 	return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced);
 }
 
+static ssize_t tunneled_ops_supported_show(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct cxl *adapter = to_cxl_adapter(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->tunneled_ops_supported);
+}
+
 static ssize_t reset_adapter_store(struct device *device,
 				   struct device_attribute *attr,
 				   const char *buf, size_t count)
@@ -183,6 +192,7 @@ static struct device_attribute adapter_attrs[] = {
 	__ATTR_RO(base_image),
 	__ATTR_RO(image_loaded),
 	__ATTR_RO(psl_timebase_synced),
+	__ATTR_RO(tunneled_ops_supported),
 	__ATTR_RW(load_image_on_perst),
 	__ATTR_RW(perst_reloads_same_image),
 	__ATTR(reset, S_IWUSR, NULL, reset_adapter_store),

From 388d4359680b56dba82fe2ffca05871e9fd2b73e Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 11 May 2018 15:20:12 +0100
Subject: [PATCH 247/393] KVM: arm/arm64: Properly protect VGIC locks from IRQs

As Jan reported [1], lockdep complains about the VGIC not being bullet
proof. This seems to be due to two issues:
- When commit 006df0f34930 ("KVM: arm/arm64: Support calling
  vgic_update_irq_pending from irq context") promoted irq_lock and
  ap_list_lock to _irqsave, we forgot two instances of irq_lock.
  lockdeps seems to pick those up.
- If a lock is _irqsave, any other locks we take inside them should be
  _irqsafe as well. So the lpi_list_lock needs to be promoted also.

This fixes both issues by simply making the remaining instances of those
locks _irqsave.
One irq_lock is addressed in a separate patch, to simplify backporting.

[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-May/575718.html

Cc: stable@vger.kernel.org
Fixes: 006df0f34930 ("KVM: arm/arm64: Support calling vgic_update_irq_pending from irq context")
Reported-by: Jan Glauber <jan.glauber@caviumnetworks.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/arm/vgic/vgic-debug.c |  5 +++--
 virt/kvm/arm/vgic/vgic-its.c   | 10 ++++++----
 virt/kvm/arm/vgic/vgic.c       | 22 ++++++++++++++--------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 10b38178cff2..4ffc0b5e6105 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -211,6 +211,7 @@ static int vgic_debug_show(struct seq_file *s, void *v)
 	struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
 	struct vgic_irq *irq;
 	struct kvm_vcpu *vcpu = NULL;
+	unsigned long flags;
 
 	if (iter->dist_id == 0) {
 		print_dist_state(s, &kvm->arch.vgic);
@@ -227,9 +228,9 @@ static int vgic_debug_show(struct seq_file *s, void *v)
 		irq = &kvm->arch.vgic.spis[iter->intid - VGIC_NR_PRIVATE_IRQS];
 	}
 
-	spin_lock(&irq->irq_lock);
+	spin_lock_irqsave(&irq->irq_lock, flags);
 	print_irq_state(s, irq, vcpu);
-	spin_unlock(&irq->irq_lock);
+	spin_unlock_irqrestore(&irq->irq_lock, flags);
 
 	return 0;
 }
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index a8f07243aa9f..41abf92f2699 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -52,6 +52,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+	unsigned long flags;
 	int ret;
 
 	/* In this case there is no put, since we keep the reference. */
@@ -71,7 +72,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	irq->intid = intid;
 	irq->target_vcpu = vcpu;
 
-	spin_lock(&dist->lpi_list_lock);
+	spin_lock_irqsave(&dist->lpi_list_lock, flags);
 
 	/*
 	 * There could be a race with another vgic_add_lpi(), so we need to
@@ -99,7 +100,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	dist->lpi_list_count++;
 
 out_unlock:
-	spin_unlock(&dist->lpi_list_lock);
+	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 
 	/*
 	 * We "cache" the configuration table entries in our struct vgic_irq's.
@@ -315,6 +316,7 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr)
 {
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_irq *irq;
+	unsigned long flags;
 	u32 *intids;
 	int irq_count, i = 0;
 
@@ -330,7 +332,7 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr)
 	if (!intids)
 		return -ENOMEM;
 
-	spin_lock(&dist->lpi_list_lock);
+	spin_lock_irqsave(&dist->lpi_list_lock, flags);
 	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
 		if (i == irq_count)
 			break;
@@ -339,7 +341,7 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr)
 			continue;
 		intids[i++] = irq->intid;
 	}
-	spin_unlock(&dist->lpi_list_lock);
+	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 
 	*intid_ptr = intids;
 	return i;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 97bfba8d9a59..33c8325c8f35 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -43,9 +43,13 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
  * kvm->lock (mutex)
  *   its->cmd_lock (mutex)
  *     its->its_lock (mutex)
- *       vgic_cpu->ap_list_lock
- *         kvm->lpi_list_lock
- *           vgic_irq->irq_lock
+ *       vgic_cpu->ap_list_lock		must be taken with IRQs disabled
+ *         kvm->lpi_list_lock		must be taken with IRQs disabled
+ *           vgic_irq->irq_lock		must be taken with IRQs disabled
+ *
+ * As the ap_list_lock might be taken from the timer interrupt handler,
+ * we have to disable IRQs before taking this lock and everything lower
+ * than it.
  *
  * If you need to take multiple locks, always take the upper lock first,
  * then the lower ones, e.g. first take the its_lock, then the irq_lock.
@@ -72,8 +76,9 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq = NULL;
+	unsigned long flags;
 
-	spin_lock(&dist->lpi_list_lock);
+	spin_lock_irqsave(&dist->lpi_list_lock, flags);
 
 	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
 		if (irq->intid != intid)
@@ -89,7 +94,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 	irq = NULL;
 
 out_unlock:
-	spin_unlock(&dist->lpi_list_lock);
+	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 
 	return irq;
 }
@@ -134,19 +139,20 @@ static void vgic_irq_release(struct kref *ref)
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	unsigned long flags;
 
 	if (irq->intid < VGIC_MIN_LPI)
 		return;
 
-	spin_lock(&dist->lpi_list_lock);
+	spin_lock_irqsave(&dist->lpi_list_lock, flags);
 	if (!kref_put(&irq->refcount, vgic_irq_release)) {
-		spin_unlock(&dist->lpi_list_lock);
+		spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 		return;
 	};
 
 	list_del(&irq->lpi_list);
 	dist->lpi_list_count--;
-	spin_unlock(&dist->lpi_list_lock);
+	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 
 	kfree(irq);
 }

From 9c4188762f7fee032abf8451fd9865a9abfc5516 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 11 May 2018 15:20:13 +0100
Subject: [PATCH 248/393] KVM: arm/arm64: VGIC/ITS: Promote irq_lock() in
 update_affinity

Apparently the development of update_affinity() overlapped with the
promotion of irq_lock to be _irqsave, so the patch didn't convert this
lock over. This will make lockdep complain.

Fix this by disabling IRQs around the lock.

Cc: stable@vger.kernel.org
Fixes: 08c9fd042117 ("KVM: arm/arm64: vITS: Add a helper to update the affinity of an LPI")
Reported-by: Jan Glauber <jan.glauber@caviumnetworks.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 41abf92f2699..51a80b600632 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -350,10 +350,11 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr)
 static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
 {
 	int ret = 0;
+	unsigned long flags;
 
-	spin_lock(&irq->irq_lock);
+	spin_lock_irqsave(&irq->irq_lock, flags);
 	irq->target_vcpu = vcpu;
-	spin_unlock(&irq->irq_lock);
+	spin_unlock_irqrestore(&irq->irq_lock, flags);
 
 	if (irq->hw) {
 		struct its_vlpi_map map;

From bf308242ab98b5d1648c3663e753556bef9bec01 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 11 May 2018 15:20:14 +0100
Subject: [PATCH 249/393] KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest()
 calls with SRCU lock

kvm_read_guest() will eventually look up in kvm_memslots(), which requires
either to hold the kvm->slots_lock or to be inside a kvm->srcu critical
section.
In contrast to x86 and s390 we don't take the SRCU lock on every guest
exit, so we have to do it individually for each kvm_read_guest() call.

Provide a wrapper which does that and use that everywhere.

Note that ending the SRCU critical section before returning from the
kvm_read_guest() wrapper is safe, because the data has been *copied*, so
we don't need to rely on valid references to the memslot anymore.

Cc: Stable <stable@vger.kernel.org> # 4.8+
Reported-by: Jan Glauber <jan.glauber@caviumnetworks.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_mmu.h   | 16 ++++++++++++++++
 arch/arm64/include/asm/kvm_mmu.h | 16 ++++++++++++++++
 virt/kvm/arm/vgic/vgic-its.c     | 15 ++++++++-------
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 707a1f06dc5d..f675162663f0 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -309,6 +309,22 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return 8;
 }
 
+/*
+ * We are not in the kvm->srcu critical section most of the time, so we take
+ * the SRCU read lock here. Since we copy the data from the user page, we
+ * can immediately drop the lock again.
+ */
+static inline int kvm_read_guest_lock(struct kvm *kvm,
+				      gpa_t gpa, void *data, unsigned long len)
+{
+	int srcu_idx = srcu_read_lock(&kvm->srcu);
+	int ret = kvm_read_guest(kvm, gpa, data, len);
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	return ret;
+}
+
 static inline void *kvm_get_hyp_vector(void)
 {
 	return kvm_ksym_ref(__kvm_hyp_vector);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 082110993647..6128992c2ded 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -360,6 +360,22 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
 }
 
+/*
+ * We are not in the kvm->srcu critical section most of the time, so we take
+ * the SRCU read lock here. Since we copy the data from the user page, we
+ * can immediately drop the lock again.
+ */
+static inline int kvm_read_guest_lock(struct kvm *kvm,
+				      gpa_t gpa, void *data, unsigned long len)
+{
+	int srcu_idx = srcu_read_lock(&kvm->srcu);
+	int ret = kvm_read_guest(kvm, gpa, data, len);
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	return ret;
+}
+
 #ifdef CONFIG_KVM_INDIRECT_VECTORS
 /*
  * EL2 vectors can be mapped and rerouted in a number of ways,
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 51a80b600632..7cb060e01a76 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -281,8 +281,8 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 	int ret;
 	unsigned long flags;
 
-	ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
-			     &prop, 1);
+	ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+				  &prop, 1);
 
 	if (ret)
 		return ret;
@@ -444,8 +444,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
 		 * this very same byte in the last iteration. Reuse that.
 		 */
 		if (byte_offset != last_byte_offset) {
-			ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
-					     &pendmask, 1);
+			ret = kvm_read_guest_lock(vcpu->kvm,
+						  pendbase + byte_offset,
+						  &pendmask, 1);
 			if (ret) {
 				kfree(intids);
 				return ret;
@@ -789,7 +790,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
 		return false;
 
 	/* Each 1st level entry is represented by a 64-bit value. */
-	if (kvm_read_guest(its->dev->kvm,
+	if (kvm_read_guest_lock(its->dev->kvm,
 			   BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
 			   &indirect_ptr, sizeof(indirect_ptr)))
 		return false;
@@ -1370,8 +1371,8 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
 	cbaser = CBASER_ADDRESS(its->cbaser);
 
 	while (its->cwriter != its->creadr) {
-		int ret = kvm_read_guest(kvm, cbaser + its->creadr,
-					 cmd_buf, ITS_CMD_SIZE);
+		int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
+					      cmd_buf, ITS_CMD_SIZE);
 		/*
 		 * If kvm_read_guest() fails, this could be due to the guest
 		 * programming a bogus value in CBASER or something else going

From 711702b57cc3c50b84bd648de0f1ca0a378805be Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 11 May 2018 15:20:15 +0100
Subject: [PATCH 250/393] KVM: arm/arm64: VGIC/ITS save/restore: protect
 kvm_read_guest() calls

kvm_read_guest() will eventually look up in kvm_memslots(), which requires
either to hold the kvm->slots_lock or to be inside a kvm->srcu critical
section.
In contrast to x86 and s390 we don't take the SRCU lock on every guest
exit, so we have to do it individually for each kvm_read_guest() call.
Use the newly introduced wrapper for that.

Cc: Stable <stable@vger.kernel.org> # 4.12+
Reported-by: Jan Glauber <jan.glauber@caviumnetworks.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 4 ++--
 virt/kvm/arm/vgic/vgic-v3.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 7cb060e01a76..4ed79c939fb4 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -1897,7 +1897,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, int esz,
 		int next_offset;
 		size_t byte_offset;
 
-		ret = kvm_read_guest(kvm, gpa, entry, esz);
+		ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
 		if (ret)
 			return ret;
 
@@ -2267,7 +2267,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
 	int ret;
 
 	BUG_ON(esz > sizeof(val));
-	ret = kvm_read_guest(kvm, gpa, &val, esz);
+	ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
 	if (ret)
 		return ret;
 	val = le64_to_cpu(val);
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index c7423f3768e5..bdcf8e7a6161 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -344,7 +344,7 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
 	bit_nr = irq->intid % BITS_PER_BYTE;
 	ptr = pendbase + byte_offset;
 
-	ret = kvm_read_guest(kvm, ptr, &val, 1);
+	ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
 	if (ret)
 		return ret;
 
@@ -397,7 +397,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
 		ptr = pendbase + byte_offset;
 
 		if (byte_offset != last_byte_offset) {
-			ret = kvm_read_guest(kvm, ptr, &val, 1);
+			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
 			if (ret)
 				return ret;
 			last_byte_offset = byte_offset;

From 9f825e74d761c13b0cfaa5f65344d64ff970e252 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 14 May 2018 12:49:37 +0200
Subject: [PATCH 251/393] mtd: rawnand: Fix return type of __DIVIDE() when
 called with 32-bit

The __DIVIDE() macro checks whether it is called with a 32-bit or 64-bit
dividend, to select the appropriate divide-and-round-up routine.
As the check uses the ternary operator, the result will always be
promoted to a type that can hold both results, i.e. unsigned long long.

When using this result in a division on a 32-bit system, this may lead
to link errors like:

    ERROR: "__udivdi3" [drivers/mtd/nand/raw/nand.ko] undefined!

Fix this by casting the result of the division to the type of the
dividend.

Fixes: 8878b126df769831 ("mtd: nand: add ->exec_op() implementation")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/linux/mtd/rawnand.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 5dad59b31244..17c919436f48 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -867,12 +867,18 @@ struct nand_op_instr {
  * tBERS (during an erase) which all of them are u64 values that cannot be
  * divided by usual kernel macros and must be handled with the special
  * DIV_ROUND_UP_ULL() macro.
+ *
+ * Cast to type of dividend is needed here to guarantee that the result won't
+ * be an unsigned long long when the dividend is an unsigned long (or smaller),
+ * which is what the compiler does when it sees ternary operator with 2
+ * different return types (picks the largest type to make sure there's no
+ * loss).
  */
-#define __DIVIDE(dividend, divisor) ({					\
-	sizeof(dividend) == sizeof(u32) ?				\
-		DIV_ROUND_UP(dividend, divisor) :			\
-		DIV_ROUND_UP_ULL(dividend, divisor);			\
-		})
+#define __DIVIDE(dividend, divisor) ({						\
+	(__typeof__(dividend))(sizeof(dividend) <= sizeof(unsigned long) ?	\
+			       DIV_ROUND_UP(dividend, divisor) :		\
+			       DIV_ROUND_UP_ULL(dividend, divisor)); 		\
+	})
 #define PSEC_TO_NSEC(x) __DIVIDE(x, 1000)
 #define PSEC_TO_MSEC(x) __DIVIDE(x, 1000000000)
 

From 5596fe34495cf0f645f417eb928ef224df3e3cb4 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 15 May 2018 19:52:50 +0000
Subject: [PATCH 252/393] tick/broadcast: Use for_each_cpu() specially on UP
 kernels

for_each_cpu() unintuitively reports CPU0 as set independent of the actual
cpumask content on UP kernels. This causes an unexpected PIT interrupt
storm on a UP kernel running in an SMP virtual machine on Hyper-V, and as
a result, the virtual machine can suffer from a strange random delay of 1~20
minutes during boot-up, and sometimes it can hang forever.

Protect if by checking whether the cpumask is empty before entering the
for_each_cpu() loop.

[ tglx: Use !IS_ENABLED(CONFIG_SMP) instead of #ifdeffery ]

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: stable@vger.kernel.org
Cc: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Link: https://lkml.kernel.org/r/KL1P15301MB000678289FE55BA365B3279ABF990@KL1P15301MB0006.APCP153.PROD.OUTLOOK.COM
Link: https://lkml.kernel.org/r/KL1P15301MB0006FA63BC22BEB64902EAA0BF930@KL1P15301MB0006.APCP153.PROD.OUTLOOK.COM
---
 kernel/time/tick-broadcast.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	now = ktime_get();
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+		/*
+		 * Required for !SMP because for_each_cpu() reports
+		 * unconditionally CPU0 as set on UP kernels.
+		 */
+		if (!IS_ENABLED(CONFIG_SMP) &&
+		    cpumask_empty(tick_broadcast_oneshot_mask))
+			break;
+
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event <= now) {
 			cpumask_set_cpu(cpu, tmpmask);

From 2e5be528ab0182ad4b42b9feea3b80f85f37179b Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Wed, 18 Apr 2018 14:49:08 +0200
Subject: [PATCH 253/393] clk: imx6ull: use OSC clock during AXI rate change

On i.MX6 ULL using PLL3 seems to cause a freeze when setting
the parent to IMX6UL_CLK_PLL3_USB_OTG. This only seems to appear
since commit 6f9575e55632 ("clk: imx: Add CLK_IS_CRITICAL flag
for busy divider and busy mux"), probably because the clock is
now forced to be on.

Fixes: 6f9575e55632("clk: imx: Add CLK_IS_CRITICAL flag for busy divider and busy mux")
Signed-off-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx6ul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c
index 114ecbb94ec5..12320118f8de 100644
--- a/drivers/clk/imx/clk-imx6ul.c
+++ b/drivers/clk/imx/clk-imx6ul.c
@@ -464,7 +464,7 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node)
 	clk_set_rate(clks[IMX6UL_CLK_AHB], 99000000);
 
 	/* Change periph_pre clock to pll2_bus to adjust AXI rate to 264MHz */
-	clk_set_parent(clks[IMX6UL_CLK_PERIPH_CLK2_SEL], clks[IMX6UL_CLK_PLL3_USB_OTG]);
+	clk_set_parent(clks[IMX6UL_CLK_PERIPH_CLK2_SEL], clks[IMX6UL_CLK_OSC]);
 	clk_set_parent(clks[IMX6UL_CLK_PERIPH], clks[IMX6UL_CLK_PERIPH_CLK2]);
 	clk_set_parent(clks[IMX6UL_CLK_PERIPH_PRE], clks[IMX6UL_CLK_PLL2_BUS]);
 	clk_set_parent(clks[IMX6UL_CLK_PERIPH], clks[IMX6UL_CLK_PERIPH_PRE]);

From 9a160601f3fb6ffa196b87d1b5643646be486405 Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@st.com>
Date: Thu, 3 May 2018 08:40:09 +0200
Subject: [PATCH 254/393] clk: stm32: fix: stm32 clock drivers are not compiled
 by default

Clock driver is mandatory if the machine is selected.
Then don't use 'bool' and 'depends on' commands, but 'def_bool'
with the machine(s).

Fixes: da32d3539fca ("clk: stm32: add configuration flags for each of the stm32 drivers")
Signed-off-by: Gabriel Fernandez <gabriel.fernandez@st.com>
Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/Kconfig | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index 41492e980ef4..34968a381d0f 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -266,15 +266,13 @@ config COMMON_CLK_STM32MP157
 	  Support for stm32mp157 SoC family clocks
 
 config COMMON_CLK_STM32F
-	bool "Clock driver for stm32f4 and stm32f7 SoC families"
-	depends on MACH_STM32F429 || MACH_STM32F469 || MACH_STM32F746
+	def_bool COMMON_CLK && (MACH_STM32F429 || MACH_STM32F469 || MACH_STM32F746)
 	help
 	---help---
 	  Support for stm32f4 and stm32f7 SoC families clocks
 
 config COMMON_CLK_STM32H7
-	bool "Clock driver for stm32h7 SoC family"
-	depends on MACH_STM32H743
+	def_bool COMMON_CLK && MACH_STM32H743
 	help
 	---help---
 	  Support for stm32h7 SoC family clocks

From 8e907ed4882714fd13cfe670681fc6cb5284c780 Mon Sep 17 00:00:00 2001
From: Lidong Chen <jemmy858585@gmail.com>
Date: Tue, 8 May 2018 16:50:16 +0800
Subject: [PATCH 255/393] IB/umem: Use the correct mm during ib_umem_release

User-space may invoke ibv_reg_mr and ibv_dereg_mr in different threads.

If ibv_dereg_mr is called after the thread which invoked ibv_reg_mr has
exited, get_pid_task will return NULL and ib_umem_release will not
decrease mm->pinned_vm.

Instead of using threads to locate the mm, use the overall tgid from the
ib_ucontext struct instead. This matches the behavior of ODP and
disassociate in handling the mm of the process that called ibv_reg_mr.

Cc: <stable@vger.kernel.org>
Fixes: 87773dd56d54 ("IB: ib_umem_release() should decrement mm->pinned_vm from ib_umem_get")
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/umem.c | 7 +------
 include/rdma/ib_umem.h         | 1 -
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 9a4e899d94b3..2b6c9b516070 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -119,7 +119,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	umem->length     = size;
 	umem->address    = addr;
 	umem->page_shift = PAGE_SHIFT;
-	umem->pid	 = get_task_pid(current, PIDTYPE_PID);
 	/*
 	 * We ask for writable memory if any of the following
 	 * access flags are set.  "Local write" and "remote write"
@@ -132,7 +131,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
 
 	if (access & IB_ACCESS_ON_DEMAND) {
-		put_pid(umem->pid);
 		ret = ib_umem_odp_get(context, umem, access);
 		if (ret) {
 			kfree(umem);
@@ -148,7 +146,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	page_list = (struct page **) __get_free_page(GFP_KERNEL);
 	if (!page_list) {
-		put_pid(umem->pid);
 		kfree(umem);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -231,7 +228,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	if (ret < 0) {
 		if (need_release)
 			__ib_umem_release(context->device, umem, 0);
-		put_pid(umem->pid);
 		kfree(umem);
 	} else
 		current->mm->pinned_vm = locked;
@@ -274,8 +270,7 @@ void ib_umem_release(struct ib_umem *umem)
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
-	task = get_pid_task(umem->pid, PIDTYPE_PID);
-	put_pid(umem->pid);
+	task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
 	if (!task)
 		goto out;
 	mm = get_task_mm(task);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 23159dd5be18..a1fd63871d17 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -48,7 +48,6 @@ struct ib_umem {
 	int                     writable;
 	int                     hugetlb;
 	struct work_struct	work;
-	struct pid             *pid;
 	struct mm_struct       *mm;
 	unsigned long		diff;
 	struct ib_umem_odp     *odp_data;

From 91ba9f28a3de97761c2b5fd5df5d88421268e507 Mon Sep 17 00:00:00 2001
From: Deepak Rawat <drawat@vmware.com>
Date: Tue, 15 May 2018 15:39:09 +0200
Subject: [PATCH 256/393] drm/vmwgfx: Set dmabuf_size when vmw_dmabuf_init is
 successful

SOU primary plane prepare_fb hook depends upon dmabuf_size to pin up BO
(and not call a new vmw_dmabuf_init) when a new fb size is same as
current fb. This was changed in a recent commit which is causing
page_flip to fail on VM with low display memory and multi-mon failure
when cycle monitors from secondary display.

Cc: <stable@vger.kernel.org> # 4.14, 4.16
Fixes: 20fb5a635a0c ("drm/vmwgfx: Unpin the screen object backup buffer when not used")
Signed-off-by: Deepak Rawat <drawat@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
index 648f8127f65a..3d667e903beb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
@@ -482,6 +482,8 @@ vmw_sou_primary_plane_prepare_fb(struct drm_plane *plane,
 		return ret;
 	}
 
+	vps->dmabuf_size = size;
+
 	/*
 	 * TTM already thinks the buffer is pinned, but make sure the
 	 * pin_count is upped.

From b579f924a90f42fa561afd8201514fc216b71949 Mon Sep 17 00:00:00 2001
From: Michel Thierry <michel.thierry@intel.com>
Date: Mon, 14 May 2018 09:54:45 -0700
Subject: [PATCH 257/393] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and
 glk

Factor in clear values wherever required while updating destination
min/max.

References: HSDES#1604444184
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Cc: mesa-dev@lists.freedesktop.org
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180510200708.18097-1-michel.thierry@intel.com
Cc: stable@vger.kernel.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180514165445.9198-1-michel.thierry@intel.com
(backported from commit 0c79f9cb77eae28d48a4f9fc1b3341aacbbd260c)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h        | 3 +++
 drivers/gpu/drm/i915/intel_engine_cs.c | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e6a8c0ee7df1..8a69a9275e28 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7326,6 +7326,9 @@ enum {
 #define SLICE_ECO_CHICKEN0			_MMIO(0x7308)
 #define   PIXEL_MASK_CAMMING_DISABLE		(1 << 14)
 
+#define GEN9_WM_CHICKEN3			_MMIO(0x5588)
+#define   GEN9_FACTOR_IN_CLR_VAL_HIZ		(1 << 9)
+
 /* WaCatErrorRejectionIssue */
 #define GEN7_SQ_CHICKEN_MBCUNIT_CONFIG		_MMIO(0x9030)
 #define  GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB	(1<<11)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4ba139c27fba..f7c25828d3bb 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1149,6 +1149,10 @@ static int gen9_init_workarounds(struct intel_engine_cs *engine)
 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 
+	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
+	if (IS_GEN9_LP(dev_priv))
+		WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
+
 	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
 	ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
 	if (ret)

From d7d760efad70c7a030725499bf9f342f04af24dd Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 15 May 2018 17:49:50 -0400
Subject: [PATCH 258/393] locking/rwsem: Add a new RWSEM_ANONYMOUSLY_OWNED flag

There are use cases where a rwsem can be acquired by one task, but
released by another task. In thess cases, optimistic spinning may need
to be disabled.  One example will be the filesystem freeze/thaw code
where the task that freezes the filesystem will acquire a write lock
on a rwsem and then un-owns it before returning to userspace. Later on,
another task will come along, acquire the ownership, thaw the filesystem
and release the rwsem.

Bit 0 of the owner field was used to designate that it is a reader
owned rwsem. It is now repurposed to mean that the owner of the rwsem
is not known. If only bit 0 is set, the rwsem is reader owned. If bit
0 and other bits are set, it is writer owned with an unknown owner.
One such value for the latter case is (-1L). So we can set owner to 1 for
reader-owned, -1 for writer-owned. The owner is unknown in both cases.

To handle transfer of rwsem ownership, the higher level code should
set the owner field to -1 to indicate a write-locked rwsem with unknown
owner.  Optimistic spinning will be disabled in this case.

Once the higher level code figures who the new owner is, it can then
set the owner field accordingly.

Tested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Theodore Y. Ts'o <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-fsdevel@vger.kernel.org
Link: http://lkml.kernel.org/r/1526420991-21213-2-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 17 +++++++----------
 kernel/locking/rwsem.c      |  2 --
 kernel/locking/rwsem.h      | 30 +++++++++++++++++++++---------
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..604d247ea8c3 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -357,11 +357,8 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 
 	rcu_read_lock();
 	owner = READ_ONCE(sem->owner);
-	if (!rwsem_owner_is_writer(owner)) {
-		/*
-		 * Don't spin if the rwsem is readers owned.
-		 */
-		ret = !rwsem_owner_is_reader(owner);
+	if (!owner || !is_rwsem_owner_spinnable(owner)) {
+		ret = !owner;	/* !owner is spinnable */
 		goto done;
 	}
 
@@ -382,11 +379,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
 	struct task_struct *owner = READ_ONCE(sem->owner);
 
-	if (!rwsem_owner_is_writer(owner))
-		goto out;
+	if (!is_rwsem_owner_spinnable(owner))
+		return false;
 
 	rcu_read_lock();
-	while (sem->owner == owner) {
+	while (owner && (READ_ONCE(sem->owner) == owner)) {
 		/*
 		 * Ensure we emit the owner->on_cpu, dereference _after_
 		 * checking sem->owner still matches owner, if that fails,
@@ -408,12 +405,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		cpu_relax();
 	}
 	rcu_read_unlock();
-out:
+
 	/*
 	 * If there is a new owner or the owner is not set, we continue
 	 * spinning.
 	 */
-	return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+	return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
 }
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30465a2f2b6c..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem)
 EXPORT_SYMBOL(up_read_non_owner);
 
 #endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a17cba8d94bb..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,24 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
  * the owner field when it unlocks. A reader, on the other hand, will
  * not touch the owner field when it unlocks.
  *
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
  *  1) 0
  *     - lock is free or the owner hasn't set the field yet
  *  2) RWSEM_READER_OWNED
  *     - lock is currently or previously owned by readers (lock is free
  *       or not set by owner yet)
- *  3) Other non-zero value
- *     - a writer owns the lock
+ *  3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ *     - lock is owned by an anonymous writer, so spinning on the lock
+ *       owner should be disabled.
+ *  4) Other non-zero value
+ *     - a writer owns the lock and other writers can spin on the lock owner.
  */
-#define RWSEM_READER_OWNED	((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED	(1UL << 0)
+#define RWSEM_READER_OWNED	((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c)
@@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 		WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
 }
 
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 {
-	return owner && owner != RWSEM_READER_OWNED;
+	return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
 }
 
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
 {
-	return owner == RWSEM_READER_OWNED;
+	return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
 }
 #else
 static inline void rwsem_set_owner(struct rw_semaphore *sem)

From 5a817641f68a6399a5fac8b7d2da67a73698ffed Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 15 May 2018 17:49:51 -0400
Subject: [PATCH 259/393] locking/percpu-rwsem: Annotate rwsem ownership
 transfer by setting RWSEM_OWNER_UNKNOWN

The filesystem freezing code needs to transfer ownership of a rwsem
embedded in a percpu-rwsem from the task that does the freezing to
another one that does the thawing by calling percpu_rwsem_release()
after freezing and percpu_rwsem_acquire() before thawing.

However, the new rwsem debug code runs afoul with this scheme by warning
that the task that releases the rwsem isn't the one that acquires it,
as reported by Amir Goldstein:

  DEBUG_LOCKS_WARN_ON(sem->owner != get_current())
  WARNING: CPU: 1 PID: 1401 at /home/amir/build/src/linux/kernel/locking/rwsem.c:133 up_write+0x59/0x79

  Call Trace:
   percpu_up_write+0x1f/0x28
   thaw_super_locked+0xdf/0x120
   do_vfs_ioctl+0x270/0x5f1
   ksys_ioctl+0x52/0x71
   __x64_sys_ioctl+0x16/0x19
   do_syscall_64+0x5d/0x167
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

To work properly with the rwsem debug code, we need to annotate that the
rwsem ownership is unknown during the tranfer period until a brave soul
comes forward to acquire the ownership. During that period, optimistic
spinning will be disabled.

Reported-by: Amir Goldstein <amir73il@gmail.com>
Tested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Theodore Y. Ts'o <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-fsdevel@vger.kernel.org
Link: http://lkml.kernel.org/r/1526420991-21213-3-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h | 6 +++++-
 include/linux/rwsem.h        | 6 ++++++
 kernel/locking/rwsem-xadd.c  | 2 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index b1f37a89e368..79b99d653e03 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -133,7 +133,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 	lock_release(&sem->rw_sem.dep_map, 1, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
-		sem->rw_sem.owner = NULL;
+		sem->rw_sem.owner = RWSEM_OWNER_UNKNOWN;
 #endif
 }
 
@@ -141,6 +141,10 @@ static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
 	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+	if (!read)
+		sem->rw_sem.owner = current;
+#endif
 }
 
 #endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 56707d5ff6ad..ab93b6eae696 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -44,6 +44,12 @@ struct rw_semaphore {
 #endif
 };
 
+/*
+ * Setting bit 0 of the owner field with other non-zero bits will indicate
+ * that the rwsem is writer-owned with an unknown owner.
+ */
+#define RWSEM_OWNER_UNKNOWN	((struct task_struct *)-1L)
+
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 604d247ea8c3..a90336779375 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -352,6 +352,8 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 	struct task_struct *owner;
 	bool ret = true;
 
+	BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
 	if (need_resched())
 		return false;
 

From 5c9b0b1c49881c680d4a56b9d9e03dfb3160fd4d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 16 May 2018 11:01:28 +0300
Subject: [PATCH 260/393] x86/boot/compressed/64: Set up GOT for
 paging_prepare() and cleanup_trampoline()

Eric and Hugh have reported instant reboot due to my recent changes in
decompression code.

The root cause is that I didn't realize that we need to adjust GOT to be
able to run C code that early.

The problem is only visible with an older toolchain. Binutils >= 2.24 is
able to eliminate GOT references by replacing them with RIP-relative
address loads:

  https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commitdiff;h=80d873266dec

We need to adjust GOT two times:

 - before calling paging_prepare() using the initial load address
 - before calling C code from the relocated kernel

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: 194a9749c73d ("x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G")
Link: http://lkml.kernel.org/r/20180516080131.27913-2-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/head_64.S | 68 ++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 13 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fca012baba19..d17af6a4bfc9 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -305,6 +305,25 @@ ENTRY(startup_64)
 	/* Set up the stack */
 	leaq	boot_stack_end(%rbx), %rsp
 
+	/*
+	 * paging_prepare() and cleanup_trampoline() below can have GOT
+	 * references. Adjust the table with address we are running at.
+	 *
+	 * Zero RAX for adjust_got: the GOT was not adjusted before;
+	 * there's no adjustment to undo.
+	 */
+	xorq	%rax, %rax
+
+	/*
+	 * Calculate the address the binary is loaded at and use it as
+	 * a GOT adjustment.
+	 */
+	call	1f
+1:	popq	%rdi
+	subq	$1b, %rdi
+
+	call	adjust_got
+
 	/*
 	 * At this point we are in long mode with 4-level paging enabled,
 	 * but we might want to enable 5-level paging or vice versa.
@@ -381,6 +400,21 @@ trampoline_return:
 	pushq	$0
 	popfq
 
+	/*
+	 * Previously we've adjusted the GOT with address the binary was
+	 * loaded at. Now we need to re-adjust for relocation address.
+	 *
+	 * Calculate the address the binary is loaded at, so that we can
+	 * undo the previous GOT adjustment.
+	 */
+	call	1f
+1:	popq	%rax
+	subq	$1b, %rax
+
+	/* The new adjustment is the relocation address */
+	movq	%rbx, %rdi
+	call	adjust_got
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -481,19 +515,6 @@ relocated:
 	shrq	$3, %rcx
 	rep	stosq
 
-/*
- * Adjust our own GOT
- */
-	leaq	_got(%rip), %rdx
-	leaq	_egot(%rip), %rcx
-1:
-	cmpq	%rcx, %rdx
-	jae	2f
-	addq	%rbx, (%rdx)
-	addq	$8, %rdx
-	jmp	1b
-2:
-	
 /*
  * Do the extraction, and jump to the new kernel..
  */
@@ -512,6 +533,27 @@ relocated:
  */
 	jmp	*%rax
 
+/*
+ * Adjust the global offset table
+ *
+ * RAX is the previous adjustment of the table to undo (use 0 if it's the
+ * first time we touch GOT).
+ * RDI is the new adjustment to apply.
+ */
+adjust_got:
+	/* Walk through the GOT adding the address to the entries */
+	leaq	_got(%rip), %rdx
+	leaq	_egot(%rip), %rcx
+1:
+	cmpq	%rcx, %rdx
+	jae	2f
+	subq	%rax, (%rdx)	/* Undo previous adjustment */
+	addq	%rdi, (%rdx)	/* Apply the new adjustment */
+	addq	$8, %rdx
+	jmp	1b
+2:
+	ret
+
 	.code32
 /*
  * This is the 32-bit trampoline that will be copied over to low memory.

From 589bb62be316401603453c7d2d3c60ad8b9c3cf3 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 16 May 2018 11:01:29 +0300
Subject: [PATCH 261/393] x86/boot/compressed/64: Fix moving page table out of
 trampoline memory

cleanup_trampoline() relocates the top-level page table out of
trampoline memory. We use 'top_pgtable' as our new top-level page table.

But if the 'top_pgtable' would be referenced from C in a usual way,
the address of the table will be calculated relative to RIP.
After kernel gets relocated, the address will be in the middle of
decompression buffer and the page table may get overwritten.
This leads to a crash.

We calculate the address of other page tables relative to the relocation
address. It makes them safe. We should do the same for 'top_pgtable'.

Calculate the address of 'top_pgtable' in assembly and pass down to
cleanup_trampoline().

Move the page table to .pgtable section where the rest of page tables
are. The section is @nobits so we save 4k in kernel image.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: e9d0e6330eb8 ("x86/boot/compressed/64: Prepare new top-level page table for trampoline")
Link: http://lkml.kernel.org/r/20180516080131.27913-3-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/head_64.S    | 11 +++++++++++
 arch/x86/boot/compressed/pgtable_64.c | 14 +++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d17af6a4bfc9..8169e8b7a4dc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -389,10 +389,14 @@ trampoline_return:
 	/*
 	 * cleanup_trampoline() would restore trampoline memory.
 	 *
+	 * RDI is address of the page table to use instead of page table
+	 * in trampoline memory (if required).
+	 *
 	 * RSI holds real mode data and needs to be preserved across
 	 * this function call.
 	 */
 	pushq	%rsi
+	leaq	top_pgtable(%rbx), %rdi
 	call	cleanup_trampoline
 	popq	%rsi
 
@@ -691,3 +695,10 @@ boot_stack_end:
 	.balign 4096
 pgtable:
 	.fill BOOT_PGT_SIZE, 1, 0
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+top_pgtable:
+	.fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 32af1cbcd903..a362fa0b849c 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -22,14 +22,6 @@ struct paging_config {
 /* Buffer to preserve trampoline memory */
 static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
 
-/*
- * The page table is going to be used instead of page table in the trampoline
- * memory.
- *
- * It must not be in BSS as BSS is cleared after cleanup_trampoline().
- */
-static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
-
 /*
  * Trampoline address will be printed by extract_kernel() for debugging
  * purposes.
@@ -134,7 +126,7 @@ struct paging_config paging_prepare(void)
 	return paging_config;
 }
 
-void cleanup_trampoline(void)
+void cleanup_trampoline(void *pgtable)
 {
 	void *trampoline_pgtable;
 
@@ -145,8 +137,8 @@ void cleanup_trampoline(void)
 	 * if it's there.
 	 */
 	if ((void *)__native_read_cr3() == trampoline_pgtable) {
-		memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
-		native_write_cr3((unsigned long)top_pgtable);
+		memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
+		native_write_cr3((unsigned long)pgtable);
 	}
 
 	/* Restore trampoline memory */

From 85f4f12d51397f1648e1f4350f77e24039b82d61 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 15 May 2018 22:24:52 -0400
Subject: [PATCH 262/393] vsprintf: Replace memory barrier with static_key for
 random_ptr_key update

Reviewing Tobin's patches for getting pointers out early before
entropy has been established, I noticed that there's a lone smp_mb() in
the code. As with most lone memory barriers, this one appears to be
incorrectly used.

We currently basically have this:

	get_random_bytes(&ptr_key, sizeof(ptr_key));
	/*
	 * have_filled_random_ptr_key==true is dependent on get_random_bytes().
	 * ptr_to_id() needs to see have_filled_random_ptr_key==true
	 * after get_random_bytes() returns.
	 */
	smp_mb();
	WRITE_ONCE(have_filled_random_ptr_key, true);

And later we have:

	if (unlikely(!have_filled_random_ptr_key))
		return string(buf, end, "(ptrval)", spec);

/* Missing memory barrier here. */

	hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);

As the CPU can perform speculative loads, we could have a situation
with the following:

	CPU0				CPU1
	----				----
				   load ptr_key = 0
   store ptr_key = random
   smp_mb()
   store have_filled_random_ptr_key

				   load have_filled_random_ptr_key = true

				    BAD BAD BAD! (you're so bad!)

Because nothing prevents CPU1 from loading ptr_key before loading
have_filled_random_ptr_key.

But this race is very unlikely, but we can't keep an incorrect smp_mb() in
place. Instead, replace the have_filled_random_ptr_key with a static_branch
not_filled_random_ptr_key, that is initialized to true and changed to false
when we get enough entropy. If the update happens in early boot, the
static_key is updated immediately, otherwise it will have to wait till
entropy is filled and this happens in an interrupt handler which can't
enable a static_key, as that requires a preemptible context. In that case, a
work_queue is used to enable it, as entropy already took too long to
establish in the first place waiting a little more shouldn't hurt anything.

The benefit of using the static key is that the unlikely branch in
vsprintf() now becomes a nop.

Link: http://lkml.kernel.org/r/20180515100558.21df515e@gandalf.local.home

Cc: stable@vger.kernel.org
Fixes: ad67b74d2469d ("printk: hash addresses printed with %p")
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 lib/vsprintf.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 30c0cb8cc9bc..23920c5ff728 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1669,19 +1669,22 @@ char *pointer_string(char *buf, char *end, const void *ptr,
 	return number(buf, end, (unsigned long int)ptr, spec);
 }
 
-static bool have_filled_random_ptr_key __read_mostly;
+static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
 static siphash_key_t ptr_key __read_mostly;
 
+static void enable_ptr_key_workfn(struct work_struct *work)
+{
+	get_random_bytes(&ptr_key, sizeof(ptr_key));
+	/* Needs to run from preemptible context */
+	static_branch_disable(&not_filled_random_ptr_key);
+}
+
+static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn);
+
 static void fill_random_ptr_key(struct random_ready_callback *unused)
 {
-	get_random_bytes(&ptr_key, sizeof(ptr_key));
-	/*
-	 * have_filled_random_ptr_key==true is dependent on get_random_bytes().
-	 * ptr_to_id() needs to see have_filled_random_ptr_key==true
-	 * after get_random_bytes() returns.
-	 */
-	smp_mb();
-	WRITE_ONCE(have_filled_random_ptr_key, true);
+	/* This may be in an interrupt handler. */
+	queue_work(system_unbound_wq, &enable_ptr_key_work);
 }
 
 static struct random_ready_callback random_ready = {
@@ -1695,7 +1698,8 @@ static int __init initialize_ptr_random(void)
 	if (!ret) {
 		return 0;
 	} else if (ret == -EALREADY) {
-		fill_random_ptr_key(&random_ready);
+		/* This is in preemptible context */
+		enable_ptr_key_workfn(&enable_ptr_key_work);
 		return 0;
 	}
 
@@ -1709,7 +1713,7 @@ static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec)
 	unsigned long hashval;
 	const int default_width = 2 * sizeof(ptr);
 
-	if (unlikely(!have_filled_random_ptr_key)) {
+	if (static_branch_unlikely(&not_filled_random_ptr_key)) {
 		spec.field_width = default_width;
 		/* string length must be less than default_width */
 		return string(buf, end, "(ptrval)", spec);

From 2b6207291b7b277a5df9d1aab44b56815a292dba Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 16 May 2018 17:00:26 +0300
Subject: [PATCH 263/393] drm/dumb-buffers: Integer overflow in
 drm_mode_create_ioctl()

There is a comment here which says that DIV_ROUND_UP() and that's where
the problem comes from.  Say you pick:

	args->bpp = UINT_MAX - 7;
	args->width = 4;
	args->height = 1;

The integer overflow in DIV_ROUND_UP() means "cpp" is UINT_MAX / 8 and
because of how we picked args->width that means cpp < UINT_MAX / 4.

I've fixed it by preventing the integer overflow in DIV_ROUND_UP().  I
removed the check for !cpp because it's not possible after this change.
I also changed all the 0xffffffffU references to U32_MAX.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20180516140026.GA19340@mwanda
---
 drivers/gpu/drm/drm_dumb_buffers.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_dumb_buffers.c b/drivers/gpu/drm/drm_dumb_buffers.c
index 39ac15ce4702..9e2ae02f31e0 100644
--- a/drivers/gpu/drm/drm_dumb_buffers.c
+++ b/drivers/gpu/drm/drm_dumb_buffers.c
@@ -65,12 +65,13 @@ int drm_mode_create_dumb_ioctl(struct drm_device *dev,
 		return -EINVAL;
 
 	/* overflow checks for 32bit size calculations */
-	/* NOTE: DIV_ROUND_UP() can overflow */
+	if (args->bpp > U32_MAX - 8)
+		return -EINVAL;
 	cpp = DIV_ROUND_UP(args->bpp, 8);
-	if (!cpp || cpp > 0xffffffffU / args->width)
+	if (cpp > U32_MAX / args->width)
 		return -EINVAL;
 	stride = cpp * args->width;
-	if (args->height > 0xffffffffU / stride)
+	if (args->height > U32_MAX / stride)
 		return -EINVAL;
 
 	/* test for wrap-around */

From ab452c3ce7bacb27ffe2fc0144aecd0c399e1e24 Mon Sep 17 00:00:00 2001
From: Keefe Liu <liuqifa@huawei.com>
Date: Mon, 14 May 2018 19:38:09 +0800
Subject: [PATCH 264/393] ipvlan: call netdevice notifier when master mac
 address changed

When master device's mac has been changed, the commit
32c10bbfe914 ("ipvlan: always use the current L2 addr of the
master") makes the IPVlan devices's mac changed also, but it
doesn't do related works such as flush the IPVlan devices's
arp table.

Signed-off-by: Keefe Liu <liuqifa@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 450eec264a5e..4377c26f714d 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -792,8 +792,10 @@ static int ipvlan_device_event(struct notifier_block *unused,
 		break;
 
 	case NETDEV_CHANGEADDR:
-		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
 			ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr);
+			call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev);
+		}
 		break;
 
 	case NETDEV_PRE_TYPE_CHANGE:

From e3ca34880652250f524022ad89e516f8ba9a805b Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 14 May 2018 15:38:10 -0700
Subject: [PATCH 265/393] net/mlx5: Fix build break when CONFIG_SMP=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid using the kernel's irq_descriptor and return IRQ vector affinity
directly from the driver.

This fixes the following build break when CONFIG_SMP=n

include/linux/mlx5/driver.h: In function ‘mlx5_get_vector_affinity_hint’:
include/linux/mlx5/driver.h:1299:13: error:
        ‘struct irq_desc’ has no member named ‘affinity_hint’

Fixes: 6082d9c9c94a ("net/mlx5: Fix mlx5_get_vector_affinity function")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
CC: Randy Dunlap <rdunlap@infradead.org>
CC: Guenter Roeck <linux@roeck-us.net>
CC: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Israel Rukshin <israelr@mellanox.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2a156c5dfadd..d703774982ca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1286,17 +1286,7 @@ enum {
 static inline const struct cpumask *
 mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
 {
-	struct irq_desc *desc;
-	unsigned int irq;
-	int eqn;
-	int err;
-
-	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
-	if (err)
-		return NULL;
-
-	desc = irq_to_desc(irq);
-	return desc->affinity_hint;
+	return dev->priv.irq_info[vector].mask;
 }
 
 #endif /* MLX5_DRIVER_H */

From 7f582b248d0a86bae5788c548d7bb5bca6f7691a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 May 2018 21:14:26 -0700
Subject: [PATCH 266/393] tcp: purge write queue in tcp_connect_init()

syzkaller found a reliable way to crash the host, hitting a BUG()
in __tcp_retransmit_skb()

Malicous MSG_FASTOPEN is the root cause. We need to purge write queue
in tcp_connect_init() at the point we init snd_una/write_seq.

This patch also replaces the BUG() by a less intrusive WARN_ON_ONCE()

kernel BUG at net/ipv4/tcp_output.c:2837!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 5276 Comm: syz-executor0 Not tainted 4.17.0-rc3+ #51
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__tcp_retransmit_skb+0x2992/0x2eb0 net/ipv4/tcp_output.c:2837
RSP: 0000:ffff8801dae06ff8 EFLAGS: 00010206
RAX: ffff8801b9fe61c0 RBX: 00000000ffc18a16 RCX: ffffffff864e1a49
RDX: 0000000000000100 RSI: ffffffff864e2e12 RDI: 0000000000000005
RBP: ffff8801dae073a0 R08: ffff8801b9fe61c0 R09: ffffed0039c40dd2
R10: ffffed0039c40dd2 R11: ffff8801ce206e93 R12: 00000000421eeaad
R13: ffff8801ce206d4e R14: ffff8801ce206cc0 R15: ffff8801cd4f4a80
FS:  0000000000000000(0000) GS:ffff8801dae00000(0063) knlGS:00000000096bc900
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000020000000 CR3: 00000001c47b6000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 tcp_retransmit_skb+0x2e/0x250 net/ipv4/tcp_output.c:2923
 tcp_retransmit_timer+0xc50/0x3060 net/ipv4/tcp_timer.c:488
 tcp_write_timer_handler+0x339/0x960 net/ipv4/tcp_timer.c:573
 tcp_write_timer+0x111/0x1d0 net/ipv4/tcp_timer.c:593
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863

Fixes: cf60af03ca4e ("net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383cac0ff0ec..d07e34f8e309 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2833,8 +2833,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		return -EBUSY;
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
-		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
-			BUG();
+		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
 			return -ENOMEM;
 	}
@@ -3342,6 +3344,7 @@ static void tcp_connect_init(struct sock *sk)
 	sock_reset_flag(sk, SOCK_DONE);
 	tp->snd_wnd = 0;
 	tcp_init_wl(tp, 0);
+	tcp_write_queue_purge(sk);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
 	tp->snd_up = tp->write_seq;

From e1b505a60366399d735312ca38b0a6753a684123 Mon Sep 17 00:00:00 2001
From: Markus Niebel <Markus.Niebel@tqs.de>
Date: Tue, 15 May 2018 10:18:56 +0200
Subject: [PATCH 267/393] net: phy: micrel: add 125MHz reference clock
 workaround

The micrel KSZ9031 phy has a optional clock pin (CLK125_NDO) which can be
used as reference clock for the MAC unit. The clock signal must meet the
RGMII requirements to ensure the correct data transmission between the
MAC and the PHY. The KSZ9031 phy does not fulfill the duty cycle
requirement if the phy is configured as slave. For a complete
describtion look at the errata sheets: DS80000691D or DS80000692D.

The errata sheet recommends to force the phy into master mode whenever
there is a 1000Base-T link-up as work around. Only set the
"micrel,force-master" property if you use the phy reference clock provided
by CLK125_NDO pin as MAC reference clock in your application.

Attenation, this workaround is only usable if the link partner can
be configured to slave mode for 1000Base-T.

Signed-off-by: Markus Niebel <Markus.Niebel@tqs.de>
[m.felsch@pengutronix.de: fix dt-binding documentation]
[m.felsch@pengutronix.de: use already existing result var for read/write]
[m.felsch@pengutronix.de: add error handling]
[m.felsch@pengutronix.de: add more comments]
Signed-off-by: Marco Felsch <m.felsch@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../bindings/net/micrel-ksz90x1.txt           |  7 +++++
 drivers/net/phy/micrel.c                      | 31 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
index 42a248301615..e22d8cfea687 100644
--- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
+++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
@@ -57,6 +57,13 @@ KSZ9031:
       - txd2-skew-ps : Skew control of TX data 2 pad
       - txd3-skew-ps : Skew control of TX data 3 pad
 
+    - micrel,force-master:
+        Boolean, force phy to master mode. Only set this option if the phy
+        reference clock provided at CLK125_NDO pin is used as MAC reference
+        clock because the clock jitter in slave mode is to high (errata#2).
+        Attention: The link partner must be configurable as slave otherwise
+        no link will be established.
+
 Examples:
 
 	mdio {
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index f41b224a9cdb..ab195f0916d6 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -573,9 +573,40 @@ static int ksz9031_config_init(struct phy_device *phydev)
 		ksz9031_of_load_skew_values(phydev, of_node,
 				MII_KSZ9031RN_TX_DATA_PAD_SKEW, 4,
 				tx_data_skews, 4);
+
+		/* Silicon Errata Sheet (DS80000691D or DS80000692D):
+		 * When the device links in the 1000BASE-T slave mode only,
+		 * the optional 125MHz reference output clock (CLK125_NDO)
+		 * has wide duty cycle variation.
+		 *
+		 * The optional CLK125_NDO clock does not meet the RGMII
+		 * 45/55 percent (min/max) duty cycle requirement and therefore
+		 * cannot be used directly by the MAC side for clocking
+		 * applications that have setup/hold time requirements on
+		 * rising and falling clock edges.
+		 *
+		 * Workaround:
+		 * Force the phy to be the master to receive a stable clock
+		 * which meets the duty cycle requirement.
+		 */
+		if (of_property_read_bool(of_node, "micrel,force-master")) {
+			result = phy_read(phydev, MII_CTRL1000);
+			if (result < 0)
+				goto err_force_master;
+
+			/* enable master mode, config & prefer master */
+			result |= CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER;
+			result = phy_write(phydev, MII_CTRL1000, result);
+			if (result < 0)
+				goto err_force_master;
+		}
 	}
 
 	return ksz9031_center_flp_timing(phydev);
+
+err_force_master:
+	phydev_err(phydev, "failed to force the phy to master mode\n");
+	return result;
 }
 
 #define KSZ8873MLL_GLOBAL_CONTROL_4	0x06

From c171654caa875919be3c533d3518da8be5be966e Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah@kernel.org>
Date: Tue, 15 May 2018 17:57:23 -0600
Subject: [PATCH 268/393] usbip: usbip_host: fix bad unlock balance during
 stub_probe()

stub_probe() calls put_busid_priv() in an error path when device isn't
found in the busid_table. Fix it by making put_busid_priv() safe to be
called with null struct bus_id_priv pointer.

This problem happens when "usbip bind" is run without loading usbip_host
driver and then running modprobe. The first failed bind attempt unbinds
the device from the original driver and when usbip_host is modprobed,
stub_probe() runs and doesn't find the device in its busid table and calls
put_busid_priv(0 with null bus_id_priv pointer.

usbip-host 3-10.2: 3-10.2 is not in match_busid table...  skip!

[  367.359679] =====================================
[  367.359681] WARNING: bad unlock balance detected!
[  367.359683] 4.17.0-rc4+ #5 Not tainted
[  367.359685] -------------------------------------
[  367.359688] modprobe/2768 is trying to release lock (
[  367.359689]
==================================================================
[  367.359696] BUG: KASAN: null-ptr-deref in print_unlock_imbalance_bug+0x99/0x110
[  367.359699] Read of size 8 at addr 0000000000000058 by task modprobe/2768

[  367.359705] CPU: 4 PID: 2768 Comm: modprobe Not tainted 4.17.0-rc4+ #5

Fixes: 22076557b07c ("usbip: usbip_host: fix NULL-ptr deref and use-after-free errors") in usb-linus
Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 41c7b9de2a92..bf8a5feb0ee9 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -82,7 +82,8 @@ struct bus_id_priv *get_busid_priv(const char *busid)
 
 void put_busid_priv(struct bus_id_priv *bid)
 {
-	spin_unlock(&bid->busid_lock);
+	if (bid)
+		spin_unlock(&bid->busid_lock);
 }
 
 static int add_match_busid(char *busid)

From 43a5e00f38fe8933a1c716bfe5b30e97f749d94b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:23 -0700
Subject: [PATCH 269/393] net: dsa: bcm_sf2: Fix RX_CLS_LOC_ANY overwrite for
 last rule

When we let the kernel pick up a rule location with RX_CLS_LOC_ANY, we
would be able to overwrite the last rules because of a number of issues.

The IPv4 code path would not be checking that rule_index is within
bounds, and it would also only be allowed to pick up rules from range
0..126 instead of the full 0..127 range. This would lead us to allow
overwriting the last rule when we let the kernel pick-up the location.

Fixes: 3306145866b6 ("net: dsa: bcm_sf2: Move IPv4 CFP processing to specific functions")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 23b45da784cb..9e04786e3139 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -354,10 +354,13 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv *priv, int port,
 	/* Locate the first rule available */
 	if (fs->location == RX_CLS_LOC_ANY)
 		rule_index = find_first_zero_bit(priv->cfp.used,
-						 bcm_sf2_cfp_rule_size(priv));
+						 priv->num_cfp_rules);
 	else
 		rule_index = fs->location;
 
+	if (rule_index > bcm_sf2_cfp_rule_size(priv))
+		return -ENOSPC;
+
 	layout = &udf_tcpip4_layout;
 	/* We only use one UDF slice for now */
 	slice_num = bcm_sf2_get_slice_number(layout, 0);

From 6c05561c541843b2bec2189f680bed6d20afc25b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:24 -0700
Subject: [PATCH 270/393] net: dsa: bcm_sf2: Fix IPv6 rules and chain ID

We had several issues that would make the programming of IPv6 rules both
inconsistent and error prone:

- the chain ID that we would be asking the hardware to put in the
  packet's Broadcom tag would be off by one, it would return one of the
  two indexes, but not the one user-space specified

- when an user specified a particular location to insert a CFP rule at,
  we would not be returning the same index, which would be confusing if
  nothing else

- finally, like IPv4, it would be possible to overflow the last entry by
  re-programming it

Fix this by swapping the usage of rule_index[0] and rule_index[1] where
relevant in order to return a consistent and correct user-space
experience.

Fixes: ba0696c22e7c ("net: dsa: bcm_sf2: Add support for IPv6 CFP rules")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 9e04786e3139..6fd0f8a12cc2 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -565,19 +565,21 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	 * first half because the HW search is by incrementing addresses.
 	 */
 	if (fs->location == RX_CLS_LOC_ANY)
-		rule_index[0] = find_first_zero_bit(priv->cfp.used,
-						    bcm_sf2_cfp_rule_size(priv));
+		rule_index[1] = find_first_zero_bit(priv->cfp.used,
+						    priv->num_cfp_rules);
 	else
-		rule_index[0] = fs->location;
+		rule_index[1] = fs->location;
+	if (rule_index[1] > bcm_sf2_cfp_rule_size(priv))
+		return -ENOSPC;
 
 	/* Flag it as used (cleared on error path) such that we can immediately
 	 * obtain a second one to chain from.
 	 */
-	set_bit(rule_index[0], priv->cfp.used);
+	set_bit(rule_index[1], priv->cfp.used);
 
-	rule_index[1] = find_first_zero_bit(priv->cfp.used,
-					    bcm_sf2_cfp_rule_size(priv));
-	if (rule_index[1] > bcm_sf2_cfp_rule_size(priv)) {
+	rule_index[0] = find_first_zero_bit(priv->cfp.used,
+					    priv->num_cfp_rules);
+	if (rule_index[0] > bcm_sf2_cfp_rule_size(priv)) {
 		ret = -ENOSPC;
 		goto out_err;
 	}
@@ -715,14 +717,14 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	/* Flag the second half rule as being used now, return it as the
 	 * location, and flag it as unique while dumping rules
 	 */
-	set_bit(rule_index[1], priv->cfp.used);
+	set_bit(rule_index[0], priv->cfp.used);
 	set_bit(rule_index[1], priv->cfp.unique);
 	fs->location = rule_index[1];
 
 	return ret;
 
 out_err:
-	clear_bit(rule_index[0], priv->cfp.used);
+	clear_bit(rule_index[1], priv->cfp.used);
 	return ret;
 }
 

From 1942adf64214df370350aa46954ba27654456f68 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:25 -0700
Subject: [PATCH 271/393] net: dsa: bcm_sf2: Fix IPv6 rule half deletion

It was possible to delete only one half of an IPv6, which would leave
the second half still programmed and possibly in use. Instead of
checking for the unused bitmap, we need to check the unique bitmap, and
refuse any deletion that does not match that criteria. We also need to
move that check from bcm_sf2_cfp_rule_del_one() into its caller:
bcm_sf2_cfp_rule_del() otherwise we would not be able to delete second
halves anymore that would not pass the first test.

Fixes: ba0696c22e7c ("net: dsa: bcm_sf2: Add support for IPv6 CFP rules")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 6fd0f8a12cc2..b89acaee12d4 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -790,10 +790,6 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 	int ret;
 	u32 reg;
 
-	/* Refuse deletion of unused rules, and the default reserved rule */
-	if (!test_bit(loc, priv->cfp.used) || loc == 0)
-		return -EINVAL;
-
 	/* Indicate which rule we want to read */
 	bcm_sf2_cfp_rule_addr_set(priv, loc);
 
@@ -831,6 +827,13 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	u32 next_loc = 0;
 	int ret;
 
+	/* Refuse deleting unused rules, and those that are not unique since
+	 * that could leave IPv6 rules with one of the chained rule in the
+	 * table.
+	 */
+	if (!test_bit(loc, priv->cfp.unique) || loc == 0)
+		return -EINVAL;
+
 	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
 	if (ret)
 		return ret;

From e49ac9679eeb30abeb462cb459cc3ea8c81fbd1f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 16 May 2018 11:18:01 +0200
Subject: [PATCH 272/393] net: 8390: ne: Fix accidentally removed RBTX4927
 support

The configuration settings for RBTX4927 were accidentally removed,
leading to a silently broken network interface.

Re-add the missing settings to fix this.

Fixes: 8eb97ff5a4ec941d ("net: 8390: remove m32r specific bits")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ne.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/8390/ne.c b/drivers/net/ethernet/8390/ne.c
index ac99d089ac72..1c97e39b478e 100644
--- a/drivers/net/ethernet/8390/ne.c
+++ b/drivers/net/ethernet/8390/ne.c
@@ -164,7 +164,9 @@ bad_clone_list[] __initdata = {
 #define NESM_START_PG	0x40	/* First page of TX buffer */
 #define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */
 
-#if defined(CONFIG_ATARI)	/* 8-bit mode on Atari, normal on Q40 */
+#if defined(CONFIG_MACH_TX49XX)
+#  define DCR_VAL 0x48		/* 8-bit mode */
+#elif defined(CONFIG_ATARI)	/* 8-bit mode on Atari, normal on Q40 */
 #  define DCR_VAL (MACH_IS_ATARI ? 0x48 : 0x49)
 #else
 #  define DCR_VAL 0x49

From 5a4931ae0193f8a4a97e8260fd0df1d705d83299 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 16 May 2018 12:54:29 +0200
Subject: [PATCH 273/393] net/sched: fix refcnt leak in the error path of
 tcf_vlan_init()

Similarly to what was done with commit a52956dfc503 ("net sched actions:
fix refcnt leak in skbmod"), fix the error path of tcf_vlan_init() to avoid
refcnt leaks when wrong value of TCA_VLAN_PUSH_VLAN_PROTOCOL is given.

Fixes: 5026c9b1bafc ("net sched: vlan action fix late binding")
CC: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 853604685965..1fb39e1f9d07 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -161,6 +161,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			case htons(ETH_P_8021AD):
 				break;
 			default:
+				if (exists)
+					tcf_idr_release(*a, bind);
 				return -EPROTONOSUPPORT;
 			}
 		} else {

From f9bcd60274a565751abef622f9018badd01a17c8 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:38 +0300
Subject: [PATCH 274/393] qed: LL2 flush isles when connection is closed

Driver should free all pending isles once it gets a FLUSH cqe from FW.
Part of iSCSI out of order flow.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 26 +++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 38502815d681..b5918bdabf23 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -591,6 +591,27 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	}
 }
 
+static bool
+qed_ll2_lb_rxq_handler_slowpath(struct qed_hwfn *p_hwfn,
+				struct core_rx_slow_path_cqe *p_cqe)
+{
+	struct ooo_opaque *iscsi_ooo;
+	u32 cid;
+
+	if (p_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH)
+		return false;
+
+	iscsi_ooo = (struct ooo_opaque *)&p_cqe->opaque_data;
+	if (iscsi_ooo->ooo_opcode != TCP_EVENT_DELETE_ISLES)
+		return false;
+
+	/* Need to make a flush */
+	cid = le32_to_cpu(iscsi_ooo->cid);
+	qed_ooo_release_connection_isles(p_hwfn, p_hwfn->p_ooo_info, cid);
+
+	return true;
+}
+
 static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_info *p_ll2_conn)
 {
@@ -617,6 +638,11 @@ static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 		cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
 		cqe_type = cqe->rx_cqe_sp.type;
 
+		if (cqe_type == CORE_RX_CQE_TYPE_SLOW_PATH)
+			if (qed_ll2_lb_rxq_handler_slowpath(p_hwfn,
+							    &cqe->rx_cqe_sp))
+				continue;
+
 		if (cqe_type != CORE_RX_CQE_TYPE_REGULAR) {
 			DP_NOTICE(p_hwfn,
 				  "Got a non-regular LB LL2 completion [type 0x%02x]\n",

From ffd2c0d12752a69e480366031ec7a7d723dd2510 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:39 +0300
Subject: [PATCH 275/393] qed: Fix possibility of list corruption during rmmod
 flows

The ll2 flows of flushing the txq/rxq need to be synchronized with the
regular fp processing. Caused list corruption during load/unload stress
tests.

Fixes: 0a7fb11c23c0f ("qed: Add Light L2 support")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index b5918bdabf23..851561fb8224 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -292,6 +292,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_tx_packet *p_pkt = NULL;
 	struct qed_ll2_info *p_ll2_conn;
 	struct qed_ll2_tx_queue *p_tx;
+	unsigned long flags = 0;
 	dma_addr_t tx_frag;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
@@ -300,6 +301,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_tx = &p_ll2_conn->tx_queue;
 
+	spin_lock_irqsave(&p_tx->lock, flags);
 	while (!list_empty(&p_tx->active_descq)) {
 		p_pkt = list_first_entry(&p_tx->active_descq,
 					 struct qed_ll2_tx_packet, list_entry);
@@ -309,6 +311,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 		list_del(&p_pkt->list_entry);
 		b_last_packet = list_empty(&p_tx->active_descq);
 		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+		spin_unlock_irqrestore(&p_tx->lock, flags);
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
 
@@ -328,7 +331,9 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      b_last_frag,
 						      b_last_packet);
 		}
+		spin_lock_irqsave(&p_tx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_tx->lock, flags);
 }
 
 static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
@@ -556,6 +561,7 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_info *p_ll2_conn = NULL;
 	struct qed_ll2_rx_packet *p_pkt = NULL;
 	struct qed_ll2_rx_queue *p_rx;
+	unsigned long flags = 0;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
 	if (!p_ll2_conn)
@@ -563,13 +569,14 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_rx = &p_ll2_conn->rx_queue;
 
+	spin_lock_irqsave(&p_rx->lock, flags);
 	while (!list_empty(&p_rx->active_descq)) {
 		p_pkt = list_first_entry(&p_rx->active_descq,
 					 struct qed_ll2_rx_packet, list_entry);
 		if (!p_pkt)
 			break;
-
 		list_move_tail(&p_pkt->list_entry, &p_rx->free_descq);
+		spin_unlock_irqrestore(&p_rx->lock, flags);
 
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
@@ -588,7 +595,9 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      cookie,
 						      rx_buf_addr, b_last);
 		}
+		spin_lock_irqsave(&p_rx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_rx->lock, flags);
 }
 
 static bool

From 490068deaef0c76e47bf89c457de899b7d3995c7 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:40 +0300
Subject: [PATCH 276/393] qed: Fix LL2 race during connection terminate

Stress on qedi/qedr load unload lead to list_del corruption.
This is due to ll2 connection terminate freeing resources without
verifying that no more ll2 processing will occur.

This patch unregisters the ll2 status block before terminating
the connection to assure this race does not occur.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 24 +++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 851561fb8224..468c59d2e491 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -829,6 +829,9 @@ static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie;
 	int rc;
 
+	if (!QED_LL2_RX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	rc = qed_ll2_lb_rxq_handler(p_hwfn, p_ll2_conn);
 	if (rc)
 		return rc;
@@ -849,6 +852,9 @@ static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	u16 new_idx = 0, num_bds = 0;
 	int rc;
 
+	if (!QED_LL2_TX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	new_idx = le16_to_cpu(*p_tx->p_fw_cons);
 	num_bds = ((s16)new_idx - (s16)p_tx->bds_idx);
 
@@ -1902,17 +1908,25 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 
 	/* Stop Tx & Rx of connection, if needed */
 	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->tx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_tx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_txq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
 	}
 
 	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->rx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_rx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_rxq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
 	}
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO)
@@ -1960,16 +1974,6 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle)
 	if (!p_ll2_conn)
 		return;
 
-	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->rx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
-	}
-
-	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->tx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
-	}
-
 	kfree(p_ll2_conn->tx_queue.descq_mem);
 	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
 

From 7063efd33bb15abc0160347f89eb5aba6b7d000e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 May 2018 20:39:33 +0800
Subject: [PATCH 277/393] tuntap: fix use after free during release

After commit b196d88aba8a ("tun: fix use after free for ptr_ring") we
need clean up tx ring during release(). But unfortunately, it tries to
do the cleanup blindly after socket were destroyed which will lead
another use-after-free. Fix this by doing the cleanup before dropping
the last reference of the socket in __tun_detach().

Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Fixes: b196d88aba8a ("tun: fix use after free for ptr_ring")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9fbbb328b95b..d45ac37e1287 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -729,6 +729,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 		sock_put(&tfile->sk);
 	}
 }
@@ -3245,7 +3246,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
-	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }

From dbad41e7bb5f4b9949ff5ea1d76c20711f326308 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 17 May 2018 10:42:32 +0100
Subject: [PATCH 278/393] dmaengine: qcom: bam_dma: check if the runtime pm
 enabled

Disabling pm runtime at probe is not sufficient to get BAM working
on remotely controller instances. pm_runtime_get_sync() would return
-EACCES in such cases.
So check if runtime pm is enabled before returning error from bam functions.

Fixes: 5b4a68952a89 ("dmaengine: qcom: bam_dma: disable runtime pm on remote controlled")
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/bam_dma.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index d29275b97e84..4a828c18099a 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -524,6 +524,14 @@ static int bam_alloc_chan(struct dma_chan *chan)
 	return 0;
 }
 
+static int bam_pm_runtime_get_sync(struct device *dev)
+{
+	if (pm_runtime_enabled(dev))
+		return pm_runtime_get_sync(dev);
+
+	return 0;
+}
+
 /**
  * bam_free_chan - Frees dma resources associated with specific channel
  * @chan: specified channel
@@ -539,7 +547,7 @@ static void bam_free_chan(struct dma_chan *chan)
 	unsigned long flags;
 	int ret;
 
-	ret = pm_runtime_get_sync(bdev->dev);
+	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return;
 
@@ -720,7 +728,7 @@ static int bam_pause(struct dma_chan *chan)
 	unsigned long flag;
 	int ret;
 
-	ret = pm_runtime_get_sync(bdev->dev);
+	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return ret;
 
@@ -746,7 +754,7 @@ static int bam_resume(struct dma_chan *chan)
 	unsigned long flag;
 	int ret;
 
-	ret = pm_runtime_get_sync(bdev->dev);
+	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return ret;
 
@@ -852,7 +860,7 @@ static irqreturn_t bam_dma_irq(int irq, void *data)
 	if (srcs & P_IRQ)
 		tasklet_schedule(&bdev->task);
 
-	ret = pm_runtime_get_sync(bdev->dev);
+	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return ret;
 
@@ -969,7 +977,7 @@ static void bam_start_dma(struct bam_chan *bchan)
 	if (!vd)
 		return;
 
-	ret = pm_runtime_get_sync(bdev->dev);
+	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return;
 

From 31d11b83b96faaee4bb514d375a09489117c3e8d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 9 May 2018 16:01:46 +0100
Subject: [PATCH 279/393] Btrfs: fix duplicate extents after fsync of file with
 prealloc extents

In commit 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size
after fsync log replay"), on fsync,  we started to always log all prealloc
extents beyond an inode's i_size in order to avoid losing them after a
power failure. However under some cases this can lead to the log replay
code to create duplicate extent items, with different lengths, in the
extent tree. That happens because, as of that commit, we can now log
extent items based on extent maps that are not on the "modified" list
of extent maps of the inode's extent map tree. Logging extent items based
on extent maps is used during the fast fsync path to save time and for
this to work reliably it requires that the extent maps are not merged
with other adjacent extent maps - having the extent maps in the list
of modified extents gives such guarantee.

Consider the following example, captured during a long run of fsstress,
which illustrates this problem.

We have inode 271, in the filesystem tree (root 5), for which all of the
following operations and discussion apply to.

A buffered write starts at offset 312391 with a length of 933471 bytes
(end offset at 1245862). At this point we have, for this inode, the
following extent maps with the their field values:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 376832, block_start 1106399232,
      block_len 376832, orig_block_len 376832
em C, start 417792, orig_start 417792, len 782336, block_start
      18446744073709551613, block_len 0, orig_block_len 0
em D, start 1200128, orig_start 1200128, len 835584, block_start
      1106776064, block_len 835584, orig_block_len 835584
em E, start 2035712, orig_start 2035712, len 245760, block_start
      1107611648, block_len 245760, orig_block_len 245760

Extent map A corresponds to a hole and extent maps D and E correspond to
preallocated extents.

Extent map D ends where extent map E begins (1106776064 + 835584 =
1107611648), but these extent maps were not merged because they are in
the inode's list of modified extent maps.

An fsync against this inode is made, which triggers the fast path
(BTRFS_INODE_NEEDS_FULL_SYNC is not set). This fsync triggers writeback
of the data previously written using buffered IO, and when the respective
ordered extent finishes, btrfs_drop_extents() is called against the
(aligned) range 311296..1249279. This causes a split of extent map D at
btrfs_drop_extent_cache(), replacing extent map D with a new extent map
D', also added to the list of modified extents,  with the following
values:

em D', start 1249280, orig_start of 1200128,
       block_start 1106825216 (= 1106776064 + 1249280 - 1200128),
       orig_block_len 835584,
       block_len 786432 (835584 - (1249280 - 1200128))

Then, during the fast fsync, btrfs_log_changed_extents() is called and
extent maps D' and E are removed from the list of modified extents. The
flag EXTENT_FLAG_LOGGING is also set on them. After the extents are logged
clear_em_logging() is called on each of them, and that makes extent map E
to be merged with extent map D' (try_merge_map()), resulting in D' being
deleted and E adjusted to:

em E, start 1249280, orig_start 1200128, len 1032192,
      block_start 1106825216, block_len 1032192,
      orig_block_len 245760

A direct IO write at offset 1847296 and length of 360448 bytes (end offset
at 2207744) starts, and at that moment the following extent maps exist for
our inode:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 270336, block_start 1106399232,
      block_len 270336, orig_block_len 376832
em C, start 311296, orig_start 311296, len 937984, block_start 1112842240,
      block_len 937984, orig_block_len 937984
em E (prealloc), start 1249280, orig_start 1200128, len 1032192,
      block_start 1106825216, block_len 1032192, orig_block_len 245760

The dio write results in drop_extent_cache() being called twice. The first
time for a range that starts at offset 1847296 and ends at offset 2035711
(length of 188416), which results in a double split of extent map E,
replacing it with two new extent maps:

em F, start 1249280, orig_start 1200128, block_start 1106825216,
      block_len 598016, orig_block_len 598016
em G, start 2035712, orig_start 1200128, block_start 1107611648,
      block_len 245760, orig_block_len 1032192

It also creates a new extent map that represents a part of the requested
IO (through create_io_em()):

em H, start 1847296, len 188416, block_start 1107423232, block_len 188416

The second call to drop_extent_cache() has a range with a start offset of
2035712 and end offset of 2207743 (length of 172032). This leads to
replacing extent map G with a new extent map I with the following values:

em I, start 2207744, orig_start 1200128, block_start 1107783680,
      block_len 73728, orig_block_len 1032192

It also creates a new extent map that represents the second part of the
requested IO (through create_io_em()):

em J, start 2035712, len 172032, block_start 1107611648, block_len 172032

The dio write set the inode's i_size to 2207744 bytes.

After the dio write the inode has the following extent maps:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 270336, block_start 1106399232,
      block_len 270336, orig_block_len 376832
em C, start 311296, orig_start 311296, len 937984, block_start 1112842240,
      block_len 937984, orig_block_len 937984
em F, start 1249280, orig_start 1200128, len 598016,
      block_start 1106825216, block_len 598016, orig_block_len 598016
em H, start 1847296, orig_start 1200128, len 188416,
      block_start 1107423232, block_len 188416, orig_block_len 835584
em J, start 2035712, orig_start 2035712, len 172032,
      block_start 1107611648, block_len 172032, orig_block_len 245760
em I, start 2207744, orig_start 1200128, len 73728,
      block_start 1107783680, block_len 73728, orig_block_len 1032192

Now do some change to the file, like adding a xattr for example and then
fsync it again. This triggers a fast fsync path, and as of commit
471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync
log replay"), we use the extent map I to log a file extent item because
it's a prealloc extent and it starts at an offset matching the inode's
i_size. However when we log it, we create a file extent item with a value
for the disk byte location that is wrong, as can be seen from the
following output of "btrfs inspect-internal dump-tree":

 item 1 key (271 EXTENT_DATA 2207744) itemoff 3782 itemsize 53
     generation 22 type 2 (prealloc)
     prealloc data disk byte 1106776064 nr 1032192
     prealloc data offset 1007616 nr 73728

Here the disk byte value corresponds to calculation based on some fields
from the extent map I:

  1106776064 = block_start (1107783680) - 1007616 (extent_offset)
  extent_offset = 2207744 (start) - 1200128 (orig_start) = 1007616

The disk byte value of 1106776064 clashes with disk byte values of the
file extent items at offsets 1249280 and 1847296 in the fs tree:

        item 6 key (271 EXTENT_DATA 1249280) itemoff 3568 itemsize 53
                generation 20 type 2 (prealloc)
                prealloc data disk byte 1106776064 nr 835584
                prealloc data offset 49152 nr 598016
        item 7 key (271 EXTENT_DATA 1847296) itemoff 3515 itemsize 53
                generation 20 type 1 (regular)
                extent data disk byte 1106776064 nr 835584
                extent data offset 647168 nr 188416 ram 835584
                extent compression 0 (none)
        item 8 key (271 EXTENT_DATA 2035712) itemoff 3462 itemsize 53
                generation 20 type 1 (regular)
                extent data disk byte 1107611648 nr 245760
                extent data offset 0 nr 172032 ram 245760
                extent compression 0 (none)
        item 9 key (271 EXTENT_DATA 2207744) itemoff 3409 itemsize 53
                generation 20 type 2 (prealloc)
                prealloc data disk byte 1107611648 nr 245760
                prealloc data offset 172032 nr 73728

Instead of the disk byte value of 1106776064, the value of 1107611648
should have been logged. Also the data offset value should have been
172032 and not 1007616.
After a log replay we end up getting two extent items in the extent tree
with different lengths, one of 835584, which is correct and existed
before the log replay, and another one of 1032192 which is wrong and is
based on the logged file extent item:

 item 12 key (1106776064 EXTENT_ITEM 835584) itemoff 3406 itemsize 53
    refs 2 gen 15 flags DATA
    extent data backref root 5 objectid 271 offset 1200128 count 2
 item 13 key (1106776064 EXTENT_ITEM 1032192) itemoff 3353 itemsize 53
    refs 1 gen 22 flags DATA
    extent data backref root 5 objectid 271 offset 1200128 count 1

Obviously this leads to many problems and a filesystem check reports many
errors:

 (...)
 checking extents
 Extent back ref already exists for 1106776064 parent 0 root 5 owner 271 offset 1200128 num_refs 1
 extent item 1106776064 has multiple extent items
 ref mismatch on [1106776064 835584] extent item 2, found 3
 Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 2 wanted 1 back 0x55b1d0ad7680
 Backref 1106776064 root 5 owner 271 offset 1200128 num_refs 0 not found in extent tree
 Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 1 wanted 0 back 0x55b1d0ad4e70
 Backref bytes do not match extent backref, bytenr=1106776064, ref bytes=835584, backref bytes=1032192
 backpointer mismatch on [1106776064 835584]
 checking free space cache
 block group 1103101952 has wrong amount of free space
 failed to load free space cache for block group 1103101952
 checking fs roots
 (...)

So fix this by logging the prealloc extents beyond the inode's i_size
based on searches in the subvolume tree instead of the extent maps.

Fixes: 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 137 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 112 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c1509547c762..8f23a94dab77 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4320,6 +4320,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+				      struct btrfs_inode *inode,
+				      struct btrfs_path *path)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_key key;
+	const u64 i_size = i_size_read(&inode->vfs_inode);
+	const u64 ino = btrfs_ino(inode);
+	struct btrfs_path *dst_path = NULL;
+	u64 last_extent = (u64)-1;
+	int ins_nr = 0;
+	int start_slot;
+	int ret;
+
+	if (!(inode->flags & BTRFS_INODE_PREALLOC))
+		return 0;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = i_size;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			if (ins_nr > 0) {
+				ret = copy_items(trans, inode, dst_path, path,
+						 &last_extent, start_slot,
+						 ins_nr, 1, 0);
+				if (ret < 0)
+					goto out;
+				ins_nr = 0;
+			}
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid > ino)
+			break;
+		if (WARN_ON_ONCE(key.objectid < ino) ||
+		    key.type < BTRFS_EXTENT_DATA_KEY ||
+		    key.offset < i_size) {
+			path->slots[0]++;
+			continue;
+		}
+		if (last_extent == (u64)-1) {
+			last_extent = key.offset;
+			/*
+			 * Avoid logging extent items logged in past fsync calls
+			 * and leading to duplicate keys in the log tree.
+			 */
+			do {
+				ret = btrfs_truncate_inode_items(trans,
+							 root->log_root,
+							 &inode->vfs_inode,
+							 i_size,
+							 BTRFS_EXTENT_DATA_KEY);
+			} while (ret == -EAGAIN);
+			if (ret)
+				goto out;
+		}
+		if (ins_nr == 0)
+			start_slot = slot;
+		ins_nr++;
+		path->slots[0]++;
+		if (!dst_path) {
+			dst_path = btrfs_alloc_path();
+			if (!dst_path) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+	if (ins_nr > 0) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 start_slot, ins_nr, 1, 0);
+		if (ret > 0)
+			ret = 0;
+	}
+out:
+	btrfs_release_path(path);
+	btrfs_free_path(dst_path);
+	return ret;
+}
+
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct btrfs_inode *inode,
@@ -4362,6 +4466,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		if (em->generation <= test_gen)
 			continue;
 
+		/* We log prealloc extents beyond eof later. */
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+		    em->start >= i_size_read(&inode->vfs_inode))
+			continue;
+
 		if (em->start < logged_start)
 			logged_start = em->start;
 		if ((em->start + em->len - 1) > logged_end)
@@ -4374,31 +4483,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		num++;
 	}
 
-	/*
-	 * Add all prealloc extents beyond the inode's i_size to make sure we
-	 * don't lose them after doing a fast fsync and replaying the log.
-	 */
-	if (inode->flags & BTRFS_INODE_PREALLOC) {
-		struct rb_node *node;
-
-		for (node = rb_last(&tree->map); node; node = rb_prev(node)) {
-			em = rb_entry(node, struct extent_map, rb_node);
-			if (em->start < i_size_read(&inode->vfs_inode))
-				break;
-			if (!list_empty(&em->list))
-				continue;
-			/* Same as above loop. */
-			if (++num > 32768) {
-				list_del_init(&tree->modified_extents);
-				ret = -EFBIG;
-				goto process;
-			}
-			refcount_inc(&em->refs);
-			set_bit(EXTENT_FLAG_LOGGING, &em->flags);
-			list_add_tail(&em->list, &extents);
-		}
-	}
-
 	list_sort(NULL, &extents, extent_cmp);
 	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
 	/*
@@ -4443,6 +4527,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	up_write(&inode->dio_sem);
 
 	btrfs_release_path(path);
+	if (!ret)
+		ret = btrfs_log_prealloc_extents(trans, inode, path);
+
 	return ret;
 }
 

From 1a63c198ddb810c790101d693c7071cca703b3c7 Mon Sep 17 00:00:00 2001
From: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Date: Tue, 15 May 2018 16:51:26 +0900
Subject: [PATCH 280/393] btrfs: property: Set incompat flag if lzo/zstd
 compression is set

Incompat flag of LZO/ZSTD compression should be set at:

 1. mount time (-o compress/compress-force)
 2. when defrag is done
 3. when property is set

Currently 3. is missing and this commit adds this.

This could lead to a filesystem that uses ZSTD but is not marked as
such. If a kernel without a ZSTD support encounteres a ZSTD compressed
extent, it will handle that but this could be confusing to the user.

Typically the filesystem is mounted with the ZSTD option, but the
discrepancy can arise when a filesystem is never mounted with ZSTD and
then the property on some file is set (and some new extents are
written). A simple mount with -o compress=zstd will fix that up on an
unpatched kernel.

Same goes for LZO, but this has been around for a very long time
(2.6.37) so it's unlikely that a pre-LZO kernel would be used.

Fixes: 5c1aab1dd544 ("btrfs: Add zstd support")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add user visible impact ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 53a8c95828e3..dc6140013ae8 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
 				  const char *value,
 				  size_t len)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int type;
 
 	if (len == 0) {
@@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
 		return 0;
 	}
 
-	if (!strncmp("lzo", value, 3))
+	if (!strncmp("lzo", value, 3)) {
 		type = BTRFS_COMPRESS_LZO;
-	else if (!strncmp("zlib", value, 4))
+		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (!strncmp("zlib", value, 4)) {
 		type = BTRFS_COMPRESS_ZLIB;
-	else if (!strncmp("zstd", value, len))
+	} else if (!strncmp("zstd", value, len)) {
 		type = BTRFS_COMPRESS_ZSTD;
-	else
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+	} else {
 		return -EINVAL;
+	}
 
 	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
 	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;

From 02a3307aa9c20b4f6626255b028f07f6cfa16feb Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Wed, 16 May 2018 01:37:36 +0800
Subject: [PATCH 281/393] btrfs: fix reading stale metadata blocks after
 degraded raid1 mounts

If a btree block, aka. extent buffer, is not available in the extent
buffer cache, it'll be read out from the disk instead, i.e.

btrfs_search_slot()
  read_block_for_search()  # hold parent and its lock, go to read child
    btrfs_release_path()
    read_tree_block()  # read child

Unfortunately, the parent lock got released before reading child, so
commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had
used 0 as parent transid to read the child block.  It forces
read_tree_block() not to check if parent transid is different with the
generation id of the child that it reads out from disk.

A simple PoC is included in btrfs/124,

0. A two-disk raid1 btrfs,

1. Right after mkfs.btrfs, block A is allocated to be device tree's root.

2. Mount this filesystem and put it in use, after a while, device tree's
   root got COW but block A hasn't been allocated/overwritten yet.

3. Umount it and reload the btrfs module to remove both disks from the
   global @fs_devices list.

4. mount -odegraded dev1 and write some data, so now block A is allocated
   to be a leaf in checksum tree.  Note that only dev1 has the latest
   metadata of this filesystem.

5. Umount it and mount it again normally (with both disks), since raid1
   can pick up one disk by the writer task's pid, if btrfs_search_slot()
   needs to read block A, dev2 which does NOT have the latest metadata
   might be read for block A, then we got a stale block A.

6. As parent transid is not checked, block A is marked as uptodate and
   put into the extent buffer cache, so the future search won't bother
   to read disk again, which means it'll make changes on this stale
   one and make it dirty and flush it onto disk.

To avoid the problem, parent transid needs to be passed to
read_tree_block().

In order to get a valid parent transid, we need to hold the parent's
lock until finishing reading child.

This patch needs to be slightly adapted for stable kernels, the
&first_key parameter added to read_tree_block() is from 4.16+
(581c1760415c4). The fix is to replace 0 by 'gen'.

Fixes: 5bdd3536cbbe ("Btrfs: Fix block generation verification race")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 63488f0b850f..8c68961925b1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2436,10 +2436,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	btrfs_release_path(p);
-
 	ret = -EAGAIN;
-	tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
 			      &first_key);
 	if (!IS_ERR(tmp)) {
 		/*
@@ -2454,6 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	} else {
 		ret = PTR_ERR(tmp);
 	}
+
+	btrfs_release_path(p);
 	return ret;
 }
 

From 2b8773313494ede83a26fb372466e634564002ed Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 27 Apr 2018 12:21:51 +0300
Subject: [PATCH 282/393] btrfs: Split btrfs_del_delalloc_inode into 2
 functions

This is in preparation of fixing delalloc inodes leakage on transaction
abort. Also export the new function.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2771cc56a622..0d422c9908b8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3182,6 +3182,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
 
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+				struct btrfs_inode *inode);
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d241285a0d2a..8e604e7071f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1742,12 +1742,12 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
 	spin_unlock(&root->delalloc_lock);
 }
 
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
-				     struct btrfs_inode *inode)
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+				struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
-	spin_lock(&root->delalloc_lock);
 	if (!list_empty(&inode->delalloc_inodes)) {
 		list_del_init(&inode->delalloc_inodes);
 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
@@ -1760,6 +1760,13 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
 			spin_unlock(&fs_info->delalloc_root_lock);
 		}
 	}
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct btrfs_inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	__btrfs_del_delalloc_inode(root, inode);
 	spin_unlock(&root->delalloc_lock);
 }
 

From fe816d0f1d4c31c4c31d42ca78a87660565fc800 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 27 Apr 2018 12:21:53 +0300
Subject: [PATCH 283/393] btrfs: Fix delalloc inodes invalidation during
 transaction abort

When a transaction is aborted btrfs_cleanup_transaction is called to
cleanup all the various in-flight bits and pieces which migth be
active. One of those is delalloc inodes - inodes which have dirty
pages which haven't been persisted yet. Currently the process of
freeing such delalloc inodes in exceptional circumstances such as
transaction abort boiled down to calling btrfs_invalidate_inodes whose
sole job is to invalidate the dentries for all inodes related to a
root. This is in fact wrong and insufficient since such delalloc inodes
will likely have pending pages or ordered-extents and will be linked to
the sb->s_inode_list. This means that unmounting a btrfs instance with
an aborted transaction could potentially lead inodes/their pages
visible to the system long after their superblock has been freed. This
in turn leads to a "use-after-free" situation once page shrink is
triggered. This situation could be simulated by running generic/019
which would cause such inodes to be left hanging, followed by
generic/176 which causes memory pressure and page eviction which lead
to touching the freed super block instance. This situation is
additionally detected by the unmount code of VFS with the following
message:

"VFS: Busy inodes after unmount of Self-destruct in 5 seconds.  Have a nice day..."

Additionally btrfs hits WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
in free_fs_root for the same reason.

This patch aims to rectify the sitaution by doing the following:

1. Change btrfs_destroy_delalloc_inodes so that it calls
invalidate_inode_pages2 for every inode on the delalloc list, this
ensures that all the pages of the inode are released. This function
boils down to calling btrfs_releasepage. During test I observed cases
where inodes on the delalloc list were having an i_count of 0, so this
necessitates using igrab to be sure we are working on a non-freed inode.

2. Since calling btrfs_releasepage might queue delayed iputs move the
call out to btrfs_cleanup_transaction in btrfs_error_commit_super before
calling run_delayed_iputs for the last time. This is necessary to ensure
that delayed iputs are run.

Note: this patch is tagged for 4.14 stable but the fix applies to older
versions too but needs to be backported manually due to conflicts.

CC: stable@vger.kernel.org # 4.14.x: 2b8773313494: btrfs: Split btrfs_del_delalloc_inode into 2 functions
CC: stable@vger.kernel.org # 4.14.x
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add comment to igrab ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60caa68c3618..c3504b4d281b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3818,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
 	btrfs_free_qgroup_config(fs_info);
+	ASSERT(list_empty(&fs_info->delalloc_roots));
 
 	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
 		btrfs_info(fs_info, "at unmount delalloc count %lld",
@@ -4125,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
 
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
 {
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(fs_info);
+
 	mutex_lock(&fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(fs_info);
 	mutex_unlock(&fs_info->cleaner_mutex);
 
 	down_write(&fs_info->cleanup_work_sem);
 	up_write(&fs_info->cleanup_work_sem);
-
-	/* cleanup FS via transaction */
-	btrfs_cleanup_transaction(fs_info);
 }
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
@@ -4258,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 	list_splice_init(&root->delalloc_inodes, &splice);
 
 	while (!list_empty(&splice)) {
+		struct inode *inode = NULL;
 		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
 					       delalloc_inodes);
-
-		list_del_init(&btrfs_inode->delalloc_inodes);
-		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-			  &btrfs_inode->runtime_flags);
+		__btrfs_del_delalloc_inode(root, btrfs_inode);
 		spin_unlock(&root->delalloc_lock);
 
-		btrfs_invalidate_inodes(btrfs_inode->root);
-
+		/*
+		 * Make sure we get a live inode and that it'll not disappear
+		 * meanwhile.
+		 */
+		inode = igrab(&btrfs_inode->vfs_inode);
+		if (inode) {
+			invalidate_inode_pages2(inode->i_mapping);
+			iput(inode);
+		}
 		spin_lock(&root->delalloc_lock);
 	}
-
 	spin_unlock(&root->delalloc_lock);
 }
 
@@ -4286,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
 	while (!list_empty(&splice)) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					 delalloc_root);
-		list_del_init(&root->delalloc_root);
 		root = btrfs_grab_fs_root(root);
 		BUG_ON(!root);
 		spin_unlock(&fs_info->delalloc_root_lock);

From 02ee654d3a04563c67bfe658a05384548b9bb105 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 17 May 2018 15:16:51 +0800
Subject: [PATCH 284/393] btrfs: fix crash when trying to resume balance
 without the resume flag

We set the BTRFS_BALANCE_RESUME flag in the btrfs_recover_balance()
only, which isn't called during the remount. So when resuming from
the paused balance we hit the bug:

 kernel: kernel BUG at fs/btrfs/volumes.c:3890!
 ::
 kernel:  balance_kthread+0x51/0x60 [btrfs]
 kernel:  kthread+0x111/0x130
 ::
 kernel: RIP: btrfs_balance+0x12e1/0x1570 [btrfs] RSP: ffffba7d0090bde8

Reproducer:
  On a mounted filesystem:

  btrfs balance start --full-balance /btrfs
  btrfs balance pause /btrfs
  mount -o remount,ro /dev/sdb /btrfs
  mount -o remount,rw /dev/sdb /btrfs

To fix this set the BTRFS_BALANCE_RESUME flag in
btrfs_resume_balance_async().

CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 292266f6ab9c..be3fc701f389 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4052,6 +4052,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 		return 0;
 	}
 
+	/*
+	 * A ro->rw remount sequence should continue with the paused balance
+	 * regardless of who pauses it, system or the user as of now, so set
+	 * the resume flag.
+	 */
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+	spin_unlock(&fs_info->balance_lock);
+
 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
 	return PTR_ERR_OR_ZERO(tsk);
 }

From 22aac3eb0c465dd9ea7f06ee1ed8ad933890f2a3 Mon Sep 17 00:00:00 2001
From: Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
Date: Fri, 11 May 2018 10:22:39 +0200
Subject: [PATCH 285/393] MAINTAINERS: add entry for STM32 I2C driver

Add I2C/SMBUS Driver entry for STM32 family from ST Microelectronics.

Signed-off-by: Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index df6e9bb2559a..6b176cc27ebe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13266,6 +13266,12 @@ M:	Jan-Benedict Glaw <jbglaw@lug-owl.de>
 S:	Maintained
 F:	arch/alpha/kernel/srm_env.c
 
+ST STM32 I2C/SMBUS DRIVER
+M:	Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
+L:	linux-i2c@vger.kernel.org
+S:	Maintained
+F:	drivers/i2c/busses/i2c-stm32*
+
 STABLE BRANCH
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 L:	stable@vger.kernel.org

From c1d2a31397ec51f0370f6bd17b19b39152c263cb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 15 May 2018 01:59:47 +1000
Subject: [PATCH 286/393] powerpc/powernv: Fix NVRAM sleep in invalid context
 when crashing

Similarly to opal_event_shutdown, opal_nvram_write can be called in
the crash path with irqs disabled. Special case the delay to avoid
sleeping in invalid context.

Fixes: 3b8070335f75 ("powerpc/powernv: Fix OPAL NVRAM driver OPAL_BUSY loops")
Cc: stable@vger.kernel.org # v3.2
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-nvram.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
index 1bceb95f422d..5584247f5029 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -44,6 +44,10 @@ static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
 	return count;
 }
 
+/*
+ * This can be called in the panic path with interrupts off, so use
+ * mdelay in that case.
+ */
 static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
 {
 	s64 rc = OPAL_BUSY;
@@ -58,10 +62,16 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_write_nvram(__pa(buf), count, off);
 		if (rc == OPAL_BUSY_EVENT) {
-			msleep(OPAL_BUSY_DELAY_MS);
+			if (in_interrupt() || irqs_disabled())
+				mdelay(OPAL_BUSY_DELAY_MS);
+			else
+				msleep(OPAL_BUSY_DELAY_MS);
 			opal_poll_events(NULL);
 		} else if (rc == OPAL_BUSY) {
-			msleep(OPAL_BUSY_DELAY_MS);
+			if (in_interrupt() || irqs_disabled())
+				mdelay(OPAL_BUSY_DELAY_MS);
+			else
+				msleep(OPAL_BUSY_DELAY_MS);
 		}
 	}
 

From 15e6c22fd8e5a42c5ed6d487b7c9fe44c2517765 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 May 2018 15:21:01 +0200
Subject: [PATCH 287/393] KVM: SVM: Move spec control call after restore of GS

svm_vcpu_run() invokes x86_spec_ctrl_restore_host() after VMEXIT, but
before the host GS is restored. x86_spec_ctrl_restore_host() uses 'current'
to determine the host SSBD state of the thread. 'current' is GS based, but
host GS is not yet restored and the access causes a triple fault.

Move the call after the host GS restore.

Fixes: 885f82bfbc6f x86/process: Allow runtime control of Speculative Store Bypass
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 437c1b371129..fa891c8af842 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5651,6 +5651,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+	loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+	loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
 	/*
 	 * We do not use IBRS in the kernel. If this vCPU has used the
 	 * SPEC_CTRL MSR it may have left it on; save the value and
@@ -5671,18 +5683,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
-	/* Eliminate branch target predictions from guest mode */
-	vmexit_fill_RSB();
-
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-	loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
 	reload_tss(vcpu);
 
 	local_irq_disable();

From e7c587da125291db39ddf1f49b18e5970adbac17 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 2 May 2018 18:15:14 +0200
Subject: [PATCH 288/393] x86/speculation: Use synthetic bits for
 IBRS/IBPB/STIBP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel and AMD have different CPUID bits hence for those use synthetic bits
which get set on the respective vendor's in init_speculation_control(). So
that debacles like what the commit message of

  c65732e4f721 ("x86/cpu: Restore CPUID_8000_0008_EBX reload")

talks about don't happen anymore.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Jörg Otte <jrg.otte@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/20180504161815.GG9257@pd.tnic
---
 arch/x86/include/asm/cpufeatures.h | 10 ++++++----
 arch/x86/kernel/cpu/common.c       | 14 ++++++++++----
 arch/x86/kvm/cpuid.c               | 10 +++++-----
 arch/x86/kvm/svm.c                 |  6 +++---
 arch/x86/kvm/vmx.c                 |  9 ++-------
 5 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4e1c747acbf8..cb9bca8cecdc 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
 #define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
-
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
@@ -216,6 +215,9 @@
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
 #define X86_FEATURE_AMD_SSBD		( 7*32+24)  /* "" AMD SSBD implementation */
+#define X86_FEATURE_IBRS		( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB		( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP		( 7*32+27) /* Single Thread Indirect Branch Predictors */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
@@ -276,9 +278,9 @@
 #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
 #define X86_FEATURE_IRPERF		(13*32+ 1) /* Instructions Retired Count */
 #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_IBRS		(13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_STIBP		(13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB		(13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS		(13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP		(13*32+15) /* "" Single Thread Indirect Branch Predictors */
 
 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
 #define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9fbb388fadac..e6cd38b20375 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -757,17 +757,23 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	 * and they also have a different bit for STIBP support. Also,
 	 * a hypervisor might have set the individual AMD bits even on
 	 * Intel CPUs, for finer-grained selection of what's available.
-	 *
-	 * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
-	 * features, which are visible in /proc/cpuinfo and used by the
-	 * kernel. So set those accordingly from the Intel bits.
 	 */
 	if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
 		set_cpu_cap(c, X86_FEATURE_IBRS);
 		set_cpu_cap(c, X86_FEATURE_IBPB);
 	}
+
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
+
+	if (cpu_has(c, X86_FEATURE_AMD_IBRS))
+		set_cpu_cap(c, X86_FEATURE_IBRS);
+
+	if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+		set_cpu_cap(c, X86_FEATURE_IBPB);
+
+	if (cpu_has(c, X86_FEATURE_AMD_STIBP))
+		set_cpu_cap(c, X86_FEATURE_STIBP);
 }
 
 void get_cpu_cap(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 865c9a769864..e5ba48599428 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(IBPB) | F(IBRS);
+		F(AMD_IBPB) | F(AMD_IBRS);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -648,10 +648,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
 		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
-		if (boot_cpu_has(X86_FEATURE_IBPB))
-			entry->ebx |= F(IBPB);
-		if (boot_cpu_has(X86_FEATURE_IBRS))
-			entry->ebx |= F(IBRS);
+		if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+			entry->ebx |= F(AMD_IBPB);
+		if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+			entry->ebx |= F(AMD_IBRS);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
 		break;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fa891c8af842..1a7a811118a8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4108,7 +4108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
 			return 1;
 
 		msr_info->data = svm->spec_ctrl;
@@ -4203,7 +4203,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
@@ -4230,7 +4230,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
 			return 1;
 
 		if (data & ~PRED_CMD_IBPB)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b8d80bf3889..210e1b63d4b5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3523,9 +3523,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return kvm_get_msr_common(vcpu, msr_info);
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SSBD))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -3643,9 +3641,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SSBD))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
@@ -3675,7 +3671,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 

From 7eb8956a7fec3c1f0abc2a5517dada99ccc8a961 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 May 2018 19:13:18 +0200
Subject: [PATCH 289/393] x86/cpufeatures: Disentangle MSR_SPEC_CTRL
 enumeration from IBRS

The availability of the SPEC_CTRL MSR is enumerated by a CPUID bit on
Intel and implied by IBRS or STIBP support on AMD. That's just confusing
and in case an AMD CPU has IBRS not supported because the underlying
problem has been fixed but has another bit valid in the SPEC_CTRL MSR,
the thing falls apart.

Add a synthetic feature bit X86_FEATURE_MSR_SPEC_CTRL to denote the
availability on both Intel and AMD.

While at it replace the boot_cpu_has() checks with static_cpu_has() where
possible. This prevents late microcode loading from exposing SPEC_CTRL, but
late loading is already very limited as it does not reevaluate the
mitigation options and other bits and pieces. Having static_cpu_has() is
the simplest and least fragile solution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/kernel/cpu/bugs.c         | 18 +++++++++++-------
 arch/x86/kernel/cpu/common.c       |  9 +++++++--
 arch/x86/kernel/cpu/intel.c        |  1 +
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cb9bca8cecdc..7d34eb0d3715 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -206,6 +206,7 @@
 #define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_CDP_L2		( 7*32+15) /* Code and Data Prioritization L2 */
+#define X86_FEATURE_MSR_SPEC_CTRL	( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
 
 #define X86_FEATURE_MBA			( 7*32+18) /* Memory Bandwidth Allocation */
 #define X86_FEATURE_RSB_CTXSW		( 7*32+19) /* "" Fill RSB on context switches */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bab05913f864..316cb24092a3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -64,7 +64,7 @@ void __init check_bugs(void)
 	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
 	 * init code as it is not enumerated and depends on the family.
 	 */
-	if (boot_cpu_has(X86_FEATURE_IBRS))
+	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 
 	/* Select the proper spectre mitigation before patching alternatives */
@@ -145,7 +145,7 @@ u64 x86_spec_ctrl_get_default(void)
 {
 	u64 msrval = x86_spec_ctrl_base;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
 		msrval |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 	return msrval;
 }
@@ -155,10 +155,12 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
-	if (!boot_cpu_has(X86_FEATURE_IBRS))
+	/* Is MSR_SPEC_CTRL implemented ? */
+	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+	/* Intel controls SSB in MSR_SPEC_CTRL */
+	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
 		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
@@ -170,10 +172,12 @@ void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
-	if (!boot_cpu_has(X86_FEATURE_IBRS))
+	/* Is MSR_SPEC_CTRL implemented ? */
+	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+	/* Intel controls SSB in MSR_SPEC_CTRL */
+	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
 		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
@@ -631,7 +635,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 
 void x86_spec_ctrl_setup_ap(void)
 {
-	if (boot_cpu_has(X86_FEATURE_IBRS))
+	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
 
 	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e6cd38b20375..af54dbe2df9a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -761,19 +761,24 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
 		set_cpu_cap(c, X86_FEATURE_IBRS);
 		set_cpu_cap(c, X86_FEATURE_IBPB);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
 	}
 
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
 
-	if (cpu_has(c, X86_FEATURE_AMD_IBRS))
+	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
 		set_cpu_cap(c, X86_FEATURE_IBRS);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+	}
 
 	if (cpu_has(c, X86_FEATURE_AMD_IBPB))
 		set_cpu_cap(c, X86_FEATURE_IBPB);
 
-	if (cpu_has(c, X86_FEATURE_AMD_STIBP))
+	if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
 		set_cpu_cap(c, X86_FEATURE_STIBP);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+	}
 }
 
 void get_cpu_cap(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0eab6c89c8d9..dd37244c587a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -188,6 +188,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_IBPB);
 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+		setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SSBD);
 	}

From 52817587e706686fcdb27f14c1b000c92f266c96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 May 2018 20:21:36 +0200
Subject: [PATCH 290/393] x86/cpufeatures: Disentangle SSBD enumeration

The SSBD enumeration is similarly to the other bits magically shared
between Intel and AMD though the mechanisms are different.

Make X86_FEATURE_SSBD synthetic and set it depending on the vendor specific
features or family dependent setup.

Change the Intel bit to X86_FEATURE_SPEC_CTRL_SSBD to denote that SSBD is
controlled via MSR_SPEC_CTRL and fix up the usage sites.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/cpufeatures.h |  7 +++----
 arch/x86/kernel/cpu/amd.c          |  7 +------
 arch/x86/kernel/cpu/bugs.c         | 10 +++++-----
 arch/x86/kernel/cpu/common.c       |  3 +++
 arch/x86/kernel/cpu/intel.c        |  1 +
 arch/x86/kernel/process.c          |  2 +-
 6 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 7d34eb0d3715..61c34c1a525c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -207,15 +207,14 @@
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_CDP_L2		( 7*32+15) /* Code and Data Prioritization L2 */
 #define X86_FEATURE_MSR_SPEC_CTRL	( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
-
+#define X86_FEATURE_SSBD		( 7*32+17) /* Speculative Store Bypass Disable */
 #define X86_FEATURE_MBA			( 7*32+18) /* Memory Bandwidth Allocation */
 #define X86_FEATURE_RSB_CTXSW		( 7*32+19) /* "" Fill RSB on context switches */
 #define X86_FEATURE_SEV			( 7*32+20) /* AMD Secure Encrypted Virtualization */
-
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
-#define X86_FEATURE_AMD_SSBD		( 7*32+24)  /* "" AMD SSBD implementation */
+#define X86_FEATURE_LS_CFG_SSBD		( 7*32+24)  /* "" AMD SSBD implementation via LS_CFG MSR */
 #define X86_FEATURE_IBRS		( 7*32+25) /* Indirect Branch Restricted Speculation */
 #define X86_FEATURE_IBPB		( 7*32+26) /* Indirect Branch Prediction Barrier */
 #define X86_FEATURE_STIBP		( 7*32+27) /* Single Thread Indirect Branch Predictors */
@@ -339,7 +338,7 @@
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
-#define X86_FEATURE_SSBD		(18*32+31) /* Speculative Store Bypass Disable */
+#define X86_FEATURE_SPEC_CTRL_SSBD	(18*32+31) /* "" Speculative Store Bypass Disable */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7bde990b0385..2d2d8985654b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -570,8 +570,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		 * avoid RMW. If that faults, do not enable SSBD.
 		 */
 		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+			setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
 			setup_force_cpu_cap(X86_FEATURE_SSBD);
-			setup_force_cpu_cap(X86_FEATURE_AMD_SSBD);
 			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
 		}
 	}
@@ -919,11 +919,6 @@ static void init_amd(struct cpuinfo_x86 *c)
 	/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
-
-	if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) {
-		set_cpu_cap(c, X86_FEATURE_SSBD);
-		set_cpu_cap(c, X86_FEATURE_AMD_SSBD);
-	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 316cb24092a3..7ebd6373fc31 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -159,8 +159,8 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		return;
 
-	/* Intel controls SSB in MSR_SPEC_CTRL */
-	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
+	/* SSBD controlled in MSR_SPEC_CTRL */
+	if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
 		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
@@ -176,8 +176,8 @@ void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		return;
 
-	/* Intel controls SSB in MSR_SPEC_CTRL */
-	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
+	/* SSBD controlled in MSR_SPEC_CTRL */
+	if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
 		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
 
 	if (host != guest_spec_ctrl)
@@ -189,7 +189,7 @@ static void x86_amd_ssb_disable(void)
 {
 	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
 
-	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index af54dbe2df9a..68282514c025 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -767,6 +767,9 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
 
+	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD))
+		set_cpu_cap(c, X86_FEATURE_SSBD);
+
 	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
 		set_cpu_cap(c, X86_FEATURE_IBRS);
 		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dd37244c587a..577e7f7ae273 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -191,6 +191,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SSBD);
+		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
 	}
 
 	/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b77a091bf3b8..d71ef7eaa7ef 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -283,7 +283,7 @@ static __always_inline void __speculative_store_bypass_update(unsigned long tifn
 {
 	u64 msr;
 
-	if (static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+	if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
 		msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
 		wrmsrl(MSR_AMD64_LS_CFG, msr);
 	} else {

From d1035d971829dcf80e8686ccde26f94b0a069472 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 May 2018 16:26:00 +0200
Subject: [PATCH 291/393] x86/cpufeatures: Add FEATURE_ZEN

Add a ZEN feature bit so family-dependent static_cpu_has() optimizations
can be built for ZEN.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/amd.c          | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 61c34c1a525c..8099be4fc3e1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -218,6 +218,7 @@
 #define X86_FEATURE_IBRS		( 7*32+25) /* Indirect Branch Restricted Speculation */
 #define X86_FEATURE_IBPB		( 7*32+26) /* Indirect Branch Prediction Barrier */
 #define X86_FEATURE_STIBP		( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN			( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2d2d8985654b..1b18be3f35a8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -812,6 +812,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 
 static void init_amd_zn(struct cpuinfo_x86 *c)
 {
+	set_cpu_cap(c, X86_FEATURE_ZEN);
 	/*
 	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
 	 * all up to and including B1.

From 1f50ddb4f4189243c05926b842dc1a0332195f31 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2018 21:53:09 +0200
Subject: [PATCH 292/393] x86/speculation: Handle HT correctly on AMD

The AMD64_LS_CFG MSR is a per core MSR on Family 17H CPUs. That means when
hyperthreading is enabled the SSBD bit toggle needs to take both cores into
account. Otherwise the following situation can happen:

CPU0		CPU1

disable SSB
		disable SSB
		enable  SSB <- Enables it for the Core, i.e. for CPU0 as well

So after the SSB enable on CPU1 the task on CPU0 runs with SSB enabled
again.

On Intel the SSBD control is per core as well, but the synchronization
logic is implemented behind the per thread SPEC_CTRL MSR. It works like
this:

  CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL

i.e. if one of the threads enables a mitigation then this affects both and
the mitigation is only disabled in the core when both threads disabled it.

Add the necessary synchronization logic for AMD family 17H. Unfortunately
that requires a spinlock to serialize the access to the MSR, but the locks
are only shared between siblings.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/spec-ctrl.h |   6 ++
 arch/x86/kernel/process.c        | 131 ++++++++++++++++++++++++++++---
 arch/x86/kernel/smpboot.c        |   5 ++
 3 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index dc21209790bf..0cb49c4564b0 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -33,6 +33,12 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
 	return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
 }
 
+#ifdef CONFIG_SMP
+extern void speculative_store_bypass_ht_init(void);
+#else
+static inline void speculative_store_bypass_ht_init(void) { }
+#endif
+
 extern void speculative_store_bypass_update(void);
 
 #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d71ef7eaa7ef..d6fe8648d3f6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -279,22 +279,135 @@ static inline void switch_to_bitmap(struct tss_struct *tss,
 	}
 }
 
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+	struct ssb_state	*shared_state;
+	raw_spinlock_t		lock;
+	unsigned int		disable_state;
+	unsigned long		local_state;
+};
+
+#define LSTATE_SSB	0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+	struct ssb_state *st = this_cpu_ptr(&ssb_state);
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	st->local_state = 0;
+
+	/*
+	 * Shared state setup happens once on the first bringup
+	 * of the CPU. It's not destroyed on CPU hotunplug.
+	 */
+	if (st->shared_state)
+		return;
+
+	raw_spin_lock_init(&st->lock);
+
+	/*
+	 * Go over HT siblings and check whether one of them has set up the
+	 * shared state pointer already.
+	 */
+	for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+		if (cpu == this_cpu)
+			continue;
+
+		if (!per_cpu(ssb_state, cpu).shared_state)
+			continue;
+
+		/* Link it to the state of the sibling: */
+		st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+		return;
+	}
+
+	/*
+	 * First HT sibling to come up on the core.  Link shared state of
+	 * the first HT sibling to itself. The siblings on the same core
+	 * which come up later will see the shared state pointer and link
+	 * themself to the state of this CPU.
+	 */
+	st->shared_state = st;
+}
+
+/*
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ *  CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
+ */
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+	struct ssb_state *st = this_cpu_ptr(&ssb_state);
+	u64 msr = x86_amd_ls_cfg_base;
+
+	if (!static_cpu_has(X86_FEATURE_ZEN)) {
+		msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+		wrmsrl(MSR_AMD64_LS_CFG, msr);
+		return;
+	}
+
+	if (tifn & _TIF_SSBD) {
+		/*
+		 * Since this can race with prctl(), block reentry on the
+		 * same CPU.
+		 */
+		if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+			return;
+
+		msr |= x86_amd_ls_cfg_ssbd_mask;
+
+		raw_spin_lock(&st->shared_state->lock);
+		/* First sibling enables SSBD: */
+		if (!st->shared_state->disable_state)
+			wrmsrl(MSR_AMD64_LS_CFG, msr);
+		st->shared_state->disable_state++;
+		raw_spin_unlock(&st->shared_state->lock);
+	} else {
+		if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+			return;
+
+		raw_spin_lock(&st->shared_state->lock);
+		st->shared_state->disable_state--;
+		if (!st->shared_state->disable_state)
+			wrmsrl(MSR_AMD64_LS_CFG, msr);
+		raw_spin_unlock(&st->shared_state->lock);
+	}
+}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+	u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+	wrmsrl(MSR_AMD64_LS_CFG, msr);
+}
+#endif
+
+static __always_inline void intel_set_ssb_state(unsigned long tifn)
+{
+	u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
+
+	wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+}
+
 static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
 {
-	u64 msr;
-
-	if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
-		msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
-		wrmsrl(MSR_AMD64_LS_CFG, msr);
-	} else {
-		msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
-		wrmsrl(MSR_IA32_SPEC_CTRL, msr);
-	}
+	if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+		amd_set_core_ssb_state(tifn);
+	else
+		intel_set_ssb_state(tifn);
 }
 
 void speculative_store_bypass_update(void)
 {
+	preempt_disable();
 	__speculative_store_bypass_update(current_thread_info()->flags);
+	preempt_enable();
 }
 
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0f1cbb042f49..9dd324ae4832 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -79,6 +79,7 @@
 #include <asm/qspinlock.h>
 #include <asm/intel-family.h>
 #include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -244,6 +245,8 @@ static void notrace start_secondary(void *unused)
 	 */
 	check_tsc_sync_target();
 
+	speculative_store_bypass_ht_init();
+
 	/*
 	 * Lock vector_lock, set CPU online and bring the vector
 	 * allocator online. Online must be set with vector_lock held
@@ -1292,6 +1295,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	set_mtrr_aps_delayed_init();
 
 	smp_quirk_init_udelay();
+
+	speculative_store_bypass_ht_init();
 }
 
 void arch_enable_nonboot_cpus_begin(void)

From ccbcd2674472a978b48c91c1fbfb66c0ff959f24 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2018 23:01:01 +0200
Subject: [PATCH 293/393] x86/bugs, KVM: Extend speculation control for
 VIRT_SPEC_CTRL

AMD is proposing a VIRT_SPEC_CTRL MSR to handle the Speculative Store
Bypass Disable via MSR_AMD64_LS_CFG so that guests do not have to care
about the bit position of the SSBD bit and thus facilitate migration.
Also, the sibling coordination on Family 17H CPUs can only be done on
the host.

Extend x86_spec_ctrl_set_guest() and x86_spec_ctrl_restore_host() with an
extra argument for the VIRT_SPEC_CTRL MSR.

Hand in 0 from VMX and in SVM add a new virt_spec_ctrl member to the CPU
data structure which is going to be used in later patches for the actual
implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/spec-ctrl.h |  9 ++++++---
 arch/x86/kernel/cpu/bugs.c       | 20 ++++++++++++++++++--
 arch/x86/kvm/svm.c               | 11 +++++++++--
 arch/x86/kvm/vmx.c               |  4 ++--
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 0cb49c4564b0..6e2874049afd 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -10,10 +10,13 @@
  * the guest has, while on VMEXIT we restore the host view. This
  * would be easier if SPEC_CTRL were architecturally maskable or
  * shadowable for guests but this is not (currently) the case.
- * Takes the guest view of SPEC_CTRL MSR as a parameter.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter and also
+ * the guest's version of VIRT_SPEC_CTRL, if emulated.
  */
-extern void x86_spec_ctrl_set_guest(u64);
-extern void x86_spec_ctrl_restore_host(u64);
+extern void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl,
+				    u64 guest_virt_spec_ctrl);
+extern void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl,
+				       u64 guest_virt_spec_ctrl);
 
 /* AMD specific Speculative Store Bypass MSR data */
 extern u64 x86_amd_ls_cfg_base;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7ebd6373fc31..d3afd38f30d1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -151,7 +151,15 @@ u64 x86_spec_ctrl_get_default(void)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
-void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
@@ -168,7 +176,15 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
 
-void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a7a811118a8..c07dbcc6d449 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -213,6 +213,12 @@ struct vcpu_svm {
 	} host;
 
 	u64 spec_ctrl;
+	/*
+	 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+	 * translated into the appropriate L2_CFG bits on the host to
+	 * perform speculative control.
+	 */
+	u64 virt_spec_ctrl;
 
 	u32 *msrpm;
 
@@ -2060,6 +2066,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.microcode_version = 0x01000065;
 	svm->spec_ctrl = 0;
+	svm->virt_spec_ctrl = 0;
 
 	if (!init_event) {
 		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
@@ -5557,7 +5564,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_spec_ctrl_set_guest(svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -5681,7 +5688,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_spec_ctrl_restore_host(svm->spec_ctrl);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	reload_tss(vcpu);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 210e1b63d4b5..5d733a03a6fa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9717,7 +9717,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
@@ -9865,7 +9865,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

From 11fb0683493b2da112cd64c9dada221b52463bf7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 17 May 2018 17:09:18 +0200
Subject: [PATCH 294/393] x86/speculation: Add virtualized speculative store
 bypass disable support

Some AMD processors only support a non-architectural means of enabling
speculative store bypass disable (SSBD).  To allow a simplified view of
this to a guest, an architectural definition has been created through a new
CPUID bit, 0x80000008_EBX[25], and a new MSR, 0xc001011f.  With this, a
hypervisor can virtualize the existence of this definition and provide an
architectural method for using SSBD to a guest.

Add the new CPUID feature, the new MSR and update the existing SSBD
support to use this MSR when present.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/msr-index.h   |  2 ++
 arch/x86/kernel/cpu/bugs.c         |  4 +++-
 arch/x86/kernel/process.c          | 13 ++++++++++++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 8099be4fc3e1..fb00a2fca990 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -282,6 +282,7 @@
 #define X86_FEATURE_AMD_IBPB		(13*32+12) /* "" Indirect Branch Prediction Barrier */
 #define X86_FEATURE_AMD_IBRS		(13*32+14) /* "" Indirect Branch Restricted Speculation */
 #define X86_FEATURE_AMD_STIBP		(13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD		(13*32+25) /* Virtualized Speculative Store Bypass Disable */
 
 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
 #define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0da3ca260b06..562414d5b834 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -347,6 +347,8 @@
 #define MSR_AMD64_SEV_ENABLED_BIT	0
 #define MSR_AMD64_SEV_ENABLED		BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
 
+#define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f
+
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d3afd38f30d1..82422a04b506 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -205,7 +205,9 @@ static void x86_amd_ssb_disable(void)
 {
 	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
 
-	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+	if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+		wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+	else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d6fe8648d3f6..91c3398286d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -388,6 +388,15 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
 }
 #endif
 
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+	/*
+	 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+	 * so ssbd_tif_to_spec_ctrl() just works.
+	 */
+	wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
+
 static __always_inline void intel_set_ssb_state(unsigned long tifn)
 {
 	u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
@@ -397,7 +406,9 @@ static __always_inline void intel_set_ssb_state(unsigned long tifn)
 
 static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
 {
-	if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+	if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
+		amd_set_ssb_virt_state(tifn);
+	else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
 		amd_set_core_ssb_state(tifn);
 	else
 		intel_set_ssb_state(tifn);

From 0270be3e34efb05a88bc4c422572ece038ef3608 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 May 2018 20:31:44 +0200
Subject: [PATCH 295/393] x86/speculation: Rework
 speculative_store_bypass_update()

The upcoming support for the virtual SPEC_CTRL MSR on AMD needs to reuse
speculative_store_bypass_update() to avoid code duplication. Add an
argument for supplying a thread info (TIF) value and create a wrapper
speculative_store_bypass_update_current() which is used at the existing
call site.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/spec-ctrl.h | 7 ++++++-
 arch/x86/kernel/cpu/bugs.c       | 2 +-
 arch/x86/kernel/process.c        | 4 ++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 6e2874049afd..82b6c5a0d61e 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -42,6 +42,11 @@ extern void speculative_store_bypass_ht_init(void);
 static inline void speculative_store_bypass_ht_init(void) { }
 #endif
 
-extern void speculative_store_bypass_update(void);
+extern void speculative_store_bypass_update(unsigned long tif);
+
+static inline void speculative_store_bypass_update_current(void)
+{
+	speculative_store_bypass_update(current_thread_info()->flags);
+}
 
 #endif
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 82422a04b506..f2f0c1b3bf50 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -598,7 +598,7 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 	 * mitigation until it is next scheduled.
 	 */
 	if (task == current && update)
-		speculative_store_bypass_update();
+		speculative_store_bypass_update_current();
 
 	return 0;
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 91c3398286d8..30ca2d1a9231 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -414,10 +414,10 @@ static __always_inline void __speculative_store_bypass_update(unsigned long tifn
 		intel_set_ssb_state(tifn);
 }
 
-void speculative_store_bypass_update(void)
+void speculative_store_bypass_update(unsigned long tif)
 {
 	preempt_disable();
-	__speculative_store_bypass_update(current_thread_info()->flags);
+	__speculative_store_bypass_update(tif);
 	preempt_enable();
 }
 

From cc69b34989210f067b2c51d5539b5f96ebcc3a01 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 12 May 2018 00:14:51 +0200
Subject: [PATCH 296/393] x86/bugs: Unify
 x86_spec_ctrl_{set_guest,restore_host}

Function bodies are very similar and are going to grow more almost
identical code. Add a bool arg to determine whether SPEC_CTRL is being set
for the guest or restored to the host.

No functional changes.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/spec-ctrl.h | 33 +++++++++++++++---
 arch/x86/kernel/cpu/bugs.c       | 58 ++++++++------------------------
 2 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 82b6c5a0d61e..9cecbe5e57ee 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -13,10 +13,35 @@
  * Takes the guest view of SPEC_CTRL MSR as a parameter and also
  * the guest's version of VIRT_SPEC_CTRL, if emulated.
  */
-extern void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl,
-				    u64 guest_virt_spec_ctrl);
-extern void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl,
-				       u64 guest_virt_spec_ctrl);
+extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest);
+
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+	x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true);
+}
+
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+	x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false);
+}
 
 /* AMD specific Speculative Store Bypass MSR data */
 extern u64 x86_amd_ls_cfg_base;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f2f0c1b3bf50..feb7d597c265 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -151,55 +151,25 @@ u64 x86_spec_ctrl_get_default(void)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
-/**
- * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
- * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
- * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
- *				(may get translated to MSR_AMD64_LS_CFG bits)
- *
- * Avoids writing to the MSR if the content/bits are the same
- */
-void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+void
+x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 {
-	u64 host = x86_spec_ctrl_base;
+	struct thread_info *ti = current_thread_info();
+	u64 msr, host = x86_spec_ctrl_base;
 
 	/* Is MSR_SPEC_CTRL implemented ? */
-	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
-		return;
+	if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+		/* SSBD controlled in MSR_SPEC_CTRL */
+		if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+			host |= ssbd_tif_to_spec_ctrl(ti->flags);
 
-	/* SSBD controlled in MSR_SPEC_CTRL */
-	if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
-		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
-
-	if (host != guest_spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
+		if (host != guest_spec_ctrl) {
+			msr = setguest ? guest_spec_ctrl : host;
+			wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+		}
+	}
 }
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
-
-/**
- * x86_spec_ctrl_restore_host - Restore host speculation control registers
- * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
- * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
- *				(may get translated to MSR_AMD64_LS_CFG bits)
- *
- * Avoids writing to the MSR if the content/bits are the same
- */
-void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
-{
-	u64 host = x86_spec_ctrl_base;
-
-	/* Is MSR_SPEC_CTRL implemented ? */
-	if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
-		return;
-
-	/* SSBD controlled in MSR_SPEC_CTRL */
-	if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
-		host |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
-
-	if (host != guest_spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, host);
-}
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
+EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
 
 static void x86_amd_ssb_disable(void)
 {

From fa8ac4988249c38476f6ad678a4848a736373403 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 May 2018 20:49:16 +0200
Subject: [PATCH 297/393] x86/bugs: Expose x86_spec_ctrl_base directly

x86_spec_ctrl_base is the system wide default value for the SPEC_CTRL MSR.
x86_spec_ctrl_get_default() returns x86_spec_ctrl_base and was intended to
prevent modification to that variable. Though the variable is read only
after init and globaly visible already.

Remove the function and export the variable instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/nospec-branch.h | 16 +++++-----------
 arch/x86/include/asm/spec-ctrl.h     |  3 ---
 arch/x86/kernel/cpu/bugs.c           | 11 +----------
 3 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index bc258e644e5e..8d9deec00de9 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,16 +217,7 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
-/*
- * The Intel specification for the SPEC_CTRL MSR requires that we
- * preserve any already set reserved bits at boot time (e.g. for
- * future additions that this kernel is not currently aware of).
- * We then set any additional mitigation bits that we want
- * ourselves and always use this as the base for SPEC_CTRL.
- * We also use this when handling guest entry/exit as below.
- */
 extern void x86_spec_ctrl_set(u64);
-extern u64 x86_spec_ctrl_get_default(void);
 
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
@@ -278,6 +269,9 @@ static inline void indirect_branch_prediction_barrier(void)
 	alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
 }
 
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
 /*
  * With retpoline, we must use IBRS to restrict branch prediction
  * before calling into firmware.
@@ -286,7 +280,7 @@ static inline void indirect_branch_prediction_barrier(void)
  */
 #define firmware_restrict_branch_speculation_start()			\
 do {									\
-	u64 val = x86_spec_ctrl_get_default() | SPEC_CTRL_IBRS;		\
+	u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS;			\
 									\
 	preempt_disable();						\
 	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
@@ -295,7 +289,7 @@ do {									\
 
 #define firmware_restrict_branch_speculation_end()			\
 do {									\
-	u64 val = x86_spec_ctrl_get_default();				\
+	u64 val = x86_spec_ctrl_base;					\
 									\
 	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 9cecbe5e57ee..763d49710329 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -47,9 +47,6 @@ void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
 extern u64 x86_amd_ls_cfg_base;
 extern u64 x86_amd_ls_cfg_ssbd_mask;
 
-/* The Intel SPEC CTRL MSR base value cache */
-extern u64 x86_spec_ctrl_base;
-
 static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
 {
 	BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index feb7d597c265..00f51deba493 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -36,6 +36,7 @@ static void __init ssb_select_mitigation(void);
  * writes to SPEC_CTRL contain whatever reserved bits have been set.
  */
 u64 __ro_after_init x86_spec_ctrl_base;
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
 
 /*
  * The vendor and possibly platform specific bits which can be modified in
@@ -141,16 +142,6 @@ void x86_spec_ctrl_set(u64 val)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
 
-u64 x86_spec_ctrl_get_default(void)
-{
-	u64 msrval = x86_spec_ctrl_base;
-
-	if (static_cpu_has(X86_FEATURE_SPEC_CTRL))
-		msrval |= ssbd_tif_to_spec_ctrl(current_thread_info()->flags);
-	return msrval;
-}
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
-
 void
 x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 {

From 4b59bdb569453a60b752b274ca61f009e37f4dae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 May 2018 20:53:14 +0200
Subject: [PATCH 298/393] x86/bugs: Remove x86_spec_ctrl_set()

x86_spec_ctrl_set() is only used in bugs.c and the extra mask checks there
provide no real value as both call sites can just write x86_spec_ctrl_base
to MSR_SPEC_CTRL. x86_spec_ctrl_base is valid and does not need any extra
masking or checking.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/nospec-branch.h |  2 --
 arch/x86/kernel/cpu/bugs.c           | 13 ++-----------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 8d9deec00de9..8b38df98548e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,8 +217,6 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
-extern void x86_spec_ctrl_set(u64);
-
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 00f51deba493..e0b2e3b3301e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -133,15 +133,6 @@ static const char *spectre_v2_strings[] = {
 static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
 	SPECTRE_V2_NONE;
 
-void x86_spec_ctrl_set(u64 val)
-{
-	if (val & x86_spec_ctrl_mask)
-		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
-	else
-		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
-}
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
-
 void
 x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 {
@@ -503,7 +494,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
 		case X86_VENDOR_INTEL:
 			x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
 			x86_spec_ctrl_mask &= ~SPEC_CTRL_SSBD;
-			x86_spec_ctrl_set(SPEC_CTRL_SSBD);
+			wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 			break;
 		case X86_VENDOR_AMD:
 			x86_amd_ssb_disable();
@@ -615,7 +606,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
-		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 
 	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
 		x86_amd_ssb_disable();

From be6fcb5478e95bb1c91f489121238deb3abca46a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 May 2018 20:10:00 +0200
Subject: [PATCH 299/393] x86/bugs: Rework spec_ctrl base and mask logic

x86_spec_ctrL_mask is intended to mask out bits from a MSR_SPEC_CTRL value
which are not to be modified. However the implementation is not really used
and the bitmask was inverted to make a check easier, which was removed in
"x86/bugs: Remove x86_spec_ctrl_set()"

Aside of that it is missing the STIBP bit if it is supported by the
platform, so if the mask would be used in x86_virt_spec_ctrl() then it
would prevent a guest from setting STIBP.

Add the STIBP bit if supported and use the mask in x86_virt_spec_ctrl() to
sanitize the value which is supplied by the guest.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/bugs.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e0b2e3b3301e..8e327bfec513 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
  * The vendor and possibly platform specific bits which can be modified in
  * x86_spec_ctrl_base.
  */
-static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
+static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
 
 /*
  * AMD specific MSR info for Speculative Store Bypass control.
@@ -68,6 +68,10 @@ void __init check_bugs(void)
 	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
 		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 
+	/* Allow STIBP in MSR_SPEC_CTRL if supported */
+	if (boot_cpu_has(X86_FEATURE_STIBP))
+		x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
@@ -136,18 +140,26 @@ static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
 void
 x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 {
+	u64 msrval, guestval, hostval = x86_spec_ctrl_base;
 	struct thread_info *ti = current_thread_info();
-	u64 msr, host = x86_spec_ctrl_base;
 
 	/* Is MSR_SPEC_CTRL implemented ? */
 	if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+		/*
+		 * Restrict guest_spec_ctrl to supported values. Clear the
+		 * modifiable bits in the host base value and or the
+		 * modifiable bits from the guest value.
+		 */
+		guestval = hostval & ~x86_spec_ctrl_mask;
+		guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+
 		/* SSBD controlled in MSR_SPEC_CTRL */
 		if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
-			host |= ssbd_tif_to_spec_ctrl(ti->flags);
+			hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
 
-		if (host != guest_spec_ctrl) {
-			msr = setguest ? guest_spec_ctrl : host;
-			wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+		if (hostval != guestval) {
+			msrval = setguest ? guestval : hostval;
+			wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
 		}
 	}
 }
@@ -493,7 +505,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
-			x86_spec_ctrl_mask &= ~SPEC_CTRL_SSBD;
+			x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
 			wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 			break;
 		case X86_VENDOR_AMD:

From 47c61b3955cf712cadfc25635bf9bc174af030ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 May 2018 20:42:48 +0200
Subject: [PATCH 300/393] x86/speculation, KVM: Implement support for
 VIRT_SPEC_CTRL/LS_CFG

Add the necessary logic for supporting the emulated VIRT_SPEC_CTRL MSR to
x86_virt_spec_ctrl().  If either X86_FEATURE_LS_CFG_SSBD or
X86_FEATURE_VIRT_SPEC_CTRL is set then use the new guest_virt_spec_ctrl
argument to check whether the state must be modified on the host. The
update reuses speculative_store_bypass_update() so the ZEN-specific sibling
coordination can be reused.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/spec-ctrl.h |  6 ++++++
 arch/x86/kernel/cpu/bugs.c       | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 763d49710329..ae7c2c5cd7f0 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -53,6 +53,12 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
 	return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
 }
 
+static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+	BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+	return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
 static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
 {
 	return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8e327bfec513..7416fc206b4a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -162,6 +162,36 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 			wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
 		}
 	}
+
+	/*
+	 * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+	 * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+	 */
+	if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+	    !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+		return;
+
+	/*
+	 * If the host has SSBD mitigation enabled, force it in the host's
+	 * virtual MSR value. If its not permanently enabled, evaluate
+	 * current's TIF_SSBD thread flag.
+	 */
+	if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+		hostval = SPEC_CTRL_SSBD;
+	else
+		hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+	/* Sanitize the guest value */
+	guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+	if (hostval != guestval) {
+		unsigned long tif;
+
+		tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+				 ssbd_spec_ctrl_to_tif(hostval);
+
+		speculative_store_bypass_update(tif);
+	}
 }
 EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
 

From bc226f07dcd3c9ef0b7f6236fe356ea4a9cb4769 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 10 May 2018 22:06:39 +0200
Subject: [PATCH 301/393] KVM: SVM: Implement VIRT_SPEC_CTRL support for SSBD

Expose the new virtualized architectural mechanism, VIRT_SSBD, for using
speculative store bypass disable (SSBD) under SVM.  This will allow guests
to use SSBD on hardware that uses non-architectural mechanisms for enabling
SSBD.

[ tglx: Folded the migration fixup from Paolo Bonzini ]

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kernel/cpu/common.c    |  3 ++-
 arch/x86/kvm/cpuid.c            | 11 +++++++++--
 arch/x86/kvm/svm.c              | 21 +++++++++++++++++++--
 arch/x86/kvm/vmx.c              | 18 +++++++++++++++---
 arch/x86/kvm/x86.c              | 13 ++++---------
 6 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c25775fad4ed..f4b2588865e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -924,7 +924,7 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
-	bool (*cpu_has_high_real_mode_segbase)(void);
+	bool (*has_emulated_msr)(int index);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	struct kvm *(*vm_alloc)(void);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68282514c025..b4247ed0c81e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -767,7 +767,8 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
 
-	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD))
+	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+	    cpu_has(c, X86_FEATURE_VIRT_SSBD))
 		set_cpu_cap(c, X86_FEATURE_SSBD);
 
 	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e5ba48599428..ced851169730 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(AMD_IBPB) | F(AMD_IBRS);
+		F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -647,13 +647,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
+		/*
+		 * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+		 * hardware cpuid
+		 */
 		if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
 			entry->ebx |= F(AMD_IBPB);
 		if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
 			entry->ebx |= F(AMD_IBRS);
+		if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		break;
 	}
 	case 0x80000019:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c07dbcc6d449..26110c202b19 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4120,6 +4120,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		msr_info->data = svm->spec_ctrl;
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+			return 1;
+
+		msr_info->data = svm->virt_spec_ctrl;
+		break;
 	case MSR_F15H_IC_CFG: {
 
 		int family, model;
@@ -4251,6 +4258,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			break;
 		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+			return 1;
+
+		if (data & ~SPEC_CTRL_SSBD)
+			return 1;
+
+		svm->virt_spec_ctrl = data;
+		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
@@ -5791,7 +5808,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static bool svm_has_high_real_mode_segbase(void)
+static bool svm_has_emulated_msr(int index)
 {
 	return true;
 }
@@ -7017,7 +7034,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
+	.has_emulated_msr = svm_has_emulated_msr,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5d733a03a6fa..0c57fb4df2d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9477,9 +9477,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
 {
-	return enable_unrestricted_guest || emulate_invalid_guest_state;
+	switch (index) {
+	case MSR_IA32_SMBASE:
+		/*
+		 * We cannot do SMM unless we can run the guest in big
+		 * real mode.
+		 */
+		return enable_unrestricted_guest || emulate_invalid_guest_state;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		/* This is AMD only.  */
+		return false;
+	default:
+		return true;
+	}
 }
 
 static bool vmx_mpx_supported(void)
@@ -12625,7 +12637,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
-	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_init = vmx_vm_init,
 	.vm_alloc = vmx_vm_alloc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51ecd381793b..421a39e40d5e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1058,6 +1058,7 @@ static u32 emulated_msrs[] = {
 	MSR_SMI_COUNT,
 	MSR_PLATFORM_INFO,
 	MSR_MISC_FEATURES_ENABLES,
+	MSR_AMD64_VIRT_SPEC_CTRL,
 };
 
 static unsigned num_emulated_msrs;
@@ -2903,7 +2904,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 * fringe case that is not enabled except via specific settings
 		 * of the module parameters.
 		 */
-		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+		r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
 		break;
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
@@ -4603,14 +4604,8 @@ static void kvm_init_msr_list(void)
 	num_msrs_to_save = j;
 
 	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
-		switch (emulated_msrs[i]) {
-		case MSR_IA32_SMBASE:
-			if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
-				continue;
-			break;
-		default:
-			break;
-		}
+		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+			continue;
 
 		if (j < i)
 			emulated_msrs[j] = emulated_msrs[i];

From 1c1a2ee1b53b006754073eefc65d2b2cedb5264b Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 17 May 2018 23:33:26 +0800
Subject: [PATCH 302/393] bcache: return 0 from bch_debug_init() if
 CONFIG_DEBUG_FS=n

Commit 539d39eb2708 ("bcache: fix wrong return value in bch_debug_init()")
returns the return value of debugfs_create_dir() to bcache_init(). When
CONFIG_DEBUG_FS=n, bch_debug_init() always returns 1 and makes
bcache_init() failedi.

This patch makes bch_debug_init() always returns 0 if CONFIG_DEBUG_FS=n,
so bcache can continue to work for the kernels which don't have debugfs
enanbled.

Changelog:
v4: Add Acked-by from Kent Overstreet.
v3: Use IS_ENABLED(CONFIG_DEBUG_FS) to replace #ifdef DEBUG_FS.
v2: Remove a warning information
v1: Initial version.

Fixes: Commit 539d39eb2708 ("bcache: fix wrong return value in bch_debug_init()")
Cc: stable@vger.kernel.org
Signed-off-by: Coly Li <colyli@suse.de>
Reported-by: Massimo B. <massimo.b@gmx.net>
Reported-by: Kai Krakow <kai@kaishome.de>
Tested-by: Kai Krakow <kai@kaishome.de>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/debug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 4e63c6f6c04d..d030ce3025a6 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -250,7 +250,9 @@ void bch_debug_exit(void)
 
 int __init bch_debug_init(struct kobject *kobj)
 {
-	bcache_debug = debugfs_create_dir("bcache", NULL);
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
 
+	bcache_debug = debugfs_create_dir("bcache", NULL);
 	return IS_ERR_OR_NULL(bcache_debug);
 }

From 7f7ccc2ccc2e70c6054685f5e3522efa81556830 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Fri, 11 May 2018 08:11:44 +0200
Subject: [PATCH 303/393] proc: do not access cmdline nor environ from
 file-backed areas

proc_pid_cmdline_read() and environ_read() directly access the target
process' VM to retrieve the command line and environment. If this
process remaps these areas onto a file via mmap(), the requesting
process may experience various issues such as extra delays if the
underlying device is slow to respond.

Let's simply refuse to access file-backed areas in these functions.
For this we add a new FOLL_ANON gup flag that is passed to all calls
to access_remote_vm(). The code already takes care of such failures
(including unmapped areas). Accesses via /proc/pid/mem were not
changed though.

This was assigned CVE-2018-1120.

Note for stable backports: the patch may apply to kernels prior to 4.11
but silently miss one location; it must be checked that no call to
access_remote_vm() keeps zero as the last argument.

Reported-by: Qualys Security Advisory <qsa@qualys.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c     | 8 ++++----
 include/linux/mm.h | 1 +
 mm/gup.c           | 3 +++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b2ede6abcdf..1a76d751cf3c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 	 * Inherently racy -- command line shares address space
 	 * with code and data.
 	 */
-	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
 	if (rv <= 0)
 		goto out_free_page;
 
@@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			int nr_read;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 				bool final;
 
 				_count = min3(count, len, PAGE_SIZE);
-				nr_read = access_remote_vm(mm, p, page, _count, 0);
+				nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 				if (nr_read < 0)
 					rv = nr_read;
 				if (nr_read <= 0)
@@ -946,7 +946,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		max_len = min_t(size_t, PAGE_SIZE, count);
 		this_len = min(max_len, this_len);
 
-		retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
+		retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
 
 		if (retval <= 0) {
 			ret = retval;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..c080af584ddd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2493,6 +2493,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_MLOCK	0x1000	/* lock present pages */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
 #define FOLL_COW	0x4000	/* internal GUP flag */
+#define FOLL_ANON	0x8000	/* don't do file mappings */
 
 static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 76af4cfeaf68..541904a7c60f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -544,6 +544,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 	if (vm_flags & (VM_IO | VM_PFNMAP))
 		return -EFAULT;
 
+	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+		return -EFAULT;
+
 	if (write) {
 		if (!(vm_flags & VM_WRITE)) {
 			if (!(gup_flags & FOLL_FORCE))

From 633711e82878dc29083fc5d2605166755e25b57a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 17 May 2018 17:54:24 +0300
Subject: [PATCH 304/393] kvm: rename KVM_HINTS_DEDICATED to KVM_HINTS_REALTIME

KVM_HINTS_DEDICATED seems to be somewhat confusing:

Guest doesn't really care whether it's the only task running on a host
CPU as long as it's not preempted.

And there are more reasons for Guest to be preempted than host CPU
sharing, for example, with memory overcommit it can get preempted on a
memory access, post copy migration can cause preemption, etc.

Let's call it KVM_HINTS_REALTIME which seems to better
match what guests expect.

Also, the flag most be set on all vCPUs - current guests assume this.
Note so in the documentation.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/cpuid.txt  | 6 +++---
 arch/x86/include/uapi/asm/kvm_para.h | 2 +-
 arch/x86/kernel/kvm.c                | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index d4f33eb805dd..ab022dcd0911 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
 
 flag                               || value || meaning
 ==================================================================================
-KVM_HINTS_DEDICATED                ||     0 || guest checks this feature bit to
-                                   ||       || determine if there is vCPU pinning
-                                   ||       || and there is no vCPU over-commitment,
+KVM_HINTS_REALTIME                 ||     0 || guest checks this feature bit to
+                                   ||       || determine that vCPUs are never
+                                   ||       || preempted for an unlimited time,
                                    ||       || allowing optimizations
 ----------------------------------------------------------------------------------
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 4c851ebb3ceb..0ede697c3961 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,7 +29,7 @@
 #define KVM_FEATURE_PV_TLB_FLUSH	9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
 
-#define KVM_HINTS_DEDICATED      0
+#define KVM_HINTS_REALTIME      0
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7867417cfaff..5b2300b818af 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
 	native_smp_prepare_cpus(max_cpus);
-	if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
 		static_branch_disable(&virt_spin_lock_key);
 }
 
@@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
 		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
 
@@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
 	int cpu;
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		for_each_possible_cpu(cpu) {
 			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
-	if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
 		return;
 
 	__pv_init_lock_hash();

From 8ab6ffba14a466c7298cb3fd5066d774d2977ad1 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Wed, 16 May 2018 10:48:40 -0700
Subject: [PATCH 305/393] tls: don't use stack memory in a scatterlist

scatterlist code expects virt_to_page() to work, which fails with
CONFIG_VMAP_STACK=y.

Fixes: c46234ebb4d1e ("tls: RX path for ktls")
Signed-off-by: Matt Mullins <mmullins@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 3 +++
 net/tls/tls_sw.c  | 9 ++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index b400d0bb7448..f5fb16da3860 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -97,6 +97,9 @@ struct tls_sw_context {
 	u8 control;
 	bool decrypted;
 
+	char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE];
+	char rx_aad_plaintext[TLS_AAD_SPACE_SIZE];
+
 	/* Sending context */
 	char aad_space[TLS_AAD_SPACE_SIZE];
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 71e79597f940..e1c93ce74e0f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -680,7 +680,6 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 	struct scatterlist *sgin = &sgin_arr[0];
 	struct strp_msg *rxm = strp_msg(skb);
 	int ret, nsg = ARRAY_SIZE(sgin_arr);
-	char aad_recv[TLS_AAD_SPACE_SIZE];
 	struct sk_buff *unused;
 
 	ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
@@ -698,13 +697,13 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	sg_init_table(sgin, nsg);
-	sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv));
+	sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE);
 
 	nsg = skb_to_sgvec(skb, &sgin[1],
 			   rxm->offset + tls_ctx->rx.prepend_size,
 			   rxm->full_len - tls_ctx->rx.prepend_size);
 
-	tls_make_aad(aad_recv,
+	tls_make_aad(ctx->rx_aad_ciphertext,
 		     rxm->full_len - tls_ctx->rx.overhead_size,
 		     tls_ctx->rx.rec_seq,
 		     tls_ctx->rx.rec_seq_size,
@@ -803,12 +802,12 @@ int tls_sw_recvmsg(struct sock *sk,
 			if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
 			    likely(!(flags & MSG_PEEK)))  {
 				struct scatterlist sgin[MAX_SKB_FRAGS + 1];
-				char unused[21];
 				int pages = 0;
 
 				zc = true;
 				sg_init_table(sgin, MAX_SKB_FRAGS + 1);
-				sg_set_buf(&sgin[0], unused, 13);
+				sg_set_buf(&sgin[0], ctx->rx_aad_plaintext,
+					   TLS_AAD_SPACE_SIZE);
 
 				err = zerocopy_from_iter(sk, &msg->msg_iter,
 							 to_copy, &pages,

From 5a847a6e1477be5bd3f94cc1b7708d7d4a7cd94c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 16 May 2018 13:36:40 -0700
Subject: [PATCH 306/393] net/ipv4: Initialize proto and ports in flow struct

Updating the FIB tracepoint for the recent change to allow rules using
the protocol and ports exposed a few places where the entries in the flow
struct are not initialized.

For __fib_validate_source add the call to fib4_rules_early_flow_dissect
since it is invoked for the input path. For netfilter, add the memset on
the flow struct to avoid future problems like this. In ip_route_input_slow
need to set the fields if the skb dissection does not happen.

Fixes: bfff4862653b ("net: fib_rules: support for match on ip_proto, sport and dport")
Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c           | 8 +++++++-
 net/ipv4/netfilter/ipt_rpfilter.c | 2 +-
 net/ipv4/route.c                  | 7 ++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f05afaf3235c..4d622112bf95 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -326,10 +326,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 				 u8 tos, int oif, struct net_device *dev,
 				 int rpf, struct in_device *idev, u32 *itag)
 {
+	struct net *net = dev_net(dev);
+	struct flow_keys flkeys;
 	int ret, no_addr;
 	struct fib_result res;
 	struct flowi4 fl4;
-	struct net *net = dev_net(dev);
 	bool dev_match;
 
 	fl4.flowi4_oif = 0;
@@ -347,6 +348,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	no_addr = idev->ifa_list == NULL;
 
 	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+	if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
+		fl4.flowi4_proto = 0;
+		fl4.fl4_sport = 0;
+		fl4.fl4_dport = 0;
+	}
 
 	trace_fib_validate_source(dev, &fl4);
 
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index fd01f13c896a..12843c9ef142 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -89,10 +89,10 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			return true ^ invert;
 	}
 
+	memset(&flow, 0, sizeof(flow));
 	flow.flowi4_iif = LOOPBACK_IFINDEX;
 	flow.daddr = iph->saddr;
 	flow.saddr = rpfilter_get_saddr(iph->daddr);
-	flow.flowi4_oif = 0;
 	flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
 	flow.flowi4_tos = RT_TOS(iph->tos);
 	flow.flowi4_scope = RT_SCOPE_UNIVERSE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 29268efad247..2cfa1b518f8d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1961,8 +1961,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	fl4.saddr = saddr;
 	fl4.flowi4_uid = sock_net_uid(net, NULL);
 
-	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
+	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
 		flkeys = &_flkeys;
+	} else {
+		fl4.flowi4_proto = 0;
+		fl4.fl4_sport = 0;
+		fl4.fl4_dport = 0;
+	}
 
 	err = fib_lookup(net, &fl4, res, 0);
 	if (err != 0) {

From 4cf2ddf3e329d4ec1e3edda3465a202b5665be0e Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 16 May 2018 15:49:03 -0500
Subject: [PATCH 307/393] ibmvnic: Free coherent DMA memory if FW map failed

If the firmware map fails for whatever reason, remember to free
up the memory after.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 6e8d6a6f6aaf..9e08917cda1f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -192,6 +192,7 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter,
 	if (adapter->fw_done_rc) {
 		dev_err(dev, "Couldn't map long term buffer,rc = %d\n",
 			adapter->fw_done_rc);
+		dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
 		return -1;
 	}
 	return 0;

From 134bbe7f21f4455c8be64f945b8b67094f768de0 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 16 May 2018 15:49:04 -0500
Subject: [PATCH 308/393] ibmvnic: Fix non-fatal firmware error reset

It is not necessary to disable interrupt lines here during a reset
to handle a non-fatal firmware error. Move that call within the code
block that handles the other cases that do require interrupts to be
disabled and re-enabled.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 9e08917cda1f..1b9c22f28fbf 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1822,9 +1822,8 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 			if (rc)
 				return rc;
 		}
+		ibmvnic_disable_irqs(adapter);
 	}
-
-	ibmvnic_disable_irqs(adapter);
 	adapter->state = VNIC_CLOSED;
 
 	if (reset_state == VNIC_CLOSED)

From 0718421389da3d4352f4538449e83316fbed4389 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 16 May 2018 15:49:05 -0500
Subject: [PATCH 309/393] ibmvnic: Fix statistics buffers memory leak

Move initialization of statistics buffers from ibmvnic_init function
into ibmvnic_probe. In the current state, ibmvnic_init will be called
again during a device reset, resulting in the allocation of new
buffers without freeing the old ones.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 1b9c22f28fbf..4bb4646a5f92 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -4586,14 +4586,6 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 		release_crq_queue(adapter);
 	}
 
-	rc = init_stats_buffers(adapter);
-	if (rc)
-		return rc;
-
-	rc = init_stats_token(adapter);
-	if (rc)
-		return rc;
-
 	return rc;
 }
 
@@ -4662,13 +4654,21 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 			goto ibmvnic_init_fail;
 	} while (rc == EAGAIN);
 
+	rc = init_stats_buffers(adapter);
+	if (rc)
+		goto ibmvnic_init_fail;
+
+	rc = init_stats_token(adapter);
+	if (rc)
+		goto ibmvnic_stats_fail;
+
 	netdev->mtu = adapter->req_mtu - ETH_HLEN;
 	netdev->min_mtu = adapter->min_mtu - ETH_HLEN;
 	netdev->max_mtu = adapter->max_mtu - ETH_HLEN;
 
 	rc = device_create_file(&dev->dev, &dev_attr_failover);
 	if (rc)
-		goto ibmvnic_init_fail;
+		goto ibmvnic_dev_file_err;
 
 	netif_carrier_off(netdev);
 	rc = register_netdev(netdev);
@@ -4687,6 +4687,12 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 ibmvnic_register_fail:
 	device_remove_file(&dev->dev, &dev_attr_failover);
 
+ibmvnic_dev_file_err:
+	release_stats_token(adapter);
+
+ibmvnic_stats_fail:
+	release_stats_buffers(adapter);
+
 ibmvnic_init_fail:
 	release_sub_crqs(adapter, 1);
 	release_crq_queue(adapter);

From fed71f7d98795ed0fa1d431910787f0f4a68324f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 May 2018 14:36:39 +0200
Subject: [PATCH 310/393] x86/apic/x2apic: Initialize cluster ID properly

Rick bisected a regression on large systems which use the x2apic cluster
mode for interrupt delivery to the commit wich reworked the cluster
management.

The problem is caused by a missing initialization of the clusterid field
in the shared cluster data structures. So all structures end up with
cluster ID 0 which only allows sharing between all CPUs which belong to
cluster 0. All other CPUs with a cluster ID > 0 cannot share the data
structure because they cannot find existing data with their cluster
ID. This causes malfunction with IPIs because IPIs are sent to the wrong
cluster and the caller waits for ever that the target CPU handles the IPI.

Add the missing initialization when a upcoming CPU is the first in a
cluster so that the later booting CPUs can find the data and share it for
proper operation.

Fixes: 023a611748fd ("x86/apic/x2apic: Simplify cluster management")
Reported-by: Rick Warner <rick@microway.com>
Bisected-by: Rick Warner <rick@microway.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Rick Warner <rick@microway.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1805171418210.1947@nanos.tec.linutronix.de
---
 arch/x86/kernel/apic/x2apic_cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8b04234e010b..7685444a106b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
 			goto update;
 	}
 	cmsk = cluster_hotplug_mask;
+	cmsk->clusterid = cluster;
 	cluster_hotplug_mask = NULL;
 update:
 	this_cpu_write(cluster_masks, cmsk);

From 02f99df1875c11330cd0be69a40fa8ccd14749b2 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 16 May 2018 17:24:32 -0700
Subject: [PATCH 311/393] erspan: fix invalid erspan version.

ERSPAN only support version 1 and 2.  When packets send to an
erspan device which does not have proper version number set,
drop the packet.  In real case, we observe multicast packets
sent to the erspan pernet device, erspan0, which does not have
erspan version configured.

Reported-by: Greg Rose <gvrose8192@gmail.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 4 +++-
 net/ipv6/ip6_gre.c | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c169bb2444d..f200b304f76c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -722,10 +722,12 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 				    tunnel->index,
 				    truncate, true);
-	else
+	else if (tunnel->erspan_ver == 2)
 		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 				       tunnel->dir, tunnel->hwid,
 				       truncate, true);
+	else
+		goto free_skb;
 
 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 69727bc168cb..a3e1dec96a74 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -979,11 +979,14 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			erspan_build_header(skb, ntohl(t->parms.o_key),
 					    t->parms.index,
 					    truncate, false);
-		else
+		else if (t->parms.erspan_ver == 2)
 			erspan_build_header_v2(skb, ntohl(t->parms.o_key),
 					       t->parms.dir,
 					       t->parms.hwid,
 					       truncate, false);
+		else
+			goto tx_err;
+
 		fl6.daddr = t->parms.raddr;
 	}
 

From deea81228ba1c6e77722f534a3d63c50e1757877 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 17 May 2018 19:39:31 +0200
Subject: [PATCH 312/393] selftests/bpf: check return value of fopen in
 test_verifier.c

Commit 0a6748740368 ("selftests/bpf: Only run tests if !bpf_disabled")
forgot to check return value of fopen.

This caused some confusion, when running test_verifier (from
tools/testing/selftests/bpf/) on an older kernel (< v4.4) as it will
simply seqfault.

This fix avoids the segfault and prints an error, but allow program to
continue.  Given the sysctl was introduced in 1be7f75d1668 ("bpf:
enable non-root eBPF programs"), we know that the running kernel
cannot support unpriv, thus continue with unpriv_disabled = true.

Fixes: 0a6748740368 ("selftests/bpf: Only run tests if !bpf_disabled")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b1a9ae..fd7de7eb329e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11713,6 +11713,11 @@ static void get_unpriv_disabled()
 	FILE *fd;
 
 	fd = fopen("/proc/sys/"UNPRIV_SYSCTL, "r");
+	if (!fd) {
+		perror("fopen /proc/sys/"UNPRIV_SYSCTL);
+		unpriv_disabled = true;
+		return;
+	}
 	if (fgets(buf, 2, fd) == buf && atoi(buf))
 		unpriv_disabled = true;
 	fclose(fd);

From 01b8d064d58b4c1f0eff47f8fe8a8508cb3b3840 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:10 +0200
Subject: [PATCH 313/393] net: ip6_gre: Request headroom in __gre6_xmit()

__gre6_xmit() pushes GRE headers before handing over to ip6_tnl_xmit()
for generic IP-in-IP processing. However it doesn't make sure that there
is enough headroom to push the header to. That can lead to the panic
cited below. (Reproducer below that).

Fix by requesting either needed_headroom if already primed, or just the
bare minimum needed for the header otherwise.

[  158.576725] kernel BUG at net/core/skbuff.c:104!
[  158.581510] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
[  158.587174] Modules linked in: act_mirred cls_matchall ip6_gre ip6_tunnel tunnel6 gre sch_ingress vrf veth x86_pkg_temp_thermal mlx_platform nfsd e1000e leds_mlxcpld
[  158.602268] CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 4.17.0-rc4-net_master-custom-139 #10
[  158.610938] Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2F"/"SA000874", BIOS 4.6.5 03/08/2016
[  158.620426] RIP: 0010:skb_panic+0xc3/0x100
[  158.624586] RSP: 0018:ffff8801d3f27110 EFLAGS: 00010286
[  158.629882] RAX: 0000000000000082 RBX: ffff8801c02cc040 RCX: 0000000000000000
[  158.637127] RDX: 0000000000000082 RSI: dffffc0000000000 RDI: ffffed003a7e4e18
[  158.644366] RBP: ffff8801bfec8020 R08: ffffed003aabce19 R09: ffffed003aabce19
[  158.651574] R10: 000000000000000b R11: ffffed003aabce18 R12: ffff8801c364de66
[  158.658786] R13: 000000000000002c R14: 00000000000000c0 R15: ffff8801c364de68
[  158.666007] FS:  0000000000000000(0000) GS:ffff8801d5400000(0000) knlGS:0000000000000000
[  158.674212] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  158.680036] CR2: 00007f4b3702dcd0 CR3: 0000000003228002 CR4: 00000000001606e0
[  158.687228] Call Trace:
[  158.689752]  ? __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.694475]  ? __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.699141]  skb_push+0x78/0x90
[  158.702344]  __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.706872]  ip6gre_tunnel_xmit+0x3bc/0x610 [ip6_gre]
[  158.711992]  ? __gre6_xmit+0xd80/0xd80 [ip6_gre]
[  158.716668]  ? debug_check_no_locks_freed+0x210/0x210
[  158.721761]  ? print_irqtrace_events+0x120/0x120
[  158.726461]  ? sched_clock_cpu+0x18/0x210
[  158.730572]  ? sched_clock_cpu+0x18/0x210
[  158.734692]  ? cyc2ns_read_end+0x10/0x10
[  158.738705]  ? skb_network_protocol+0x76/0x200
[  158.743216]  ? netif_skb_features+0x1b2/0x550
[  158.747648]  dev_hard_start_xmit+0x137/0x770
[  158.752010]  sch_direct_xmit+0x2ef/0x5d0
[  158.755992]  ? pfifo_fast_dequeue+0x3fa/0x670
[  158.760460]  ? pfifo_fast_change_tx_queue_len+0x810/0x810
[  158.765975]  ? __lock_is_held+0xa0/0x160
[  158.770002]  __qdisc_run+0x39e/0xfc0
[  158.773673]  ? _raw_spin_unlock+0x29/0x40
[  158.777781]  ? pfifo_fast_enqueue+0x24b/0x3e0
[  158.782191]  ? sch_direct_xmit+0x5d0/0x5d0
[  158.786372]  ? pfifo_fast_dequeue+0x670/0x670
[  158.790818]  ? __dev_queue_xmit+0x172/0x1770
[  158.795195]  ? preempt_count_sub+0xf/0xd0
[  158.799313]  __dev_queue_xmit+0x410/0x1770
[  158.803512]  ? ___slab_alloc+0x605/0x930
[  158.807525]  ? ___slab_alloc+0x605/0x930
[  158.811540]  ? memcpy+0x34/0x50
[  158.814768]  ? netdev_pick_tx+0x1c0/0x1c0
[  158.818895]  ? __skb_clone+0x2fd/0x3d0
[  158.822712]  ? __copy_skb_header+0x270/0x270
[  158.827079]  ? rcu_read_lock_sched_held+0x93/0xa0
[  158.831903]  ? kmem_cache_alloc+0x344/0x4d0
[  158.836199]  ? skb_clone+0x123/0x230
[  158.839869]  ? skb_split+0x820/0x820
[  158.843521]  ? tcf_mirred+0x554/0x930 [act_mirred]
[  158.848407]  tcf_mirred+0x554/0x930 [act_mirred]
[  158.853104]  ? tcf_mirred_act_wants_ingress.part.2+0x10/0x10 [act_mirred]
[  158.860005]  ? __lock_acquire+0x706/0x26e0
[  158.864162]  ? mark_lock+0x13d/0xb40
[  158.867832]  tcf_action_exec+0xcf/0x2a0
[  158.871736]  tcf_classify+0xfa/0x340
[  158.875402]  __netif_receive_skb_core+0x8e1/0x1c60
[  158.880334]  ? nf_ingress+0x500/0x500
[  158.884059]  ? process_backlog+0x347/0x4b0
[  158.888241]  ? lock_acquire+0xd8/0x320
[  158.892050]  ? process_backlog+0x1b6/0x4b0
[  158.896228]  ? process_backlog+0xc2/0x4b0
[  158.900291]  process_backlog+0xc2/0x4b0
[  158.904210]  net_rx_action+0x5cc/0x980
[  158.908047]  ? napi_complete_done+0x2c0/0x2c0
[  158.912525]  ? rcu_read_unlock+0x80/0x80
[  158.916534]  ? __lock_is_held+0x34/0x160
[  158.920541]  __do_softirq+0x1d4/0x9d2
[  158.924308]  ? trace_event_raw_event_irq_handler_exit+0x140/0x140
[  158.930515]  run_ksoftirqd+0x1d/0x40
[  158.934152]  smpboot_thread_fn+0x32b/0x690
[  158.938299]  ? sort_range+0x20/0x20
[  158.941842]  ? preempt_count_sub+0xf/0xd0
[  158.945940]  ? schedule+0x5b/0x140
[  158.949412]  kthread+0x206/0x300
[  158.952689]  ? sort_range+0x20/0x20
[  158.956249]  ? kthread_stop+0x570/0x570
[  158.960164]  ret_from_fork+0x3a/0x50
[  158.963823] Code: 14 3e ff 8b 4b 78 55 4d 89 f9 41 56 41 55 48 c7 c7 a0 cf db 82 41 54 44 8b 44 24 2c 48 8b 54 24 30 48 8b 74 24 20 e8 16 94 13 ff <0f> 0b 48 c7 c7 60 8e 1f 85 48 83 c4 20 e8 55 ef a6 ff 89 74 24
[  158.983235] RIP: skb_panic+0xc3/0x100 RSP: ffff8801d3f27110
[  158.988935] ---[ end trace 5af56ee845aa6cc8 ]---
[  158.993641] Kernel panic - not syncing: Fatal exception in interrupt
[  159.000176] Kernel Offset: disabled
[  159.003767] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Reproducer:

	ip link add h1 type veth peer name swp1
	ip link add h3 type veth peer name swp3

	ip link set dev h1 up
	ip address add 192.0.2.1/28 dev h1

	ip link add dev vh3 type vrf table 20
	ip link set dev h3 master vh3
	ip link set dev vh3 up
	ip link set dev h3 up

	ip link set dev swp3 up
	ip address add dev swp3 2001:db8:2::1/64

	ip link set dev swp1 up
	tc qdisc add dev swp1 clsact

	ip link add name gt6 type ip6gretap \
		local 2001:db8:2::1 remote 2001:db8:2::2
	ip link set dev gt6 up

	sleep 1

	tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
		action mirred egress mirror dev gt6
	ping -I h1 192.0.2.2

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a3e1dec96a74..020f74a743ee 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -698,6 +698,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	else
 		fl6->daddr = tunnel->parms.raddr;
 
+	if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+		return -ENOMEM;
+
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 

From 5691484df961aff897d824bcc26cd1a2aa036b5b Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:15 +0200
Subject: [PATCH 314/393] net: ip6_gre: Fix headroom request in
 ip6erspan_tunnel_xmit()

dev->needed_headroom is not primed until ip6_tnl_xmit(), so it starts
out zero. Thus the call to skb_cow_head() fails to actually make sure
there's enough headroom to push the ERSPAN headers to. That can lead to
the panic cited below. (Reproducer below that).

Fix by requesting either needed_headroom if already primed, or just the
bare minimum needed for the header otherwise.

[  190.703567] kernel BUG at net/core/skbuff.c:104!
[  190.708384] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
[  190.714007] Modules linked in: act_mirred cls_matchall ip6_gre ip6_tunnel tunnel6 gre sch_ingress vrf veth x86_pkg_temp_thermal mlx_platform nfsd e1000e leds_mlxcpld
[  190.728975] CPU: 1 PID: 959 Comm: kworker/1:2 Not tainted 4.17.0-rc4-net_master-custom-139 #10
[  190.737647] Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2F"/"SA000874", BIOS 4.6.5 03/08/2016
[  190.747006] Workqueue: ipv6_addrconf addrconf_dad_work
[  190.752222] RIP: 0010:skb_panic+0xc3/0x100
[  190.756358] RSP: 0018:ffff8801d54072f0 EFLAGS: 00010282
[  190.761629] RAX: 0000000000000085 RBX: ffff8801c1a8ecc0 RCX: 0000000000000000
[  190.768830] RDX: 0000000000000085 RSI: dffffc0000000000 RDI: ffffed003aa80e54
[  190.776025] RBP: ffff8801bd1ec5a0 R08: ffffed003aabce19 R09: ffffed003aabce19
[  190.783226] R10: 0000000000000001 R11: ffffed003aabce18 R12: ffff8801bf695dbe
[  190.790418] R13: 0000000000000084 R14: 00000000000006c0 R15: ffff8801bf695dc8
[  190.797621] FS:  0000000000000000(0000) GS:ffff8801d5400000(0000) knlGS:0000000000000000
[  190.805786] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  190.811582] CR2: 000055fa929aced0 CR3: 0000000003228004 CR4: 00000000001606e0
[  190.818790] Call Trace:
[  190.821264]  <IRQ>
[  190.823314]  ? ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.828940]  ? ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.834562]  skb_push+0x78/0x90
[  190.837749]  ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.843219]  ? ip6gre_tunnel_ioctl+0xd90/0xd90 [ip6_gre]
[  190.848577]  ? debug_check_no_locks_freed+0x210/0x210
[  190.853679]  ? debug_check_no_locks_freed+0x210/0x210
[  190.858783]  ? print_irqtrace_events+0x120/0x120
[  190.863451]  ? sched_clock_cpu+0x18/0x210
[  190.867496]  ? cyc2ns_read_end+0x10/0x10
[  190.871474]  ? skb_network_protocol+0x76/0x200
[  190.875977]  dev_hard_start_xmit+0x137/0x770
[  190.880317]  ? do_raw_spin_trylock+0x6d/0xa0
[  190.884624]  sch_direct_xmit+0x2ef/0x5d0
[  190.888589]  ? pfifo_fast_dequeue+0x3fa/0x670
[  190.892994]  ? pfifo_fast_change_tx_queue_len+0x810/0x810
[  190.898455]  ? __lock_is_held+0xa0/0x160
[  190.902422]  __qdisc_run+0x39e/0xfc0
[  190.906041]  ? _raw_spin_unlock+0x29/0x40
[  190.910090]  ? pfifo_fast_enqueue+0x24b/0x3e0
[  190.914501]  ? sch_direct_xmit+0x5d0/0x5d0
[  190.918658]  ? pfifo_fast_dequeue+0x670/0x670
[  190.923047]  ? __dev_queue_xmit+0x172/0x1770
[  190.927365]  ? preempt_count_sub+0xf/0xd0
[  190.931421]  __dev_queue_xmit+0x410/0x1770
[  190.935553]  ? ___slab_alloc+0x605/0x930
[  190.939524]  ? print_irqtrace_events+0x120/0x120
[  190.944186]  ? memcpy+0x34/0x50
[  190.947364]  ? netdev_pick_tx+0x1c0/0x1c0
[  190.951428]  ? __skb_clone+0x2fd/0x3d0
[  190.955218]  ? __copy_skb_header+0x270/0x270
[  190.959537]  ? rcu_read_lock_sched_held+0x93/0xa0
[  190.964282]  ? kmem_cache_alloc+0x344/0x4d0
[  190.968520]  ? cyc2ns_read_end+0x10/0x10
[  190.972495]  ? skb_clone+0x123/0x230
[  190.976112]  ? skb_split+0x820/0x820
[  190.979747]  ? tcf_mirred+0x554/0x930 [act_mirred]
[  190.984582]  tcf_mirred+0x554/0x930 [act_mirred]
[  190.989252]  ? tcf_mirred_act_wants_ingress.part.2+0x10/0x10 [act_mirred]
[  190.996109]  ? __lock_acquire+0x706/0x26e0
[  191.000239]  ? sched_clock_cpu+0x18/0x210
[  191.004294]  tcf_action_exec+0xcf/0x2a0
[  191.008179]  tcf_classify+0xfa/0x340
[  191.011794]  __netif_receive_skb_core+0x8e1/0x1c60
[  191.016630]  ? debug_check_no_locks_freed+0x210/0x210
[  191.021732]  ? nf_ingress+0x500/0x500
[  191.025458]  ? process_backlog+0x347/0x4b0
[  191.029619]  ? print_irqtrace_events+0x120/0x120
[  191.034302]  ? lock_acquire+0xd8/0x320
[  191.038089]  ? process_backlog+0x1b6/0x4b0
[  191.042246]  ? process_backlog+0xc2/0x4b0
[  191.046303]  process_backlog+0xc2/0x4b0
[  191.050189]  net_rx_action+0x5cc/0x980
[  191.053991]  ? napi_complete_done+0x2c0/0x2c0
[  191.058386]  ? mark_lock+0x13d/0xb40
[  191.062001]  ? clockevents_program_event+0x6b/0x1d0
[  191.066922]  ? print_irqtrace_events+0x120/0x120
[  191.071593]  ? __lock_is_held+0xa0/0x160
[  191.075566]  __do_softirq+0x1d4/0x9d2
[  191.079282]  ? ip6_finish_output2+0x524/0x1460
[  191.083771]  do_softirq_own_stack+0x2a/0x40
[  191.087994]  </IRQ>
[  191.090130]  do_softirq.part.13+0x38/0x40
[  191.094178]  __local_bh_enable_ip+0x135/0x190
[  191.098591]  ip6_finish_output2+0x54d/0x1460
[  191.102916]  ? ip6_forward_finish+0x2f0/0x2f0
[  191.107314]  ? ip6_mtu+0x3c/0x2c0
[  191.110674]  ? ip6_finish_output+0x2f8/0x650
[  191.114992]  ? ip6_output+0x12a/0x500
[  191.118696]  ip6_output+0x12a/0x500
[  191.122223]  ? ip6_route_dev_notify+0x5b0/0x5b0
[  191.126807]  ? ip6_finish_output+0x650/0x650
[  191.131120]  ? ip6_fragment+0x1a60/0x1a60
[  191.135182]  ? icmp6_dst_alloc+0x26e/0x470
[  191.139317]  mld_sendpack+0x672/0x830
[  191.143021]  ? igmp6_mcf_seq_next+0x2f0/0x2f0
[  191.147429]  ? __local_bh_enable_ip+0x77/0x190
[  191.151913]  ipv6_mc_dad_complete+0x47/0x90
[  191.156144]  addrconf_dad_completed+0x561/0x720
[  191.160731]  ? addrconf_rs_timer+0x3a0/0x3a0
[  191.165036]  ? mark_held_locks+0xc9/0x140
[  191.169095]  ? __local_bh_enable_ip+0x77/0x190
[  191.173570]  ? addrconf_dad_work+0x50d/0xa20
[  191.177886]  ? addrconf_dad_work+0x529/0xa20
[  191.182194]  addrconf_dad_work+0x529/0xa20
[  191.186342]  ? addrconf_dad_completed+0x720/0x720
[  191.191088]  ? __lock_is_held+0xa0/0x160
[  191.195059]  ? process_one_work+0x45d/0xe20
[  191.199302]  ? process_one_work+0x51e/0xe20
[  191.203531]  ? rcu_read_lock_sched_held+0x93/0xa0
[  191.208279]  process_one_work+0x51e/0xe20
[  191.212340]  ? pwq_dec_nr_in_flight+0x200/0x200
[  191.216912]  ? get_lock_stats+0x4b/0xf0
[  191.220788]  ? preempt_count_sub+0xf/0xd0
[  191.224844]  ? worker_thread+0x219/0x860
[  191.228823]  ? do_raw_spin_trylock+0x6d/0xa0
[  191.233142]  worker_thread+0xeb/0x860
[  191.236848]  ? process_one_work+0xe20/0xe20
[  191.241095]  kthread+0x206/0x300
[  191.244352]  ? process_one_work+0xe20/0xe20
[  191.248587]  ? kthread_stop+0x570/0x570
[  191.252459]  ret_from_fork+0x3a/0x50
[  191.256082] Code: 14 3e ff 8b 4b 78 55 4d 89 f9 41 56 41 55 48 c7 c7 a0 cf db 82 41 54 44 8b 44 24 2c 48 8b 54 24 30 48 8b 74 24 20 e8 16 94 13 ff <0f> 0b 48 c7 c7 60 8e 1f 85 48 83 c4 20 e8 55 ef a6 ff 89 74 24
[  191.275327] RIP: skb_panic+0xc3/0x100 RSP: ffff8801d54072f0
[  191.281024] ---[ end trace 7ea51094e099e006 ]---
[  191.285724] Kernel panic - not syncing: Fatal exception in interrupt
[  191.292168] Kernel Offset: disabled
[  191.295697] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Reproducer:

	ip link add h1 type veth peer name swp1
	ip link add h3 type veth peer name swp3

	ip link set dev h1 up
	ip address add 192.0.2.1/28 dev h1

	ip link add dev vh3 type vrf table 20
	ip link set dev h3 master vh3
	ip link set dev vh3 up
	ip link set dev h3 up

	ip link set dev swp3 up
	ip address add dev swp3 2001:db8:2::1/64

	ip link set dev swp1 up
	tc qdisc add dev swp1 clsact

	ip link add name gt6 type ip6erspan \
		local 2001:db8:2::1 remote 2001:db8:2::2 oseq okey 123
	ip link set dev gt6 up

	sleep 1

	tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
		action mirred egress mirror dev gt6
	ping -I h1 192.0.2.2

Fixes: e41c7c68ea77 ("ip6erspan: make sure enough headroom at xmit.")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 020f74a743ee..14c069b59a90 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -911,7 +911,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		truncate = true;
 	}
 
-	if (skb_cow_head(skb, dev->needed_headroom))
+	if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
 		goto tx_err;
 
 	t->parms.o_flags &= ~TUNNEL_KEY;

From a483373ead61e6079bc8ebe27e2dfdb2e3c1559f Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:27 +0200
Subject: [PATCH 315/393] net: ip6_gre: Split up ip6gre_tnl_link_config()

The function ip6gre_tnl_link_config() is used for setting up
configuration of both ip6gretap and ip6erspan tunnels. Split the
function into the common part and the route-lookup part. The latter then
takes the calculated header length as an argument. This split will allow
the patches down the line to sneak in a custom header length computation
for the ERSPAN tunnel.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 14c069b59a90..5d0a3b1ee0e9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1025,12 +1025,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
 {
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
 	struct flowi6 *fl6 = &t->fl.u.ip6;
-	int t_hlen;
 
 	if (dev->type != ARPHRD_ETHER) {
 		memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
@@ -1057,12 +1056,13 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 		dev->flags |= IFF_POINTOPOINT;
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
+}
 
-	t->tun_hlen = gre_calc_hlen(t->parms.o_flags);
-
-	t->hlen = t->encap_hlen + t->tun_hlen;
-
-	t_hlen = t->hlen + sizeof(struct ipv6hdr);
+static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
+					 int t_hlen)
+{
+	const struct __ip6_tnl_parm *p = &t->parms;
+	struct net_device *dev = t->dev;
 
 	if (p->flags & IP6_TNL_F_CAP_XMIT) {
 		int strict = (ipv6_addr_type(&p->raddr) &
@@ -1094,6 +1094,24 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	}
 }
 
+static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
+{
+	int t_hlen;
+
+	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+	tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	return t_hlen;
+}
+
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+	ip6gre_tnl_link_config_common(t);
+	ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
+}
+
 static int ip6gre_tnl_change(struct ip6_tnl *t,
 	const struct __ip6_tnl_parm *p, int set_mtu)
 {
@@ -1387,11 +1405,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 		return ret;
 	}
 
-	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
-	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
-	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
-
-	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	t_hlen = ip6gre_calc_hlen(tunnel);
 	dev->mtu = ETH_DATA_LEN - t_hlen;
 	if (dev->type == ARPHRD_ETHER)
 		dev->mtu -= ETH_HLEN;

From a6465350ef495f5cbd76a3e505d25a01d648477e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:33 +0200
Subject: [PATCH 316/393] net: ip6_gre: Split up ip6gre_tnl_change()

Split a reusable function ip6gre_tnl_copy_tnl_parm() from
ip6gre_tnl_change(). This will allow ERSPAN-specific code to
reuse the common parts while customizing the behavior for ERSPAN.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 5d0a3b1ee0e9..fac820b121ae 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1112,8 +1112,8 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
 }
 
-static int ip6gre_tnl_change(struct ip6_tnl *t,
-	const struct __ip6_tnl_parm *p, int set_mtu)
+static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
+				     const struct __ip6_tnl_parm *p)
 {
 	t->parms.laddr = p->laddr;
 	t->parms.raddr = p->raddr;
@@ -1129,6 +1129,12 @@ static int ip6gre_tnl_change(struct ip6_tnl *t,
 	t->parms.o_flags = p->o_flags;
 	t->parms.fwmark = p->fwmark;
 	dst_cache_reset(&t->dst_cache);
+}
+
+static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
+			     int set_mtu)
+{
+	ip6gre_tnl_copy_tnl_parm(t, p);
 	ip6gre_tnl_link_config(t, set_mtu);
 	return 0;
 }

From 7fa38a7c852ec99e3a7fc375eb2c21c50c2e46b8 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:39 +0200
Subject: [PATCH 317/393] net: ip6_gre: Split up ip6gre_newlink()

Extract from ip6gre_newlink() a reusable function
ip6gre_newlink_common(). The ip6gre_tnl_link_config() call needs to be
made customizable for ERSPAN, thus reorder it with calls to
ip6_tnl_change_mtu() and dev_hold(), and extract the whole tail to the
caller, ip6gre_newlink(). Thus enable an ERSPAN-specific _newlink()
function without a lot of duplicity.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index fac820b121ae..565e11162ac3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1861,9 +1861,9 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
 	return ret;
 }
 
-static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
-			  struct nlattr *tb[], struct nlattr *data[],
-			  struct netlink_ext_ack *extack)
+static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
+				 struct nlattr *tb[], struct nlattr *data[],
+				 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *nt;
 	struct net *net = dev_net(dev);
@@ -1900,18 +1900,30 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto out;
 
-	ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
-
 	if (tb[IFLA_MTU])
 		ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
 
 	dev_hold(dev);
-	ip6gre_tunnel_link(ign, nt);
 
 out:
 	return err;
 }
 
+static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+	struct ip6_tnl *nt = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+
+	if (!err) {
+		ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+	}
+	return err;
+}
+
 static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
 			     struct nlattr *data[],
 			     struct netlink_ext_ack *extack)

From c8632fc30bb03aa0c3bd7bcce85355a10feb8149 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:45 +0200
Subject: [PATCH 318/393] net: ip6_gre: Split up ip6gre_changelink()

Extract from ip6gre_changelink() a reusable function
ip6gre_changelink_common(). This will allow introduction of
ERSPAN-specific _changelink() function with not a lot of code
duplication.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 565e11162ac3..41fd5735f931 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1924,37 +1924,52 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	return err;
 }
 
-static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
-			     struct nlattr *data[],
-			     struct netlink_ext_ack *extack)
+static struct ip6_tnl *
+ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[],
+			 struct nlattr *data[], struct __ip6_tnl_parm *p_p,
+			 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *t, *nt = netdev_priv(dev);
 	struct net *net = nt->net;
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
-	struct __ip6_tnl_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (dev == ign->fb_tunnel_dev)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	if (ip6gre_netlink_encap_parms(data, &ipencap)) {
 		int err = ip6_tnl_encap_setup(nt, &ipencap);
 
 		if (err < 0)
-			return err;
+			return ERR_PTR(err);
 	}
 
-	ip6gre_netlink_parms(data, &p);
+	ip6gre_netlink_parms(data, p_p);
 
-	t = ip6gre_tunnel_locate(net, &p, 0);
+	t = ip6gre_tunnel_locate(net, p_p, 0);
 
 	if (t) {
 		if (t->dev != dev)
-			return -EEXIST;
+			return ERR_PTR(-EEXIST);
 	} else {
 		t = nt;
 	}
 
+	return t;
+}
+
+static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
+			     struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+	struct __ip6_tnl_parm p;
+	struct ip6_tnl *t;
+
+	t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
 	ip6gre_tunnel_unlink(ign, t);
 	ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
 	ip6gre_tunnel_link(ign, t);

From 2d665034f239412927b1e71329f20f001c92da09 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:51 +0200
Subject: [PATCH 319/393] net: ip6_gre: Fix ip6erspan hlen calculation

Even though ip6erspan_tap_init() sets up hlen and tun_hlen according to
what ERSPAN needs, it goes ahead to call ip6gre_tnl_link_config() which
overwrites these settings with GRE-specific ones.

Similarly for changelink callbacks, which are handled by
ip6gre_changelink() calls ip6gre_tnl_change() calls
ip6gre_tnl_link_config() as well.

The difference ends up being 12 vs. 20 bytes, and this is generally not
a problem, because a 12-byte request likely ends up allocating more and
the extra 8 bytes are thus available. However correct it is not.

So replace the newlink and changelink callbacks with an ERSPAN-specific
ones, reusing the newly-introduced _common() functions.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 74 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 41fd5735f931..5162ecc45c20 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -81,6 +81,7 @@ static int ip6gre_tunnel_init(struct net_device *dev);
 static void ip6gre_tunnel_setup(struct net_device *dev);
 static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
 static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu);
 
 /* Tunnel hash table */
 
@@ -1754,6 +1755,19 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
 	.ndo_get_iflink = ip6_tnl_get_iflink,
 };
 
+static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
+{
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+		       erspan_hdr_len(tunnel->parms.erspan_ver);
+
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+	tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	return t_hlen;
+}
+
 static int ip6erspan_tap_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
@@ -1777,12 +1791,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 		return ret;
 	}
 
-	tunnel->tun_hlen = 8;
-	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       erspan_hdr_len(tunnel->parms.erspan_ver);
-	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
-
-	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	t_hlen = ip6erspan_calc_hlen(tunnel);
 	dev->mtu = ETH_DATA_LEN - t_hlen;
 	if (dev->type == ARPHRD_ETHER)
 		dev->mtu -= ETH_HLEN;
@@ -1790,7 +1799,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 		dev->mtu -= 8;
 
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
-	ip6gre_tnl_link_config(tunnel, 1);
+	ip6erspan_tnl_link_config(tunnel, 1);
 
 	return 0;
 }
@@ -2121,6 +2130,53 @@ static void ip6erspan_tap_setup(struct net_device *dev)
 	netif_keep_dst(dev);
 }
 
+static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
+			     struct nlattr *tb[], struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+	struct ip6_tnl *nt = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+
+	if (!err) {
+		ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+	}
+	return err;
+}
+
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+	ip6gre_tnl_link_config_common(t);
+	ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t));
+}
+
+static int ip6erspan_tnl_change(struct ip6_tnl *t,
+				const struct __ip6_tnl_parm *p, int set_mtu)
+{
+	ip6gre_tnl_copy_tnl_parm(t, p);
+	ip6erspan_tnl_link_config(t, set_mtu);
+	return 0;
+}
+
+static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+				struct nlattr *data[],
+				struct netlink_ext_ack *extack)
+{
+	struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+	struct __ip6_tnl_parm p;
+	struct ip6_tnl *t;
+
+	t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ip6gre_tunnel_unlink(ign, t);
+	ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6gre_tunnel_link(ign, t);
+	return 0;
+}
+
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.kind		= "ip6gre",
 	.maxtype	= IFLA_GRE_MAX,
@@ -2157,8 +2213,8 @@ static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
 	.priv_size	= sizeof(struct ip6_tnl),
 	.setup		= ip6erspan_tap_setup,
 	.validate	= ip6erspan_tap_validate,
-	.newlink	= ip6gre_newlink,
-	.changelink	= ip6gre_changelink,
+	.newlink	= ip6erspan_newlink,
+	.changelink	= ip6erspan_changelink,
 	.get_size	= ip6gre_get_size,
 	.fill_info	= ip6gre_fill_info,
 	.get_link_net	= ip6_tnl_get_link_net,

From 113f99c3358564a0647d444c2ae34e8b1abfd5b9 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 17 May 2018 13:13:29 -0400
Subject: [PATCH 320/393] net: test tailroom before appending to linear skb

Device features may change during transmission. In particular with
corking, a device may toggle scatter-gather in between allocating
and writing to an skb.

Do not unconditionally assume that !NETIF_F_SG at write time implies
that the same held at alloc time and thus the skb has sufficient
tailroom.

This issue predates git history.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 83c73bab2c3d..d54abc097800 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1045,7 +1045,8 @@ static int __ip_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2e891d2c30ef..7b6d1689087b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1503,7 +1503,8 @@ static int __ip6_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;

From a593f70831b68740fb7db69e0556ca72dac8c7a8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:06:35 -0700
Subject: [PATCH 321/393] bpf: sockmap update rollback on error can incorrectly
 dec prog refcnt

If the user were to only attach one of the parse or verdict programs
then it is possible a subsequent sockmap update could incorrectly
decrement the refcnt on the program. This happens because in the
rollback logic, after an error, we have to decrement the program
reference count when its been incremented. However, we only increment
the program reference count if the user has both a verdict and a
parse program. The reason for this is because, at least at the
moment, both are required for any one to be meaningful. The problem
fixed here is in the rollback path we decrement the program refcnt
even if only one existing. But we never incremented the refcnt in
the first place creating an imbalance.

This patch fixes the error path to handle this case.

Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..f03aaa8daadd 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1717,10 +1717,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (tx_msg) {
 		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
 		if (IS_ERR(tx_msg)) {
-			if (verdict)
-				bpf_prog_put(verdict);
-			if (parse)
+			if (parse && verdict) {
 				bpf_prog_put(parse);
+				bpf_prog_put(verdict);
+			}
 			return PTR_ERR(tx_msg);
 		}
 	}
@@ -1805,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 out_free:
 	smap_release_sock(psock, sock);
 out_progs:
-	if (verdict)
-		bpf_prog_put(verdict);
-	if (parse)
+	if (parse && verdict) {
 		bpf_prog_put(parse);
+		bpf_prog_put(verdict);
+	}
 	if (tx_msg)
 		bpf_prog_put(tx_msg);
 	write_unlock_bh(&sock->sk_callback_lock);

From 9617456054a6160f5e11e892b713fade78aea2e9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:06:40 -0700
Subject: [PATCH 322/393] bpf: parse and verdict prog attach may race with bpf
 map update

In the sockmap design BPF programs (SK_SKB_STREAM_PARSER,
SK_SKB_STREAM_VERDICT and SK_MSG_VERDICT) are attached to the sockmap
map type and when a sock is added to the map the programs are used by
the socket. However, sockmap updates from both userspace and BPF
programs can happen concurrently with the attach and detach of these
programs.

To resolve this we use the bpf_prog_inc_not_zero and a READ_ONCE()
primitive to ensure the program pointer is not refeched and
possibly NULL'd before the refcnt increment. This happens inside
a RCU critical section so although the pointer reference in the map
object may be NULL (by a concurrent detach operation) the reference
from READ_ONCE will not be free'd until after grace period. This
ensures the object returned by READ_ONCE() is valid through the
RCU criticl section and safe to use as long as we "know" it may
be free'd shortly.

Daniel spotted a case in the sock update API where instead of using
the READ_ONCE() program reference we used the pointer from the
original map, stab->bpf_{verdict|parse|txmsg}. The problem with this
is the logic checks the object returned from the READ_ONCE() is not
NULL and then tries to reference the object again but using the
above map pointer, which may have already been NULL'd by a parallel
detach operation. If this happened bpf_porg_inc_not_zero could
dereference a NULL pointer.

Fix this by using variable returned by READ_ONCE() that is checked
for NULL.

Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f03aaa8daadd..95a84b2f10ce 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1703,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);
@@ -1715,7 +1715,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	}
 
 	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+		tx_msg = bpf_prog_inc_not_zero(tx_msg);
 		if (IS_ERR(tx_msg)) {
 			if (parse && verdict) {
 				bpf_prog_put(parse);

From 050fad7c4534c13c8eb1d9c2ba66012e014773cb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 17 May 2018 01:44:11 +0200
Subject: [PATCH 323/393] bpf: fix truncated jump targets on heavy expansions

Recently during testing, I ran into the following panic:

  [  207.892422] Internal error: Accessing user space memory outside uaccess.h routines: 96000004 [#1] SMP
  [  207.901637] Modules linked in: binfmt_misc [...]
  [  207.966530] CPU: 45 PID: 2256 Comm: test_verifier Tainted: G        W         4.17.0-rc3+ #7
  [  207.974956] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB18A 03/31/2017
  [  207.982428] pstate: 60400005 (nZCv daif +PAN -UAO)
  [  207.987214] pc : bpf_skb_load_helper_8_no_cache+0x34/0xc0
  [  207.992603] lr : 0xffff000000bdb754
  [  207.996080] sp : ffff000013703ca0
  [  207.999384] x29: ffff000013703ca0 x28: 0000000000000001
  [  208.004688] x27: 0000000000000001 x26: 0000000000000000
  [  208.009992] x25: ffff000013703ce0 x24: ffff800fb4afcb00
  [  208.015295] x23: ffff00007d2f5038 x22: ffff00007d2f5000
  [  208.020599] x21: fffffffffeff2a6f x20: 000000000000000a
  [  208.025903] x19: ffff000009578000 x18: 0000000000000a03
  [  208.031206] x17: 0000000000000000 x16: 0000000000000000
  [  208.036510] x15: 0000ffff9de83000 x14: 0000000000000000
  [  208.041813] x13: 0000000000000000 x12: 0000000000000000
  [  208.047116] x11: 0000000000000001 x10: ffff0000089e7f18
  [  208.052419] x9 : fffffffffeff2a6f x8 : 0000000000000000
  [  208.057723] x7 : 000000000000000a x6 : 00280c6160000000
  [  208.063026] x5 : 0000000000000018 x4 : 0000000000007db6
  [  208.068329] x3 : 000000000008647a x2 : 19868179b1484500
  [  208.073632] x1 : 0000000000000000 x0 : ffff000009578c08
  [  208.078938] Process test_verifier (pid: 2256, stack limit = 0x0000000049ca7974)
  [  208.086235] Call trace:
  [  208.088672]  bpf_skb_load_helper_8_no_cache+0x34/0xc0
  [  208.093713]  0xffff000000bdb754
  [  208.096845]  bpf_test_run+0x78/0xf8
  [  208.100324]  bpf_prog_test_run_skb+0x148/0x230
  [  208.104758]  sys_bpf+0x314/0x1198
  [  208.108064]  el0_svc_naked+0x30/0x34
  [  208.111632] Code: 91302260 f9400001 f9001fa1 d2800001 (29500680)
  [  208.117717] ---[ end trace 263cb8a59b5bf29f ]---

The program itself which caused this had a long jump over the whole
instruction sequence where all of the inner instructions required
heavy expansions into multiple BPF instructions. Additionally, I also
had BPF hardening enabled which requires once more rewrites of all
constant values in order to blind them. Each time we rewrite insns,
bpf_adj_branches() would need to potentially adjust branch targets
which cross the patchlet boundary to accommodate for the additional
delta. Eventually that lead to the case where the target offset could
not fit into insn->off's upper 0x7fff limit anymore where then offset
wraps around becoming negative (in s16 universe), or vice versa
depending on the jump direction.

Therefore it becomes necessary to detect and reject any such occasions
in a generic way for native eBPF and cBPF to eBPF migrations. For
the latter we can simply check bounds in the bpf_convert_filter()'s
BPF_EMIT_JMP helper macro and bail out once we surpass limits. The
bpf_patch_insn_single() for native eBPF (and cBPF to eBPF in case
of subsequent hardening) is a bit more complex in that we need to
detect such truncations before hitting the bpf_prog_realloc(). Thus
the latter is split into an extra pass to probe problematic offsets
on the original program in order to fail early. With that in place
and carefully tested I no longer hit the panic and the rewrites are
rejected properly. The above example panic I've seen on bpf-next,
though the issue itself is generic in that a guard against this issue
in bpf seems more appropriate in this case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c | 110 +++++++++++++++++++++++++++++++++-------------
 net/core/filter.c |  11 ++++-
 2 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..6ef6746a7871 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
 	return 0;
 }
 
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+				u32 curr, const bool probe_pass)
 {
+	const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+	s64 imm = insn->imm;
+
+	if (curr < pos && curr + imm + 1 > pos)
+		imm += delta;
+	else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+		imm -= delta;
+	if (imm < imm_min || imm > imm_max)
+		return -ERANGE;
+	if (!probe_pass)
+		insn->imm = imm;
+	return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+				u32 curr, const bool probe_pass)
+{
+	const s32 off_min = S16_MIN, off_max = S16_MAX;
+	s32 off = insn->off;
+
+	if (curr < pos && curr + off + 1 > pos)
+		off += delta;
+	else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+		off -= delta;
+	if (off < off_min || off > off_max)
+		return -ERANGE;
+	if (!probe_pass)
+		insn->off = off;
+	return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+			    const bool probe_pass)
+{
+	u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
 	struct bpf_insn *insn = prog->insnsi;
-	u32 i, insn_cnt = prog->len;
-	bool pseudo_call;
-	u8 code;
-	int off;
+	int ret = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		code = insn->code;
-		if (BPF_CLASS(code) != BPF_JMP)
-			continue;
-		if (BPF_OP(code) == BPF_EXIT)
-			continue;
-		if (BPF_OP(code) == BPF_CALL) {
-			if (insn->src_reg == BPF_PSEUDO_CALL)
-				pseudo_call = true;
-			else
-				continue;
-		} else {
-			pseudo_call = false;
+		u8 code;
+
+		/* In the probing pass we still operate on the original,
+		 * unpatched image in order to check overflows before we
+		 * do any other adjustments. Therefore skip the patchlet.
+		 */
+		if (probe_pass && i == pos) {
+			i += delta + 1;
+			insn++;
 		}
-		off = pseudo_call ? insn->imm : insn->off;
-
-		/* Adjust offset of jmps if we cross boundaries. */
-		if (i < pos && i + off + 1 > pos)
-			off += delta;
-		else if (i > pos + delta && i + off + 1 <= pos + delta)
-			off -= delta;
-
-		if (pseudo_call)
-			insn->imm = off;
-		else
-			insn->off = off;
+		code = insn->code;
+		if (BPF_CLASS(code) != BPF_JMP ||
+		    BPF_OP(code) == BPF_EXIT)
+			continue;
+		/* Adjust offset of jmps if we cross patch boundaries. */
+		if (BPF_OP(code) == BPF_CALL) {
+			if (insn->src_reg != BPF_PSEUDO_CALL)
+				continue;
+			ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+						   probe_pass);
+		} else {
+			ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+						   probe_pass);
+		}
+		if (ret)
+			break;
 	}
+
+	return ret;
 }
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len)
 {
 	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	const u32 cnt_max = S16_MAX;
 	struct bpf_prog *prog_adj;
 
 	/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 
 	insn_adj_cnt = prog->len + insn_delta;
 
+	/* Reject anything that would potentially let the insn->off
+	 * target overflow when we have excessive program expansions.
+	 * We need to probe here before we do any reallocation where
+	 * we afterwards may not fail anymore.
+	 */
+	if (insn_adj_cnt > cnt_max &&
+	    bpf_adj_branches(prog, off, insn_delta, true))
+		return NULL;
+
 	/* Several new instructions need to be inserted. Make room
 	 * for them. Likely, there's no need for a new allocation as
 	 * last page could have large enough tailroom.
@@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 		sizeof(*patch) * insn_rest);
 	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
 
-	bpf_adj_branches(prog_adj, off, insn_delta);
+	/* We are guaranteed to not fail at this point, otherwise
+	 * the ship has sailed to reverse to the original state. An
+	 * overflow cannot happen at this point.
+	 */
+	BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
 
 	return prog_adj;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index e77c30ca491d..201ff36b17a8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -481,11 +481,18 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,
 
 #define BPF_EMIT_JMP							\
 	do {								\
+		const s32 off_min = S16_MIN, off_max = S16_MAX;		\
+		s32 off;						\
+									\
 		if (target >= len || target < 0)			\
 			goto err;					\
-		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\
+		off = addrs ? addrs[target] - addrs[i] - 1 : 0;		\
 		/* Adjust pc relative offset for 2nd or 3rd insn. */	\
-		insn->off -= insn - tmp_insns;				\
+		off -= insn - tmp_insns;				\
+		/* Reject anything not fitting into insn->off. */	\
+		if (off < off_min || off > off_max)			\
+			goto err;					\
+		insn->off = off;					\
 	} while (0)
 
 		case BPF_JMP | BPF_JA:

From f6a3463063f42d9fb2c78f386437a822e0ad1792 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 May 2018 21:53:47 +0200
Subject: [PATCH 324/393] sched/debug: Move the print_rt_rq() and print_dl_rq()
 declarations to kernel/sched/sched.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the following commit:

  6b55c9654fcc ("sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h")

the print_cfs_rq() prototype was added to <kernel/sched/sched.h>,
right next to the prototypes for print_cfs_stats(), print_rt_stats()
and print_dl_stats().

Finish this previous commit and also move related prototypes for
print_rt_rq() and print_dl_rq().

Remove existing extern declarations now that they not needed anymore.

Silences the following GCC warning, triggered by W=1:

  kernel/sched/debug.c:573:6: warning: no previous prototype for ‘print_rt_rq’ [-Wmissing-prototypes]
  kernel/sched/debug.c:603:6: warning: no previous prototype for ‘print_dl_rq’ [-Wmissing-prototypes]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180516195348.30426-1-malat@debian.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 --
 kernel/sched/rt.c       | 2 --
 kernel/sched/sched.h    | 5 +++--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e7b3008b85bb..d6196bc6cbb5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu)
 #endif
 
 #ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
 void print_dl_stats(struct seq_file *m, int cpu)
 {
 	print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7aef6b4e885a..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
 void print_rt_stats(struct seq_file *m, int cpu)
 {
 	rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 15750c222ca2..1f0a4bc6a39d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2025,8 +2025,9 @@ extern bool sched_debug_enabled;
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);

From 3febfc8a219a036633b57a34c6678e21b6a0580d Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 May 2018 22:09:02 +0200
Subject: [PATCH 325/393] sched/deadline: Make the grub_reclaim() function
 static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the grub_reclaim() function can be made static, make it so.

Silences the following GCC warning (W=1):

  kernel/sched/deadline.c:1120:5: warning: no previous prototype for ‘grub_reclaim’ [-Wmissing-prototypes]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180516200902.959-1-malat@debian.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d6196bc6cbb5..1356afd1eeb6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
  * So, overflow is not an issue here.
  */
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 {
 	u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
 	u64 u_act;

From cd33d8803bfa3f5e6e5d7a4bb03fc8cc32c40fb6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 May 2018 18:53:28 +0200
Subject: [PATCH 326/393] sched/fair: Fix documentation file path

The 'tip' prefix probably referred to the -tip tree and is not required,
remove it.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180515165328.24899-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index f013afc74b11..18b151f0ddc1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -738,7 +738,7 @@ config CFS_BANDWIDTH
 	  tasks running within the fair group scheduler.  Groups with no limit
 	  set are considered to be unconstrained and will run with no
 	  restriction.
-	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
+	  See Documentation/scheduler/sched-bwc.txt for more information.
 
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"

From 240da953fcc6a9008c92fae5b1f727ee5ed167ab Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 16 May 2018 23:18:09 -0400
Subject: [PATCH 327/393] x86/bugs: Rename SSBD_NO to SSB_NO

The "336996 Speculative Execution Side Channel Mitigations" from
May defines this as SSB_NO, hence lets sync-up.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msr-index.h | 2 +-
 arch/x86/kernel/cpu/common.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 562414d5b834..fda2114197b3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,7 +70,7 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
-#define ARCH_CAP_SSBD_NO		(1 << 4)   /*
+#define ARCH_CAP_SSB_NO			(1 << 4)   /*
 						    * Not susceptible to Speculative Store Bypass
 						    * attack, so no Speculative Store Bypass
 						    * control required.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b4247ed0c81e..78decc3e3067 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -974,7 +974,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
 	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
-	   !(ia32_cap & ARCH_CAP_SSBD_NO))
+	   !(ia32_cap & ARCH_CAP_SSB_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (x86_match_cpu(cpu_no_speculation))

From 3faf5246f0cd2c1fe82a2c4ba5ad857fa6941909 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 18 May 2018 16:08:10 +0200
Subject: [PATCH 328/393] parisc: Move find_pa_parent_type() out of init
 section

The 0-DAY kernel test infrastructure reported that inet_put_port() may
reference the find_pa_parent_type() function, so it can't be moved into the
init section.

Fixes: b86db40e1ecc ("parisc: Move various functions and strings to init section")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/drivers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index ee5a78a151a6..e0e1c9775c32 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -268,7 +268,7 @@ static struct parisc_device *find_device_by_addr(unsigned long hpa)
  * Walks up the device tree looking for a device of the specified type.
  * If it finds it, it returns it.  If not, it returns NULL.
  */
-const struct parisc_device * __init
+const struct parisc_device *
 find_pa_parent_type(const struct parisc_device *padev, int type)
 {
 	const struct device *dev = &padev->dev;

From 01f56832cfb6fcc204e7203f46841b6185ebd574 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 18 May 2018 16:12:12 +0200
Subject: [PATCH 329/393] parisc: Move setup_profiling_timer() out of init
 section

No other architecture has setup_profiling_timer() in the init section,
thus on parisc we face this section mismatch warning:
 Reference from the function devm_device_add_group() to the function .init.text:setup_profiling_timer()

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/smp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 4065b5e48c9d..5e26dbede5fc 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -423,8 +423,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 }
 
 #ifdef CONFIG_PROC_FS
-int __init
-setup_profiling_timer(unsigned int multiplier)
+int setup_profiling_timer(unsigned int multiplier)
 {
 	return -EINVAL;
 }

From 8a922814ccfed70d35a725f47c0bf12b50fd223c Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 18 May 2018 16:16:34 +0200
Subject: [PATCH 330/393] parisc: Move ccio_cujo20_fixup() into init section

ccio_cujo20_fixup() is called by dino_probe() only, which is in init
section already.

Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/parisc/ccio-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 126cf19e869b..297599fcbc32 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1195,7 +1195,7 @@ void * ccio_get_iommu(const struct parisc_device *dev)
  * to/from certain pages.  To avoid this happening, we mark these pages
  * as `used', and ensure that nothing will try to allocate from them.
  */
-void ccio_cujo20_fixup(struct parisc_device *cujo, u32 iovp)
+void __init ccio_cujo20_fixup(struct parisc_device *cujo, u32 iovp)
 {
 	unsigned int idx;
 	struct parisc_device *dev = parisc_parent(cujo);

From a45b599ad808c3c982fdcdc12b0b8611c2f92824 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 18 May 2018 16:23:18 +0200
Subject: [PATCH 331/393] scsi: sg: allocate with __GFP_ZERO in
 sg_build_indirect()

This shall help avoid copying uninitialized memory to the userspace when
calling ioctl(fd, SG_IO) with an empty command.

Reported-by: syzbot+7d26fc1eea198488deab@syzkaller.appspotmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c198b96368dd..5c40d809830f 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1894,7 +1894,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 		num = (rem_sz > scatter_elem_sz_prev) ?
 			scatter_elem_sz_prev : rem_sz;
 
-		schp->pages[k] = alloc_pages(gfp_mask, order);
+		schp->pages[k] = alloc_pages(gfp_mask | __GFP_ZERO, order);
 		if (!schp->pages[k])
 			goto out;
 

From 6358d49ac23995fdfe157cc8747ab0f274d3954b Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 17 May 2018 14:50:44 -0700
Subject: [PATCH 332/393] net: Fix a bug in removing queues from XPS map

While removing queues from the XPS map, the individual CPU ID
alone was used to index the CPUs map, this should be changed to also
factor in the traffic class mapping for the CPU-to-queue lookup.

Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes")
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index af0558b00c6c..2af787e8b130 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2124,7 +2124,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 		int i, j;
 
 		for (i = count, j = offset; i--; j++) {
-			if (!remove_xps_queue(dev_maps, cpu, j))
+			if (!remove_xps_queue(dev_maps, tci, j))
 				break;
 		}
 

From 5447d78623da2eded06d4cd9469d1a71eba43bc4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 May 2018 16:55:39 -0700
Subject: [PATCH 333/393] net: dsa: Do not register devlink for unused ports

Even if commit 1d27732f411d ("net: dsa: setup and teardown ports") indicated
that registering a devlink instance for unused ports is not a problem, and this
is true, this can be confusing nonetheless, so let's not do it.

Fixes: 1d27732f411d ("net: dsa: setup and teardown ports")
Reported-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index adf50fbc4c13..47725250b4ca 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -258,11 +258,13 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
 static int dsa_port_setup(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
-	int err;
+	int err = 0;
 
 	memset(&dp->devlink_port, 0, sizeof(dp->devlink_port));
 
-	err = devlink_port_register(ds->devlink, &dp->devlink_port, dp->index);
+	if (dp->type != DSA_PORT_TYPE_UNUSED)
+		err = devlink_port_register(ds->devlink, &dp->devlink_port,
+					    dp->index);
 	if (err)
 		return err;
 
@@ -293,7 +295,8 @@ static int dsa_port_setup(struct dsa_port *dp)
 
 static void dsa_port_teardown(struct dsa_port *dp)
 {
-	devlink_port_unregister(&dp->devlink_port);
+	if (dp->type != DSA_PORT_TYPE_UNUSED)
+		devlink_port_unregister(&dp->devlink_port);
 
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:

From b16a960ddbf0d4fd6aaabee42d7ec4c4c3ec836d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 18 May 2018 12:52:51 +0200
Subject: [PATCH 334/393] sh_eth: Change platform check to CONFIG_ARCH_RENESAS

Since commit 9b5ba0df4ea4f940 ("ARM: shmobile: Introduce ARCH_RENESAS")
is CONFIG_ARCH_RENESAS a more appropriate platform check than the legacy
CONFIG_ARCH_SHMOBILE, hence use the former.

Renesas SuperH SH-Mobile SoCs are still covered by the CONFIG_CPU_SH4
check.

This will allow to drop ARCH_SHMOBILE on ARM and ARM64 in the near
future.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index a5b792ce2ae7..1bf930d4a1e5 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -163,7 +163,7 @@ enum {
 };
 
 /* Driver's parameters */
-#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
+#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_RENESAS)
 #define SH_ETH_RX_ALIGN		32
 #else
 #define SH_ETH_RX_ALIGN		2

From 9709020c86f6bf8439ca3effc58cfca49a5de192 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 May 2018 04:47:55 -0700
Subject: [PATCH 335/393] sock_diag: fix use-after-free read in __sk_free

We must not call sock_diag_has_destroy_listeners(sk) on a socket
that has no reference on net structure.

BUG: KASAN: use-after-free in sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
BUG: KASAN: use-after-free in __sk_free+0x329/0x340 net/core/sock.c:1609
Read of size 8 at addr ffff88018a02e3a0 by task swapper/1/0

CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.17.0-rc5+ #54
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
 __sk_free+0x329/0x340 net/core/sock.c:1609
 sk_free+0x42/0x50 net/core/sock.c:1623
 sock_put include/net/sock.h:1664 [inline]
 reqsk_free include/net/request_sock.h:116 [inline]
 reqsk_put include/net/request_sock.h:124 [inline]
 inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:672 [inline]
 reqsk_timer_handler+0xe27/0x10e0 net/ipv4/inet_connection_sock.c:739
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
 </IRQ>
RIP: 0010:native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:54
RSP: 0018:ffff8801d9ae7c38 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: dffffc0000000000 RBX: 1ffff1003b35cf8a RCX: 0000000000000000
RDX: 1ffffffff11a30d0 RSI: 0000000000000001 RDI: ffffffff88d18680
RBP: ffff8801d9ae7c38 R08: ffffed003b5e46c3 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffff8801d9ae7cf0 R14: ffffffff897bef20 R15: 0000000000000000
 arch_safe_halt arch/x86/include/asm/paravirt.h:94 [inline]
 default_idle+0xc2/0x440 arch/x86/kernel/process.c:354
 arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:345
 default_idle_call+0x6d/0x90 kernel/sched/idle.c:93
 cpuidle_idle_call kernel/sched/idle.c:153 [inline]
 do_idle+0x395/0x560 kernel/sched/idle.c:262
 cpu_startup_entry+0x104/0x120 kernel/sched/idle.c:368
 start_secondary+0x426/0x5b0 arch/x86/kernel/smpboot.c:269
 secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242

Allocated by task 4557:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 kmem_cache_zalloc include/linux/slab.h:691 [inline]
 net_alloc net/core/net_namespace.c:383 [inline]
 copy_net_ns+0x159/0x4c0 net/core/net_namespace.c:423
 create_new_namespaces+0x69d/0x8f0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc3/0x1f0 kernel/nsproxy.c:206
 ksys_unshare+0x708/0xf90 kernel/fork.c:2408
 __do_sys_unshare kernel/fork.c:2476 [inline]
 __se_sys_unshare kernel/fork.c:2474 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2474
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 69:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 net_free net/core/net_namespace.c:399 [inline]
 net_drop_ns.part.14+0x11a/0x130 net/core/net_namespace.c:406
 net_drop_ns net/core/net_namespace.c:405 [inline]
 cleanup_net+0x6a1/0xb20 net/core/net_namespace.c:541
 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
 kthread+0x345/0x410 kernel/kthread.c:240
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The buggy address belongs to the object at ffff88018a02c140
 which belongs to the cache net_namespace of size 8832
The buggy address is located 8800 bytes inside of
 8832-byte region [ffff88018a02c140, ffff88018a02e3c0)
The buggy address belongs to the page:
page:ffffea0006280b00 count:1 mapcount:0 mapping:ffff88018a02c140 index:0x0 compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffff88018a02c140 0000000000000000 0000000100000001
raw: ffffea00062a1320 ffffea0006268020 ffff8801d9bdde40 0000000000000000
page dumped because: kasan: bad access detected

Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Craig Gallek <kraig@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525f610c..3b6d02854e57 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1606,7 +1606,7 @@ static void __sk_free(struct sock *sk)
 	if (likely(sk->sk_net_refcnt))
 		sock_inuse_add(sock_net(sk), -1);
 
-	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
 		sock_diag_broadcast_destroy(sk);
 	else
 		sk_destruct(sk);

From 44a63b137f7b6e4c7bd6c9cc21615941cb36509d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 18 May 2018 14:51:44 +0200
Subject: [PATCH 336/393] net: sched: red: avoid hashing NULL child

Hangbin reported an Oops triggered by the syzkaller qdisc rules:

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault: 0000 [#1] SMP KASAN PTI
 Modules linked in: sch_red
 CPU: 0 PID: 28699 Comm: syz-executor5 Not tainted 4.17.0-rc4.kcov #1
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:qdisc_hash_add+0x26/0xa0
 RSP: 0018:ffff8800589cf470 EFLAGS: 00010203
 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff824ad971
 RDX: 0000000000000007 RSI: ffffc9000ce9f000 RDI: 000000000000003c
 RBP: 0000000000000001 R08: ffffed000b139ea2 R09: ffff8800589cf4f0
 R10: ffff8800589cf50f R11: ffffed000b139ea2 R12: ffff880054019fc0
 R13: ffff880054019fb4 R14: ffff88005c0af600 R15: ffff880054019fb0
 FS:  00007fa6edcb1700(0000) GS:ffff88005ce00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000020000740 CR3: 000000000fc16000 CR4: 00000000000006f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  red_change+0x2d2/0xed0 [sch_red]
  qdisc_create+0x57e/0xef0
  tc_modify_qdisc+0x47f/0x14e0
  rtnetlink_rcv_msg+0x6a8/0x920
  netlink_rcv_skb+0x2a2/0x3c0
  netlink_unicast+0x511/0x740
  netlink_sendmsg+0x825/0xc30
  sock_sendmsg+0xc5/0x100
  ___sys_sendmsg+0x778/0x8e0
  __sys_sendmsg+0xf5/0x1b0
  do_syscall_64+0xbd/0x3b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x450869
 RSP: 002b:00007fa6edcb0c48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007fa6edcb16b4 RCX: 0000000000450869
 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000013
 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
 R13: 0000000000008778 R14: 0000000000702838 R15: 00007fa6edcb1700
 Code: e9 0b fe ff ff 0f 1f 44 00 00 55 53 48 89 fb 89 f5 e8 3f 07 f3 fe 48 8d 7b 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 51
 RIP: qdisc_hash_add+0x26/0xa0 RSP: ffff8800589cf470

When a red qdisc is updated with a 0 limit, the child qdisc is left
unmodified, no additional scheduler is created in red_change(),
the 'child' local variable is rightfully NULL and must not add it
to the hash table.

This change addresses the above issue moving qdisc_hash_add() right
after the child qdisc creation. It additionally removes unneeded checks
for noop_qdisc.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the dump")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 5 +++--
 net/sched/sch_tbf.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 16644b3d2362..56c181c3feeb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -222,10 +222,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 					 extack);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
-	if (child != &noop_qdisc)
-		qdisc_hash_add(child, true);
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 03225a8df973..6f74a426f159 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -383,6 +383,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 			err = PTR_ERR(child);
 			goto done;
 		}
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
 	sch_tree_lock(sch);
@@ -391,8 +394,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
-		if (child != &noop_qdisc)
-			qdisc_hash_add(child, true);
 	}
 	q->limit = qopt->limit;
 	if (tb[TCA_TBF_PBURST])

From d775f26b295a0a303f7a73d7da46e04296484fe7 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Fri, 18 May 2018 19:13:37 +0530
Subject: [PATCH 337/393] cxgb4: fix offset in collecting TX rate limit info

Correct the indirect register offsets in collecting TX rate limit info
in UP CIM logs.

Also, T5 doesn't support these indirect register offsets, so remove
them from collection logic.

Fixes: be6e36d916b1 ("cxgb4: collect TX rate limit info in UP CIM logs")
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/chelsio/cxgb4/cudbg_entity.h | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index b57acb8dc35b..dc25066c59a1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -419,15 +419,15 @@ static const u32 t6_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2920, 0x10, 0x10}, /* up_cim_2920_to_2a10 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2a14 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
+	{0x7b50, 0x7b54, 0x4900, 0x4, 0x4}, /* up_cim_4900_to_4c60 */
+	{0x7b50, 0x7b54, 0x4904, 0x4, 0x4}, /* up_cim_4904_to_4c64 */
+	{0x7b50, 0x7b54, 0x4908, 0x4, 0x4}, /* up_cim_4908_to_4c68 */
+	{0x7b50, 0x7b54, 0x4910, 0x4, 0x4}, /* up_cim_4910_to_4c70 */
+	{0x7b50, 0x7b54, 0x4914, 0x4, 0x4}, /* up_cim_4914_to_4c74 */
+	{0x7b50, 0x7b54, 0x4920, 0x10, 0x10}, /* up_cim_4920_to_4a10 */
+	{0x7b50, 0x7b54, 0x4924, 0x10, 0x10}, /* up_cim_4924_to_4a14 */
+	{0x7b50, 0x7b54, 0x4928, 0x10, 0x10}, /* up_cim_4928_to_4a18 */
+	{0x7b50, 0x7b54, 0x492c, 0x10, 0x10}, /* up_cim_492c_to_4a1c */
 };
 
 static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
@@ -444,16 +444,6 @@ static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2918, 0x4, 0x4}, /* up_cim_2918_to_3d54 */
-	{0x7b50, 0x7b54, 0x291c, 0x4, 0x4}, /* up_cim_291c_to_3d58 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2914 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
 };
 
 static const u32 t6_hma_ireg_array[][IREG_NUM_ELEM] = {

From 4855c92dbb7b3b85c23e88ab7ca04f99b9677b41 Mon Sep 17 00:00:00 2001
From: Joe Jin <joe.jin@oracle.com>
Date: Thu, 17 May 2018 12:33:28 -0700
Subject: [PATCH 338/393] xen-swiotlb: fix the check condition for
 xen_swiotlb_free_coherent

When run raidconfig from Dom0 we found that the Xen DMA heap is reduced,
but Dom Heap is increased by the same size. Tracing raidconfig we found
that the related ioctl() in megaraid_sas will call dma_alloc_coherent()
to apply memory. If the memory allocated by Dom0 is not in the DMA area,
it will exchange memory with Xen to meet the requiment. Later drivers
call dma_free_coherent() to free the memory, on xen_swiotlb_free_coherent()
the check condition (dev_addr + size - 1 <= dma_mask) is always false,
it prevents calling xen_destroy_contiguous_region() to return the memory
to the Xen DMA heap.

This issue introduced by commit 6810df88dcfc2 "xen-swiotlb: When doing
coherent alloc/dealloc check before swizzling the MFNs.".

Signed-off-by: Joe Jin <joe.jin@oracle.com>
Tested-by: John Sobecki <john.sobecki@oracle.com>
Reviewed-by: Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/swiotlb-xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index e1c60899fdbc..a6f9ba85dc4b 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -351,7 +351,7 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	 * physical address */
 	phys = xen_bus_to_phys(dev_addr);
 
-	if (((dev_addr + size - 1 > dma_mask)) ||
+	if (((dev_addr + size - 1 <= dma_mask)) ||
 	    range_straddles_page_boundary(phys, size))
 		xen_destroy_contiguous_region(phys, order);
 

From 54940fa60ad3728c592f62dadb558165495a6938 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@infradead.org>
Date: Sat, 12 May 2018 12:10:07 -0700
Subject: [PATCH 339/393] platform/x86: DELL_WMI use depends on instead of
 select for DELL_SMBIOS

If DELL_WMI "select"s DELL_SMBIOS, the DELL_SMBIOS dependencies are
ignored and it is still possible to end up with unmet direct
dependencies.

Change the select to a depends on.

Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index bc309c5327ff..566644bb496a 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -168,8 +168,8 @@ config DELL_WMI
 	depends on DMI
 	depends on INPUT
 	depends on ACPI_VIDEO || ACPI_VIDEO = n
+	depends on DELL_SMBIOS
 	select DELL_WMI_DESCRIPTOR
-	select DELL_SMBIOS
 	select INPUT_SPARSEKMAP
 	---help---
 	  Say Y here if you want to support WMI-based hotkeys on Dell laptops.

From 1e3054b98c5415d5cb5f8824fc33b548ae5644c3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 18 May 2018 16:08:44 -0700
Subject: [PATCH 340/393] lib/test_bitmap.c: fix bitmap optimisation tests to
 report errors correctly

I had neglected to increment the error counter when the tests failed,
which made the tests noisy when they fail, but not actually return an
error code.

Link: http://lkml.kernel.org/r/20180509114328.9887-1-mpe@ellerman.id.au
Fixes: 3cc78125a081 ("lib/test_bitmap.c: add optimisation tests")
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: <stable@vger.kernel.org>	[4.13+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_bitmap.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index de16f7869fb1..6cd7d0740005 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -331,23 +331,32 @@ static void noinline __init test_mem_optimisations(void)
 	unsigned int start, nbits;
 
 	for (start = 0; start < 1024; start += 8) {
-		memset(bmap1, 0x5a, sizeof(bmap1));
-		memset(bmap2, 0x5a, sizeof(bmap2));
 		for (nbits = 0; nbits < 1024 - start; nbits += 8) {
+			memset(bmap1, 0x5a, sizeof(bmap1));
+			memset(bmap2, 0x5a, sizeof(bmap2));
+
 			bitmap_set(bmap1, start, nbits);
 			__bitmap_set(bmap2, start, nbits);
-			if (!bitmap_equal(bmap1, bmap2, 1024))
+			if (!bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("set not equal %d %d\n", start, nbits);
-			if (!__bitmap_equal(bmap1, bmap2, 1024))
+				failed_tests++;
+			}
+			if (!__bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("set not __equal %d %d\n", start, nbits);
+				failed_tests++;
+			}
 
 			bitmap_clear(bmap1, start, nbits);
 			__bitmap_clear(bmap2, start, nbits);
-			if (!bitmap_equal(bmap1, bmap2, 1024))
+			if (!bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("clear not equal %d %d\n", start, nbits);
-			if (!__bitmap_equal(bmap1, bmap2, 1024))
+				failed_tests++;
+			}
+			if (!__bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("clear not __equal %d %d\n", start,
 									nbits);
+				failed_tests++;
+			}
 		}
 	}
 }

From d97baf9470b0668904aa96865abe7db4000dc3ba Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Fri, 18 May 2018 16:08:47 -0700
Subject: [PATCH 341/393] include/linux/mm.h: add new inline function
 vmf_error()

Many places in drivers/ file systems, error was handled in a common way
like below:

	ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;

vmf_error() will replace this and return vm_fault_t type err.

A lot of drivers and filesystems currently have a rather complex mapping
of errno-to-VM_FAULT code.  We have been able to eliminate a lot of it
by just returning VM_FAULT codes directly from functions which are
called exclusively from the fault handling path.

Some functions can be called both from the fault handler and other
context which are expecting an errno, so they have to continue to return
an errno.  Some users still need to choose different behaviour for
different errnos, but vmf_error() captures the essential error
translation that's common to all users, and those that need to handle
additional errors can handle them first.

Link: http://lkml.kernel.org/r/20180510174826.GA14268@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c080af584ddd..c6fa9a255dbf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2466,6 +2466,13 @@ static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
+static inline vm_fault_t vmf_error(int err)
+{
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
+}
+
 struct page *follow_page_mask(struct vm_area_struct *vma,
 			      unsigned long address, unsigned int foll_flags,
 			      unsigned int *page_mask);

From 8d9fa88edd5e360b71765feeadb915d4066c9684 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 18 May 2018 16:08:51 -0700
Subject: [PATCH 342/393] radix tree test suite: fix mapshift build target

Commit c6ce3e2fe3da ("radix tree test suite: Add config option for map
shift") introduced a phony makefile target called 'mapshift' that ends
up generating the file generated/map-shift.h.  This phony target was
then added as a dependency of the top level 'targets' build target,
which is what is run when you go to tools/testing/radix-tree and just
type 'make'.

Unfortunately, this phony target doesn't actually work as a dependency,
so you end up getting:

  $ make
  make: *** No rule to make target 'generated/map-shift.h', needed by 'main.o'.  Stop.
  make: *** Waiting for unfinished jobs....

Fix this by making the file generated/map-shift.h our real makefile
target, and add this a dependency of the top level build target.

Link: http://lkml.kernel.org/r/20180503192430.7582-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/radix-tree/Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index fa7ee369b3c9..db66f8a0d4be 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -17,7 +17,7 @@ ifeq ($(BUILD), 32)
 	LDFLAGS += -m32
 endif
 
-targets: mapshift $(TARGETS)
+targets: generated/map-shift.h $(TARGETS)
 
 main:	$(OFILES)
 
@@ -42,9 +42,7 @@ radix-tree.c: ../../../lib/radix-tree.c
 idr.c: ../../../lib/idr.c
 	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
 
-.PHONY: mapshift
-
-mapshift:
+generated/map-shift.h:
 	@if ! grep -qws $(SHIFT) generated/map-shift.h; then		\
 		echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" >		\
 				generated/map-shift.h;			\

From dcbbf25adb31410c95ce844f80d372ed38b68b24 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 18 May 2018 16:08:54 -0700
Subject: [PATCH 343/393] radix tree test suite: fix compilation issue

Pulled from a patch from Matthew Wilcox entitled "xarray: Add definition
of struct xarray":

> From: Matthew Wilcox <mawilcox@microsoft.com>
> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>

  https://patchwork.kernel.org/patch/10341249/

These defines fix this compilation error:

  In file included from ./linux/radix-tree.h:6:0,
                   from ./linux/../../../../include/linux/idr.h:15,
                   from ./linux/idr.h:1,
                   from idr.c:4:
  ./linux/../../../../include/linux/idr.h: In function `idr_init_base':
  ./linux/../../../../include/linux/radix-tree.h:129:2: warning: implicit declaration of function `spin_lock_init'; did you mean `spinlock_t'? [-Wimplicit-function-declaration]
    spin_lock_init(&(root)->xa_lock);    \
    ^
  ./linux/../../../../include/linux/idr.h:126:2: note: in expansion of macro `INIT_RADIX_TREE'
    INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
    ^~~~~~~~~~~~~~~

by providing a spin_lock_init() wrapper for the v4.17-rc* version of the
radix tree test suite.

Link: http://lkml.kernel.org/r/20180503192430.7582-3-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/include/linux/spinlock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index b21b586b9854..1738c0391da4 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -6,8 +6,9 @@
 #include <stdbool.h>
 
 #define spinlock_t		pthread_mutex_t
-#define DEFINE_SPINLOCK(x)	pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER;
+#define DEFINE_SPINLOCK(x)	pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER
 #define __SPIN_LOCK_UNLOCKED(x)	(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER
+#define spin_lock_init(x)      pthread_mutex_init(x, NULL)
 
 #define spin_lock_irqsave(x, f)		(void)f, pthread_mutex_lock(x)
 #define spin_unlock_irqrestore(x, f)	(void)f, pthread_mutex_unlock(x)

From 3e252fa7d4f711798e7a3f5ff2d7b62f0e2987ce Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 18 May 2018 16:08:58 -0700
Subject: [PATCH 344/393] radix tree test suite: add item_delete_rcu()

Currently the lifetime of "struct item" entries in the radix tree are
not controlled by RCU, but are instead deleted inline as they are
removed from the tree.

In the following patches we add a test which has threads iterating over
items pulled from the tree and verifying them in an
rcu_read_lock()/rcu_read_unlock() section.  This means that though an
item has been removed from the tree it could still be being worked on by
other threads until the RCU grace period expires.  So, we need to
actually free the "struct item" structures at the end of the grace
period, just as we do with "struct radix_tree_node" items.

Link: http://lkml.kernel.org/r/20180503192430.7582-4-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/radix-tree/test.c | 19 +++++++++++++++++++
 tools/testing/radix-tree/test.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 5978ab1f403d..def6015570b2 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -75,6 +75,25 @@ int item_delete(struct radix_tree_root *root, unsigned long index)
 	return 0;
 }
 
+static void item_free_rcu(struct rcu_head *head)
+{
+	struct item *item = container_of(head, struct item, rcu_head);
+
+	free(item);
+}
+
+int item_delete_rcu(struct radix_tree_root *root, unsigned long index)
+{
+	struct item *item = radix_tree_delete(root, index);
+
+	if (item) {
+		item_sanity(item, index);
+		call_rcu(&item->rcu_head, item_free_rcu);
+		return 1;
+	}
+	return 0;
+}
+
 void item_check_present(struct radix_tree_root *root, unsigned long index)
 {
 	struct item *item;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index d9c031dbeb1a..8cf4a7a7f94c 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -5,6 +5,7 @@
 #include <linux/rcupdate.h>
 
 struct item {
+	struct rcu_head	rcu_head;
 	unsigned long index;
 	unsigned int order;
 };
@@ -15,6 +16,7 @@ int item_insert(struct radix_tree_root *root, unsigned long index);
 int item_insert_order(struct radix_tree_root *root, unsigned long index,
 			unsigned order);
 int item_delete(struct radix_tree_root *root, unsigned long index);
+int item_delete_rcu(struct radix_tree_root *root, unsigned long index);
 struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
 
 void item_check_present(struct radix_tree_root *root, unsigned long index);

From fd8f58c40b703e47697c9f12bc16c31f14c161f1 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 18 May 2018 16:09:01 -0700
Subject: [PATCH 345/393] radix tree test suite: multi-order iteration race

Add a test which shows a race in the multi-order iteration code.  This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64).  With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.

The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree.  Remember that an order 2
entry looks like this:

  struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]

where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'

When we delete 'entry' from the tree, we call :

  radix_tree_delete()
    radix_tree_delete_item()
      __radix_tree_delete()
        replace_slot()

replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL.  This means that for a
brief period of time we end up with one or more of the siblings removed,
so:

  struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]

This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection.  This is a common case in
mm/filemap.c.

The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:

                                      V preceding slot
  struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
                                              ^ current slot

This lets you find the first sibling, and you skip them all in order.

But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:

                                             V preceding slot
  struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
                                                    ^ current slot

This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers.  This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.

In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.

In the radix tree test suite this will be caught by the address
sanitizer:

  ==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
  0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
  READ of size 8 at 0x60c0008ae400 thread T3
      #0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
      #1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
      #2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
      #3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
      #4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)

Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/radix-tree/multiorder.c | 63 +++++++++++++++++++++++++++
 tools/testing/radix-tree/test.h       |  1 +
 2 files changed, 64 insertions(+)

diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 59245b3d587c..7bf405638b0b 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -16,6 +16,7 @@
 #include <linux/radix-tree.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <pthread.h>
 
 #include "test.h"
 
@@ -624,6 +625,67 @@ static void multiorder_account(void)
 	item_kill_tree(&tree);
 }
 
+bool stop_iteration = false;
+
+static void *creator_func(void *ptr)
+{
+	/* 'order' is set up to ensure we have sibling entries */
+	unsigned int order = RADIX_TREE_MAP_SHIFT - 1;
+	struct radix_tree_root *tree = ptr;
+	int i;
+
+	for (i = 0; i < 10000; i++) {
+		item_insert_order(tree, 0, order);
+		item_delete_rcu(tree, 0);
+	}
+
+	stop_iteration = true;
+	return NULL;
+}
+
+static void *iterator_func(void *ptr)
+{
+	struct radix_tree_root *tree = ptr;
+	struct radix_tree_iter iter;
+	struct item *item;
+	void **slot;
+
+	while (!stop_iteration) {
+		rcu_read_lock();
+		radix_tree_for_each_slot(slot, tree, &iter, 0) {
+			item = radix_tree_deref_slot(slot);
+
+			if (!item)
+				continue;
+			if (radix_tree_deref_retry(item)) {
+				slot = radix_tree_iter_retry(&iter);
+				continue;
+			}
+
+			item_sanity(item, iter.index);
+		}
+		rcu_read_unlock();
+	}
+	return NULL;
+}
+
+static void multiorder_iteration_race(void)
+{
+	const int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+	pthread_t worker_thread[num_threads];
+	RADIX_TREE(tree, GFP_KERNEL);
+	int i;
+
+	pthread_create(&worker_thread[0], NULL, &creator_func, &tree);
+	for (i = 1; i < num_threads; i++)
+		pthread_create(&worker_thread[i], NULL, &iterator_func, &tree);
+
+	for (i = 0; i < num_threads; i++)
+		pthread_join(worker_thread[i], NULL);
+
+	item_kill_tree(&tree);
+}
+
 void multiorder_checks(void)
 {
 	int i;
@@ -644,6 +706,7 @@ void multiorder_checks(void)
 	multiorder_join();
 	multiorder_split();
 	multiorder_account();
+	multiorder_iteration_race();
 
 	radix_tree_cpu_dead(0);
 }
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 8cf4a7a7f94c..31f1d9b6f506 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -13,6 +13,7 @@ struct item {
 struct item *item_create(unsigned long index, unsigned int order);
 int __item_insert(struct radix_tree_root *root, struct item *item);
 int item_insert(struct radix_tree_root *root, unsigned long index);
+void item_sanity(struct item *item, unsigned long index);
 int item_insert_order(struct radix_tree_root *root, unsigned long index,
 			unsigned order);
 int item_delete(struct radix_tree_root *root, unsigned long index);

From 9f418224e8114156d995b98fa4e0f4fd21f685fe Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 18 May 2018 16:09:06 -0700
Subject: [PATCH 346/393] radix tree: fix multi-order iteration race

Fix a race in the multi-order iteration code which causes the kernel to
hit a GP fault.  This was first seen with a production v4.15 based
kernel (4.15.6-300.fc27.x86_64) utilizing a DAX workload which used
order 9 PMD DAX entries.

The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree.  Remember for example that
an order 2 entry looks like this:

  struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]

where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'

When we delete 'entry' from the tree, we call :

  radix_tree_delete()
    radix_tree_delete_item()
      __radix_tree_delete()
        replace_slot()

replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL.  This means that for a
brief period of time we end up with one or more of the siblings removed,
so:

  struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]

This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection.  This is a common case in
mm/filemap.c.

The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:

                                      V preceding slot
  struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
                                              ^ current slot

This lets you find the first sibling, and you skip them all in order.

But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:

                                             V preceding slot
  struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
                                                    ^ current slot

This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers.  This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.

In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.

We fix this race by fixing the way that skip_siblings() detects sibling
nodes.  Instead of testing against the preceding slot we instead look
for siblings via is_sibling_entry() which compares against the position
of the struct radix_tree_node.slots[] array.  This ensures that sibling
entries are properly identified, even if they are no longer contiguous
with the 'entry' they point to.

Link: http://lkml.kernel.org/r/20180503192430.7582-6-ross.zwisler@linux.intel.com
Fixes: 148deab223b2 ("radix-tree: improve multiorder iterators")
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: CR, Sapthagirish <sapthagirish.cr@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/radix-tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index da9e10c827df..43e0cbedc3a0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1612,11 +1612,9 @@ static void set_iter_tags(struct radix_tree_iter *iter,
 static void __rcu **skip_siblings(struct radix_tree_node **nodep,
 			void __rcu **slot, struct radix_tree_iter *iter)
 {
-	void *sib = node_to_entry(slot - 1);
-
 	while (iter->index < iter->next_index) {
 		*nodep = rcu_dereference_raw(*slot);
-		if (*nodep && *nodep != sib)
+		if (*nodep && !is_sibling_entry(iter->node, *nodep))
 			return slot;
 		slot++;
 		iter->index = __radix_tree_iter_add(iter, 1);
@@ -1631,7 +1629,7 @@ void __rcu **__radix_tree_next_slot(void __rcu **slot,
 				struct radix_tree_iter *iter, unsigned flags)
 {
 	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
-	struct radix_tree_node *node = rcu_dereference_raw(*slot);
+	struct radix_tree_node *node;
 
 	slot = skip_siblings(&node, slot, iter);
 

From f3d8d3cfc1c00d1ebdb53ee96aa53aa822209003 Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah@kernel.org>
Date: Fri, 18 May 2018 16:09:09 -0700
Subject: [PATCH 347/393] MAINTAINERS: add Q: entry to kselftest for patchwork
 project

A new patchwork project is created to track kselftest patches.  Update
the kselftest entry in the MAINTAINERS file adding 'Q:' entry:

  https://patchwork.kernel.org/project/linux-kselftest/list/

Link: http://lkml.kernel.org/r/20180515164427.12201-1-shuah@kernel.org
Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 92e47b5b0480..9d1bc95dfe3c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7698,6 +7698,7 @@ KERNEL SELFTEST FRAMEWORK
 M:	Shuah Khan <shuah@kernel.org>
 L:	linux-kselftest@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git
+Q:	https://patchwork.kernel.org/project/linux-kselftest/list/
 S:	Maintained
 F:	tools/testing/selftests/
 F:	Documentation/dev-tools/kselftest*

From ab1e8d8960b68f54af42b6484b5950bd13a4054b Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Fri, 18 May 2018 16:09:13 -0700
Subject: [PATCH 348/393] mm: don't allow deferred pages with NEED_PER_CPU_KM

It is unsafe to do virtual to physical translations before mm_init() is
called if struct page is needed in order to determine the memory section
number (see SECTION_IN_PAGE_FLAGS).  This is because only in mm_init()
we initialize struct pages for all the allocated memory when deferred
struct pages are used.

My recent fix in commit c9e97a1997 ("mm: initialize pages on demand
during boot") exposed this problem, because it greatly reduced number of
pages that are initialized before mm_init(), but the problem existed
even before my fix, as Fengguang Wu found.

Below is a more detailed explanation of the problem.

We initialize struct pages in four places:

1. Early in boot a small set of struct pages is initialized to fill the
   first section, and lower zones.

2. During mm_init() we initialize "struct pages" for all the memory that
   is allocated, i.e reserved in memblock.

3. Using on-demand logic when pages are allocated after mm_init call
   (when memblock is finished)

4. After smp_init() when the rest free deferred pages are initialized.

The problem occurs if we try to do va to phys translation of a memory
between steps 1 and 2.  Because we have not yet initialized struct pages
for all the reserved pages, it is inherently unsafe to do va to phys if
the translation itself requires access of "struct page" as in case of
this combination: CONFIG_SPARSE && !CONFIG_SPARSE_VMEMMAP

The following path exposes the problem:

  start_kernel()
   trap_init()
    setup_cpu_entry_areas()
     setup_cpu_entry_area(cpu)
      get_cpu_gdt_paddr(cpu)
       per_cpu_ptr_to_phys(addr)
        pcpu_addr_to_page(addr)
         virt_to_page(addr)
          pfn_to_page(__pa(addr) >> PAGE_SHIFT)

We disable this path by not allowing NEED_PER_CPU_KM with deferred
struct pages feature.

The problems are discussed in these threads:
  http://lkml.kernel.org/r/20180418135300.inazvpxjxowogyge@wfg-t540p.sh.intel.com
  http://lkml.kernel.org/r/20180419013128.iurzouiqxvcnpbvz@wfg-t540p.sh.intel.com
  http://lkml.kernel.org/r/20180426202619.2768-1-pasha.tatashin@oracle.com

Link: http://lkml.kernel.org/r/20180515175124.1770-1-pasha.tatashin@oracle.com
Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Sistare <steven.sistare@oracle.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Dennis Zhou <dennisszhou@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..e14c01513bfd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT
 	default n
 	depends on NO_BOOTMEM
 	depends on !FLATMEM
+	depends on !NEED_PER_CPU_KM
 	help
 	  Ordinarily all struct pages are initialised during early boot in a
 	  single thread. On very large machines this can take a considerable

From 66072c29328717072fd84aaff3e070e3f008ba77 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 18 May 2018 16:09:16 -0700
Subject: [PATCH 349/393] hfsplus: stop workqueue when fill_super() failed

syzbot is reporting ODEBUG messages at hfsplus_fill_super() [1].  This
is because hfsplus_fill_super() forgot to call cancel_delayed_work_sync().

As far as I can see, it is hfsplus_mark_mdb_dirty() from
hfsplus_new_inode() in hfsplus_fill_super() that calls
queue_delayed_work().  Therefore, I assume that hfsplus_new_inode() does
not fail if queue_delayed_work() was called, and the out_put_hidden_dir
label is the appropriate location to call cancel_delayed_work_sync().

[1] https://syzkaller.appspot.com/bug?id=a66f45e96fdbeb76b796bf46eb25ea878c42a6c9

Link: http://lkml.kernel.org/r/964a8b27-cd69-357c-fe78-76b066056201@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+4f2e5f086147d543ab03@syzkaller.appspotmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Ernesto A. Fernandez <ernesto.mnd.fernandez@gmail.com>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 513c357c734b..a6c0f54c48c3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 out_put_hidden_dir:
+	cancel_delayed_work_sync(&sbi->sync_work);
 	iput(sbi->hidden_dir);
 out_put_root:
 	dput(sb->s_root);

From 4f74d72aa7067e75af92fbab077e6d7d0210be66 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 18 May 2018 16:08:41 +0200
Subject: [PATCH 350/393] efi/libstub/arm64: Handle randomized TEXT_OFFSET

When CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET is an arbitrary
multiple of PAGE_SIZE in the interval [0, 2MB).

The EFI stub does not account for the potential misalignment of
TEXT_OFFSET relative to EFI_KIMG_ALIGN, and produces a randomized
physical offset which is always a round multiple of EFI_KIMG_ALIGN.
This may result in statically allocated objects whose alignment exceeds
PAGE_SIZE to appear misaligned in memory. This has been observed to
result in spurious stack overflow reports and failure to make use of
the IRQ stacks, and theoretically could result in a number of other
issues.

We can OR in the low bits of TEXT_OFFSET to ensure that we have the
necessary offset (and hence preserve the misalignment of TEXT_OFFSET
relative to EFI_KIMG_ALIGN), so let's do that.

Reported-by: Kim Phillips <kim.phillips@arm.com>
Tested-by: Kim Phillips <kim.phillips@arm.com>
[ardb: clarify comment and commit log, drop unneeded parens]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 6f26b3671184c36d ("arm64: kaslr: increase randomization granularity")
Link: http://lkml.kernel.org/r/20180518140841.9731-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/libstub/arm64-stub.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index b9bd827caa22..1b4d465cc5d9 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -97,6 +97,16 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
 		u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ?
 			     (phys_seed >> 32) & mask : TEXT_OFFSET;
 
+		/*
+		 * With CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET may not
+		 * be a multiple of EFI_KIMG_ALIGN, and we must ensure that
+		 * we preserve the misalignment of 'offset' relative to
+		 * EFI_KIMG_ALIGN so that statically allocated objects whose
+		 * alignment exceeds PAGE_SIZE appear correctly aligned in
+		 * memory.
+		 */
+		offset |= TEXT_OFFSET % EFI_KIMG_ALIGN;
+
 		/*
 		 * If KASLR is enabled, and we have some randomness available,
 		 * locate the kernel at a randomized offset in physical memory.

From 7dec80ccbe310fb7e225bf21c48c672bb780ce7b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 18 May 2018 15:10:34 -0500
Subject: [PATCH 351/393] objtool: Detect RIP-relative switch table references,
 part 2

With the following commit:

  fd35c88b7417 ("objtool: Support GCC 8 switch tables")

I added a "can't find switch jump table" warning, to stop covering up
silent failures if add_switch_table() can't find anything.

That warning found yet another bug in the objtool switch table detection
logic.  For cases 1 and 2 (as described in the comments of
find_switch_table()), the find_symbol_containing() check doesn't adjust
the offset for RIP-relative switch jumps.

Incidentally, this bug was already fixed for case 3 with:

  6f5ec2993b1f ("objtool: Detect RIP-relative switch table references")

However, that commit missed the fix for cases 1 and 2.

The different cases are now starting to look more and more alike.  So
fix the bug by consolidating them into a single case, by checking the
original dynamic jump instruction in the case 3 loop.

This also simplifies the code and makes it more robust against future
switch table detection issues -- of which I'm sure there will be many...

Switch table detection has been the most fragile area of objtool, by
far.  I long for the day when we'll have a GCC plugin for annotating
switch tables.  Linus asked me to delay such a plugin due to the
flakiness of the plugin infrastructure in older versions of GCC, so this
rickety code is what we're stuck with for now.  At least the code is now
a little simpler than it was.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f400541613d45689086329432f3095119ffbc328.1526674218.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f4bbce838433..3a31b238f885 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -905,40 +905,19 @@ static struct rela *find_switch_table(struct objtool_file *file,
 	struct instruction *orig_insn = insn;
 	unsigned long table_offset;
 
-	/* case 1 & 2 */
-	text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len);
-	if (text_rela && text_rela->sym == file->rodata->sym &&
-	    !find_symbol_containing(file->rodata, text_rela->addend)) {
-
-		table_offset = text_rela->addend;
-		if (text_rela->type == R_X86_64_PC32) {
-			/* case 2 */
-			table_offset += 4;
-			file->ignore_unreachables = true;
-		}
-
-		rodata_rela = find_rela_by_dest(file->rodata, table_offset);
-		if (!rodata_rela)
-			return NULL;
-
-		return rodata_rela;
-	}
-
-	/* case 3 */
 	/*
 	 * Backward search using the @first_jump_src links, these help avoid
 	 * much of the 'in between' code. Which avoids us getting confused by
 	 * it.
 	 */
-	for (insn = list_prev_entry(insn, list);
-
+	for (;
 	     &insn->list != &file->insn_list &&
 	     insn->sec == func->sec &&
 	     insn->offset >= func->offset;
 
 	     insn = insn->first_jump_src ?: list_prev_entry(insn, list)) {
 
-		if (insn->type == INSN_JUMP_DYNAMIC)
+		if (insn != orig_insn && insn->type == INSN_JUMP_DYNAMIC)
 			break;
 
 		/* allow small jumps within the range */
@@ -965,10 +944,18 @@ static struct rela *find_switch_table(struct objtool_file *file,
 		if (find_symbol_containing(file->rodata, table_offset))
 			continue;
 
-		/* mov [rodata addr], %reg */
 		rodata_rela = find_rela_by_dest(file->rodata, table_offset);
-		if (rodata_rela)
+		if (rodata_rela) {
+			/*
+			 * Use of RIP-relative switch jumps is quite rare, and
+			 * indicates a rare GCC quirk/bug which can leave dead
+			 * code behind.
+			 */
+			if (text_rela->type == R_X86_64_PC32)
+				file->ignore_unreachables = true;
+
 			return rodata_rela;
+		}
 	}
 
 	return NULL;

From acf46020012ccbca1172e9c7aeab399c950d9212 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Fri, 18 May 2018 00:35:10 +0100
Subject: [PATCH 352/393] x86/mm: Drop TS_COMPAT on 64-bit exec() syscall

The x86 mmap() code selects the mmap base for an allocation depending on
the bitness of the syscall. For 64bit sycalls it select mm->mmap_base and
for 32bit mm->mmap_compat_base.

exec() calls mmap() which in turn uses in_compat_syscall() to check whether
the mapping is for a 32bit or a 64bit task. The decision is made on the
following criteria:

  ia32    child->thread.status & TS_COMPAT
   x32    child->pt_regs.orig_ax & __X32_SYSCALL_BIT
  ia64    !ia32 && !x32

__set_personality_x32() was dropping TS_COMPAT flag, but
set_personality_64bit() has kept compat syscall flag making
in_compat_syscall() return true during the first exec() syscall.

Which in result has user-visible effects, mentioned by Alexey:
1) It breaks ASAN
$ gcc -fsanitize=address wrap.c -o wrap-asan
$ ./wrap32 ./wrap-asan true
==1217==Shadow memory range interleaves with an existing memory mapping. ASan cannot proceed correctly. ABORTING.
==1217==ASan shadow was supposed to be located in the [0x00007fff7000-0x10007fff7fff] range.
==1217==Process memory map follows:
        0x000000400000-0x000000401000   /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan
        0x000000600000-0x000000601000   /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan
        0x000000601000-0x000000602000   /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan
        0x0000f7dbd000-0x0000f7de2000   /lib64/ld-2.27.so
        0x0000f7fe2000-0x0000f7fe3000   /lib64/ld-2.27.so
        0x0000f7fe3000-0x0000f7fe4000   /lib64/ld-2.27.so
        0x0000f7fe4000-0x0000f7fe5000
        0x7fed9abff000-0x7fed9af54000
        0x7fed9af54000-0x7fed9af6b000   /lib64/libgcc_s.so.1
[snip]

2) It doesn't seem to be great for security if an attacker always knows
that ld.so is going to be mapped into the first 4GB in this case
(the same thing happens for PIEs as well).

The testcase:
$ cat wrap.c

int main(int argc, char *argv[]) {
  execvp(argv[1], &argv[1]);
  return 127;
}

$ gcc wrap.c -o wrap
$ LD_SHOW_AUXV=1 ./wrap ./wrap true |& grep AT_BASE
AT_BASE:         0x7f63b8309000
AT_BASE:         0x7faec143c000
AT_BASE:         0x7fbdb25fa000

$ gcc -m32 wrap.c -o wrap32
$ LD_SHOW_AUXV=1 ./wrap32 ./wrap true |& grep AT_BASE
AT_BASE:         0xf7eff000
AT_BASE:         0xf7cee000
AT_BASE:         0x7f8b9774e000

Fixes: 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()")
Fixes: ada26481dfe6 ("x86/mm: Make in_compat_syscall() work during exec")
Reported-by: Alexey Izbyshev <izbyshev@ispras.ru>
Bisected-by: Alexander Monakov <amonakov@ispras.ru>
Investigated-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Alexander Monakov <amonakov@ispras.ru>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: stable@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/20180517233510.24996-1-dima@arista.com
---
 arch/x86/kernel/process_64.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4b100fe0f508..12bb445fb98d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -542,6 +542,7 @@ void set_personality_64bit(void)
 	clear_thread_flag(TIF_X32);
 	/* Pretend that this comes from a 64bit execve */
 	task_pt_regs(current)->orig_ax = __NR_execve;
+	current_thread_info()->status &= ~TS_COMPAT;
 
 	/* Ensure the corresponding mm is not marked. */
 	if (current->mm)

From e07e3c33b9c0b5751ade624f44325c9bf2487ea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stelmach?= <l.stelmach@samsung.com>
Date: Tue, 3 Apr 2018 09:04:57 +0100
Subject: [PATCH 353/393] ARM: 8753/1: decompressor: add a missing parameter to
 the addruart macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit 639da5ee374b ("ARM: add an extra temp register to the low
level debugging addruart macro") an additional temporary register was
added to the addruart macro, but the decompressor code wasn't updated.

Fixes: 639da5ee374b ("ARM: add an extra temp register to the low level debugging addruart macro")
Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/boot/compressed/head.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 45c8823c3750..182bf6add0b9 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -29,19 +29,19 @@
 #if defined(CONFIG_DEBUG_ICEDCC)
 
 #if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K) || defined(CONFIG_CPU_V7)
-		.macro	loadsp, rb, tmp
+		.macro	loadsp, rb, tmp1, tmp2
 		.endm
 		.macro	writeb, ch, rb
 		mcr	p14, 0, \ch, c0, c5, 0
 		.endm
 #elif defined(CONFIG_CPU_XSCALE)
-		.macro	loadsp, rb, tmp
+		.macro	loadsp, rb, tmp1, tmp2
 		.endm
 		.macro	writeb, ch, rb
 		mcr	p14, 0, \ch, c8, c0, 0
 		.endm
 #else
-		.macro	loadsp, rb, tmp
+		.macro	loadsp, rb, tmp1, tmp2
 		.endm
 		.macro	writeb, ch, rb
 		mcr	p14, 0, \ch, c1, c0, 0
@@ -57,7 +57,7 @@
 		.endm
 
 #if defined(CONFIG_ARCH_SA1100)
-		.macro	loadsp, rb, tmp
+		.macro	loadsp, rb, tmp1, tmp2
 		mov	\rb, #0x80000000	@ physical base address
 #ifdef CONFIG_DEBUG_LL_SER3
 		add	\rb, \rb, #0x00050000	@ Ser3
@@ -66,8 +66,8 @@
 #endif
 		.endm
 #else
-		.macro	loadsp,	rb, tmp
-		addruart \rb, \tmp
+		.macro	loadsp,	rb, tmp1, tmp2
+		addruart \rb, \tmp1, \tmp2
 		.endm
 #endif
 #endif
@@ -1297,7 +1297,7 @@ phex:		adr	r3, phexbuf
 		b	1b
 
 @ puts corrupts {r0, r1, r2, r3}
-puts:		loadsp	r3, r1
+puts:		loadsp	r3, r2, r1
 1:		ldrb	r2, [r0], #1
 		teq	r2, #0
 		moveq	pc, lr
@@ -1314,8 +1314,8 @@ puts:		loadsp	r3, r1
 @ putc corrupts {r0, r1, r2, r3}
 putc:
 		mov	r2, r0
+		loadsp	r3, r1, r0
 		mov	r0, #0
-		loadsp	r3, r1
 		b	2b
 
 @ memdump corrupts {r0, r1, r2, r3, r10, r11, r12, lr}

From f2ae9de019e4e2807d812ec4fe1df7c34788a0a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stelmach?= <l.stelmach@samsung.com>
Date: Wed, 4 Apr 2018 08:46:58 +0100
Subject: [PATCH 354/393] ARM: 8758/1: decompressor: restore r1 and r2 just
 before jumping to the kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hypervisor setup before __enter_kernel destroys the value
sotred in r1. The value needs to be restored just before the jump.

Fixes: 6b52f7bdb888 ("ARM: hyp-stub: Use r1 for the soft-restart address")
Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/boot/compressed/head.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 182bf6add0b9..517e0e18f0b8 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -561,8 +561,6 @@ not_relocated:	mov	r0, #0
 		bl	decompress_kernel
 		bl	cache_clean_flush
 		bl	cache_off
-		mov	r1, r7			@ restore architecture number
-		mov	r2, r8			@ restore atags pointer
 
 #ifdef CONFIG_ARM_VIRT_EXT
 		mrs	r0, spsr		@ Get saved CPU boot mode
@@ -1365,6 +1363,8 @@ __hyp_reentry_vectors:
 
 __enter_kernel:
 		mov	r0, #0			@ must be 0
+		mov	r1, r7			@ restore architecture number
+		mov	r2, r8			@ restore atags pointer
  ARM(		mov	pc, r4		)	@ call kernel
  M_CLASS(	add	r4, r4, #1	)	@ enter in Thumb mode for M class
  THUMB(		bx	r4		)	@ entry point is always ARM for A/R classes

From 2d7b3c64431245c95b05a441669c074da10db943 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 11 Apr 2018 18:24:01 +0100
Subject: [PATCH 355/393] ARM: kexec: fix kdump register saving on panic()

When a panic() occurs, the kexec code uses smp_send_stop() to stop
the other CPUs, but this results in the CPU register state not being
saved, and gdb is unable to inspect the state of other CPUs.

Commit 0ee59413c967 ("x86/panic: replace smp_send_stop() with kdump
friendly version in panic path") addressed the issue on x86, but
ignored other architectures.  Address the issue on ARM by splitting
out the crash stop implementation to crash_smp_send_stop() and
adding the necessary protection.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/kernel/machine_kexec.c | 34 +++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 6b38d7a634c1..c15318431986 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -95,6 +95,27 @@ void machine_crash_nonpanic_core(void *unused)
 		cpu_relax();
 }
 
+void crash_smp_send_stop(void)
+{
+	static int cpus_stopped;
+	unsigned long msecs;
+
+	if (cpus_stopped)
+		return;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	smp_call_function(machine_crash_nonpanic_core, NULL, false);
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+	if (atomic_read(&waiting_for_crash_ipi) > 0)
+		pr_warn("Non-crashing CPUs did not react to IPI\n");
+
+	cpus_stopped = 1;
+}
+
 static void machine_kexec_mask_interrupts(void)
 {
 	unsigned int i;
@@ -120,19 +141,8 @@ static void machine_kexec_mask_interrupts(void)
 
 void machine_crash_shutdown(struct pt_regs *regs)
 {
-	unsigned long msecs;
-
 	local_irq_disable();
-
-	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-	smp_call_function(machine_crash_nonpanic_core, NULL, false);
-	msecs = 1000; /* Wait at most a second for the other cpus to stop */
-	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
-		mdelay(1);
-		msecs--;
-	}
-	if (atomic_read(&waiting_for_crash_ipi) > 0)
-		pr_warn("Non-crashing CPUs did not react to IPI\n");
+	crash_smp_send_stop();
 
 	crash_save_cpu(regs, smp_processor_id());
 	machine_kexec_mask_interrupts();

From 1c37963b1a1600b9686c4a99857ddcb6028be884 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 11 Apr 2018 19:35:19 +0100
Subject: [PATCH 356/393] ARM: kexec: record parent context registers for
 non-crash CPUs

How we got to machine_crash_nonpanic_core() (iow, from an IPI, etc) is
not interesting for debugging a crash.  The more interesting context
is the parent context prior to the IPI being received.

Record the parent context register state rather than the register state
in machine_crash_nonpanic_core(), which is more relevant to the failing
condition.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/kernel/machine_kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index c15318431986..dd2eb5f76b9f 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -83,7 +83,7 @@ void machine_crash_nonpanic_core(void *unused)
 {
 	struct pt_regs regs;
 
-	crash_setup_regs(&regs, NULL);
+	crash_setup_regs(&regs, get_irq_regs());
 	printk(KERN_DEBUG "CPU %u will stop doing anything useful since another CPU has crashed\n",
 	       smp_processor_id());
 	crash_save_cpu(&regs, smp_processor_id());

From 6cea14f55474ec71f1098228e0ae5dd2a8f22c0a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Apr 2018 13:21:54 +0100
Subject: [PATCH 357/393] ARM: replace unnecessary perl with sed and the shell
 $(( )) operator

You can build a kernel in a cross compiling environment that doesn't
have perl in the $PATH. Commit 429f7a062e3b broke that for 32 bit
ARM. Fix it.

As reported by Stephen Rothwell, it appears that the symbols can be
either part of the BSS section or absolute symbols depending on the
binutils version.  When they're an absolute symbol, the $(( ))
operator errors out and the build fails.  Fix this as well.

Fixes: 429f7a062e3b ("ARM: decompressor: fix BSS size calculation")
Reported-by: Rob Landley <rob@landley.net>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Rob Landley <rob@landley.net>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/boot/compressed/Makefile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 45a6b9b7af2a..6a4e7341ecd3 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -117,11 +117,9 @@ ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj)
 asflags-y := -DZIMAGE
 
 # Supply kernel BSS size to the decompressor via a linker symbol.
-KBSS_SZ = $(shell $(CROSS_COMPILE)nm $(obj)/../../../../vmlinux | \
-		perl -e 'while (<>) { \
-			$$bss_start=hex($$1) if /^([[:xdigit:]]+) B __bss_start$$/; \
-			$$bss_end=hex($$1) if /^([[:xdigit:]]+) B __bss_stop$$/; \
-		}; printf "%d\n", $$bss_end - $$bss_start;')
+KBSS_SZ = $(shell echo $$(($$($(CROSS_COMPILE)nm $(obj)/../../../../vmlinux | \
+		sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \
+		       -e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) )
 LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)
 # Supply ZRELADDR to the decompressor via a linker symbol.
 ifneq ($(CONFIG_AUTO_ZRELADDR),y)

From 69af7e23a6870df2ea6fa79ca16493d59b3eebeb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 13 May 2018 05:03:54 +0100
Subject: [PATCH 358/393] ARM: 8769/1: kprobes: Fix to use get_kprobe_ctlblk
 after irq-disabed

Since get_kprobe_ctlblk() uses smp_processor_id() to access
per-cpu variable, it hits smp_processor_id sanity check as below.

[    7.006928] BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
[    7.007859] caller is debug_smp_processor_id+0x20/0x24
[    7.008438] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.16.0-rc1-00192-g4eb17253e4b5 #1
[    7.008890] Hardware name: Generic DT based system
[    7.009917] [<c0313f0c>] (unwind_backtrace) from [<c030e6d8>] (show_stack+0x20/0x24)
[    7.010473] [<c030e6d8>] (show_stack) from [<c0c64694>] (dump_stack+0x84/0x98)
[    7.010990] [<c0c64694>] (dump_stack) from [<c071ca5c>] (check_preemption_disabled+0x138/0x13c)
[    7.011592] [<c071ca5c>] (check_preemption_disabled) from [<c071ca80>] (debug_smp_processor_id+0x20/0x24)
[    7.012214] [<c071ca80>] (debug_smp_processor_id) from [<c03335e0>] (optimized_callback+0x2c/0xe4)
[    7.013077] [<c03335e0>] (optimized_callback) from [<bf0021b0>] (0xbf0021b0)

To fix this issue, call get_kprobe_ctlblk() right after
irq-disabled since that disables preemption.

Fixes: 0dc016dbd820 ("ARM: kprobes: enable OPTPROBES for ARM 32")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/probes/kprobes/opt-arm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c
index bcdecc25461b..ddc5a82eb10d 100644
--- a/arch/arm/probes/kprobes/opt-arm.c
+++ b/arch/arm/probes/kprobes/opt-arm.c
@@ -165,13 +165,14 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
 {
 	unsigned long flags;
 	struct kprobe *p = &op->kp;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
 
 	/* Save skipped registers */
 	regs->ARM_pc = (unsigned long)op->kp.addr;
 	regs->ARM_ORIG_r0 = ~0UL;
 
 	local_irq_save(flags);
+	kcb = get_kprobe_ctlblk();
 
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(&op->kp);

From 70948c05fdde0aac32f9667856a88725c192fa40 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 13 May 2018 05:04:10 +0100
Subject: [PATCH 359/393] ARM: 8770/1: kprobes: Prohibit probing on
 optimized_callback

Prohibit probing on optimized_callback() because
it is called from kprobes itself. If we put a kprobes
on it, that will cause a recursive call loop.
Mark it NOKPROBE_SYMBOL.

Fixes: 0dc016dbd820 ("ARM: kprobes: enable OPTPROBES for ARM 32")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/probes/kprobes/opt-arm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c
index ddc5a82eb10d..b2aa9b32bff2 100644
--- a/arch/arm/probes/kprobes/opt-arm.c
+++ b/arch/arm/probes/kprobes/opt-arm.c
@@ -192,6 +192,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
 
 	local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(optimized_callback)
 
 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig)
 {

From eb0146daefdde65665b7f076fbff7b49dade95b9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 13 May 2018 05:04:16 +0100
Subject: [PATCH 360/393] ARM: 8771/1: kprobes: Prohibit kprobes on
 do_undefinstr

Prohibit kprobes on do_undefinstr because kprobes on
arm is implemented by undefined instruction. This means
if we probe do_undefinstr(), it can cause infinit
recursive exception.

Fixes: 24ba613c9d6c ("ARM kprobes: core code")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/kernel/traps.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 5e3633c24e63..2fe87109ae46 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -19,6 +19,7 @@
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
+#include <linux/kprobes.h>
 #include <linux/module.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
@@ -417,7 +418,8 @@ void unregister_undef_hook(struct undef_hook *hook)
 	raw_spin_unlock_irqrestore(&undef_lock, flags);
 }
 
-static int call_undef_hook(struct pt_regs *regs, unsigned int instr)
+static nokprobe_inline
+int call_undef_hook(struct pt_regs *regs, unsigned int instr)
 {
 	struct undef_hook *hook;
 	unsigned long flags;
@@ -490,6 +492,7 @@ asmlinkage void do_undefinstr(struct pt_regs *regs)
 
 	arm_notify_die("Oops - undefined instruction", regs, &info, 0, 6);
 }
+NOKPROBE_SYMBOL(do_undefinstr)
 
 /*
  * Handle FIQ similarly to NMI on x86 systems.

From 0d73c3f8e7f6ee2aab1bb350f60c180f5ae21a2c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 13 May 2018 05:04:29 +0100
Subject: [PATCH 361/393] ARM: 8772/1: kprobes: Prohibit kprobes on get_user
 functions

Since do_undefinstr() uses get_user to get the undefined
instruction, it can be called before kprobes processes
recursive check. This can cause an infinit recursive
exception.
Prohibit probing on get_user functions.

Fixes: 24ba613c9d6c ("ARM kprobes: core code")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/include/asm/assembler.h | 10 ++++++++++
 arch/arm/lib/getuser.S           | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index bc8d4bbd82e2..9342904cccca 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -536,4 +536,14 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 #endif
 	.endm
 
+#ifdef CONFIG_KPROBES
+#define _ASM_NOKPROBE(entry)				\
+	.pushsection "_kprobe_blacklist", "aw" ;	\
+	.balign 4 ;					\
+	.long entry;					\
+	.popsection
+#else
+#define _ASM_NOKPROBE(entry)
+#endif
+
 #endif /* __ASM_ASSEMBLER_H__ */
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index df73914e81c8..746e7801dcdf 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -38,6 +38,7 @@ ENTRY(__get_user_1)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_1)
+_ASM_NOKPROBE(__get_user_1)
 
 ENTRY(__get_user_2)
 	check_uaccess r0, 2, r1, r2, __get_user_bad
@@ -58,6 +59,7 @@ rb	.req	r0
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_2)
+_ASM_NOKPROBE(__get_user_2)
 
 ENTRY(__get_user_4)
 	check_uaccess r0, 4, r1, r2, __get_user_bad
@@ -65,6 +67,7 @@ ENTRY(__get_user_4)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_4)
+_ASM_NOKPROBE(__get_user_4)
 
 ENTRY(__get_user_8)
 	check_uaccess r0, 8, r1, r2, __get_user_bad8
@@ -78,6 +81,7 @@ ENTRY(__get_user_8)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_8)
+_ASM_NOKPROBE(__get_user_8)
 
 #ifdef __ARMEB__
 ENTRY(__get_user_32t_8)
@@ -91,6 +95,7 @@ ENTRY(__get_user_32t_8)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_32t_8)
+_ASM_NOKPROBE(__get_user_32t_8)
 
 ENTRY(__get_user_64t_1)
 	check_uaccess r0, 1, r1, r2, __get_user_bad8
@@ -98,6 +103,7 @@ ENTRY(__get_user_64t_1)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_1)
+_ASM_NOKPROBE(__get_user_64t_1)
 
 ENTRY(__get_user_64t_2)
 	check_uaccess r0, 2, r1, r2, __get_user_bad8
@@ -114,6 +120,7 @@ rb	.req	r0
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_2)
+_ASM_NOKPROBE(__get_user_64t_2)
 
 ENTRY(__get_user_64t_4)
 	check_uaccess r0, 4, r1, r2, __get_user_bad8
@@ -121,6 +128,7 @@ ENTRY(__get_user_64t_4)
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_4)
+_ASM_NOKPROBE(__get_user_64t_4)
 #endif
 
 __get_user_bad8:
@@ -131,6 +139,8 @@ __get_user_bad:
 	ret	lr
 ENDPROC(__get_user_bad)
 ENDPROC(__get_user_bad8)
+_ASM_NOKPROBE(__get_user_bad)
+_ASM_NOKPROBE(__get_user_bad8)
 
 .pushsection __ex_table, "a"
 	.long	1b, __get_user_bad

From 78ce241099bb363b19dbd0245442e66c8de8f567 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 17 May 2018 10:46:26 +0200
Subject: [PATCH 362/393] x86/MCE/AMD: Cache SMCA MISC block addresses

... into a global, two-dimensional array and service subsequent reads from
that cache to avoid rdmsr_on_cpu() calls during CPU hotplug (IPIs with IRQs
disabled).

In addition, this fixes a KASAN slab-out-of-bounds read due to wrong usage
of the bank->blocks pointer.

Fixes: 27bd59502702 ("x86/mce/AMD: Get address from already initialized block")
Reported-by: Johannes Hirte <johannes.hirte@datenkhaos.de>
Tested-by: Johannes Hirte <johannes.hirte@datenkhaos.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Link: http://lkml.kernel.org/r/20180414004230.GA2033@probook
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f7666eef4a87..c8e038800591 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = {
 	[SMCA_SMU]	= { "smu",		"System Management Unit" },
 };
 
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+	[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
 const char *smca_get_name(enum smca_bank_types t)
 {
 	if (t >= N_SMCA_BANK_TYPES)
@@ -443,20 +448,26 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
 	if (!block)
 		return MSR_AMD64_SMCA_MCx_MISC(bank);
 
+	/* Check our cache first: */
+	if (smca_bank_addrs[bank][block] != -1)
+		return smca_bank_addrs[bank][block];
+
 	/*
 	 * For SMCA enabled processors, BLKPTR field of the first MISC register
 	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
 	 */
 	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
-		return addr;
+		goto out;
 
 	if (!(low & MCI_CONFIG_MCAX))
-		return addr;
+		goto out;
 
 	if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
 	    (low & MASK_BLKPTR_LO))
-		return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+		addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
 
+out:
+	smca_bank_addrs[bank][block] = addr;
 	return addr;
 }
 
@@ -468,18 +479,6 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi
 	if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
 		return addr;
 
-	/* Get address from already initialized block. */
-	if (per_cpu(threshold_banks, cpu)) {
-		struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank];
-
-		if (bankp && bankp->blocks) {
-			struct threshold_block *blockp = &bankp->blocks[block];
-
-			if (blockp)
-				return blockp->address;
-		}
-	}
-
 	if (mce_flags.smca)
 		return smca_get_block_address(cpu, bank, block);
 

From 423913ad4ae5b3e8fb8983f70969fb522261ba26 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 19 May 2018 09:29:11 -0700
Subject: [PATCH 363/393] mmap: relax file size limit for regular files

Commit be83bbf80682 ("mmap: introduce sane default mmap limits") was
introduced to catch problems in various ad-hoc character device drivers
doing mmap and getting the size limits wrong.  In the process, it used
"known good" limits for the normal cases of mapping regular files and
block device drivers.

It turns out that the "s_maxbytes" limit was less "known good" than I
thought.  In particular, /proc doesn't set it, but exposes one regular
file to mmap: /proc/vmcore.  As a result, that file got limited to the
default MAX_INT s_maxbytes value.

This went unnoticed for a while, because apparently the only thing that
needs it is the s390 kernel zfcpdump, but there might be other tools
that use this too.

Vasily suggested just changing s_maxbytes for all of /proc, which isn't
wrong, but makes me nervous at this stage.  So instead, just make the
new mmap limit always be MAX_LFS_FILESIZE for regular files, which won't
affect anything else.  It wasn't the regular file case I was worried
about.

I'd really prefer for maxsize to have been per-inode, but that is not
how things are today.

Fixes: be83bbf80682 ("mmap: introduce sane default mmap limits")
Reported-by: Vasily Gorbik <gor@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 78e14facdb6e..fc41c0543d7f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1327,7 +1327,7 @@ static inline int mlock_future_check(struct mm_struct *mm,
 static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
-		return inode->i_sb->s_maxbytes;
+		return MAX_LFS_FILESIZE;
 
 	if (S_ISBLK(inode->i_mode))
 		return MAX_LFS_FILESIZE;

From 92d44a42af81e850a038c38278ff4f434b2871df Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 13 Apr 2018 00:22:47 +0100
Subject: [PATCH 364/393] ARM: fix kill( ,SIGFPE) breakage

Commit 7771c6645700 ("signal/arm: Document conflicts with SI_USER and
SIGFPE") broke the siginfo structure for userspace triggered signals,
causing the strace testsuite to regress.  Fix this by eliminating
the FPE_FIXME definition (which is at the root of the breakage) and
use FPE_FLTINV instead for the case where the hardware appears to be
reporting nonsense.

Fixes: 7771c6645700 ("signal/arm: Document conflicts with SI_USER and SIGFPE")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/include/uapi/asm/siginfo.h | 13 -------------
 arch/arm/vfp/vfpmodule.c            |  2 +-
 2 files changed, 1 insertion(+), 14 deletions(-)
 delete mode 100644 arch/arm/include/uapi/asm/siginfo.h

diff --git a/arch/arm/include/uapi/asm/siginfo.h b/arch/arm/include/uapi/asm/siginfo.h
deleted file mode 100644
index d0513880be21..000000000000
--- a/arch/arm/include/uapi/asm/siginfo.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ASM_SIGINFO_H
-#define __ASM_SIGINFO_H
-
-#include <asm-generic/siginfo.h>
-
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-#endif
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 4c375e11ae95..af4ee2cef2f9 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -257,7 +257,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_
 
 	if (exceptions == VFP_EXCEPTION_ERROR) {
 		vfp_panic("unhandled bounce", inst);
-		vfp_raise_sigfpe(FPE_FIXME, regs);
+		vfp_raise_sigfpe(FPE_FLTINV, regs);
 		return;
 	}
 

From af86ca4e3088fe5eacf2f7e58c01fa68ca067672 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 May 2018 09:27:05 -0700
Subject: [PATCH 365/393] bpf: Prevent memory disambiguation attack

Detect code patterns where malicious 'speculative store bypass' can be used
and sanitize such patterns.

 39: (bf) r3 = r10
 40: (07) r3 += -216
 41: (79) r8 = *(u64 *)(r7 +0)   // slow read
 42: (7a) *(u64 *)(r10 -72) = 0  // verifier inserts this instruction
 43: (7b) *(u64 *)(r8 +0) = r3   // this store becomes slow due to r8
 44: (79) r1 = *(u64 *)(r6 +0)   // cpu speculatively executes this load
 45: (71) r2 = *(u8 *)(r1 +0)    // speculatively arbitrary 'load byte'
                                 // is now sanitized

Above code after x86 JIT becomes:
 e5: mov    %rbp,%rdx
 e8: add    $0xffffffffffffff28,%rdx
 ef: mov    0x0(%r13),%r14
 f3: movq   $0x0,-0x48(%rbp)
 fb: mov    %rdx,0x0(%r14)
 ff: mov    0x0(%rbx),%rdi
103: movzbq 0x0(%rdi),%rsi

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 59 ++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c395fddf..65cfc2f59db9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -146,6 +146,7 @@ struct bpf_insn_aux_data {
 		s32 call_imm;			/* saved imm field of call insn */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb902bf..2ce967a63ede 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,7 +978,7 @@ static bool register_is_null(struct bpf_reg_state *reg)
  */
 static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_func_state *state, /* func where register points to */
-			     int off, int size, int value_regno)
+			     int off, int size, int value_regno, int insn_idx)
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -1017,8 +1017,33 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		state->stack[spi].spilled_ptr = cur->regs[value_regno];
 		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
-		for (i = 0; i < BPF_REG_SIZE; i++)
+		for (i = 0; i < BPF_REG_SIZE; i++) {
+			if (state->stack[spi].slot_type[i] == STACK_MISC &&
+			    !env->allow_ptr_leaks) {
+				int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+				int soff = (-spi - 1) * BPF_REG_SIZE;
+
+				/* detected reuse of integer stack slot with a pointer
+				 * which means either llvm is reusing stack slot or
+				 * an attacker is trying to exploit CVE-2018-3639
+				 * (speculative store bypass)
+				 * Have to sanitize that slot with preemptive
+				 * store of zero.
+				 */
+				if (*poff && *poff != soff) {
+					/* disallow programs where single insn stores
+					 * into two different stack slots, since verifier
+					 * cannot sanitize them
+					 */
+					verbose(env,
+						"insn %d cannot access two stack slots fp%d and fp%d",
+						insn_idx, *poff, soff);
+					return -EINVAL;
+				}
+				*poff = soff;
+			}
 			state->stack[spi].slot_type[i] = STACK_SPILL;
+		}
 	} else {
 		u8 type = STACK_MISC;
 
@@ -1694,7 +1719,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
-						value_regno);
+						value_regno, insn_idx);
 		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
@@ -5169,6 +5194,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		else
 			continue;
 
+		if (type == BPF_WRITE &&
+		    env->insn_aux_data[i + delta].sanitize_stack_off) {
+			struct bpf_insn patch[] = {
+				/* Sanitize suspicious stack slot with zero.
+				 * There are no memory dependencies for this store,
+				 * since it's only using frame pointer and immediate
+				 * constant of zero
+				 */
+				BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+					   env->insn_aux_data[i + delta].sanitize_stack_off,
+					   0),
+				/* the original STX instruction will immediately
+				 * overwrite the same stack slot with appropriate value
+				 */
+				*insn,
+			};
+
+			cnt = ARRAY_SIZE(patch);
+			new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 

From b80d0b93b991e551a32157e0d9d38fc5bc9348a7 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 18 May 2018 19:22:28 -0700
Subject: [PATCH 366/393] net: ip6_gre: fix tunnel metadata device sharing.

Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'.  Thus, when doing:
  ip link add dev ip6gre11 type ip6gretap external
  ip link add dev ip6erspan12 type ip6erspan external
  RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.

The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'.  As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.

First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan.  Tested it
using the samples/bpf/test_tunnel_bpf.sh.

Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 101 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 79 insertions(+), 22 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 5162ecc45c20..458de353f5d9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -71,6 +71,7 @@ struct ip6gre_net {
 	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
 	struct ip6_tnl __rcu *collect_md_tun;
+	struct ip6_tnl __rcu *collect_md_tun_erspan;
 	struct net_device *fb_tunnel_dev;
 };
 
@@ -233,7 +234,12 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	if (cand)
 		return cand;
 
-	t = rcu_dereference(ign->collect_md_tun);
+	if (gre_proto == htons(ETH_P_ERSPAN) ||
+	    gre_proto == htons(ETH_P_ERSPAN2))
+		t = rcu_dereference(ign->collect_md_tun_erspan);
+	else
+		t = rcu_dereference(ign->collect_md_tun);
+
 	if (t && t->dev->flags & IFF_UP)
 		return t;
 
@@ -262,6 +268,31 @@ static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign,
 	return &ign->tunnels[prio][h];
 }
 
+static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, t);
+}
+
+static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun_erspan, t);
+}
+
+static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, NULL);
+}
+
+static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign,
+				       struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun_erspan, NULL);
+}
+
 static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
 		const struct ip6_tnl *t)
 {
@@ -272,9 +303,6 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
 
-	if (t->parms.collect_md)
-		rcu_assign_pointer(ign->collect_md_tun, t);
-
 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -284,9 +312,6 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
-	if (t->parms.collect_md)
-		rcu_assign_pointer(ign->collect_md_tun, NULL);
-
 	for (tp = ip6gre_bucket(ign, t);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -375,11 +400,23 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
 	return NULL;
 }
 
+static void ip6erspan_tunnel_uninit(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+	ip6erspan_tunnel_unlink_md(ign, t);
+	ip6gre_tunnel_unlink(ign, t);
+	dst_cache_reset(&t->dst_cache);
+	dev_put(dev);
+}
+
 static void ip6gre_tunnel_uninit(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
@@ -1806,7 +1843,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 static const struct net_device_ops ip6erspan_netdev_ops = {
 	.ndo_init =		ip6erspan_tap_init,
-	.ndo_uninit =		ip6gre_tunnel_uninit,
+	.ndo_uninit =		ip6erspan_tunnel_uninit,
 	.ndo_start_xmit =	ip6erspan_tunnel_xmit,
 	.ndo_set_mac_address =	eth_mac_addr,
 	.ndo_validate_addr =	eth_validate_addr,
@@ -1875,8 +1912,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
 				 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *nt;
-	struct net *net = dev_net(dev);
-	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
 	struct ip_tunnel_encap ipencap;
 	int err;
 
@@ -1889,16 +1924,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
 			return err;
 	}
 
-	ip6gre_netlink_parms(data, &nt->parms);
-
-	if (nt->parms.collect_md) {
-		if (rtnl_dereference(ign->collect_md_tun))
-			return -EEXIST;
-	} else {
-		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
-			return -EEXIST;
-	}
-
 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
 
@@ -1922,12 +1947,26 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
-	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	struct ip6_tnl *nt = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	struct ip6gre_net *ign;
+	int err;
 
+	ip6gre_netlink_parms(data, &nt->parms);
+	ign = net_generic(net, ip6gre_net_id);
+
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
+
+	err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	if (!err) {
 		ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link_md(ign, nt);
 		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
 	}
 	return err;
@@ -1979,8 +2018,10 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6gre_tunnel_link_md(ign, t);
 	ip6gre_tunnel_link(ign, t);
 	return 0;
 }
@@ -2134,12 +2175,26 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
 			     struct nlattr *tb[], struct nlattr *data[],
 			     struct netlink_ext_ack *extack)
 {
-	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	struct ip6_tnl *nt = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	struct ip6gre_net *ign;
+	int err;
 
+	ip6gre_netlink_parms(data, &nt->parms);
+	ign = net_generic(net, ip6gre_net_id);
+
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun_erspan))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
+
+	err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	if (!err) {
 		ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6erspan_tunnel_link_md(ign, nt);
 		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
 	}
 	return err;
@@ -2171,8 +2226,10 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6erspan_tunnel_link_md(ign, t);
 	ip6gre_tunnel_link(ign, t);
 	return 0;
 }

From 771c577c23bac90597c685971d7297ea00f99d11 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 20 May 2018 15:31:38 -0700
Subject: [PATCH 367/393] Linux 4.17-rc6

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ba3106b36597..ec6f45928fd4 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
 NAME = Merciless Moray
 
 # *DOCUMENTATION*

From 136d769e0b3475d71350aa3648a116a6ee7a8f6c Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Sat, 19 May 2018 22:29:36 +0100
Subject: [PATCH 368/393] libata: blacklist Micron 500IT SSD with MU01 firmware

While whitelisting Micron M500DC drives, the tweaked blacklist entry
enabled queued TRIM from M500IT variants also. But these do not support
queued TRIM. And while using those SSDs with the latest kernel we have
seen errors and even the partition table getting corrupted.

Some part from the dmesg:
[    6.727384] ata1.00: ATA-9: Micron_M500IT_MTFDDAK060MBD, MU01, max UDMA/133
[    6.727390] ata1.00: 117231408 sectors, multi 16: LBA48 NCQ (depth 31/32), AA
[    6.741026] ata1.00: supports DRM functions and may not be fully accessible
[    6.759887] ata1.00: configured for UDMA/133
[    6.762256] scsi 0:0:0:0: Direct-Access     ATA      Micron_M500IT_MT MU01 PQ: 0 ANSI: 5

and then for the error:
[  120.860334] ata1.00: exception Emask 0x1 SAct 0x7ffc0007 SErr 0x0 action 0x6 frozen
[  120.860338] ata1.00: irq_stat 0x40000008
[  120.860342] ata1.00: failed command: SEND FPDMA QUEUED
[  120.860351] ata1.00: cmd 64/01:00:00:00:00/00:00:00:00:00/a0 tag 0 ncq dma 512 out
         res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x5 (timeout)
[  120.860353] ata1.00: status: { DRDY }
[  120.860543] ata1: hard resetting link
[  121.166128] ata1: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
[  121.166376] ata1.00: supports DRM functions and may not be fully accessible
[  121.186238] ata1.00: supports DRM functions and may not be fully accessible
[  121.204445] ata1.00: configured for UDMA/133
[  121.204454] ata1.00: device reported invalid CHS sector 0
[  121.204541] sd 0:0:0:0: [sda] tag#18 UNKNOWN(0x2003) Result: hostbyte=0x00 driverbyte=0x08
[  121.204546] sd 0:0:0:0: [sda] tag#18 Sense Key : 0x5 [current]
[  121.204550] sd 0:0:0:0: [sda] tag#18 ASC=0x21 ASCQ=0x4
[  121.204555] sd 0:0:0:0: [sda] tag#18 CDB: opcode=0x93 93 08 00 00 00 00 00 04 28 80 00 00 00 30 00 00
[  121.204559] print_req_error: I/O error, dev sda, sector 272512

After few reboots with these errors, and the SSD is corrupted.
After blacklisting it, the errors are not seen and the SSD does not get
corrupted any more.

Fixes: 243918be6393 ("libata: Do not blacklist Micron M500DC")
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 7ed2f009911a..346b163f6e89 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4561,6 +4561,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "SanDisk SD7UB3Q*G1001",	NULL,	ATA_HORKAGE_NOLPM, },
 
 	/* devices that don't properly handle queued TRIM commands */
+	{ "Micron_M500IT_*",		"MU01",	ATA_HORKAGE_NO_NCQ_TRIM |
+						ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Micron_M500_*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM |
 						ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Crucial_CT*M500*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM |

From f7068114d45ec55996b9040e98111afa56e010fe Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 May 2018 12:21:14 -0600
Subject: [PATCH 369/393] sr: pass down correctly sized SCSI sense buffer

We're casting the CDROM layer request_sense to the SCSI sense
buffer, but the former is 64 bytes and the latter is 96 bytes.
As we generally allocate these on the stack, we end up blowing
up the stack.

Fix this by wrapping the scsi_execute() call with a properly
sized sense buffer, and copying back the bits for the CDROM
layer.

Cc: stable@vger.kernel.org
Reported-by: Piotr Gabriel Kosinski <pg.kosinski@gmail.com>
Reported-by: Daniel Shapira <daniel@twistlock.com>
Tested-by: Kees Cook <keescook@chromium.org>
Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sr_ioctl.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 2a21f2d48592..35fab1e18adc 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -188,9 +188,13 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 	struct scsi_device *SDev;
 	struct scsi_sense_hdr sshdr;
 	int result, err = 0, retries = 0;
+	unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL;
 
 	SDev = cd->device;
 
+	if (cgc->sense)
+		senseptr = sense_buffer;
+
       retry:
 	if (!scsi_block_when_processing_errors(SDev)) {
 		err = -ENODEV;
@@ -198,10 +202,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 	}
 
 	result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
-			      cgc->buffer, cgc->buflen,
-			      (unsigned char *)cgc->sense, &sshdr,
+			      cgc->buffer, cgc->buflen, senseptr, &sshdr,
 			      cgc->timeout, IOCTL_RETRIES, 0, 0, NULL);
 
+	if (cgc->sense)
+		memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
+
 	/* Minimal error checking.  Ignore cases we know about, and report the rest. */
 	if (driver_byte(result) != 0) {
 		switch (sshdr.sense_key) {

From 30da870ce4a4e007c901858a96e9e394a1daa74a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 May 2018 12:15:20 -0400
Subject: [PATCH 370/393] affs_lookup(): close a race with affs_remove_link()

we unlock the directory hash too early - if we are looking at secondary
link and primary (in another directory) gets removed just as we unlock,
we could have the old primary moved in place of the secondary, leaving
us to look into freed entry (and leaving our dentry with ->d_fsdata
pointing to a freed entry).

Cc: stable@vger.kernel.org # 2.4.4+
Acked-by: David Sterba <dsterba@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/namei.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d8aa0ae3d037..1ed0fa4c4d48 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -206,9 +206,10 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 
 	affs_lock_dir(dir);
 	bh = affs_find_entry(dir, dentry);
-	affs_unlock_dir(dir);
-	if (IS_ERR(bh))
+	if (IS_ERR(bh)) {
+		affs_unlock_dir(dir);
 		return ERR_CAST(bh);
+	}
 	if (bh) {
 		u32 ino = bh->b_blocknr;
 
@@ -222,10 +223,13 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 		}
 		affs_brelse(bh);
 		inode = affs_iget(sb, ino);
-		if (IS_ERR(inode))
+		if (IS_ERR(inode)) {
+			affs_unlock_dir(dir);
 			return ERR_CAST(inode);
+		}
 	}
 	d_add(dentry, inode);
+	affs_unlock_dir(dir);
 	return NULL;
 }
 

From 87fbd639c02ec96d67738e40b6521fb070ed7168 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 May 2018 12:20:40 -0400
Subject: [PATCH 371/393] affs_lookup: switch to d_splice_alias()

Making something exportable takes more than providing ->s_export_ops.
In particular, ->lookup() *MUST* use d_splice_alias() instead of
d_add().

Reading Documentation/filesystems/nfs/Exporting would've been a good idea;
as it is, exporting AFFS is badly (and exploitably) broken.

Partially-Fixes: ed4433d72394 "fs/affs: make affs exportable"
Acked-by: David Sterba <dsterba@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/namei.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 1ed0fa4c4d48..41c5749f4db7 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -201,6 +201,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *bh;
 	struct inode *inode = NULL;
+	struct dentry *res;
 
 	pr_debug("%s(\"%pd\")\n", __func__, dentry);
 
@@ -223,14 +224,12 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 		}
 		affs_brelse(bh);
 		inode = affs_iget(sb, ino);
-		if (IS_ERR(inode)) {
-			affs_unlock_dir(dir);
-			return ERR_CAST(inode);
-		}
 	}
-	d_add(dentry, inode);
+	res = d_splice_alias(inode, dentry);
+	if (!IS_ERR_OR_NULL(res))
+		res->d_fsdata = dentry->d_fsdata;
 	affs_unlock_dir(dir);
-	return NULL;
+	return res;
 }
 
 int

From f4e4d434fe3f5eceea470bf821683677dabe39c4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Apr 2018 19:02:02 -0400
Subject: [PATCH 372/393] befs_lookup(): use d_splice_alias()

RTFS(Documentation/filesystems/nfs/Exporting) if you try to make
something exportable.

Fixes: ac632f5b6301 "befs: add NFS export support"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/befs/linuxvfs.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index af2832aaeec5..4700b4534439 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -198,23 +198,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 
 	if (ret == BEFS_BT_NOT_FOUND) {
 		befs_debug(sb, "<--- %s %pd not found", __func__, dentry);
-		d_add(dentry, NULL);
-		return ERR_PTR(-ENOENT);
-
+		inode = NULL;
 	} else if (ret != BEFS_OK || offset == 0) {
 		befs_error(sb, "<--- %s Error", __func__);
-		return ERR_PTR(-ENODATA);
+		inode = ERR_PTR(-ENODATA);
+	} else {
+		inode = befs_iget(dir->i_sb, (ino_t) offset);
 	}
-
-	inode = befs_iget(dir->i_sb, (ino_t) offset);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	d_add(dentry, inode);
-
 	befs_debug(sb, "<--- %s", __func__);
 
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int

From 08a8f3086880325433d66b2dc9cdfb3f095adddf Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 13 May 2018 15:05:47 -0700
Subject: [PATCH 373/393] cramfs: Fix IS_ENABLED typo

There's an extra C here...

Fixes: 99c18ce580c6 ("cramfs: direct memory access support")
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cramfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 017b0ab19bc4..124b093d14e5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -492,7 +492,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 {
 	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 
-	if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) {
+	if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) {
 		if (sbi && sbi->mtd_point_size)
 			mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
 		kill_mtd_super(sb);

From 82382acec0c97b91830fff7130d0acce4ac4f3f3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Apr 2018 00:22:29 -0400
Subject: [PATCH 374/393] kernfs: deal with kernfs_fill_super() failures

make sure that info->node is initialized early, so that kernfs_kill_sb()
can list_del() it safely.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/kernfs/mount.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 26dd9a50f383..ff2716f9322e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -316,6 +316,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 
 	info->root = root;
 	info->ns = ns;
+	INIT_LIST_HEAD(&info->node);
 
 	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
 			 &init_user_ns, info);

From 7b745a4e4051e1bbce40e0b1c2cf636c70583aa4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 May 2018 00:03:34 -0400
Subject: [PATCH 375/393] unfuck sysfs_mount()

new_sb is left uninitialized in case of early failures in kernfs_mount_ns(),
and while IS_ERR(root) is true in all such cases, using IS_ERR(root) || !new_sb
is not a solution - IS_ERR(root) is true in some cases when new_sb is true.

Make sure new_sb is initialized (and matches the reality) in all cases and
fix the condition for dropping kobj reference - we want it done precisely
in those situations where the reference has not been transferred into a new
super_block instance.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysfs/mount.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index b428d317ae92..92682fcc41f6 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -25,7 +25,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 {
 	struct dentry *root;
 	void *ns;
-	bool new_sb;
+	bool new_sb = false;
 
 	if (!(flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
@@ -35,9 +35,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
 	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
 				SYSFS_MAGIC, &new_sb, ns);
-	if (IS_ERR(root) || !new_sb)
+	if (!new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
-	else if (new_sb)
+	else if (!IS_ERR(root))
 		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
 
 	return root;

From 9c3e9025a3f7ed25c99a0add8af65431c8043800 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 May 2018 22:59:45 -0400
Subject: [PATCH 376/393] cachefiles: vfs_mkdir() might succeed leaving dentry
 negative unhashed

That can (and does, on some filesystems) happen - ->mkdir() (and thus
vfs_mkdir()) can legitimately leave its argument negative and just
unhash it, counting upon the lookup to pick the object we'd created
next time we try to look at that name.

Some vfs_mkdir() callers forget about that possibility...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/namei.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 0daa1e3fe0df..ab0bbe93b398 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -572,6 +572,11 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 			if (ret < 0)
 				goto create_error;
 
+			if (unlikely(d_unhashed(next))) {
+				dput(next);
+				inode_unlock(d_inode(dir));
+				goto lookup_again;
+			}
 			ASSERT(d_backing_inode(next));
 
 			_debug("mkdir -> %p{%p{ino=%lu}}",
@@ -764,6 +769,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	/* search the current directory for the element name */
 	inode_lock(d_inode(dir));
 
+retry:
 	start = jiffies;
 	subdir = lookup_one_len(dirname, dir, strlen(dirname));
 	cachefiles_hist(cachefiles_lookup_histogram, start);
@@ -793,6 +799,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 		if (ret < 0)
 			goto mkdir_error;
 
+		if (unlikely(d_unhashed(subdir))) {
+			dput(subdir);
+			goto retry;
+		}
 		ASSERT(d_backing_inode(subdir));
 
 		_debug("mkdir -> %p{%p{ino=%lu}}",

From 3819bb0d79f50b05910db5bdc6d9ef512184e3b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 May 2018 17:03:19 -0400
Subject: [PATCH 377/393] nfsd: vfs_mkdir() might succeed leaving dentry
 negative unhashed

That can (and does, on some filesystems) happen - ->mkdir() (and thus
vfs_mkdir()) can legitimately leave its argument negative and just
unhash it, counting upon the lookup to pick the object we'd created
next time we try to look at that name.

Some vfs_mkdir() callers forget about that possibility...

Acked-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2410b093a2e6..b0555d7d8200 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1201,6 +1201,28 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		break;
 	case S_IFDIR:
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		if (!host_err && unlikely(d_unhashed(dchild))) {
+			struct dentry *d;
+			d = lookup_one_len(dchild->d_name.name,
+					   dchild->d_parent,
+					   dchild->d_name.len);
+			if (IS_ERR(d)) {
+				host_err = PTR_ERR(d);
+				break;
+			}
+			if (unlikely(d_is_negative(d))) {
+				dput(d);
+				err = nfserr_serverfault;
+				goto out;
+			}
+			dput(resfhp->fh_dentry);
+			resfhp->fh_dentry = dget(d);
+			err = fh_update(resfhp);
+			dput(dchild);
+			dchild = d;
+			if (err)
+				goto out;
+		}
 		break;
 	case S_IFCHR:
 	case S_IFBLK:

From 5aa1437d2d9a068c0334bd7c9dafa8ec4f97f13b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 May 2018 17:18:30 -0400
Subject: [PATCH 378/393] ext2: fix a block leak

open file, unlink it, then use ioctl(2) to make it immutable or
append only.  Now close it and watch the blocks *not* freed...

Immutable/append-only checks belong in ->setattr().
Note: the bug is old and backport to anything prior to 737f2e93b972
("ext2: convert to use the new truncate convention") will need
these checks lifted into ext2_setattr().

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/inode.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1e01fabef130..71635909df3b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1264,21 +1264,11 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 
 static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
-	/*
-	 * XXX: it seems like a bug here that we don't allow
-	 * IS_APPEND inode to have blocks-past-i_size trimmed off.
-	 * review and fix this.
-	 *
-	 * Also would be nice to be able to handle IO errors and such,
-	 * but that's probably too much to ask.
-	 */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return;
 	if (ext2_inode_is_fast_symlink(inode))
 		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
 
 	dax_sem_down_write(EXT2_I(inode));
 	__ext2_truncate_blocks(inode, offset);

From baf10564fbb66ea222cae66fbff11c444590ffd9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 20 May 2018 16:46:23 -0400
Subject: [PATCH 379/393] aio: fix io_destroy(2) vs. lookup_ioctx() race

kill_ioctx() used to have an explicit RCU delay between removing the
reference from ->ioctx_table and percpu_ref_kill() dropping the refcount.
At some point that delay had been removed, on the theory that
percpu_ref_kill() itself contained an RCU delay.  Unfortunately, that was
the wrong kind of RCU delay and it didn't care about rcu_read_lock() used
by lookup_ioctx().  As the result, we could get ctx freed right under
lookup_ioctx().  Tejun has fixed that in a6d7cff472e ("fs/aio: Add explicit
RCU grace period when freeing kioctx"); however, that fix is not enough.

Suppose io_destroy() from one thread races with e.g. io_setup() from another;
CPU1 removes the reference from current->mm->ioctx_table[...] just as CPU2
has picked it (under rcu_read_lock()).  Then CPU1 proceeds to drop the
refcount, getting it to 0 and triggering a call of free_ioctx_users(),
which proceeds to drop the secondary refcount and once that reaches zero
calls free_ioctx_reqs().  That does
        INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
        queue_rcu_work(system_wq, &ctx->free_rwork);
and schedules freeing the whole thing after RCU delay.

In the meanwhile CPU2 has gotten around to percpu_ref_get(), bumping the
refcount from 0 to 1 and returned the reference to io_setup().

Tejun's fix (that queue_rcu_work() in there) guarantees that ctx won't get
freed until after percpu_ref_get().  Sure, we'd increment the counter before
ctx can be freed.  Now we are out of rcu_read_lock() and there's nothing to
stop freeing of the whole thing.  Unfortunately, CPU2 assumes that since it
has grabbed the reference, ctx is *NOT* going away until it gets around to
dropping that reference.

The fix is obvious - use percpu_ref_tryget_live() and treat failure as miss.
It's not costlier than what we currently do in normal case, it's safe to
call since freeing *is* delayed and it closes the race window - either
lookup_ioctx() comes before percpu_ref_kill() (in which case ctx->users
won't reach 0 until the caller of lookup_ioctx() drops it) or lookup_ioctx()
fails, ctx->users is unaffected and caller of lookup_ioctx() doesn't see
the object in question at all.

Cc: stable@kernel.org
Fixes: a6d7cff472e "fs/aio: Add explicit RCU grace period when freeing kioctx"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 88d7927ffbc6..8061d9787e54 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1078,8 +1078,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 
 	ctx = rcu_dereference(table->table[id]);
 	if (ctx && ctx->user_id == ctx_id) {
-		percpu_ref_get(&ctx->users);
-		ret = ctx;
+		if (percpu_ref_tryget_live(&ctx->users))
+			ret = ctx;
 	}
 out:
 	rcu_read_unlock();

From eedffa28c9b00ca2dcb4d541b5a530f4c917052d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 21 May 2018 14:35:03 -0400
Subject: [PATCH 380/393] loop: clear wb_err in bd_inode when detaching backing
 file

When a loop block device encounters a writeback error, that error will
get propagated to the bd_inode's wb_err field. If we then detach the
backing file from it, attach another and fsync it, we'll get back the
writeback error that we had from the previous backing file.

This is a bit of a grey area as POSIX doesn't cover loop devices, but it
is somewhat counterintuitive.

If we detach a backing file from the loopdev while there are still
unreported errors, take it as a sign that we're no longer interested in
the previous file, and clear out the wb_err in the loop blockdev.

Reported-and-Tested-by: Theodore Y. Ts'o <tytso@mit.edu>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5d4e31655d96..55cf554bc914 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1068,6 +1068,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	if (bdev) {
 		bdput(bdev);
 		invalidate_bdev(bdev);
+		bdev->bd_inode->i_mapping->wb_err = 0;
 	}
 	set_capacity(lo->lo_disk, 0);
 	loop_sysfs_exit(lo);

From a048a07d7f4535baa4cbad6bc024f175317ab938 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 22 May 2018 09:00:00 +1000
Subject: [PATCH 381/393] powerpc/64s: Add support for a store forwarding
 barrier at kernel entry/exit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On some CPUs we can prevent a vulnerability related to store-to-load
forwarding by preventing store forwarding between privilege domains,
by inserting a barrier in kernel entry and exit paths.

This is known to be the case on at least Power7, Power8 and Power9
powerpc CPUs.

Barriers must be inserted generally before the first load after moving
to a higher privilege, and after the last store before moving to a
lower privilege, HV and PR privilege transitions must be protected.

Barriers are added as patch sections, with all kernel/hypervisor entry
points patched, and the exit points to lower privilge levels patched
similarly to the RFI flush patching.

Firmware advertisement is not implemented yet, so CPU flush types
are hard coded.

Thanks to Michal Suchánek for bug fixes and review.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/exception-64s.h     |  29 ++++
 arch/powerpc/include/asm/feature-fixups.h    |  19 +++
 arch/powerpc/include/asm/security_features.h |  11 ++
 arch/powerpc/kernel/exceptions-64s.S         |  19 ++-
 arch/powerpc/kernel/security.c               | 149 +++++++++++++++++++
 arch/powerpc/kernel/vmlinux.lds.S            |  14 ++
 arch/powerpc/lib/feature-fixups.c            | 115 ++++++++++++++
 arch/powerpc/platforms/powernv/setup.c       |   1 +
 arch/powerpc/platforms/pseries/setup.c       |   1 +
 9 files changed, 356 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 471b2274fbeb..c40b4380951c 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -74,6 +74,27 @@
  */
 #define EX_R3		EX_DAR
 
+#define STF_ENTRY_BARRIER_SLOT						\
+	STF_ENTRY_BARRIER_FIXUP_SECTION;				\
+	nop;								\
+	nop;								\
+	nop
+
+#define STF_EXIT_BARRIER_SLOT						\
+	STF_EXIT_BARRIER_FIXUP_SECTION;					\
+	nop;								\
+	nop;								\
+	nop;								\
+	nop;								\
+	nop;								\
+	nop
+
+/*
+ * r10 must be free to use, r13 must be paca
+ */
+#define INTERRUPT_TO_KERNEL						\
+	STF_ENTRY_BARRIER_SLOT
+
 /*
  * Macros for annotating the expected destination of (h)rfid
  *
@@ -90,16 +111,19 @@
 	rfid
 
 #define RFI_TO_USER							\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	rfid;								\
 	b	rfi_flush_fallback
 
 #define RFI_TO_USER_OR_KERNEL						\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	rfid;								\
 	b	rfi_flush_fallback
 
 #define RFI_TO_GUEST							\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	rfid;								\
 	b	rfi_flush_fallback
@@ -108,21 +132,25 @@
 	hrfid
 
 #define HRFI_TO_USER							\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	hrfid;								\
 	b	hrfi_flush_fallback
 
 #define HRFI_TO_USER_OR_KERNEL						\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	hrfid;								\
 	b	hrfi_flush_fallback
 
 #define HRFI_TO_GUEST							\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	hrfid;								\
 	b	hrfi_flush_fallback
 
 #define HRFI_TO_UNKNOWN							\
+	STF_EXIT_BARRIER_SLOT;						\
 	RFI_FLUSH_SLOT;							\
 	hrfid;								\
 	b	hrfi_flush_fallback
@@ -254,6 +282,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define __EXCEPTION_PROLOG_1_PRE(area)					\
 	OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR);		\
 	OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR);		\
+	INTERRUPT_TO_KERNEL;						\
 	SAVE_CTR(r10, area);						\
 	mfcr	r9;
 
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 1e82eb3caabd..a9b64df34e2a 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -187,6 +187,22 @@ label##3:					       	\
 	FTR_ENTRY_OFFSET label##1b-label##3b;		\
 	.popsection;
 
+#define STF_ENTRY_BARRIER_FIXUP_SECTION			\
+953:							\
+	.pushsection __stf_entry_barrier_fixup,"a";	\
+	.align 2;					\
+954:							\
+	FTR_ENTRY_OFFSET 953b-954b;			\
+	.popsection;
+
+#define STF_EXIT_BARRIER_FIXUP_SECTION			\
+955:							\
+	.pushsection __stf_exit_barrier_fixup,"a";	\
+	.align 2;					\
+956:							\
+	FTR_ENTRY_OFFSET 955b-956b;			\
+	.popsection;
+
 #define RFI_FLUSH_FIXUP_SECTION				\
 951:							\
 	.pushsection __rfi_flush_fixup,"a";		\
@@ -199,6 +215,9 @@ label##3:					       	\
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
+extern long stf_barrier_fallback;
+extern long __start___stf_entry_barrier_fixup, __stop___stf_entry_barrier_fixup;
+extern long __start___stf_exit_barrier_fixup, __stop___stf_exit_barrier_fixup;
 extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
 
 void apply_feature_fixups(void);
diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h
index fa4d2e1cf772..44989b22383c 100644
--- a/arch/powerpc/include/asm/security_features.h
+++ b/arch/powerpc/include/asm/security_features.h
@@ -12,6 +12,17 @@
 extern unsigned long powerpc_security_features;
 extern bool rfi_flush;
 
+/* These are bit flags */
+enum stf_barrier_type {
+	STF_BARRIER_NONE	= 0x1,
+	STF_BARRIER_FALLBACK	= 0x2,
+	STF_BARRIER_EIEIO	= 0x4,
+	STF_BARRIER_SYNC_ORI	= 0x8,
+};
+
+void setup_stf_barrier(void);
+void do_stf_barrier_fixups(enum stf_barrier_type types);
+
 static inline void security_ftr_set(unsigned long feature)
 {
 	powerpc_security_features |= feature;
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ae6a849db60b..f283958129f2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -885,7 +885,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 
-EXC_REAL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED)
+EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED)
 EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED)
 TRAMP_KVM(PACA_EXGEN, 0x900)
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
@@ -961,6 +961,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 	mtctr	r13;							\
 	GET_PACA(r13);							\
 	std	r10,PACA_EXGEN+EX_R10(r13);				\
+	INTERRUPT_TO_KERNEL;						\
 	KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \
 	HMT_MEDIUM;							\
 	mfctr	r9;
@@ -969,7 +970,8 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 #define SYSCALL_KVMTEST							\
 	HMT_MEDIUM;							\
 	mr	r9,r13;							\
-	GET_PACA(r13);
+	GET_PACA(r13);							\
+	INTERRUPT_TO_KERNEL;
 #endif
 	
 #define LOAD_SYSCALL_HANDLER(reg)					\
@@ -1507,6 +1509,19 @@ masked_##_H##interrupt:					\
 	b	.;					\
 	MASKED_DEC_HANDLER(_H)
 
+TRAMP_REAL_BEGIN(stf_barrier_fallback)
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	sync
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ori	31,31,0
+	.rept 14
+	b	1f
+1:
+	.endr
+	blr
+
 TRAMP_REAL_BEGIN(rfi_flush_fallback)
 	SET_SCRATCH0(r13);
 	GET_PACA(r13);
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index bab5a27ea805..b98a722da915 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/seq_buf.h>
 
+#include <asm/debugfs.h>
 #include <asm/security_features.h>
 
 
@@ -86,3 +87,151 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c
 
 	return s.len;
 }
+
+/*
+ * Store-forwarding barrier support.
+ */
+
+static enum stf_barrier_type stf_enabled_flush_types;
+static bool no_stf_barrier;
+bool stf_barrier;
+
+static int __init handle_no_stf_barrier(char *p)
+{
+	pr_info("stf-barrier: disabled on command line.");
+	no_stf_barrier = true;
+	return 0;
+}
+
+early_param("no_stf_barrier", handle_no_stf_barrier);
+
+/* This is the generic flag used by other architectures */
+static int __init handle_ssbd(char *p)
+{
+	if (!p || strncmp(p, "auto", 5) == 0 || strncmp(p, "on", 2) == 0 ) {
+		/* Until firmware tells us, we have the barrier with auto */
+		return 0;
+	} else if (strncmp(p, "off", 3) == 0) {
+		handle_no_stf_barrier(NULL);
+		return 0;
+	} else
+		return 1;
+
+	return 0;
+}
+early_param("spec_store_bypass_disable", handle_ssbd);
+
+/* This is the generic flag used by other architectures */
+static int __init handle_no_ssbd(char *p)
+{
+	handle_no_stf_barrier(NULL);
+	return 0;
+}
+early_param("nospec_store_bypass_disable", handle_no_ssbd);
+
+static void stf_barrier_enable(bool enable)
+{
+	if (enable)
+		do_stf_barrier_fixups(stf_enabled_flush_types);
+	else
+		do_stf_barrier_fixups(STF_BARRIER_NONE);
+
+	stf_barrier = enable;
+}
+
+void setup_stf_barrier(void)
+{
+	enum stf_barrier_type type;
+	bool enable, hv;
+
+	hv = cpu_has_feature(CPU_FTR_HVMODE);
+
+	/* Default to fallback in case fw-features are not available */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		type = STF_BARRIER_EIEIO;
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		type = STF_BARRIER_SYNC_ORI;
+	else if (cpu_has_feature(CPU_FTR_ARCH_206))
+		type = STF_BARRIER_FALLBACK;
+	else
+		type = STF_BARRIER_NONE;
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		(security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) ||
+		 (security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && hv));
+
+	if (type == STF_BARRIER_FALLBACK) {
+		pr_info("stf-barrier: fallback barrier available\n");
+	} else if (type == STF_BARRIER_SYNC_ORI) {
+		pr_info("stf-barrier: hwsync barrier available\n");
+	} else if (type == STF_BARRIER_EIEIO) {
+		pr_info("stf-barrier: eieio barrier available\n");
+	}
+
+	stf_enabled_flush_types = type;
+
+	if (!no_stf_barrier)
+		stf_barrier_enable(enable);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	if (stf_barrier && stf_enabled_flush_types != STF_BARRIER_NONE) {
+		const char *type;
+		switch (stf_enabled_flush_types) {
+		case STF_BARRIER_EIEIO:
+			type = "eieio";
+			break;
+		case STF_BARRIER_SYNC_ORI:
+			type = "hwsync";
+			break;
+		case STF_BARRIER_FALLBACK:
+			type = "fallback";
+			break;
+		default:
+			type = "unknown";
+		}
+		return sprintf(buf, "Mitigation: Kernel entry/exit barrier (%s)\n", type);
+	}
+
+	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
+	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
+		return sprintf(buf, "Not affected\n");
+
+	return sprintf(buf, "Vulnerable\n");
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int stf_barrier_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	/* Only do anything if we're changing state */
+	if (enable != stf_barrier)
+		stf_barrier_enable(enable);
+
+	return 0;
+}
+
+static int stf_barrier_get(void *data, u64 *val)
+{
+	*val = stf_barrier ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, "%llu\n");
+
+static __init int stf_barrier_debugfs_init(void)
+{
+	debugfs_create_file("stf_barrier", 0600, powerpc_debugfs_root, NULL, &fops_stf_barrier);
+	return 0;
+}
+device_initcall(stf_barrier_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index c8af90ff49f0..b8d82678f8b4 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -133,6 +133,20 @@ SECTIONS
 	RO_DATA(PAGE_SIZE)
 
 #ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__stf_entry_barrier_fixup : AT(ADDR(__stf_entry_barrier_fixup) - LOAD_OFFSET) {
+		__start___stf_entry_barrier_fixup = .;
+		*(__stf_entry_barrier_fixup)
+		__stop___stf_entry_barrier_fixup = .;
+	}
+
+	. = ALIGN(8);
+	__stf_exit_barrier_fixup : AT(ADDR(__stf_exit_barrier_fixup) - LOAD_OFFSET) {
+		__start___stf_exit_barrier_fixup = .;
+		*(__stf_exit_barrier_fixup)
+		__stop___stf_exit_barrier_fixup = .;
+	}
+
 	. = ALIGN(8);
 	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
 		__start___rfi_flush_fixup = .;
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 288fe4f0db4e..e1bcdc32a851 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -23,6 +23,7 @@
 #include <asm/page.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
+#include <asm/security_features.h>
 #include <asm/firmware.h>
 
 struct fixup_entry {
@@ -117,6 +118,120 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
+void do_stf_entry_barrier_fixups(enum stf_barrier_type types)
+{
+	unsigned int instrs[3], *dest;
+	long *start, *end;
+	int i;
+
+	start = PTRRELOC(&__start___stf_entry_barrier_fixup),
+	end = PTRRELOC(&__stop___stf_entry_barrier_fixup);
+
+	instrs[0] = 0x60000000; /* nop */
+	instrs[1] = 0x60000000; /* nop */
+	instrs[2] = 0x60000000; /* nop */
+
+	i = 0;
+	if (types & STF_BARRIER_FALLBACK) {
+		instrs[i++] = 0x7d4802a6; /* mflr r10		*/
+		instrs[i++] = 0x60000000; /* branch patched below */
+		instrs[i++] = 0x7d4803a6; /* mtlr r10		*/
+	} else if (types & STF_BARRIER_EIEIO) {
+		instrs[i++] = 0x7e0006ac; /* eieio + bit 6 hint */
+	} else if (types & STF_BARRIER_SYNC_ORI) {
+		instrs[i++] = 0x7c0004ac; /* hwsync		*/
+		instrs[i++] = 0xe94d0000; /* ld r10,0(r13)	*/
+		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+	}
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+		patch_instruction(dest, instrs[0]);
+
+		if (types & STF_BARRIER_FALLBACK)
+			patch_branch(dest + 1, (unsigned long)&stf_barrier_fallback,
+				     BRANCH_SET_LINK);
+		else
+			patch_instruction(dest + 1, instrs[1]);
+
+		patch_instruction(dest + 2, instrs[2]);
+	}
+
+	printk(KERN_DEBUG "stf-barrier: patched %d entry locations (%s barrier)\n", i,
+		(types == STF_BARRIER_NONE)                  ? "no" :
+		(types == STF_BARRIER_FALLBACK)              ? "fallback" :
+		(types == STF_BARRIER_EIEIO)                 ? "eieio" :
+		(types == (STF_BARRIER_SYNC_ORI))            ? "hwsync"
+		                                           : "unknown");
+}
+
+void do_stf_exit_barrier_fixups(enum stf_barrier_type types)
+{
+	unsigned int instrs[6], *dest;
+	long *start, *end;
+	int i;
+
+	start = PTRRELOC(&__start___stf_exit_barrier_fixup),
+	end = PTRRELOC(&__stop___stf_exit_barrier_fixup);
+
+	instrs[0] = 0x60000000; /* nop */
+	instrs[1] = 0x60000000; /* nop */
+	instrs[2] = 0x60000000; /* nop */
+	instrs[3] = 0x60000000; /* nop */
+	instrs[4] = 0x60000000; /* nop */
+	instrs[5] = 0x60000000; /* nop */
+
+	i = 0;
+	if (types & STF_BARRIER_FALLBACK || types & STF_BARRIER_SYNC_ORI) {
+		if (cpu_has_feature(CPU_FTR_HVMODE)) {
+			instrs[i++] = 0x7db14ba6; /* mtspr 0x131, r13 (HSPRG1) */
+			instrs[i++] = 0x7db04aa6; /* mfspr r13, 0x130 (HSPRG0) */
+		} else {
+			instrs[i++] = 0x7db243a6; /* mtsprg 2,r13	*/
+			instrs[i++] = 0x7db142a6; /* mfsprg r13,1    */
+	        }
+		instrs[i++] = 0x7c0004ac; /* hwsync		*/
+		instrs[i++] = 0xe9ad0000; /* ld r13,0(r13)	*/
+		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+		if (cpu_has_feature(CPU_FTR_HVMODE)) {
+			instrs[i++] = 0x7db14aa6; /* mfspr r13, 0x131 (HSPRG1) */
+		} else {
+			instrs[i++] = 0x7db242a6; /* mfsprg r13,2 */
+		}
+	} else if (types & STF_BARRIER_EIEIO) {
+		instrs[i++] = 0x7e0006ac; /* eieio + bit 6 hint */
+	}
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+		patch_instruction(dest, instrs[0]);
+		patch_instruction(dest + 1, instrs[1]);
+		patch_instruction(dest + 2, instrs[2]);
+		patch_instruction(dest + 3, instrs[3]);
+		patch_instruction(dest + 4, instrs[4]);
+		patch_instruction(dest + 5, instrs[5]);
+	}
+	printk(KERN_DEBUG "stf-barrier: patched %d exit locations (%s barrier)\n", i,
+		(types == STF_BARRIER_NONE)                  ? "no" :
+		(types == STF_BARRIER_FALLBACK)              ? "fallback" :
+		(types == STF_BARRIER_EIEIO)                 ? "eieio" :
+		(types == (STF_BARRIER_SYNC_ORI))            ? "hwsync"
+		                                           : "unknown");
+}
+
+
+void do_stf_barrier_fixups(enum stf_barrier_type types)
+{
+	do_stf_entry_barrier_fixups(types);
+	do_stf_exit_barrier_fixups(types);
+}
+
 void do_rfi_flush_fixups(enum l1d_flush_type types)
 {
 	unsigned int instrs[3], *dest;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index ef8c9ce53a61..a6648ec99ca7 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -131,6 +131,7 @@ static void __init pnv_setup_arch(void)
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 
 	pnv_setup_rfi_flush();
+	setup_stf_barrier();
 
 	/* Initialize SMP */
 	pnv_smp_init();
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b55ad4286dc7..fdb32e056ef4 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -710,6 +710,7 @@ static void __init pSeries_setup_arch(void)
 	fwnmi_init();
 
 	pseries_setup_rfi_flush();
+	setup_stf_barrier();
 
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);

From c62ec4610c40bcc44f2d3d5ed1c312737279e2f3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 22 May 2018 13:02:17 +0200
Subject: [PATCH 382/393] PM / core: Fix direct_complete handling for devices
 with no callbacks

Commit 08810a4119aa (PM / core: Add NEVER_SKIP and SMART_PREPARE
driver flags) inadvertently prevented the power.direct_complete flag
from being set for devices without PM callbacks and with disabled
runtime PM which also prevents power.direct_complete from being set
for their parents.  That led to problems including a resume crash on
HP ZBook 14u.

Restore the previous behavior by causing power.direct_complete to be
set for those devices again, but do that in a more direct way to
avoid overlooking that case in the future.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199693
Fixes: 08810a4119aa (PM / core: Add NEVER_SKIP and SMART_PREPARE driver flags)
Reported-by: Thomas Martitz <kugel@rockbox.org>
Tested-by: Thomas Martitz <kugel@rockbox.org>
Cc: 4.15+ <stable@vger.kernel.org> # 4.15+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Johan Hovold <johan@kernel.org>
---
 drivers/base/power/main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 02a497e7c785..e5e067091572 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1923,10 +1923,8 @@ static int device_prepare(struct device *dev, pm_message_t state)
 
 	dev->power.wakeup_path = false;
 
-	if (dev->power.no_pm_callbacks) {
-		ret = 1;	/* Let device go direct_complete */
+	if (dev->power.no_pm_callbacks)
 		goto unlock;
-	}
 
 	if (dev->pm_domain)
 		callback = dev->pm_domain->ops.prepare;
@@ -1960,7 +1958,8 @@ static int device_prepare(struct device *dev, pm_message_t state)
 	 */
 	spin_lock_irq(&dev->power.lock);
 	dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
-		pm_runtime_suspended(dev) && ret > 0 &&
+		((pm_runtime_suspended(dev) && ret > 0) ||
+		 dev->power.no_pm_callbacks) &&
 		!dev_pm_test_driver_flags(dev, DPM_FLAG_NEVER_SKIP);
 	spin_unlock_irq(&dev->power.lock);
 	return 0;

From 6db615431a21b6057f68ed87583a663ee69f7601 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 May 2018 16:04:51 +0200
Subject: [PATCH 383/393] alpha: use dma_direct_ops for jensen

The generic dma_direct implementation does the same thing as the alpha
pci-noop implementation, just with more bells and whistles.  And unlike
the current code it at least has a theoretical chance to actually compile.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/Kconfig                   |  1 +
 arch/alpha/include/asm/dma-mapping.h |  4 ++++
 arch/alpha/kernel/pci-noop.c         | 33 ----------------------------
 3 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b2022885ced8..f19dc31288c8 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -211,6 +211,7 @@ config ALPHA_EIGER
 config ALPHA_JENSEN
 	bool "Jensen"
 	depends on BROKEN
+	select DMA_DIRECT_OPS
 	help
 	  DEC PC 150 AXP (aka Jensen): This is a very old Digital system - one
 	  of the first-generation Alpha systems. A number of these systems
diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index b78f61f20796..76ce923ecca1 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -6,7 +6,11 @@ extern const struct dma_map_ops *dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
+#ifdef CONFIG_ALPHA_JENSEN
+	return &dma_direct_ops;
+#else
 	return dma_ops;
+#endif
 }
 
 #endif	/* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index b6ebb65127a8..c7c5879869d3 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -102,36 +102,3 @@ SYSCALL_DEFINE5(pciconfig_write, unsigned long, bus, unsigned long, dfn,
 	else
 		return -ENODEV;
 }
-
-static void *alpha_noop_alloc_coherent(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t gfp,
-				       unsigned long attrs)
-{
-	void *ret;
-
-	if (!dev || *dev->dma_mask >= 0xffffffffUL)
-		gfp &= ~GFP_DMA;
-	ret = (void *)__get_free_pages(gfp, get_order(size));
-	if (ret) {
-		memset(ret, 0, size);
-		*dma_handle = virt_to_phys(ret);
-	}
-	return ret;
-}
-
-static int alpha_noop_supported(struct device *dev, u64 mask)
-{
-	return mask < 0x00ffffffUL ? 0 : 1;
-}
-
-const struct dma_map_ops alpha_noop_ops = {
-	.alloc			= alpha_noop_alloc_coherent,
-	.free			= dma_noop_free_coherent,
-	.map_page		= dma_noop_map_page,
-	.map_sg			= dma_noop_map_sg,
-	.mapping_error		= dma_noop_mapping_error,
-	.dma_supported		= alpha_noop_supported,
-};
-
-const struct dma_map_ops *dma_ops = &alpha_noop_ops;
-EXPORT_SYMBOL(dma_ops);

From f5e82fa26063e6fad10624ff600457d878fa6e41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 May 2018 16:04:52 +0200
Subject: [PATCH 384/393] alpha: simplify get_arch_dma_ops

Remove the dma_ops indirection.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/include/asm/dma-mapping.h | 4 ++--
 arch/alpha/kernel/pci_iommu.c        | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 76ce923ecca1..8beeafd4f68e 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -2,14 +2,14 @@
 #ifndef _ALPHA_DMA_MAPPING_H
 #define _ALPHA_DMA_MAPPING_H
 
-extern const struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops alpha_pci_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_ALPHA_JENSEN
 	return &dma_direct_ops;
 #else
-	return dma_ops;
+	return &alpha_pci_ops;
 #endif
 }
 
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 83b34b9188ea..6923b0d9c1e1 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -950,6 +950,4 @@ const struct dma_map_ops alpha_pci_ops = {
 	.mapping_error		= alpha_pci_mapping_error,
 	.dma_supported		= alpha_pci_supported,
 };
-
-const struct dma_map_ops *dma_ops = &alpha_pci_ops;
-EXPORT_SYMBOL(dma_ops);
+EXPORT_SYMBOL(alpha_pci_ops);

From 92d7223a74235054f2aa7227d207d9c57f84dca0 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Mon, 16 Apr 2018 18:16:56 -0400
Subject: [PATCH 385/393] alpha: io: reorder barriers to guarantee writeX() and
 iowriteX() ordering #2

memory-barriers.txt has been updated with the following requirement.

"When using writel(), a prior wmb() is not needed to guarantee that the
cache coherent memory writes have completed before writing to the MMIO
region."

Current writeX() and iowriteX() implementations on alpha are not
satisfying this requirement as the barrier is after the register write.

Move mb() in writeX() and iowriteX() functions to guarantee that HW
observes memory changes before performing register operations.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/kernel/io.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/alpha/kernel/io.c b/arch/alpha/kernel/io.c
index 3e3d49c254c5..c025a3e5e357 100644
--- a/arch/alpha/kernel/io.c
+++ b/arch/alpha/kernel/io.c
@@ -37,20 +37,20 @@ unsigned int ioread32(void __iomem *addr)
 
 void iowrite8(u8 b, void __iomem *addr)
 {
-	IO_CONCAT(__IO_PREFIX,iowrite8)(b, addr);
 	mb();
+	IO_CONCAT(__IO_PREFIX,iowrite8)(b, addr);
 }
 
 void iowrite16(u16 b, void __iomem *addr)
 {
-	IO_CONCAT(__IO_PREFIX,iowrite16)(b, addr);
 	mb();
+	IO_CONCAT(__IO_PREFIX,iowrite16)(b, addr);
 }
 
 void iowrite32(u32 b, void __iomem *addr)
 {
-	IO_CONCAT(__IO_PREFIX,iowrite32)(b, addr);
 	mb();
+	IO_CONCAT(__IO_PREFIX,iowrite32)(b, addr);
 }
 
 EXPORT_SYMBOL(ioread8);
@@ -176,26 +176,26 @@ u64 readq(const volatile void __iomem *addr)
 
 void writeb(u8 b, volatile void __iomem *addr)
 {
-	__raw_writeb(b, addr);
 	mb();
+	__raw_writeb(b, addr);
 }
 
 void writew(u16 b, volatile void __iomem *addr)
 {
-	__raw_writew(b, addr);
 	mb();
+	__raw_writew(b, addr);
 }
 
 void writel(u32 b, volatile void __iomem *addr)
 {
-	__raw_writel(b, addr);
 	mb();
+	__raw_writel(b, addr);
 }
 
 void writeq(u64 b, volatile void __iomem *addr)
 {
-	__raw_writeq(b, addr);
 	mb();
+	__raw_writeq(b, addr);
 }
 
 EXPORT_SYMBOL(readb);

From 11799564fc7eedff50801950090773928f867996 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Tue, 22 May 2018 17:23:10 -0700
Subject: [PATCH 386/393] mfd: cros_ec: Retry commands when EC is known to be
 busy

Commit 001dde9400d5 ("mfd: cros ec: spi: Fix "in progress" error
signaling") pointed out some bad code, but its analysis and conclusion
was not 100% correct.

It *is* correct that we should not propagate result==EC_RES_IN_PROGRESS
for transport errors, because this has a special meaning -- that we
should follow up with EC_CMD_GET_COMMS_STATUS until the EC is no longer
busy. This is definitely the wrong thing for many commands, because
among other problems, EC_CMD_GET_COMMS_STATUS doesn't actually retrieve
any RX data from the EC, so commands that expected some data back will
instead start processing junk.

For such commands, the right answer is to either propagate the error
(and return that error to the caller) or resend the original command
(*not* EC_CMD_GET_COMMS_STATUS).

Unfortunately, commit 001dde9400d5 forgets a crucial point: that for
some long-running operations, the EC physically cannot respond to
commands any more. For example, with EC_CMD_FLASH_ERASE, the EC may be
re-flashing its own code regions, so it can't respond to SPI interrupts.
Instead, the EC prepares us ahead of time for being busy for a "long"
time, and fills its hardware buffer with EC_SPI_PAST_END. Thus, we
expect to see several "transport" errors (or, messages filled with
EC_SPI_PAST_END). So we should really translate that to a retryable
error (-EAGAIN) and continue sending EC_CMD_GET_COMMS_STATUS until we
get a ready status.

IOW, it is actually important to treat some of these "junk" values as
retryable errors.

Together with commit 001dde9400d5, this resolves bugs like the
following:

1. EC_CMD_FLASH_ERASE now works again (with commit 001dde9400d5, we
   would abort the first time we saw EC_SPI_PAST_END)
2. Before commit 001dde9400d5, transport errors (e.g.,
   EC_SPI_RX_BAD_DATA) seen in other commands (e.g.,
   EC_CMD_RTC_GET_VALUE) used to yield junk data in the RX buffer; they
   will now yield -EAGAIN return values, and tools like 'hwclock' will
   simply fail instead of retrieving and re-programming undefined time
   values

Fixes: 001dde9400d5 ("mfd: cros ec: spi: Fix "in progress" error signaling")
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_spi.c               | 24 ++++++++++++++++++++----
 drivers/platform/chrome/cros_ec_proto.c |  2 ++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 1b52b8557034..2060d1483043 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -419,10 +419,25 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
 		/* Verify that EC can process command */
 		for (i = 0; i < len; i++) {
 			rx_byte = rx_buf[i];
+			/*
+			 * Seeing the PAST_END, RX_BAD_DATA, or NOT_READY
+			 * markers are all signs that the EC didn't fully
+			 * receive our command. e.g., if the EC is flashing
+			 * itself, it can't respond to any commands and instead
+			 * clocks out EC_SPI_PAST_END from its SPI hardware
+			 * buffer. Similar occurrences can happen if the AP is
+			 * too slow to clock out data after asserting CS -- the
+			 * EC will abort and fill its buffer with
+			 * EC_SPI_RX_BAD_DATA.
+			 *
+			 * In all cases, these errors should be safe to retry.
+			 * Report -EAGAIN and let the caller decide what to do
+			 * about that.
+			 */
 			if (rx_byte == EC_SPI_PAST_END  ||
 			    rx_byte == EC_SPI_RX_BAD_DATA ||
 			    rx_byte == EC_SPI_NOT_READY) {
-				ret = -EREMOTEIO;
+				ret = -EAGAIN;
 				break;
 			}
 		}
@@ -431,7 +446,7 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
 	if (!ret)
 		ret = cros_ec_spi_receive_packet(ec_dev,
 				ec_msg->insize + sizeof(*response));
-	else
+	else if (ret != -EAGAIN)
 		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
 
 	final_ret = terminate_request(ec_dev);
@@ -537,10 +552,11 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 		/* Verify that EC can process command */
 		for (i = 0; i < len; i++) {
 			rx_byte = rx_buf[i];
+			/* See comments in cros_ec_pkt_xfer_spi() */
 			if (rx_byte == EC_SPI_PAST_END  ||
 			    rx_byte == EC_SPI_RX_BAD_DATA ||
 			    rx_byte == EC_SPI_NOT_READY) {
-				ret = -EREMOTEIO;
+				ret = -EAGAIN;
 				break;
 			}
 		}
@@ -549,7 +565,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 	if (!ret)
 		ret = cros_ec_spi_receive_response(ec_dev,
 				ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
-	else
+	else if (ret != -EAGAIN)
 		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
 
 	final_ret = terminate_request(ec_dev);
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index e7bbdf947bbc..8350ca2311c7 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -91,6 +91,8 @@ static int send_command(struct cros_ec_device *ec_dev,
 			usleep_range(10000, 11000);
 
 			ret = (*xfer_fxn)(ec_dev, status_msg);
+			if (ret == -EAGAIN)
+				continue;
 			if (ret < 0)
 				break;
 

From 30bf066cd9989fef34aeeef9080368867fe42be7 Mon Sep 17 00:00:00 2001
From: "Kalderon, Michal" <Michal.Kalderon@cavium.com>
Date: Tue, 15 May 2018 15:13:33 +0300
Subject: [PATCH 387/393] RDMA/qedr: Fix doorbell bar mapping for dpi > 1

Each user_context receives a separate dpi value and thus a different
address on the doorbell bar. The qedr_mmap function needs to validate
the address and map the doorbell bar accordingly.
The current implementation always checked against dpi=0 doorbell range
leading to a wrong mapping for doorbell bar. (It entered an else case
that mapped the address differently). qedr_mmap should only be used
for doorbells, so the else was actually wrong in the first place.
This only has an affect on arm architecture and not an issue on a
x86 based architecture.
This lead to doorbells not occurring on arm based systems and left
applications that use more than one dpi (or several applications
run simultaneously ) to hang.

Fixes: ac1b36e55a51 ("qedr: Add support for user context verbs")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/qedr/verbs.c | 62 +++++++++++++++---------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 7d3763b2e01c..3f9afc02d166 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -401,49 +401,47 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct qedr_ucontext *ucontext = get_qedr_ucontext(context);
 	struct qedr_dev *dev = get_qedr_dev(context->device);
-	unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT;
-	u64 unmapped_db = dev->db_phys_addr;
+	unsigned long phys_addr = vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long len = (vma->vm_end - vma->vm_start);
-	int rc = 0;
-	bool found;
+	unsigned long dpi_start;
+
+	dpi_start = dev->db_phys_addr + (ucontext->dpi * ucontext->dpi_size);
 
 	DP_DEBUG(dev, QEDR_MSG_INIT,
-		 "qedr_mmap called vm_page=0x%lx vm_pgoff=0x%lx unmapped_db=0x%llx db_size=%x, len=%lx\n",
-		 vm_page, vma->vm_pgoff, unmapped_db, dev->db_size, len);
-	if (vma->vm_start & (PAGE_SIZE - 1)) {
-		DP_ERR(dev, "Vma_start not page aligned = %ld\n",
-		       vma->vm_start);
+		 "mmap invoked with vm_start=0x%pK, vm_end=0x%pK,vm_pgoff=0x%pK; dpi_start=0x%pK dpi_size=0x%x\n",
+		 (void *)vma->vm_start, (void *)vma->vm_end,
+		 (void *)vma->vm_pgoff, (void *)dpi_start, ucontext->dpi_size);
+
+	if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) {
+		DP_ERR(dev,
+		       "failed mmap, adrresses must be page aligned: start=0x%pK, end=0x%pK\n",
+		       (void *)vma->vm_start, (void *)vma->vm_end);
 		return -EINVAL;
 	}
 
-	found = qedr_search_mmap(ucontext, vm_page, len);
-	if (!found) {
-		DP_ERR(dev, "Vma_pgoff not found in mapped array = %ld\n",
+	if (!qedr_search_mmap(ucontext, phys_addr, len)) {
+		DP_ERR(dev, "failed mmap, vm_pgoff=0x%lx is not authorized\n",
 		       vma->vm_pgoff);
 		return -EINVAL;
 	}
 
-	DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping doorbell bar\n");
-
-	if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db +
-						     dev->db_size))) {
-		DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping doorbell bar\n");
-		if (vma->vm_flags & VM_READ) {
-			DP_ERR(dev, "Trying to map doorbell bar for read\n");
-			return -EPERM;
-		}
-
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-
-		rc = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-					PAGE_SIZE, vma->vm_page_prot);
-	} else {
-		DP_DEBUG(dev, QEDR_MSG_INIT, "Mapping chains\n");
-		rc = remap_pfn_range(vma, vma->vm_start,
-				     vma->vm_pgoff, len, vma->vm_page_prot);
+	if (phys_addr < dpi_start ||
+	    ((phys_addr + len) > (dpi_start + ucontext->dpi_size))) {
+		DP_ERR(dev,
+		       "failed mmap, pages are outside of dpi; page address=0x%pK, dpi_start=0x%pK, dpi_size=0x%x\n",
+		       (void *)phys_addr, (void *)dpi_start,
+		       ucontext->dpi_size);
+		return -EINVAL;
 	}
-	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr_mmap return code: %d\n", rc);
-	return rc;
+
+	if (vma->vm_flags & VM_READ) {
+		DP_ERR(dev, "failed mmap, cannot map doorbell bar for read\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len,
+				  vma->vm_page_prot);
 }
 
 struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,

From f4602cbb0a2478dda8238a4f382867da425daa8e Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 22 May 2018 15:56:51 -0600
Subject: [PATCH 388/393] IB/uverbs: Fix uverbs_attr_get_obj

The err pointer comes from uverbs_attr_get, not from the uobject member,
which does not store an ERR_PTR.

Fixes: be934cca9e98 ("IB/uverbs: Add device memory registration ioctl support")
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/rdma/uverbs_ioctl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 4a4201d997a7..095383a4bd1a 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -411,13 +411,13 @@ static inline int uverbs_attr_get_enum_id(const struct uverbs_attr_bundle *attrs
 static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_bundle,
 					u16 idx)
 {
-	struct ib_uobject *uobj =
-		uverbs_attr_get(attrs_bundle, idx)->obj_attr.uobject;
+	const struct uverbs_attr *attr;
 
-	if (IS_ERR(uobj))
-		return uobj;
+	attr = uverbs_attr_get(attrs_bundle, idx);
+	if (IS_ERR(attr))
+		return ERR_CAST(attr);
 
-	return uobj->object;
+	return attr->obj_attr.uobject->object;
 }
 
 static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,

From 05d6a4ddb654ef6f2fbbcf9dcb3b263184baa8e4 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Tue, 22 May 2018 20:47:14 +0800
Subject: [PATCH 389/393] RDMA/hns: Bugfix for cq record db for kernel

When use cq record db for kernel, it needs to set the hr_cq->db_en
to 1 and configure the dma address of record cq db of qp context.

Fixes: 86188a8810ed ("RDMA/hns: Support cq record doorbell for kernel space")
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 14734d0d0b76..3a485f50fede 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -377,6 +377,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 
 			hr_cq->set_ci_db = hr_cq->db.db_record;
 			*hr_cq->set_ci_db = 0;
+			hr_cq->db_en = 1;
 		}
 
 		/* Init mmt table and write buff address to mtt table */

From 55ba49cbcef37053d973f9a45bc58818c333fe13 Mon Sep 17 00:00:00 2001
From: oulijun <oulijun@huawei.com>
Date: Tue, 22 May 2018 20:47:15 +0800
Subject: [PATCH 390/393] RDMA/hns: Move the location for initializing tmp_len

When posted work request, it need to compute the length of
all sges of every wr and fill it into the msg_len field of
send wqe. Thus, While posting multiple wr,
tmp_len should be reinitialized to zero.

Fixes: 8b9b8d143b46 ("RDMA/hns: Fix the endian problem for hns")
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 7ecf0c911808..1f0965bb64ee 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -142,8 +142,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	unsigned long flags;
 	unsigned int ind;
 	void *wqe = NULL;
-	u32 tmp_len = 0;
 	bool loopback;
+	u32 tmp_len;
 	int ret = 0;
 	u8 *smac;
 	int nreq;
@@ -189,6 +189,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 		owner_bit =
 		       ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1);
+		tmp_len = 0;
 
 		/* Corresponding to the QP type, wqe process separately */
 		if (ibqp->qp_type == IB_QPT_GSI) {

From d50147381aa0c9725d63a677c138c47f55d6d3bc Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 22 May 2018 09:47:58 -0700
Subject: [PATCH 391/393] Btrfs: fix error handling in btrfs_truncate()

Jun Wu at Facebook reported that an internal service was seeing a return
value of 1 from ftruncate() on Btrfs in some cases. This is coming from
the NEED_TRUNCATE_BLOCK return value from btrfs_truncate_inode_items().

btrfs_truncate() uses two variables for error handling, ret and err.
When btrfs_truncate_inode_items() returns non-zero, we set err to the
return value. However, NEED_TRUNCATE_BLOCK is not an error. Make sure we
only set err if ret is an error (i.e., negative).

To reproduce the issue: mount a filesystem with -o compress-force=zstd
and the following program will encounter return value of 1 from
ftruncate:

int main(void) {
        char buf[256] = { 0 };
        int ret;
        int fd;

        fd = open("test", O_CREAT | O_WRONLY | O_TRUNC, 0666);
        if (fd == -1) {
                perror("open");
                return EXIT_FAILURE;
        }

        if (write(fd, buf, sizeof(buf)) != sizeof(buf)) {
                perror("write");
                close(fd);
                return EXIT_FAILURE;
        }

        if (fsync(fd) == -1) {
                perror("fsync");
                close(fd);
                return EXIT_FAILURE;
        }

        ret = ftruncate(fd, 128);
        if (ret) {
                printf("ftruncate() returned %d\n", ret);
                close(fd);
                return EXIT_FAILURE;
        }

        close(fd);
        return EXIT_SUCCESS;
}

Fixes: ddfae63cc8e0 ("btrfs: move btrfs_truncate_block out of trans handle")
CC: stable@vger.kernel.org # 4.15+
Reported-by: Jun Wu <quark@fb.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e604e7071f1..d82afca0c05f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9124,7 +9124,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 						 BTRFS_EXTENT_DATA_KEY);
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		if (ret != -ENOSPC && ret != -EAGAIN) {
-			err = ret;
+			if (ret < 0)
+				err = ret;
 			break;
 		}
 

From 4544e403eb25552aed7f0ee181a7a506b8800403 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 24 May 2018 11:12:16 +0300
Subject: [PATCH 392/393] ahci: Add PCI ID for Cannon Lake PCH-LP AHCI

This one should be using the default LPM policy for mobile chipsets so
add the PCI ID to the driver list of supported revices.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/ata/ahci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 6389c88b3500..738fb22978dd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -334,6 +334,7 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, 0x9c07), board_ahci_mobile }, /* Lynx LP RAID */
 	{ PCI_VDEVICE(INTEL, 0x9c0e), board_ahci_mobile }, /* Lynx LP RAID */
 	{ PCI_VDEVICE(INTEL, 0x9c0f), board_ahci_mobile }, /* Lynx LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9dd3), board_ahci_mobile }, /* Cannon Lake PCH-LP AHCI */
 	{ PCI_VDEVICE(INTEL, 0x1f22), board_ahci }, /* Avoton AHCI */
 	{ PCI_VDEVICE(INTEL, 0x1f23), board_ahci }, /* Avoton AHCI */
 	{ PCI_VDEVICE(INTEL, 0x1f24), board_ahci }, /* Avoton RAID */

From d883c6cf3b39f1f42506e82ad2779fb88004acf3 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 23 May 2018 10:18:21 +0900
Subject: [PATCH 393/393] Revert "mm/cma: manage the memory of the CMA area by
 using the ZONE_MOVABLE"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts the following commits that change CMA design in MM.

 3d2054ad8c2d ("ARM: CMA: avoid double mapping to the CMA area if CONFIG_HIGHMEM=y")

 1d47a3ec09b5 ("mm/cma: remove ALLOC_CMA")

 bad8c6c0b114 ("mm/cma: manage the memory of the CMA area by using the ZONE_MOVABLE")

Ville reported a following error on i386.

  Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
  microcode: microcode updated early to revision 0x4, date = 2013-06-28
  Initializing CPU#0
  Initializing HighMem for node 0 (000377fe:00118000)
  Initializing Movable for node 0 (00000001:00118000)
  BUG: Bad page state in process swapper  pfn:377fe
  page:f53effc0 count:0 mapcount:-127 mapping:00000000 index:0x0
  flags: 0x80000000()
  raw: 80000000 00000000 00000000 ffffff80 00000000 00000100 00000200 00000001
  page dumped because: nonzero mapcount
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0-rc5-elk+ #145
  Hardware name: Dell Inc. Latitude E5410/03VXMC, BIOS A15 07/11/2013
  Call Trace:
   dump_stack+0x60/0x96
   bad_page+0x9a/0x100
   free_pages_check_bad+0x3f/0x60
   free_pcppages_bulk+0x29d/0x5b0
   free_unref_page_commit+0x84/0xb0
   free_unref_page+0x3e/0x70
   __free_pages+0x1d/0x20
   free_highmem_page+0x19/0x40
   add_highpages_with_active_regions+0xab/0xeb
   set_highmem_pages_init+0x66/0x73
   mem_init+0x1b/0x1d7
   start_kernel+0x17a/0x363
   i386_start_kernel+0x95/0x99
   startup_32_smp+0x164/0x168

The reason for this error is that the span of MOVABLE_ZONE is extended
to whole node span for future CMA initialization, and, normal memory is
wrongly freed here.  I submitted the fix and it seems to work, but,
another problem happened.

It's so late time to fix the later problem so I decide to reverting the
series.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Laura Abbott <labbott@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/dma-mapping.c      | 16 +------
 include/linux/memory_hotplug.h |  3 ++
 include/linux/mm.h             |  1 -
 mm/cma.c                       | 83 +++++-----------------------------
 mm/compaction.c                |  4 +-
 mm/internal.h                  |  4 +-
 mm/page_alloc.c                | 83 ++++++++++++----------------------
 7 files changed, 49 insertions(+), 145 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 8c398fedbbb6..ada8eb206a90 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -466,12 +466,6 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
 void __init dma_contiguous_remap(void)
 {
 	int i;
-
-	if (!dma_mmu_remap_num)
-		return;
-
-	/* call flush_cache_all() since CMA area would be large enough */
-	flush_cache_all();
 	for (i = 0; i < dma_mmu_remap_num; i++) {
 		phys_addr_t start = dma_mmu_remap[i].base;
 		phys_addr_t end = start + dma_mmu_remap[i].size;
@@ -504,15 +498,7 @@ void __init dma_contiguous_remap(void)
 		flush_tlb_kernel_range(__phys_to_virt(start),
 				       __phys_to_virt(end));
 
-		/*
-		 * All the memory in CMA region will be on ZONE_MOVABLE.
-		 * If that zone is considered as highmem, the memory in CMA
-		 * region is also considered as highmem even if it's
-		 * physical address belong to lowmem. In this case,
-		 * re-mapping isn't required.
-		 */
-		if (!is_highmem_idx(ZONE_MOVABLE))
-			iotable_init(&map, 1);
+		iotable_init(&map, 1);
 	}
 }
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0e49b5b1ee1..2b0265265c28 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -216,6 +216,9 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 #define pfn_to_online_page(pfn)			\
 ({						\
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6fa9a255dbf..02a616e2f17d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2109,7 +2109,6 @@ extern void setup_per_cpu_pageset(void);
 
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
-extern void setup_zone_pageset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
diff --git a/mm/cma.c b/mm/cma.c
index aa40e6c7b042..5809bbe360d7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -39,7 +39,6 @@
 #include <trace/events/cma.h>
 
 #include "cma.h"
-#include "internal.h"
 
 struct cma cma_areas[MAX_CMA_AREAS];
 unsigned cma_area_count;
@@ -110,25 +109,23 @@ static int __init cma_activate_area(struct cma *cma)
 	if (!cma->bitmap)
 		return -ENOMEM;
 
+	WARN_ON_ONCE(!pfn_valid(pfn));
+	zone = page_zone(pfn_to_page(pfn));
+
 	do {
 		unsigned j;
 
 		base_pfn = pfn;
-		if (!pfn_valid(base_pfn))
-			goto err;
-
-		zone = page_zone(pfn_to_page(base_pfn));
 		for (j = pageblock_nr_pages; j; --j, pfn++) {
-			if (!pfn_valid(pfn))
-				goto err;
-
+			WARN_ON_ONCE(!pfn_valid(pfn));
 			/*
-			 * In init_cma_reserved_pageblock(), present_pages
-			 * is adjusted with assumption that all pages in
-			 * the pageblock come from a single zone.
+			 * alloc_contig_range requires the pfn range
+			 * specified to be in the same zone. Make this
+			 * simple by forcing the entire CMA resv range
+			 * to be in the same zone.
 			 */
 			if (page_zone(pfn_to_page(pfn)) != zone)
-				goto err;
+				goto not_in_zone;
 		}
 		init_cma_reserved_pageblock(pfn_to_page(base_pfn));
 	} while (--i);
@@ -142,7 +139,7 @@ static int __init cma_activate_area(struct cma *cma)
 
 	return 0;
 
-err:
+not_in_zone:
 	pr_err("CMA area %s could not be activated\n", cma->name);
 	kfree(cma->bitmap);
 	cma->count = 0;
@@ -152,41 +149,6 @@ static int __init cma_activate_area(struct cma *cma)
 static int __init cma_init_reserved_areas(void)
 {
 	int i;
-	struct zone *zone;
-	pg_data_t *pgdat;
-
-	if (!cma_area_count)
-		return 0;
-
-	for_each_online_pgdat(pgdat) {
-		unsigned long start_pfn = UINT_MAX, end_pfn = 0;
-
-		zone = &pgdat->node_zones[ZONE_MOVABLE];
-
-		/*
-		 * In this case, we cannot adjust the zone range
-		 * since it is now maximum node span and we don't
-		 * know original zone range.
-		 */
-		if (populated_zone(zone))
-			continue;
-
-		for (i = 0; i < cma_area_count; i++) {
-			if (pfn_to_nid(cma_areas[i].base_pfn) !=
-				pgdat->node_id)
-				continue;
-
-			start_pfn = min(start_pfn, cma_areas[i].base_pfn);
-			end_pfn = max(end_pfn, cma_areas[i].base_pfn +
-						cma_areas[i].count);
-		}
-
-		if (!end_pfn)
-			continue;
-
-		zone->zone_start_pfn = start_pfn;
-		zone->spanned_pages = end_pfn - start_pfn;
-	}
 
 	for (i = 0; i < cma_area_count; i++) {
 		int ret = cma_activate_area(&cma_areas[i]);
@@ -195,32 +157,9 @@ static int __init cma_init_reserved_areas(void)
 			return ret;
 	}
 
-	/*
-	 * Reserved pages for ZONE_MOVABLE are now activated and
-	 * this would change ZONE_MOVABLE's managed page counter and
-	 * the other zones' present counter. We need to re-calculate
-	 * various zone information that depends on this initialization.
-	 */
-	build_all_zonelists(NULL);
-	for_each_populated_zone(zone) {
-		if (zone_idx(zone) == ZONE_MOVABLE) {
-			zone_pcp_reset(zone);
-			setup_zone_pageset(zone);
-		} else
-			zone_pcp_update(zone);
-
-		set_zone_contiguous(zone);
-	}
-
-	/*
-	 * We need to re-init per zone wmark by calling
-	 * init_per_zone_wmark_min() but doesn't call here because it is
-	 * registered on core_initcall and it will be called later than us.
-	 */
-
 	return 0;
 }
-pure_initcall(cma_init_reserved_areas);
+core_initcall(cma_init_reserved_areas);
 
 /**
  * cma_init_reserved_mem() - create custom contiguous area from reserved memory
diff --git a/mm/compaction.c b/mm/compaction.c
index 028b7210a669..29bd1df18b98 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1450,12 +1450,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * if compaction succeeds.
 	 * For costly orders, we require low watermark instead of min for
 	 * compaction to proceed to increase its chances.
+	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+	 * suitable migration targets
 	 */
 	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
 				low_wmark_pages(zone) : min_wmark_pages(zone);
 	watermark += compact_gap(order);
 	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
-						0, wmark_target))
+						ALLOC_CMA, wmark_target))
 		return COMPACT_SKIPPED;
 
 	return COMPACT_CONTINUE;
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..502d14189794 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,9 +168,6 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
 
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
@@ -498,6 +495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_CMA		0x80 /* allow allocations from CMA areas */
 
 enum ttu_flags;
 struct tlbflush_unmap_batch;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d7962f..511a7124d7f9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1743,38 +1743,16 @@ void __init page_alloc_init_late(void)
 }
 
 #ifdef CONFIG_CMA
-static void __init adjust_present_page_count(struct page *page, long count)
-{
-	struct zone *zone = page_zone(page);
-
-	/* We don't need to hold a lock since it is boot-up process */
-	zone->present_pages += count;
-}
-
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
-	unsigned long pfn = page_to_pfn(page);
 	struct page *p = page;
-	int nid = page_to_nid(page);
-
-	/*
-	 * ZONE_MOVABLE will steal present pages from other zones by
-	 * changing page links so page_zone() is changed. Before that,
-	 * we need to adjust previous zone's page count first.
-	 */
-	adjust_present_page_count(page, -pageblock_nr_pages);
 
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
-
-		/* Steal pages from other zones */
-		set_page_links(p, ZONE_MOVABLE, nid, pfn);
-	} while (++p, ++pfn, --i);
-
-	adjust_present_page_count(page, pageblock_nr_pages);
+	} while (++p, --i);
 
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 
@@ -2889,7 +2867,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		 * exists.
 		 */
 		watermark = min_wmark_pages(zone) + (1UL << order);
-		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
 
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3165,6 +3143,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 	}
 
 
+#ifdef CONFIG_CMA
+	/* If allocation can't use CMA areas don't use free CMA pages */
+	if (!(alloc_flags & ALLOC_CMA))
+		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
 	/*
 	 * Check watermarks for an order-0 allocation request. If these
 	 * are not met, then a high-order request also cannot go ahead
@@ -3191,8 +3175,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		}
 
 #ifdef CONFIG_CMA
-		if (!list_empty(&area->free_list[MIGRATE_CMA]))
+		if ((alloc_flags & ALLOC_CMA) &&
+		    !list_empty(&area->free_list[MIGRATE_CMA])) {
 			return true;
+		}
 #endif
 		if (alloc_harder &&
 			!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3212,6 +3198,13 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx, unsigned int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+	long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+	/* If allocation can't use CMA areas don't use free CMA pages */
+	if (!(alloc_flags & ALLOC_CMA))
+		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
 
 	/*
 	 * Fast check for order-0 only. If this fails then the reserves
@@ -3220,7 +3213,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
 	 * the caller is !atomic then it'll uselessly search the free
 	 * list. That corner case is then slower but it is harmless.
 	 */
-	if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
+	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
 		return true;
 
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3856,6 +3849,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
+#ifdef CONFIG_CMA
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	return alloc_flags;
 }
 
@@ -4322,6 +4319,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 
+	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+		*alloc_flags |= ALLOC_CMA;
+
 	return true;
 }
 
@@ -6204,7 +6204,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
-	unsigned long node_end_pfn = 0;
 
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
@@ -6232,13 +6231,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		unsigned long zone_start_pfn = zone->zone_start_pfn;
-		unsigned long movable_size = 0;
 
 		size = zone->spanned_pages;
 		realsize = freesize = zone->present_pages;
-		if (zone_end_pfn(zone) > node_end_pfn)
-			node_end_pfn = zone_end_pfn(zone);
-
 
 		/*
 		 * Adjust freesize so that it accounts for how much memory
@@ -6287,30 +6282,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		zone_seqlock_init(zone);
 		zone_pcp_init(zone);
 
-		/*
-		 * The size of the CMA area is unknown now so we need to
-		 * prepare the memory for the usemap at maximum.
-		 */
-		if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
-			pgdat->node_spanned_pages) {
-			movable_size = node_end_pfn - pgdat->node_start_pfn;
-		}
-
-		if (!size && !movable_size)
+		if (!size)
 			continue;
 
 		set_pageblock_order();
-		if (movable_size) {
-			zone->zone_start_pfn = pgdat->node_start_pfn;
-			zone->spanned_pages = movable_size;
-			setup_usemap(pgdat, zone,
-				pgdat->node_start_pfn, movable_size);
-			init_currently_empty_zone(zone,
-				pgdat->node_start_pfn, movable_size);
-		} else {
-			setup_usemap(pgdat, zone, zone_start_pfn, size);
-			init_currently_empty_zone(zone, zone_start_pfn, size);
-		}
+		setup_usemap(pgdat, zone, zone_start_pfn, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
 		memmap_init(size, nid, j, zone_start_pfn);
 	}
 }
@@ -7951,7 +7928,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 }
 #endif
 
-#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.