aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonardo Arena <rnalrd@alpinelinux.org>2019-12-31 12:57:18 +0000
committerLeonardo Arena <rnalrd@alpinelinux.org>2019-12-31 12:58:06 +0000
commitb56efe8db5679b569767cee09b45ce5cd04b942d (patch)
tree763e2d062569bcacc4e7c798ff591b468fe4002f
parentd21e62f341aa3be70aa5dc967aeebdf6a54e04d2 (diff)
downloadalpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.tar.bz2
alpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.tar.xz
alpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.zip
main/xen: security fixes
- CVE-2019-18425 XSA-298 - CVE-2019-18421 XSA-299 - CVE-2019-18423 XSA-301 - CVE-2019-18424 XSA-302 - CVE-2019-18422 XSA-303 - CVE-2018-12207 XSA-304 - CVE-2019-11135 XSA-305 fixes #10968
-rw-r--r--main/xen/APKBUILD63
-rw-r--r--main/xen/xsa298-4.10.patch87
-rw-r--r--main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch94
-rw-r--r--main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch99
-rw-r--r--main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch610
-rw-r--r--main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch141
-rw-r--r--main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch79
-rw-r--r--main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch101
-rw-r--r--main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch374
-rw-r--r--main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch227
-rw-r--r--main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch106
-rw-r--r--main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch169
-rw-r--r--main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch413
-rw-r--r--main/xen/xsa301-4.11-1.patch80
-rw-r--r--main/xen/xsa301-4.11-2.patch92
-rw-r--r--main/xen/xsa301-4.11-3.patch49
-rw-r--r--main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch37
-rw-r--r--main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch498
-rw-r--r--main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch74
-rw-r--r--main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch97
-rw-r--r--main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch226
-rw-r--r--main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch114
-rw-r--r--main/xen/xsa304-4.10-1.patch71
-rw-r--r--main/xen/xsa304-4.10-2.patch268
-rw-r--r--main/xen/xsa304-4.10-3.patch84
-rw-r--r--main/xen/xsa305-4.10-1.patch288
-rw-r--r--main/xen/xsa305-4.10-2.patch192
27 files changed, 4732 insertions, 1 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index dc6b2dc89b..1582f86324 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -3,7 +3,7 @@
3# Maintainer: William Pitcock <nenolod@dereferenced.org> 3# Maintainer: William Pitcock <nenolod@dereferenced.org>
4pkgname=xen 4pkgname=xen
5pkgver=4.10.4 5pkgver=4.10.4
6pkgrel=0 6pkgrel=1
7pkgdesc="Xen hypervisor" 7pkgdesc="Xen hypervisor"
8url="http://www.xen.org/" 8url="http://www.xen.org/"
9arch="x86_64 armhf aarch64" 9arch="x86_64 armhf aarch64"
@@ -154,6 +154,14 @@ options="!strip"
154# - XSA-294 154# - XSA-294
155# - XSA-295 155# - XSA-295
156# - XSA-296 156# - XSA-296
157# 4.10.4-r1:
158# - CVE-2019-18425 XSA-298
159# - CVE-2019-18421 XSA-299
160# - CVE-2019-18423 XSA-301
161# - CVE-2019-18424 XSA-302
162# - CVE-2019-18422 XSA-303
163# - CVE-2018-12207 XSA-304
164# - CVE-2019-11135 XSA-305
157 165
158case "$CARCH" in 166case "$CARCH" in
159x86*) 167x86*)
@@ -218,6 +226,33 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
218 226
219 hotplug-Linux-iscsi-block-handle-lun-1.patch 227 hotplug-Linux-iscsi-block-handle-lun-1.patch
220 228
229 xsa298-4.10.patch
230 xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
231 xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
232 xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
233 xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
234 xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
235 xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
236 xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
237 xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
238 xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
239 xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
240 xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
241 xsa301-4.11-1.patch
242 xsa301-4.11-2.patch
243 xsa301-4.11-3.patch
244 xsa302-0001-IOMMU-add-missing-HVM-check.patch
245 xsa302-0002-passthrough-quarantine-PCI-devices.patch
246 xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
247 xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
248 xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
249 xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
250 xsa304-4.10-1.patch
251 xsa304-4.10-2.patch
252 xsa304-4.10-3.patch
253 xsa305-4.10-1.patch
254 xsa305-4.10-2.patch
255
221 xenstored.initd 256 xenstored.initd
222 xenstored.confd 257 xenstored.confd
223 xenconsoled.initd 258 xenconsoled.initd
@@ -471,6 +506,32 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a
47169dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch 50669dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch
4722094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch 5072094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch
4738c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch 5088c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch
509c81ce3b1f14731061af530861f628e1fd392211f98c4aba9db8354e7aff604902908733ec716d46f679e65e068717dc87694797480f490046701c4e2aecc3a51 xsa298-4.10.patch
510eaeba22b8582a5f7cac727d0c068236a6af375b8b9f9e57d69d97569a6e1b7da15c38b611bc2504a84e044a6cafabc1fed27a134547c629210ebc66750fbce9f xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
511a027beb481e58b575967212381fd98e992eb28c1e6cd9a207c7c3f22e9aa6f65ca94b73cd02f460fdb2c931c527300bc2bd6dee9f039d1ace3532069ab9fb42d xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
5126a48835ad80ba6d8c97d09e74303d8c430e1f8a1245bdd4ea9b9301d4d35a5bbb388ef694d8ca9bbf872521123c40ac8f8142e59c2b13efd932948083d98b09f xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
513a9774b3bece635bb86501f67718cdeeeadfb32c465ef11a41a0f9869b42f879a82c73753c198b5285bb29e8df6531f6467619c4b29b583e0a761f45c2419b521 xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
514d25dd31942d676c4b4f9db593b1a520ef8e3feaf50dd79313860eb5afd5e41503caca937d5bd0fbc57a02f9d85d52fea3646e0bb1580ff4971c6d194f872b9d1 xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
515695a3ea0a0c2965e88cf907719aa2ace509d1f4944793eabbe3ace44d94f4f6b8e685695cf668c129d205b6b1ef30f37c13acb736bdf7de3b44c1b60d05c22be xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
5168bd1fb05bed70aacdebf31755e673c74700d6f5ee1a15a35d950e90d5c34f16b3d0531b56ae74f17203cf87579d2b157c049efea040a2a03c7d0e8adce8498b9 xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
51745bf263b11abd75e2fa2ee9e757c13de0a99365861d900b82cad0302446762a0ae76b9efbd870887d6353dcf95d565987debf43f80be4c9a0950c88964a3ee6a xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
51835faf5434ebf4c6166d7f8fd10f9010e3dc8a714d5b9e168f641d420e070222c172060a7a72b8c81b93aa762b1d5286098713b485f86c1f1a679c5c588dd642f xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
5198512e19397e30b4cca367b1fb936ef615ed5d4656206d16b24d0f44539a6ec5af07d0021a6276b48592a68b0fb7c5d3a3f035c9b3a1b7bfaa82f70204096a745 xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
52081813683d7d83610296c7dfb2f75be7ccf1e332d9abc8fcf741906ddbcaa5b38511a1047c233e34e21437737be2fc343b027f4f73133c4ab823ff879842a5002 xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
521916dc53eddb225a5c118630553baaa784c14a2a2ddc1f031ea83dbbb0241f0b103e664d3429414236951f6de689ff234be3fb900b83d1e1a4a4227385b32d496 xsa301-4.11-1.patch
522555d6586543f4b8661d1104851073277290ccce17d05279531c685966186948f933706d834ac1dd748678340df0aaa0423d18ea88be9a35bec67685aeb6258ac xsa301-4.11-2.patch
5235cf43aeb65988b7395a2d87bef0587cc172456ebebc2b9b67c231802ebcfb6bc1bdccccaaa09be463f1a79158234cb4003c0cd478990b99f020d812b90acc011 xsa301-4.11-3.patch
5246e918e7e6488d89807df5ff5c73926eb6c2990893c25850c5a55d2944619c6e135855ec57a5f54379c809e1ec854a4b56d1acd1c2bc0b50a06d183b470167d0f xsa302-0001-IOMMU-add-missing-HVM-check.patch
525cda95d99b8a51175b1ca98318ae4488a7b82f43c1e7a4e9903d8f5f9277c08acb759d05f146b8363363f9f1ed45663190fb935726c43fe667301134b88b21692 xsa302-0002-passthrough-quarantine-PCI-devices.patch
526b65de69f7c0097177652fc6fe7c0c12ab44c6bb0a8823b19ee315a574b04f9151a572d518d684fec467b995c9c9756bd5b2d88f7546199c0b807155c5dca43b5 xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
527440869c1d7212820ba0c7d4b35681483897d1dcc4aa2f833af1370ac5bd8995b3d2712c598e6309488b90f37e36ca36db232e5de06242afa017d1c991f5d6af6 xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
5287d56d0576fcd90ce4296e59cd2eae35929ecae6a7fa40e88c2f66f54234083b09be92630f299e5bb24d23b32949e58d49bafa1bed1e73719b73a4c640b86206f xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
52949b540f2585f43685c9f37ea1b6f166a98e71d85e6e0fbf2807b5788b3e71cb35dd71f71b7ad5a6d230ba60706cd11ef4bcecec7c2f250f28fd95dbd50fffc2b xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
5308502fd41000664f74382e2691f0a7ceef5121227532a55ffef3046745fe05461b266c93191f505ce3566b2e932b2f0880510dff714948384215fc48093b8d983 xsa304-4.10-1.patch
531c0149a445a9f6ef4aa0d928ff321afa7ea6f52d96213042f444a9b96912729fa27c5b81c247c56f45922061f2e45649c8ab462d73765de8ca49022b9994ccf05 xsa304-4.10-2.patch
532f7c34c984885f73f51fd3ca0274b7a6b3ca938547b910bb1becc73d7df668b0f9f69d6f402cc3a183a2acff1a9978c2d5775bd2acced4300212568e8ca22d47a xsa304-4.10-3.patch
533eeca8ad1ec1b13b7d1849b94537d24e8f91eff6fb7b2e406a08accb9ec72ddb48360c90b2a250ffbc628970f00de557fcddacbcf09062a59a36a8b6ffcbf1909 xsa305-4.10-1.patch
5346fc52805ef24510aa5092d1bda61d1299b74c8b37fdca0c17e9df62ec16bb9c7343f09b8dd1f4801c4c5db3b3f6f7208c0c35034ef8aa86b08df308e82597892 xsa305-4.10-2.patch
47452c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd 53552c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
475093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd 536093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
4763c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd 5373c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
diff --git a/main/xen/xsa298-4.10.patch b/main/xen/xsa298-4.10.patch
new file mode 100644
index 0000000000..f0b2c1efc1
--- /dev/null
+++ b/main/xen/xsa298-4.10.patch
@@ -0,0 +1,87 @@
1From: Jan Beulich <jbeulich@suse.com>
2Subject: x86/PV: check GDT/LDT limits during emulation
3
4Accesses beyond the LDT limit originating from emulation would trigger
5the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
6accesses would cause an attempt to promote the touched page (offset from
7the present LDT base address) to a segment descriptor one. If this
8happens to succeed, guest user mode would be able to elevate its
9privileges to that of the guest kernel. This is particularly easy when
10there's no LDT at all, in which case the LDT base stored internally to
11Xen is simply zero.
12
13Also adjust the ASSERT() that was triggering: It was off by one to
14begin with, and for production builds we also better use
15ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
16
17This is XSA-298.
18
19Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
20Signed-off-by: Jan Beulich <jbeulich@suse.com>
21Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
22
23--- a/xen/arch/x86/pv/emul-gate-op.c
24+++ b/xen/arch/x86/pv/emul-gate-op.c
25@@ -60,7 +60,13 @@ static int read_gate_descriptor(unsigned
26 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
27 + (gate_sel >> 3);
28 if ( (gate_sel < 4) ||
29- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
30+ /*
31+ * We're interested in call gates only, which occupy a single
32+ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
33+ */
34+ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
35+ (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents
36+ : v->arch.pv_vcpu.gdt_ents)) ||
37 __get_user(desc, pdesc) )
38 return 0;
39
40@@ -79,7 +85,7 @@ static int read_gate_descriptor(unsigned
41 if ( !is_pv_32bit_vcpu(v) )
42 {
43 if ( (*ar & 0x1f00) != 0x0c00 ||
44- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
45+ /* Limit check done above already. */
46 __get_user(desc, pdesc + 1) ||
47 (desc.b & 0x1f00) )
48 return 0;
49--- a/xen/arch/x86/pv/emulate.c
50+++ b/xen/arch/x86/pv/emulate.c
51@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
52 {
53 struct desc_struct desc;
54
55- if ( sel < 4)
56+ if ( sel < 4 ||
57+ /*
58+ * Don't apply the GDT limit here, as the selector may be a Xen
59+ * provided one. __get_user() will fail (without taking further
60+ * action) for ones falling in the gap between guest populated
61+ * and Xen ones.
62+ */
63+ ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) )
64 desc.b = desc.a = 0;
65 else if ( __get_user(desc,
66 (const struct desc_struct *)(!(sel & 4)
67--- a/xen/arch/x86/pv/mm.c
68+++ b/xen/arch/x86/pv/mm.c
69@@ -98,12 +98,16 @@ bool pv_map_ldt_shadow_page(unsigned int
70 BUG_ON(unlikely(in_irq()));
71
72 /*
73- * Hardware limit checking should guarantee this property. NB. This is
74+ * Prior limit checking should guarantee this property. NB. This is
75 * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
76 * current vcpu, and vcpu_reset() will block until this vcpu has been
77 * descheduled before continuing.
78 */
79- ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents);
80+ if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) )
81+ {
82+ ASSERT_UNREACHABLE();
83+ return false;
84+ }
85
86 if ( is_pv_32bit_domain(currd) )
87 linear = (uint32_t)linear;
diff --git a/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch b/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
new file mode 100644
index 0000000000..2db7b3f980
--- /dev/null
+++ b/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
@@ -0,0 +1,94 @@
1From bc266a68aa014af2cc3ed0a1f55723fdeac2e545 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry
5
6On detection of a potential L1TF issue, most validation code returns
7-ERESTART to allow the switch to shadow mode to happen and cause the
8original operation to be restarted.
9
10However, in the validation code, the return value -ERESTART has been
11repurposed to indicate 1) the function has partially completed
12something which needs to be undone, and 2) calling put_page_type()
13should cleanly undo it. This causes problems in several places.
14
15For L1 tables, on receiving an -ERESTART return from alloc_l1_table(),
16alloc_page_type() will set PGT_partial on the page. If for some
17reason the original operation never restarts, then on domain
18destruction, relinquish_memory() will call free_page_type() on the
19page.
20
21Unfortunately, alloc_ and free_l1_table() aren't set up to deal with
22PGT_partial. When returning a failure, alloc_l1_table() always
23de-validates whatever it's validated so far, and free_l1_table()
24always devalidates the whole page. This means that if
25relinquish_memory() calls free_page_type() on an L1 that didn't
26complete due to an L1TF, it will call put_page_from_l1e() on "page
27entries" that have never been validated.
28
29For L2+ tables, setting rc to ERESTART causes the rest of the
30alloc_lN_table() function to *think* that the entry in question will
31have PGT_partial set. This will cause it to set partial_pte = 1. If
32relinqush_memory() then calls free_page_type() on one of those pages,
33then free_lN_table() will call put_page_from_lNe() on the entry when
34it shouldn't.
35
36Rather than indicating -ERESTART, indicate -EINTR. This is the code
37to indicate that nothing has changed from when you started the call
38(which is effectively how alloc_l1_table() handles errors).
39
40mod_lN_entry() shouldn't have any of these types of problems, so leave
41potential changes there for a clean-up patch later.
42
43This is part of XSA-299.
44
45Reported-by: George Dunlap <george.dunlap@citrix.com>
46Signed-off-by: George Dunlap <george.dunlap@citrix.com>
47Reviewed-by: Jan Beulich <jbeulich@suse.com>
48---
49 xen/arch/x86/mm.c | 8 ++++----
50 1 file changed, 4 insertions(+), 4 deletions(-)
51
52diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
53index ce2c082caf..0cbca48a02 100644
54--- a/xen/arch/x86/mm.c
55+++ b/xen/arch/x86/mm.c
56@@ -1152,7 +1152,7 @@ get_page_from_l2e(
57 int rc;
58
59 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
60- return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
61+ return pv_l1tf_check_l2e(d, l2e) ? -EINTR : 1;
62
63 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
64 {
65@@ -1188,7 +1188,7 @@ get_page_from_l3e(
66 int rc;
67
68 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
69- return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
70+ return pv_l1tf_check_l3e(d, l3e) ? -EINTR : 1;
71
72 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
73 {
74@@ -1221,7 +1221,7 @@ get_page_from_l4e(
75 int rc;
76
77 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
78- return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
79+ return pv_l1tf_check_l4e(d, l4e) ? -EINTR : 1;
80
81 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
82 {
83@@ -1435,7 +1435,7 @@ static int alloc_l1_table(struct page_info *page)
84 {
85 if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
86 {
87- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
88+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0;
89 if ( ret )
90 goto out;
91 }
92--
932.23.0
94
diff --git a/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
new file mode 100644
index 0000000000..10345be2d4
--- /dev/null
+++ b/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
@@ -0,0 +1,99 @@
1From fd7bfe9aaee41c589c16c541ec538285dcde1fb2 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially
5 de-validated page
6
7When unpinning pagetables, if an operation is interrupted,
8relinquish_memory() re-sets PGT_pinned so that the un-pin will
9pickedup again when the hypercall restarts.
10
11This is appropriate when put_page_and_type_preemptible() returns
12-EINTR, which indicates that the page is back in its initial state
13(i.e., completely validated). However, for -ERESTART, this leads to a
14state where a page has both PGT_pinned and PGT_partial set.
15
16This happens to work at the moment, although it's not really a
17"canonical" state; but in subsequent patches, where we need to make a
18distinction in handling between PGT_validated and PGT_partial pages,
19this causes issues.
20
21Move to a "canonical" state by:
22- Only re-setting PGT_pinned on -EINTR
23- Re-dropping the refcount held by PGT_pinned on -ERESTART
24
25In the latter case, the PGT_partial bit will be cleared further down
26with the rest of the other PGT_partial pages.
27
28While here, clean up some trainling whitespace.
29
30This is part of XSA-299.
31
32Reported-by: George Dunlap <george.dunlap@citrix.com>
33Signed-off-by: George Dunlap <george.dunlap@citrix.com>
34Reviewed-by: Jan Beulich <jbeulich@suse.com>
35---
36 xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++---
37 1 file changed, 28 insertions(+), 3 deletions(-)
38
39diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
40index 91c2b1c21a..897124f05f 100644
41--- a/xen/arch/x86/domain.c
42+++ b/xen/arch/x86/domain.c
43@@ -112,7 +112,7 @@ static void play_dead(void)
44 * this case, heap corruption or #PF can occur (when heap debugging is
45 * enabled). For example, even printk() can involve tasklet scheduling,
46 * which touches per-cpu vars.
47- *
48+ *
49 * Consider very carefully when adding code to *dead_idle. Most hypervisor
50 * subsystems are unsafe to call.
51 */
52@@ -1837,9 +1837,34 @@ static int relinquish_memory(
53 break;
54 case -ERESTART:
55 case -EINTR:
56+ /*
57+ * -EINTR means PGT_validated has been re-set; re-set
58+ * PGT_pinned again so that it gets picked up next time
59+ * around.
60+ *
61+ * -ERESTART, OTOH, means PGT_partial is set instead. Put
62+ * it back on the list, but don't set PGT_pinned; the
63+ * section below will finish off de-validation. But we do
64+ * need to drop the general ref associated with
65+ * PGT_pinned, since put_page_and_type_preemptible()
66+ * didn't do it.
67+ *
68+ * NB we can do an ASSERT for PGT_validated, since we
69+ * "own" the type ref; but theoretically, the PGT_partial
70+ * could be cleared by someone else.
71+ */
72+ if ( ret == -EINTR )
73+ {
74+ ASSERT(page->u.inuse.type_info & PGT_validated);
75+ set_bit(_PGT_pinned, &page->u.inuse.type_info);
76+ }
77+ else
78+ put_page(page);
79+
80 ret = -ERESTART;
81+
82+ /* Put the page back on the list and drop the ref we grabbed above */
83 page_list_add(page, list);
84- set_bit(_PGT_pinned, &page->u.inuse.type_info);
85 put_page(page);
86 goto out;
87 default:
88@@ -2061,7 +2086,7 @@ void vcpu_kick(struct vcpu *v)
89 * pending flag. These values may fluctuate (after all, we hold no
90 * locks) but the key insight is that each change will cause
91 * evtchn_upcall_pending to be polled.
92- *
93+ *
94 * NB2. We save the running flag across the unblock to avoid a needless
95 * IPI for domains that we IPI'd to unblock.
96 */
97--
982.23.0
99
diff --git a/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
new file mode 100644
index 0000000000..1e79d7666e
--- /dev/null
+++ b/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
@@ -0,0 +1,610 @@
1From 6bad09c708d906922fb59d7e2c06d5de9a633ca3 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into
5 individual flags
6
7At the moment, partial_pte is a tri-state that contains two distinct bits
8of information:
9
101. If zero, the pte at index [nr_validated_ptes] is un-validated. If
11 non-zero, the pte was last seen with PGT_partial set.
12
132. If positive, the pte at index [nr_validated_ptes] does not hold a
14 general reference count. If negative, it does.
15
16To make future patches more clear, separate out this functionality
17into two distinct, named bits: PTF_partial_set (for #1) and
18PTF_partial_general_ref (for #2).
19
20Additionally, a number of functions which need this information also
21take other flags to control behavior (such as `preemptible` and
22`defer`). These are hard to read in the caller (since you only see
23'true' or 'false'), and ugly when many are added together. In
24preparation for adding yet another flag in a future patch, collapse
25all of these into a single `flag` variable.
26
27NB that this does mean checking for what was previously the '-1'
28condition a bit more ugly in the put_page_from_lNe functions (since
29you have to check for both partial_set and general ref); but this
30clause will go away in a future patch.
31
32Also note that the original comment had an off-by-one error:
33partial_flags (like partial_pte before it) concerns
34plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].
35
36No functional change intended.
37
38This is part of XSA-299.
39
40Reported-by: George Dunlap <george.dunlap@citrix.com>
41Signed-off-by: George Dunlap <george.dunlap@citrix.com>
42Reviewed-by: Jan Beulich <jbeulich@suse.com>
43---
44 xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++---------------
45 xen/include/asm-x86/mm.h | 41 +++++++---
46 2 files changed, 128 insertions(+), 78 deletions(-)
47
48diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
49index 0cbca48a02..84ee48ec3f 100644
50--- a/xen/arch/x86/mm.c
51+++ b/xen/arch/x86/mm.c
52@@ -651,20 +651,34 @@ static int alloc_segdesc_page(struct page_info *page)
53 static int __get_page_type(struct page_info *page, unsigned long type,
54 int preemptible);
55
56+/*
57+ * The following flags are used to specify behavior of various get and
58+ * put commands. The first two are also stored in page->partial_flags
59+ * to indicate the state of the page pointed to by
60+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
61+ * more information.
62+ */
63+#define PTF_partial_set (1 << 0)
64+#define PTF_partial_general_ref (1 << 1)
65+#define PTF_preemptible (1 << 2)
66+#define PTF_defer (1 << 3)
67+
68 static int get_page_and_type_from_mfn(
69 mfn_t mfn, unsigned long type, struct domain *d,
70- int partial, int preemptible)
71+ unsigned int flags)
72 {
73 struct page_info *page = mfn_to_page(mfn);
74 int rc;
75+ bool preemptible = flags & PTF_preemptible,
76+ partial_ref = flags & PTF_partial_general_ref;
77
78- if ( likely(partial >= 0) &&
79+ if ( likely(!partial_ref) &&
80 unlikely(!get_page_from_mfn(mfn, d)) )
81 return -EINVAL;
82
83 rc = __get_page_type(page, type, preemptible);
84
85- if ( unlikely(rc) && partial >= 0 &&
86+ if ( unlikely(rc) && !partial_ref &&
87 (!preemptible || page != current->arch.old_guest_table) )
88 put_page(page);
89
90@@ -1146,7 +1160,7 @@ get_page_from_l1e(
91 define_get_linear_pagetable(l2);
92 static int
93 get_page_from_l2e(
94- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
95+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
96 {
97 unsigned long mfn = l2e_get_pfn(l2e);
98 int rc;
99@@ -1163,8 +1177,9 @@ get_page_from_l2e(
100
101 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
102 {
103- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
104- partial, false);
105+ ASSERT(!(flags & PTF_preemptible));
106+
107+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags);
108 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
109 rc = 0;
110 return rc;
111@@ -1183,7 +1198,7 @@ get_page_from_l2e(
112 define_get_linear_pagetable(l3);
113 static int
114 get_page_from_l3e(
115- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
116+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
117 {
118 int rc;
119
120@@ -1198,7 +1213,7 @@ get_page_from_l3e(
121 }
122
123 rc = get_page_and_type_from_mfn(
124- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
125+ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
126 if ( unlikely(rc == -EINVAL) &&
127 !is_pv_32bit_domain(d) &&
128 get_l3_linear_pagetable(l3e, pfn, d) )
129@@ -1216,7 +1231,7 @@ get_page_from_l3e(
130 define_get_linear_pagetable(l4);
131 static int
132 get_page_from_l4e(
133- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
134+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
135 {
136 int rc;
137
138@@ -1231,7 +1246,7 @@ get_page_from_l4e(
139 }
140
141 rc = get_page_and_type_from_mfn(
142- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
143+ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
144 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
145 rc = 0;
146
147@@ -1306,7 +1321,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
148 * Note also that this automatically deals correctly with linear p.t.'s.
149 */
150 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
151- int partial, bool defer)
152+ unsigned int flags)
153 {
154 int rc = 0;
155
156@@ -1326,12 +1341,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
157 struct page_info *pg = l2e_get_page(l2e);
158 struct page_info *ptpg = mfn_to_page(_mfn(pfn));
159
160- if ( unlikely(partial > 0) )
161+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
162+ PTF_partial_set )
163 {
164- ASSERT(!defer);
165+ ASSERT(!(flags & PTF_defer));
166 rc = _put_page_type(pg, true, ptpg);
167 }
168- else if ( defer )
169+ else if ( flags & PTF_defer )
170 {
171 current->arch.old_guest_ptpg = ptpg;
172 current->arch.old_guest_table = pg;
173@@ -1348,7 +1364,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
174 }
175
176 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
177- int partial, bool defer)
178+ unsigned int flags)
179 {
180 struct page_info *pg;
181 int rc;
182@@ -1371,13 +1387,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
183
184 pg = l3e_get_page(l3e);
185
186- if ( unlikely(partial > 0) )
187+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
188+ PTF_partial_set )
189 {
190- ASSERT(!defer);
191+ ASSERT(!(flags & PTF_defer));
192 return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
193 }
194
195- if ( defer )
196+ if ( flags & PTF_defer )
197 {
198 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
199 current->arch.old_guest_table = pg;
200@@ -1392,7 +1409,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
201 }
202
203 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
204- int partial, bool defer)
205+ unsigned int flags)
206 {
207 int rc = 1;
208
209@@ -1401,13 +1418,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
210 {
211 struct page_info *pg = l4e_get_page(l4e);
212
213- if ( unlikely(partial > 0) )
214+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
215+ PTF_partial_set )
216 {
217- ASSERT(!defer);
218+ ASSERT(!(flags & PTF_defer));
219 return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
220 }
221
222- if ( defer )
223+ if ( flags & PTF_defer )
224 {
225 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
226 current->arch.old_guest_table = pg;
227@@ -1514,12 +1532,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
228 unsigned long pfn = mfn_x(page_to_mfn(page));
229 l2_pgentry_t *pl2e;
230 unsigned int i;
231- int rc = 0, partial = page->partial_pte;
232+ int rc = 0;
233+ unsigned int partial_flags = page->partial_flags;
234
235 pl2e = map_domain_page(_mfn(pfn));
236
237 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
238- i++, partial = 0 )
239+ i++, partial_flags = 0 )
240 {
241 if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
242 {
243@@ -1529,18 +1548,19 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
244 }
245
246 if ( !is_guest_l2_slot(d, type, i) ||
247- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
248+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
249 continue;
250
251 if ( rc == -ERESTART )
252 {
253 page->nr_validated_ptes = i;
254- page->partial_pte = partial ?: 1;
255+ /* Set 'set', retain 'general ref' */
256+ page->partial_flags = partial_flags | PTF_partial_set;
257 }
258 else if ( rc == -EINTR && i )
259 {
260 page->nr_validated_ptes = i;
261- page->partial_pte = 0;
262+ page->partial_flags = 0;
263 rc = -ERESTART;
264 }
265 else if ( rc < 0 && rc != -EINTR )
266@@ -1549,7 +1569,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
267 if ( i )
268 {
269 page->nr_validated_ptes = i;
270- page->partial_pte = 0;
271+ page->partial_flags = 0;
272 current->arch.old_guest_ptpg = NULL;
273 current->arch.old_guest_table = page;
274 }
275@@ -1573,7 +1593,8 @@ static int alloc_l3_table(struct page_info *page)
276 unsigned long pfn = mfn_x(page_to_mfn(page));
277 l3_pgentry_t *pl3e;
278 unsigned int i;
279- int rc = 0, partial = page->partial_pte;
280+ int rc = 0;
281+ unsigned int partial_flags = page->partial_flags;
282
283 pl3e = map_domain_page(_mfn(pfn));
284
285@@ -1588,7 +1609,7 @@ static int alloc_l3_table(struct page_info *page)
286 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
287
288 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
289- i++, partial = 0 )
290+ i++, partial_flags = 0 )
291 {
292 if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
293 {
294@@ -1605,20 +1626,22 @@ static int alloc_l3_table(struct page_info *page)
295 else
296 rc = get_page_and_type_from_mfn(
297 l3e_get_mfn(pl3e[i]),
298- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
299+ PGT_l2_page_table | PGT_pae_xen_l2, d,
300+ partial_flags | PTF_preemptible);
301 }
302- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
303+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
304 continue;
305
306 if ( rc == -ERESTART )
307 {
308 page->nr_validated_ptes = i;
309- page->partial_pte = partial ?: 1;
310+ /* Set 'set', leave 'general ref' set if this entry was set */
311+ page->partial_flags = partial_flags | PTF_partial_set;
312 }
313 else if ( rc == -EINTR && i )
314 {
315 page->nr_validated_ptes = i;
316- page->partial_pte = 0;
317+ page->partial_flags = 0;
318 rc = -ERESTART;
319 }
320 if ( rc < 0 )
321@@ -1635,7 +1658,7 @@ static int alloc_l3_table(struct page_info *page)
322 if ( i )
323 {
324 page->nr_validated_ptes = i;
325- page->partial_pte = 0;
326+ page->partial_flags = 0;
327 current->arch.old_guest_ptpg = NULL;
328 current->arch.old_guest_table = page;
329 }
330@@ -1767,19 +1790,21 @@ static int alloc_l4_table(struct page_info *page)
331 unsigned long pfn = mfn_x(page_to_mfn(page));
332 l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
333 unsigned int i;
334- int rc = 0, partial = page->partial_pte;
335+ int rc = 0;
336+ unsigned int partial_flags = page->partial_flags;
337
338 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
339- i++, partial = 0 )
340+ i++, partial_flags = 0 )
341 {
342 if ( !is_guest_l4_slot(d, i) ||
343- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
344+ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
345 continue;
346
347 if ( rc == -ERESTART )
348 {
349 page->nr_validated_ptes = i;
350- page->partial_pte = partial ?: 1;
351+ /* Set 'set', leave 'general ref' set if this entry was set */
352+ page->partial_flags = partial_flags | PTF_partial_set;
353 }
354 else if ( rc < 0 )
355 {
356@@ -1789,7 +1814,7 @@ static int alloc_l4_table(struct page_info *page)
357 if ( i )
358 {
359 page->nr_validated_ptes = i;
360- page->partial_pte = 0;
361+ page->partial_flags = 0;
362 if ( rc == -EINTR )
363 rc = -ERESTART;
364 else
365@@ -1842,19 +1867,20 @@ static int free_l2_table(struct page_info *page)
366 struct domain *d = page_get_owner(page);
367 unsigned long pfn = mfn_x(page_to_mfn(page));
368 l2_pgentry_t *pl2e;
369- int rc = 0, partial = page->partial_pte;
370- unsigned int i = page->nr_validated_ptes - !partial;
371+ int rc = 0;
372+ unsigned int partial_flags = page->partial_flags,
373+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
374
375 pl2e = map_domain_page(_mfn(pfn));
376
377 for ( ; ; )
378 {
379 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
380- rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
381+ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
382 if ( rc < 0 )
383 break;
384
385- partial = 0;
386+ partial_flags = 0;
387
388 if ( !i-- )
389 break;
390@@ -1876,12 +1902,14 @@ static int free_l2_table(struct page_info *page)
391 else if ( rc == -ERESTART )
392 {
393 page->nr_validated_ptes = i;
394- page->partial_pte = partial ?: -1;
395+ page->partial_flags = (partial_flags & PTF_partial_set) ?
396+ partial_flags :
397+ (PTF_partial_set | PTF_partial_general_ref);
398 }
399 else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
400 {
401 page->nr_validated_ptes = i + 1;
402- page->partial_pte = 0;
403+ page->partial_flags = 0;
404 rc = -ERESTART;
405 }
406
407@@ -1893,18 +1921,19 @@ static int free_l3_table(struct page_info *page)
408 struct domain *d = page_get_owner(page);
409 unsigned long pfn = mfn_x(page_to_mfn(page));
410 l3_pgentry_t *pl3e;
411- int rc = 0, partial = page->partial_pte;
412- unsigned int i = page->nr_validated_ptes - !partial;
413+ int rc = 0;
414+ unsigned int partial_flags = page->partial_flags,
415+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
416
417 pl3e = map_domain_page(_mfn(pfn));
418
419 for ( ; ; )
420 {
421- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
422+ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
423 if ( rc < 0 )
424 break;
425
426- partial = 0;
427+ partial_flags = 0;
428 if ( rc == 0 )
429 pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
430
431@@ -1923,12 +1952,14 @@ static int free_l3_table(struct page_info *page)
432 if ( rc == -ERESTART )
433 {
434 page->nr_validated_ptes = i;
435- page->partial_pte = partial ?: -1;
436+ page->partial_flags = (partial_flags & PTF_partial_set) ?
437+ partial_flags :
438+ (PTF_partial_set | PTF_partial_general_ref);
439 }
440 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
441 {
442 page->nr_validated_ptes = i + 1;
443- page->partial_pte = 0;
444+ page->partial_flags = 0;
445 rc = -ERESTART;
446 }
447 return rc > 0 ? 0 : rc;
448@@ -1939,26 +1970,29 @@ static int free_l4_table(struct page_info *page)
449 struct domain *d = page_get_owner(page);
450 unsigned long pfn = mfn_x(page_to_mfn(page));
451 l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
452- int rc = 0, partial = page->partial_pte;
453- unsigned int i = page->nr_validated_ptes - !partial;
454+ int rc = 0;
455+ unsigned partial_flags = page->partial_flags,
456+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
457
458 do {
459 if ( is_guest_l4_slot(d, i) )
460- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
461+ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
462 if ( rc < 0 )
463 break;
464- partial = 0;
465+ partial_flags = 0;
466 } while ( i-- );
467
468 if ( rc == -ERESTART )
469 {
470 page->nr_validated_ptes = i;
471- page->partial_pte = partial ?: -1;
472+ page->partial_flags = (partial_flags & PTF_partial_set) ?
473+ partial_flags :
474+ (PTF_partial_set | PTF_partial_general_ref);
475 }
476 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
477 {
478 page->nr_validated_ptes = i + 1;
479- page->partial_pte = 0;
480+ page->partial_flags = 0;
481 rc = -ERESTART;
482 }
483
484@@ -2180,7 +2214,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
485 return -EBUSY;
486 }
487
488- put_page_from_l2e(ol2e, pfn, 0, true);
489+ put_page_from_l2e(ol2e, pfn, PTF_defer);
490
491 return rc;
492 }
493@@ -2248,7 +2282,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
494 if ( !create_pae_xen_mappings(d, pl3e) )
495 BUG();
496
497- put_page_from_l3e(ol3e, pfn, 0, 1);
498+ put_page_from_l3e(ol3e, pfn, PTF_defer);
499 return rc;
500 }
501
502@@ -2311,7 +2345,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
503 return -EFAULT;
504 }
505
506- put_page_from_l4e(ol4e, pfn, 0, 1);
507+ put_page_from_l4e(ol4e, pfn, PTF_defer);
508 return rc;
509 }
510
511@@ -2577,7 +2611,7 @@ int free_page_type(struct page_info *page, unsigned long type,
512 if ( !(type & PGT_partial) )
513 {
514 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
515- page->partial_pte = 0;
516+ page->partial_flags = 0;
517 }
518
519 switch ( type & PGT_type_mask )
520@@ -2862,7 +2896,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
521 if ( !(x & PGT_partial) )
522 {
523 page->nr_validated_ptes = 0;
524- page->partial_pte = 0;
525+ page->partial_flags = 0;
526 }
527 page->linear_pt_count = 0;
528 rc = alloc_page_type(page, type, preemptible);
529@@ -3037,7 +3071,8 @@ int new_guest_cr3(mfn_t mfn)
530
531 rc = paging_mode_refcounts(d)
532 ? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL)
533- : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
534+ : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d,
535+ PTF_preemptible);
536 switch ( rc )
537 {
538 case 0:
539@@ -3420,7 +3455,7 @@ long do_mmuext_op(
540 if ( op.arg1.mfn != 0 )
541 {
542 rc = get_page_and_type_from_mfn(
543- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
544+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible);
545
546 if ( unlikely(rc) )
547 {
548diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
549index 1030b8b5e6..a531fe3115 100644
550--- a/xen/include/asm-x86/mm.h
551+++ b/xen/include/asm-x86/mm.h
552@@ -157,19 +157,34 @@ struct page_info
553 * setting the flag must not drop that reference, whereas the instance
554 * clearing it will have to.
555 *
556- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
557- * been partially validated. This implies that the general reference
558- * to the page (acquired from get_page_from_lNe()) would be dropped
559- * (again due to the apparent failure) and hence must be re-acquired
560- * when resuming the validation, but must not be dropped when picking
561- * up the page for invalidation.
562+ * If partial_flags & PTF_partial_set is set, then the page at
563+ * at @nr_validated_ptes had PGT_partial set as a result of an
564+ * operation on the current page. (That page may or may not
565+ * still have PGT_partial set.)
566 *
567- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
568- * been partially invalidated. This is basically the opposite case of
569- * above, i.e. the general reference to the page was not dropped in
570- * put_page_from_lNe() (due to the apparent failure), and hence it
571- * must be dropped when the put operation is resumed (and completes),
572- * but it must not be acquired if picking up the page for validation.
573+ * If PTF_partial_general_ref is set, then the PTE at
574+ * @nr_validated_ptef holds a general reference count for the
575+ * page.
576+ *
577+ * This happens:
578+ * - During de-validation, if de-validation of the page was
579+ * interrupted
580+ * - During validation, if an invalid entry is encountered and
581+ * validation is preemptible
582+ * - During validation, if PTF_partial_general_ref was set on
583+ * this entry to begin with (perhaps because we're picking
584+ * up from a partial de-validation).
585+ *
586+ * When resuming validation, if PTF_partial_general_ref is clear,
587+ * then a general reference must be re-acquired; if it is set, no
588+ * reference should be acquired.
589+ *
590+ * When resuming de-validation, if PTF_partial_general_ref is
591+ * clear, no reference should be dropped; if it is set, a
592+ * reference should be dropped.
593+ *
594+ * NB that PTF_partial_set and PTF_partial_general_ref are
595+ * defined in mm.c, the only place where they are used.
596 *
597 * The 3rd field, @linear_pt_count, indicates
598 * - by a positive value, how many same-level page table entries a page
599@@ -180,7 +195,7 @@ struct page_info
600 struct {
601 u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
602 u16 :16 - PAGETABLE_ORDER - 1 - 2;
603- s16 partial_pte:2;
604+ u16 partial_flags:2;
605 s16 linear_pt_count;
606 };
607
608--
6092.23.0
610
diff --git a/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
new file mode 100644
index 0000000000..9c5b9669e9
--- /dev/null
+++ b/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
@@ -0,0 +1,141 @@
1From 255ad8804c79dc874322a7060ae0615305bcb8e8 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a
5 boolean
6
7This is in mainly in preparation for _put_page_type taking the
8partial_flags value in the future. It also makes it easier to read in
9the caller (since you see a flag name rather than `true` or `false`).
10
11No functional change intended.
12
13This is part of XSA-299.
14
15Reported-by: George Dunlap <george.dunlap@citrix.com>
16Signed-off-by: George Dunlap <george.dunlap@citrix.com>
17Reviewed-by: Jan Beulich <jbeulich@suse.com>
18---
19 xen/arch/x86/mm.c | 25 +++++++++++++------------
20 1 file changed, 13 insertions(+), 12 deletions(-)
21
22diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
23index 84ee48ec3f..e3264f8879 100644
24--- a/xen/arch/x86/mm.c
25+++ b/xen/arch/x86/mm.c
26@@ -1253,7 +1253,7 @@ get_page_from_l4e(
27 return rc;
28 }
29
30-static int _put_page_type(struct page_info *page, bool preemptible,
31+static int _put_page_type(struct page_info *page, unsigned int flags,
32 struct page_info *ptpg);
33
34 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
35@@ -1345,7 +1345,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
36 PTF_partial_set )
37 {
38 ASSERT(!(flags & PTF_defer));
39- rc = _put_page_type(pg, true, ptpg);
40+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
41 }
42 else if ( flags & PTF_defer )
43 {
44@@ -1354,7 +1354,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
45 }
46 else
47 {
48- rc = _put_page_type(pg, true, ptpg);
49+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
50 if ( likely(!rc) )
51 put_page(pg);
52 }
53@@ -1391,7 +1391,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
54 PTF_partial_set )
55 {
56 ASSERT(!(flags & PTF_defer));
57- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
58+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
59 }
60
61 if ( flags & PTF_defer )
62@@ -1401,7 +1401,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
63 return 0;
64 }
65
66- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
67+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
68 if ( likely(!rc) )
69 put_page(pg);
70
71@@ -1422,7 +1422,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
72 PTF_partial_set )
73 {
74 ASSERT(!(flags & PTF_defer));
75- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
76+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
77 }
78
79 if ( flags & PTF_defer )
80@@ -1432,7 +1432,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
81 return 0;
82 }
83
84- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
85+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
86 if ( likely(!rc) )
87 put_page(pg);
88 }
89@@ -2680,11 +2680,12 @@ static int _put_final_page_type(struct page_info *page, unsigned long type,
90 }
91
92
93-static int _put_page_type(struct page_info *page, bool preemptible,
94+static int _put_page_type(struct page_info *page, unsigned int flags,
95 struct page_info *ptpg)
96 {
97 unsigned long nx, x, y = page->u.inuse.type_info;
98 int rc = 0;
99+ bool preemptible = flags & PTF_preemptible;
100
101 for ( ; ; )
102 {
103@@ -2884,7 +2885,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
104
105 if ( unlikely(iommu_ret) )
106 {
107- _put_page_type(page, false, NULL);
108+ _put_page_type(page, 0, NULL);
109 rc = iommu_ret;
110 goto out;
111 }
112@@ -2911,7 +2912,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
113
114 void put_page_type(struct page_info *page)
115 {
116- int rc = _put_page_type(page, false, NULL);
117+ int rc = _put_page_type(page, 0, NULL);
118 ASSERT(rc == 0);
119 (void)rc;
120 }
121@@ -2927,7 +2928,7 @@ int get_page_type(struct page_info *page, unsigned long type)
122
123 int put_page_type_preemptible(struct page_info *page)
124 {
125- return _put_page_type(page, true, NULL);
126+ return _put_page_type(page, PTF_preemptible, NULL);
127 }
128
129 int get_page_type_preemptible(struct page_info *page, unsigned long type)
130@@ -2943,7 +2944,7 @@ int put_old_guest_table(struct vcpu *v)
131 if ( !v->arch.old_guest_table )
132 return 0;
133
134- switch ( rc = _put_page_type(v->arch.old_guest_table, true,
135+ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
136 v->arch.old_guest_ptpg) )
137 {
138 case -EINTR:
139--
1402.23.0
141
diff --git a/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
new file mode 100644
index 0000000000..8724f4d6ac
--- /dev/null
+++ b/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
@@ -0,0 +1,79 @@
1From 36ce2b6e246d41ebaeb994dbf2b4e0e4555893bf Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional
5
6Make it easier to read by declaring the conditions in which we will
7retain the ref, rather than the conditions under which we release it.
8
9The only way (page == current->arch.old_guest_table) can be true is if
10preemptible is true; so remove this from the query itself, and add an
11ASSERT() to that effect on the opposite path.
12
13No functional change intended.
14
15NB that alloc_lN_table() mishandle the "linear pt failure" situation
16described in the comment; this will be addressed in a future patch.
17
18This is part of XSA-299.
19
20Reported-by: George Dunlap <george.dunlap@citrix.com>
21Signed-off-by: George Dunlap <george.dunlap@citrix.com>
22Reviewed-by: Jan Beulich <jbeulich@suse.com>
23---
24 xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++--
25 1 file changed, 37 insertions(+), 2 deletions(-)
26
27diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
28index e3264f8879..ce7f5b84f3 100644
29--- a/xen/arch/x86/mm.c
30+++ b/xen/arch/x86/mm.c
31@@ -678,8 +678,43 @@ static int get_page_and_type_from_mfn(
32
33 rc = __get_page_type(page, type, preemptible);
34
35- if ( unlikely(rc) && !partial_ref &&
36- (!preemptible || page != current->arch.old_guest_table) )
37+ /*
38+ * Retain the refcount if:
39+ * - page is fully validated (rc == 0)
40+ * - page is not validated (rc < 0) but:
41+ * - We came in with a reference (partial_ref)
42+ * - page is partially validated but there's been an error
43+ * (page == current->arch.old_guest_table)
44+ *
45+ * The partial_ref-on-error clause is worth an explanation. There
46+ * are two scenarios where partial_ref might be true coming in:
47+ * - mfn has been partially demoted as type `type`; i.e. has
48+ * PGT_partial set
49+ * - mfn has been partially demoted as L(type+1) (i.e., a linear
50+ * page; e.g. we're being called from get_page_from_l2e with
51+ * type == PGT_l1_table, but the mfn is PGT_l2_table)
52+ *
53+ * If there's an error, in the first case, _get_page_type will
54+ * either return -ERESTART, in which case we want to retain the
55+ * ref (as the caller will consider it retained), or -EINVAL, in
56+ * which case old_guest_table will be set; in both cases, we need
57+ * to retain the ref.
58+ *
59+ * In the second case, if there's an error, _get_page_type() can
60+ * *only* return -EINVAL, and *never* set old_guest_table. In
61+ * that case we also want to retain the reference, to allow the
62+ * page to continue to be torn down (i.e., PGT_partial cleared)
63+ * safely.
64+ *
65+ * Also note that we shouldn't be able to leave with the reference
66+ * count retained unless we succeeded, or the operation was
67+ * preemptible.
68+ */
69+ if ( likely(!rc) || partial_ref )
70+ /* nothing */;
71+ else if ( page == current->arch.old_guest_table )
72+ ASSERT(preemptible);
73+ else
74 put_page(page);
75
76 return rc;
77--
782.23.0
79
diff --git a/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
new file mode 100644
index 0000000000..379c5002c6
--- /dev/null
+++ b/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
@@ -0,0 +1,101 @@
1From 180f638fb5047c478ca32b15dd2ba9ba0ce43623 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when
5 preempting
6
7In order to allow recursive pagetable promotions and demotions to be
8interrupted, Xen must keep track of the state of the sub-pages
9promoted or demoted. This is stored in two elements in the page
10struct: nr_entries_validated and partial_flags.
11
12The rule is that entries [0, nr_entries_validated) should always be
13validated and hold a general reference count. If partial_flags is
14zero, then [nr_entries_validated] is not validated and no reference
15count is held. If PTF_partial_set is set, then [nr_entries_validated]
16is partially validated.
17
18At the moment, a distinction is made between promotion and demotion
19with regard to whether the entry itself "holds" a general reference
20count: when entry promotion is interrupted (i.e., returns -ERESTART),
21the entry is not considered to hold a reference; when entry demotion
22is interrupted, the entry is still considered to hold a general
23reference.
24
25PTF_partial_general_ref is used to distinguish between these cases.
26If clear, it's a partial promotion => no general reference count held
27by the entry; if set, it's partial demotion, so a general reference
28count held. Because promotions and demotions can be interleaved, this
29value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
30to be able to properly handle reference counts.
31
32Unfortunately, when alloc_l[23]_table check hypercall_preempt_check()
33and return -ERESTART, they set nr_entries_validated, but don't clear
34partial_flags.
35
36If we were picking up from a previously-interrupted promotion, that
37means that PTF_partial_set would be set even though
38[nr_entries_validated] was not partially validated. This means that
39if the page in this state were de-validated, put_page_type() would
40erroneously be called on that entry.
41
42Perhaps worse, if we were racing with a de-validation, then we might
43leave both PTF_partial_set and PTF_partial_general_ref; and when
44de-validation picked up again, both the type and the general ref would
45be erroneously dropped from [nr_entries_validated].
46
47In a sense, the real issue here is code duplication. Rather than
48duplicate the interruption code, set rc to -EINTR and fall through to
49the code which already handles that case correctly.
50
51Given the logic at this point, it should be impossible for
52partial_flags to be non-zero; add an ASSERT() to catch any changes.
53
54This is part of XSA-299.
55
56Reported-by: George Dunlap <george.dunlap@citrix.com>
57Signed-off-by: George Dunlap <george.dunlap@citrix.com>
58Reviewed-by: Jan Beulich <jbeulich@suse.com>
59---
60 xen/arch/x86/mm.c | 18 ++++--------------
61 1 file changed, 4 insertions(+), 14 deletions(-)
62
63diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
64index ce7f5b84f3..9b9b67cd74 100644
65--- a/xen/arch/x86/mm.c
66+++ b/xen/arch/x86/mm.c
67@@ -1576,13 +1576,8 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
68 i++, partial_flags = 0 )
69 {
70 if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
71- {
72- page->nr_validated_ptes = i;
73- rc = -ERESTART;
74- break;
75- }
76-
77- if ( !is_guest_l2_slot(d, type, i) ||
78+ rc = -EINTR;
79+ else if ( !is_guest_l2_slot(d, type, i) ||
80 (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
81 continue;
82
83@@ -1647,13 +1642,8 @@ static int alloc_l3_table(struct page_info *page)
84 i++, partial_flags = 0 )
85 {
86 if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
87- {
88- page->nr_validated_ptes = i;
89- rc = -ERESTART;
90- break;
91- }
92-
93- if ( is_pv_32bit_domain(d) && (i == 3) )
94+ rc = -EINTR;
95+ else if ( is_pv_32bit_domain(d) && (i == 3) )
96 {
97 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
98 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
99--
1002.23.0
101
diff --git a/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
new file mode 100644
index 0000000000..253c0fbb7c
--- /dev/null
+++ b/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
@@ -0,0 +1,374 @@
1From 29f56f0e7c11a299da497c866b4c76ebbc862045 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial
5
6In order to allow recursive pagetable promotions and demotions to be
7interrupted, Xen must keep track of the state of the sub-pages
8promoted or demoted. This is stored in two elements in the page struct:
9nr_entries_validated and partial_flags.
10
11The rule is that entries [0, nr_entries_validated) should always be
12validated and hold a general reference count. If partial_flags is
13zero, then [nr_entries_validated] is not validated and no reference
14count is held. If PTF_partial_set is set, then [nr_entries_validated]
15is partially validated.
16
17At the moment, a distinction is made between promotion and demotion
18with regard to whether the entry itself "holds" a general reference
19count: when entry promotion is interrupted (i.e., returns -ERESTART),
20the entry is not considered to hold a reference; when entry demotion
21is interrupted, the entry is still considered to hold a general
22reference.
23
24PTF_partial_general_ref is used to distinguish between these cases.
25If clear, it's a partial promotion => no general reference count held
26by the entry; if set, it's partial demotion, so a general reference
27count held. Because promotions and demotions can be interleaved, this
28value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
29to be able to properly handle reference counts.
30
31Unfortunately, because a refcount is not held, it is possible to
32engineer a situation where PFT_partial_set is set but the page in
33question has been assigned to another domain. A sketch is provided in
34the appendix.
35
36Fix this by having the parent page table entry hold a general
37reference count whenever PFT_partial_set is set. (For clarity of
38change, keep two separate flags. These will be collapsed in a
39subsequent changeset.)
40
41This has two basic implications. On the put_page_from_lNe() side,
42this mean that the (partial_set && !partial_ref) case can never happen,
43and no longer needs to be special-cased.
44
45Secondly, because both flags are set together, there's no need to carry over
46existing bits from partial_pte.
47
48(NB there is still another issue with calling _put_page_type() on a
49page which had PGT_partial set; that will be handled in a subsequent
50patch.)
51
52On the get_page_and_type_from_mfn() side, we need to distinguish
53between callers which hold a reference on partial (i.e.,
54alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and
55so on): pass a flag if the type should be retained on interruption.
56
57NB that since l1 promotion can't be preempted, that get_page_from_l2e
58can't return -ERESTART.
59
60This is part of XSA-299.
61
62Reported-by: George Dunlap <george.dunlap@citrix.com>
63Signed-off-by: George Dunlap <george.dunlap@citrix.com>
64Reviewed-by: Jan Beulich <jbeulich@suse.com>
65-----
66* Appendix: Engineering PTF_partial_set while a page belongs to a
67 foreign domain
68
69Suppose A is a page which can be promoted to an l3, and B is a page
70which can be promoted to an l2, and A[x] points to B. B has
71PGC_allocated set but no other general references.
72
73V1: PIN_L3 A.
74 A is validated, B is validated.
75 A.type_count = 1 | PGT_validated | PGT_pinned
76 B.type_count = 1 | PGT_validated
77 B.count = 2 | PGC_allocated (A[x] holds a general ref)
78
79V1: UNPIN A.
80 A begins de-validation.
81 Arrange to be interrupted when i < x
82 V1->old_guest_table = A
83 V1->old_guest_table_ref_held = false
84 A.type_count = 1 | PGT_partial
85 A.nr_validated_entries = i < x
86 B.type_count = 0
87 B.count = 1 | PGC_allocated
88
89V2: MOD_L4_ENTRY to point some l4e to A.
90 Picks up re-validation of A.
91 Arrange to be interrupted halfway through B's validation
92 B.type_count = 1 | PGT_partial
93 B.count = 2 | PGC_allocated (PGT_partial holds a general ref)
94 A.type_count = 1 | PGT_partial
95 A.nr_validated_entries = x
96 A.partial_pte = PTF_partial_set
97
98V3: MOD_L3_ENTRY to point some other l3e (not in A) to B.
99 Validates B.
100 B.type_count = 1 | PGT_validated
101 B.count = 2 | PGC_allocated ("other l3e" holds a general ref)
102
103V3: MOD_L3_ENTRY to clear l3e pointing to B.
104 Devalidates B.
105 B.type_count = 0
106 B.count = 1 | PGC_allocated
107
108V3: decrease_reservation(B)
109 Clears PGC_allocated
110 B.count = 0 => B is freed
111
112B gets assigned to a different domain
113
114V1: Restarts UNPIN of A
115 put_old_guest_table(A)
116 ...
117 free_l3_table(A)
118
119Now since A.partial_flags has PTF_partial_set, free_l3_table() will
120call put_page_from_l3e() on A[x], which points to B, while B is owned
121by another domain.
122
123If A[x] held a general refcount for B on partial validation, as it does
124for partial de-validation, then B would still have a reference count of
1251 after PGC_allocated was freed; so B wouldn't be freed until after
126put_page_from_l3e() had happend on A[x].
127---
128 xen/arch/x86/mm.c | 84 +++++++++++++++++++++++-----------------
129 xen/include/asm-x86/mm.h | 15 ++++---
130 2 files changed, 58 insertions(+), 41 deletions(-)
131
132diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
133index 9b9b67cd74..2f185a3cd3 100644
134--- a/xen/arch/x86/mm.c
135+++ b/xen/arch/x86/mm.c
136@@ -658,10 +658,11 @@ static int __get_page_type(struct page_info *page, unsigned long type,
137 * page->pte[page->nr_validated_entries]. See the comment in mm.h for
138 * more information.
139 */
140-#define PTF_partial_set (1 << 0)
141-#define PTF_partial_general_ref (1 << 1)
142-#define PTF_preemptible (1 << 2)
143-#define PTF_defer (1 << 3)
144+#define PTF_partial_set (1 << 0)
145+#define PTF_partial_general_ref (1 << 1)
146+#define PTF_preemptible (1 << 2)
147+#define PTF_defer (1 << 3)
148+#define PTF_retain_ref_on_restart (1 << 4)
149
150 static int get_page_and_type_from_mfn(
151 mfn_t mfn, unsigned long type, struct domain *d,
152@@ -670,7 +671,11 @@ static int get_page_and_type_from_mfn(
153 struct page_info *page = mfn_to_page(mfn);
154 int rc;
155 bool preemptible = flags & PTF_preemptible,
156- partial_ref = flags & PTF_partial_general_ref;
157+ partial_ref = flags & PTF_partial_general_ref,
158+ partial_set = flags & PTF_partial_set,
159+ retain_ref = flags & PTF_retain_ref_on_restart;
160+
161+ ASSERT(partial_ref == partial_set);
162
163 if ( likely(!partial_ref) &&
164 unlikely(!get_page_from_mfn(mfn, d)) )
165@@ -683,13 +688,15 @@ static int get_page_and_type_from_mfn(
166 * - page is fully validated (rc == 0)
167 * - page is not validated (rc < 0) but:
168 * - We came in with a reference (partial_ref)
169+ * - page is partially validated (rc == -ERESTART), and the
170+ * caller has asked the ref to be retained in that case
171 * - page is partially validated but there's been an error
172 * (page == current->arch.old_guest_table)
173 *
174 * The partial_ref-on-error clause is worth an explanation. There
175 * are two scenarios where partial_ref might be true coming in:
176- * - mfn has been partially demoted as type `type`; i.e. has
177- * PGT_partial set
178+ * - mfn has been partially promoted / demoted as type `type`;
179+ * i.e. has PGT_partial set
180 * - mfn has been partially demoted as L(type+1) (i.e., a linear
181 * page; e.g. we're being called from get_page_from_l2e with
182 * type == PGT_l1_table, but the mfn is PGT_l2_table)
183@@ -712,7 +719,8 @@ static int get_page_and_type_from_mfn(
184 */
185 if ( likely(!rc) || partial_ref )
186 /* nothing */;
187- else if ( page == current->arch.old_guest_table )
188+ else if ( page == current->arch.old_guest_table ||
189+ (retain_ref && rc == -ERESTART) )
190 ASSERT(preemptible);
191 else
192 put_page(page);
193@@ -1379,8 +1387,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
194 if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
195 PTF_partial_set )
196 {
197- ASSERT(!(flags & PTF_defer));
198- rc = _put_page_type(pg, PTF_preemptible, ptpg);
199+ /* partial_set should always imply partial_ref */
200+ BUG();
201 }
202 else if ( flags & PTF_defer )
203 {
204@@ -1425,8 +1433,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
205 if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
206 PTF_partial_set )
207 {
208- ASSERT(!(flags & PTF_defer));
209- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
210+ /* partial_set should always imply partial_ref */
211+ BUG();
212 }
213
214 if ( flags & PTF_defer )
215@@ -1456,8 +1464,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
216 if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
217 PTF_partial_set )
218 {
219- ASSERT(!(flags & PTF_defer));
220- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
221+ /* partial_set should always imply partial_ref */
222+ BUG();
223 }
224
225 if ( flags & PTF_defer )
226@@ -1581,13 +1589,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
227 (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
228 continue;
229
230- if ( rc == -ERESTART )
231- {
232- page->nr_validated_ptes = i;
233- /* Set 'set', retain 'general ref' */
234- page->partial_flags = partial_flags | PTF_partial_set;
235- }
236- else if ( rc == -EINTR && i )
237+ /*
238+ * It shouldn't be possible for get_page_from_l2e to return
239+ * -ERESTART, since we never call this with PTF_preemptible.
240+ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable
241+ * entry.)
242+ *
243+ * NB that while on a "clean" promotion, we can never get
244+ * PGT_partial. It is possible to arrange for an l2e to
245+ * contain a partially-devalidated l2; but in that case, both
246+ * of the following functions will fail anyway (the first
247+ * because the page in question is not an l1; the second
248+ * because the page is not fully validated).
249+ */
250+ ASSERT(rc != -ERESTART);
251+
252+ if ( rc == -EINTR && i )
253 {
254 page->nr_validated_ptes = i;
255 page->partial_flags = 0;
256@@ -1596,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
257 else if ( rc < 0 && rc != -EINTR )
258 {
259 gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
260+ ASSERT(current->arch.old_guest_table == NULL);
261 if ( i )
262 {
263 page->nr_validated_ptes = i;
264@@ -1652,16 +1670,17 @@ static int alloc_l3_table(struct page_info *page)
265 rc = get_page_and_type_from_mfn(
266 l3e_get_mfn(pl3e[i]),
267 PGT_l2_page_table | PGT_pae_xen_l2, d,
268- partial_flags | PTF_preemptible);
269+ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart);
270 }
271- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
272+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d,
273+ partial_flags | PTF_retain_ref_on_restart)) > 0 )
274 continue;
275
276 if ( rc == -ERESTART )
277 {
278 page->nr_validated_ptes = i;
279 /* Set 'set', leave 'general ref' set if this entry was set */
280- page->partial_flags = partial_flags | PTF_partial_set;
281+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
282 }
283 else if ( rc == -EINTR && i )
284 {
285@@ -1822,14 +1841,15 @@ static int alloc_l4_table(struct page_info *page)
286 i++, partial_flags = 0 )
287 {
288 if ( !is_guest_l4_slot(d, i) ||
289- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
290+ (rc = get_page_from_l4e(pl4e[i], pfn, d,
291+ partial_flags | PTF_retain_ref_on_restart)) > 0 )
292 continue;
293
294 if ( rc == -ERESTART )
295 {
296 page->nr_validated_ptes = i;
297 /* Set 'set', leave 'general ref' set if this entry was set */
298- page->partial_flags = partial_flags | PTF_partial_set;
299+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
300 }
301 else if ( rc < 0 )
302 {
303@@ -1927,9 +1947,7 @@ static int free_l2_table(struct page_info *page)
304 else if ( rc == -ERESTART )
305 {
306 page->nr_validated_ptes = i;
307- page->partial_flags = (partial_flags & PTF_partial_set) ?
308- partial_flags :
309- (PTF_partial_set | PTF_partial_general_ref);
310+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
311 }
312 else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
313 {
314@@ -1977,9 +1995,7 @@ static int free_l3_table(struct page_info *page)
315 if ( rc == -ERESTART )
316 {
317 page->nr_validated_ptes = i;
318- page->partial_flags = (partial_flags & PTF_partial_set) ?
319- partial_flags :
320- (PTF_partial_set | PTF_partial_general_ref);
321+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
322 }
323 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
324 {
325@@ -2010,9 +2026,7 @@ static int free_l4_table(struct page_info *page)
326 if ( rc == -ERESTART )
327 {
328 page->nr_validated_ptes = i;
329- page->partial_flags = (partial_flags & PTF_partial_set) ?
330- partial_flags :
331- (PTF_partial_set | PTF_partial_general_ref);
332+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
333 }
334 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
335 {
336diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
337index a531fe3115..74b0246c02 100644
338--- a/xen/include/asm-x86/mm.h
339+++ b/xen/include/asm-x86/mm.h
340@@ -167,22 +167,25 @@ struct page_info
341 * page.
342 *
343 * This happens:
344- * - During de-validation, if de-validation of the page was
345+ * - During validation or de-validation, if the operation was
346 * interrupted
347 * - During validation, if an invalid entry is encountered and
348 * validation is preemptible
349 * - During validation, if PTF_partial_general_ref was set on
350- * this entry to begin with (perhaps because we're picking
351- * up from a partial de-validation).
352+ * this entry to begin with (perhaps because it picked up a
353+ * previous operation)
354 *
355- * When resuming validation, if PTF_partial_general_ref is clear,
356- * then a general reference must be re-acquired; if it is set, no
357- * reference should be acquired.
358+ * When resuming validation, if PTF_partial_general_ref is
359+ * clear, then a general reference must be re-acquired; if it
360+ * is set, no reference should be acquired.
361 *
362 * When resuming de-validation, if PTF_partial_general_ref is
363 * clear, no reference should be dropped; if it is set, a
364 * reference should be dropped.
365 *
366+ * NB at the moment, PTF_partial_set should be set if and only if
367+ * PTF_partial_general_ref is set.
368+ *
369 * NB that PTF_partial_set and PTF_partial_general_ref are
370 * defined in mm.c, the only place where they are used.
371 *
372--
3732.23.0
374
diff --git a/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
new file mode 100644
index 0000000000..d7602d644b
--- /dev/null
+++ b/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
@@ -0,0 +1,227 @@
1From 140c8876835a134daf507d6c60bdcdf9126f166f Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and
5 PTF_partial_general_ref into one
6
7...now that they are equivalent. No functional change intended.
8
9Reported-by: George Dunlap <george.dunlap@citrix.com>
10Signed-off-by: George Dunlap <george.dunlap@citrix.com>
11Reviewed-by: Jan Beulich <jbeulich@suse.com>
12---
13 xen/arch/x86/mm.c | 50 +++++++++++-----------------------------
14 xen/include/asm-x86/mm.h | 29 +++++++++++------------
15 2 files changed, 26 insertions(+), 53 deletions(-)
16
17diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
18index 2f185a3cd3..693791331a 100644
19--- a/xen/arch/x86/mm.c
20+++ b/xen/arch/x86/mm.c
21@@ -653,13 +653,12 @@ static int __get_page_type(struct page_info *page, unsigned long type,
22
23 /*
24 * The following flags are used to specify behavior of various get and
25- * put commands. The first two are also stored in page->partial_flags
26- * to indicate the state of the page pointed to by
27+ * put commands. The first is also stored in page->partial_flags to
28+ * indicate the state of the page pointed to by
29 * page->pte[page->nr_validated_entries]. See the comment in mm.h for
30 * more information.
31 */
32 #define PTF_partial_set (1 << 0)
33-#define PTF_partial_general_ref (1 << 1)
34 #define PTF_preemptible (1 << 2)
35 #define PTF_defer (1 << 3)
36 #define PTF_retain_ref_on_restart (1 << 4)
37@@ -671,13 +670,10 @@ static int get_page_and_type_from_mfn(
38 struct page_info *page = mfn_to_page(mfn);
39 int rc;
40 bool preemptible = flags & PTF_preemptible,
41- partial_ref = flags & PTF_partial_general_ref,
42 partial_set = flags & PTF_partial_set,
43 retain_ref = flags & PTF_retain_ref_on_restart;
44
45- ASSERT(partial_ref == partial_set);
46-
47- if ( likely(!partial_ref) &&
48+ if ( likely(!partial_set) &&
49 unlikely(!get_page_from_mfn(mfn, d)) )
50 return -EINVAL;
51
52@@ -687,14 +683,14 @@ static int get_page_and_type_from_mfn(
53 * Retain the refcount if:
54 * - page is fully validated (rc == 0)
55 * - page is not validated (rc < 0) but:
56- * - We came in with a reference (partial_ref)
57+ * - We came in with a reference (partial_set)
58 * - page is partially validated (rc == -ERESTART), and the
59 * caller has asked the ref to be retained in that case
60 * - page is partially validated but there's been an error
61 * (page == current->arch.old_guest_table)
62 *
63- * The partial_ref-on-error clause is worth an explanation. There
64- * are two scenarios where partial_ref might be true coming in:
65+ * The partial_set-on-error clause is worth an explanation. There
66+ * are two scenarios where partial_set might be true coming in:
67 * - mfn has been partially promoted / demoted as type `type`;
68 * i.e. has PGT_partial set
69 * - mfn has been partially demoted as L(type+1) (i.e., a linear
70@@ -717,7 +713,7 @@ static int get_page_and_type_from_mfn(
71 * count retained unless we succeeded, or the operation was
72 * preemptible.
73 */
74- if ( likely(!rc) || partial_ref )
75+ if ( likely(!rc) || partial_set )
76 /* nothing */;
77 else if ( page == current->arch.old_guest_table ||
78 (retain_ref && rc == -ERESTART) )
79@@ -1384,13 +1380,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
80 struct page_info *pg = l2e_get_page(l2e);
81 struct page_info *ptpg = mfn_to_page(_mfn(pfn));
82
83- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
84- PTF_partial_set )
85- {
86- /* partial_set should always imply partial_ref */
87- BUG();
88- }
89- else if ( flags & PTF_defer )
90+ if ( flags & PTF_defer )
91 {
92 current->arch.old_guest_ptpg = ptpg;
93 current->arch.old_guest_table = pg;
94@@ -1430,13 +1420,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
95
96 pg = l3e_get_page(l3e);
97
98- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
99- PTF_partial_set )
100- {
101- /* partial_set should always imply partial_ref */
102- BUG();
103- }
104-
105 if ( flags & PTF_defer )
106 {
107 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
108@@ -1461,13 +1444,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
109 {
110 struct page_info *pg = l4e_get_page(l4e);
111
112- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
113- PTF_partial_set )
114- {
115- /* partial_set should always imply partial_ref */
116- BUG();
117- }
118-
119 if ( flags & PTF_defer )
120 {
121 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
122@@ -1680,7 +1656,7 @@ static int alloc_l3_table(struct page_info *page)
123 {
124 page->nr_validated_ptes = i;
125 /* Set 'set', leave 'general ref' set if this entry was set */
126- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
127+ page->partial_flags = PTF_partial_set;
128 }
129 else if ( rc == -EINTR && i )
130 {
131@@ -1849,7 +1825,7 @@ static int alloc_l4_table(struct page_info *page)
132 {
133 page->nr_validated_ptes = i;
134 /* Set 'set', leave 'general ref' set if this entry was set */
135- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
136+ page->partial_flags = PTF_partial_set;
137 }
138 else if ( rc < 0 )
139 {
140@@ -1947,7 +1923,7 @@ static int free_l2_table(struct page_info *page)
141 else if ( rc == -ERESTART )
142 {
143 page->nr_validated_ptes = i;
144- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
145+ page->partial_flags = PTF_partial_set;
146 }
147 else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
148 {
149@@ -1995,7 +1971,7 @@ static int free_l3_table(struct page_info *page)
150 if ( rc == -ERESTART )
151 {
152 page->nr_validated_ptes = i;
153- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
154+ page->partial_flags = PTF_partial_set;
155 }
156 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
157 {
158@@ -2026,7 +2002,7 @@ static int free_l4_table(struct page_info *page)
159 if ( rc == -ERESTART )
160 {
161 page->nr_validated_ptes = i;
162- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
163+ page->partial_flags = PTF_partial_set;
164 }
165 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
166 {
167diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
168index 74b0246c02..704345335c 100644
169--- a/xen/include/asm-x86/mm.h
170+++ b/xen/include/asm-x86/mm.h
171@@ -162,7 +162,7 @@ struct page_info
172 * operation on the current page. (That page may or may not
173 * still have PGT_partial set.)
174 *
175- * If PTF_partial_general_ref is set, then the PTE at
176+ * Additionally, if PTF_partial_set is set, then the PTE at
177 * @nr_validated_ptef holds a general reference count for the
178 * page.
179 *
180@@ -171,23 +171,20 @@ struct page_info
181 * interrupted
182 * - During validation, if an invalid entry is encountered and
183 * validation is preemptible
184- * - During validation, if PTF_partial_general_ref was set on
185- * this entry to begin with (perhaps because it picked up a
186+ * - During validation, if PTF_partial_set was set on this
187+ * entry to begin with (perhaps because it picked up a
188 * previous operation)
189 *
190- * When resuming validation, if PTF_partial_general_ref is
191- * clear, then a general reference must be re-acquired; if it
192- * is set, no reference should be acquired.
193+ * When resuming validation, if PTF_partial_set is clear, then
194+ * a general reference must be re-acquired; if it is set, no
195+ * reference should be acquired.
196 *
197- * When resuming de-validation, if PTF_partial_general_ref is
198- * clear, no reference should be dropped; if it is set, a
199- * reference should be dropped.
200+ * When resuming de-validation, if PTF_partial_set is clear,
201+ * no reference should be dropped; if it is set, a reference
202+ * should be dropped.
203 *
204- * NB at the moment, PTF_partial_set should be set if and only if
205- * PTF_partial_general_ref is set.
206- *
207- * NB that PTF_partial_set and PTF_partial_general_ref are
208- * defined in mm.c, the only place where they are used.
209+ * NB that PTF_partial_set is defined in mm.c, the only place
210+ * where it is used.
211 *
212 * The 3rd field, @linear_pt_count, indicates
213 * - by a positive value, how many same-level page table entries a page
214@@ -197,8 +194,8 @@ struct page_info
215 */
216 struct {
217 u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
218- u16 :16 - PAGETABLE_ORDER - 1 - 2;
219- u16 partial_flags:2;
220+ u16 :16 - PAGETABLE_ORDER - 1 - 1;
221+ u16 partial_flags:1;
222 s16 linear_pt_count;
223 };
224
225--
2262.23.0
227
diff --git a/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
new file mode 100644
index 0000000000..a3519c2103
--- /dev/null
+++ b/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
@@ -0,0 +1,106 @@
1From 203bc967574c7c5a06ed6bb452a9761f46dce724 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion
5 failures
6
7In order to allow recursive pagetable promotions and demotions to be
8interrupted, Xen must keep track of the state of the sub-pages
9promoted or demoted. This is stored in two elements in the page
10struct: nr_entries_validated and partial_flags.
11
12The rule is that entries [0, nr_entries_validated) should always be
13validated and hold a general reference count. If partial_flags is
14zero, then [nr_entries_validated] is not validated and no reference
15count is held. If PTF_partial_set is set, then [nr_entries_validated]
16is partially validated, and a general reference count is held.
17
18Unfortunately, in cases where an entry began with PTF_partial_set set,
19and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is
20erroneously dropped. (This scenario can be engineered mainly by the
21use of interleaving of promoting and demoting a page which has "linear
22pagetable" entries; see the appendix for a sketch.) This means that
23we will "leak" a general reference count on the page in question,
24preventing the page from being freed.
25
26Fix this by setting page->partial_flags to the partial_flags local
27variable.
28
29This is part of XSA-299.
30
31Reported-by: George Dunlap <george.dunlap@citrix.com>
32Signed-off-by: George Dunlap <george.dunlap@citrix.com>
33Reviewed-by: Jan Beulich <jbeulich@suse.com>
34-----
35Appendix
36
37Suppose A and B can both be promoted to L2 pages, and A[x] points to B.
38
39V1: PIN_L2 B.
40 B.type_count = 1 | PGT_validated
41 B.count = 2 | PGC_allocated
42
43V1: MOD_L3_ENTRY pointing something to A.
44 In the process of validating A[x], grab an extra type / ref on B:
45 B.type_count = 2 | PGT_validated
46 B.count = 3 | PGC_allocated
47 A.type_count = 1 | PGT_validated
48 A.count = 2 | PGC_allocated
49
50V1: UNPIN B.
51 B.type_count = 1 | PGT_validate
52 B.count = 2 | PGC_allocated
53
54V1: MOD_L3_ENTRY removing the reference to A.
55 De-validate A, down to A[x], which points to B.
56 Drop the final type on B. Arrange to be interrupted.
57 B.type_count = 1 | PGT_partial
58 B.count = 2 | PGC_allocated
59 A.type_count = 1 | PGT_partial
60 A.nr_validated_entries = x
61 A.partial_pte = -1
62
63V2: MOD_L3_ENTRY adds a reference to A.
64
65At this point, get_page_from_l2e(A[x]) tries
66get_page_and_type_from_mfn(), which fails because it's the wrong type;
67and get_l2_linear_pagetable() also fails, because B isn't validated as
68an l2 anymore.
69---
70 xen/arch/x86/mm.c | 6 +++---
71 1 file changed, 3 insertions(+), 3 deletions(-)
72
73diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
74index 693791331a..300f147e98 100644
75--- a/xen/arch/x86/mm.c
76+++ b/xen/arch/x86/mm.c
77@@ -1593,7 +1593,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
78 if ( i )
79 {
80 page->nr_validated_ptes = i;
81- page->partial_flags = 0;
82+ page->partial_flags = partial_flags;
83 current->arch.old_guest_ptpg = NULL;
84 current->arch.old_guest_table = page;
85 }
86@@ -1678,7 +1678,7 @@ static int alloc_l3_table(struct page_info *page)
87 if ( i )
88 {
89 page->nr_validated_ptes = i;
90- page->partial_flags = 0;
91+ page->partial_flags = partial_flags;
92 current->arch.old_guest_ptpg = NULL;
93 current->arch.old_guest_table = page;
94 }
95@@ -1835,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page)
96 if ( i )
97 {
98 page->nr_validated_ptes = i;
99- page->partial_flags = 0;
100+ page->partial_flags = partial_flags;
101 if ( rc == -EINTR )
102 rc = -ERESTART;
103 else
104--
1052.23.0
106
diff --git a/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
new file mode 100644
index 0000000000..f8e7915bb9
--- /dev/null
+++ b/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
@@ -0,0 +1,169 @@
1From 45242b9057b4feccb837362f39e0eb97dc0093c8 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:49 +0100
4Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error
5
6If an invalid entry is discovered when validating a page-table tree,
7the entire tree which has so far been validated must be de-validated.
8Since this may take a long time, alloc_l[2-4]_table() set current
9vcpu's old_guest_table immediately; put_old_guest_table() will make
10sure that put_page_type() will be called to finish off the
11de-validation before any other MMU operations can happen on the vcpu.
12
13The invariant for partial pages should be:
14
15* Entries [0, nr_validated_ptes) should be completely validated;
16 put_page_type() will de-validate these.
17
18* If [nr_validated_ptes] is partially validated, partial_flags should
19 set PTF_partiaL_set. put_page_type() will be called on this page to
20 finish off devalidation, and the appropriate refcount adjustments
21 will be done.
22
23alloc_l[2-3]_table() indicates partial validation to its callers by
24setting current->old_guest_table.
25
26Unfortunately, this is mishandled.
27
28Take the case where validating lNe[x] returns an error.
29
30First, alloc_l3_table() doesn't check old_guest_table at all; as a
31result, partial_flags is not set when it should be. nr_validated_ptes
32is set to x; and since PFT_partial_set clear, de-validation resumes at
33nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not
34have put_page_type() called on it when de-validating the rest of the
35l3: it will be stuck in the PGT_partial state until the domain is
36destroyed, or until it is re-used as an l2. (Any other page type will
37fail.)
38
39Worse, alloc_l4_table(), rather than setting PTF_partial_set as it
40should, sets nr_validated_ptes to x+1. When de-validating, since
41partial is 0, this will correctly resume calling put_page_type at [x];
42but, if the put_page_type() is never called, but instead
43get_page_type() is called, validation will pick up at [x+1],
44neglecting to validate [x]. If the rest of the validation succeeds,
45the l4 will be validated even though [x] is invalid.
46
47Fix this in both cases by setting PTF_partial_set if old_guest_table
48is set.
49
50While here, add some safety catches:
51- old_guest_table must point to the page contained in
52 [nr_validated_ptes].
53- alloc_l1_page shouldn't set old_guest_table
54
55If we experience one of these situations in production builds, it's
56safer to avoid calling put_page_type for the pages in question. If
57they have PGT_partial set, they will be cleaned up on domain
58destruction; if not, we have no idea whether a type count is safe to
59drop. Retaining an extra type ref that should have been dropped may
60trigger a BUG() on the free_domain_page() path, but dropping a type
61count that shouldn't be dropped may cause a privilege escalation.
62
63This is part of XSA-299.
64
65Reported-by: George Dunlap <george.dunlap@citrix.com>
66Signed-off-by: George Dunlap <george.dunlap@citrix.com>
67Reviewed-by: Jan Beulich <jbeulich@suse.com>
68---
69 xen/arch/x86/mm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
70 1 file changed, 54 insertions(+), 1 deletion(-)
71
72diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
73index 300f147e98..2ea32463a8 100644
74--- a/xen/arch/x86/mm.c
75+++ b/xen/arch/x86/mm.c
76@@ -1592,6 +1592,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
77 ASSERT(current->arch.old_guest_table == NULL);
78 if ( i )
79 {
80+ /*
81+ * alloc_l1_table() doesn't set old_guest_table; it does
82+ * its own tear-down immediately on failure. If it
83+ * did we'd need to check it and set partial_flags as we
84+ * do in alloc_l[34]_table().
85+ *
86+ * Note on the use of ASSERT: if it's non-null and
87+ * hasn't been cleaned up yet, it should have
88+ * PGT_partial set; and so the type will be cleaned up
89+ * on domain destruction. Unfortunately, we would
90+ * leak the general ref held by old_guest_table; but
91+ * leaking a page is less bad than a host crash.
92+ */
93+ ASSERT(current->arch.old_guest_table == NULL);
94 page->nr_validated_ptes = i;
95 page->partial_flags = partial_flags;
96 current->arch.old_guest_ptpg = NULL;
97@@ -1619,6 +1633,7 @@ static int alloc_l3_table(struct page_info *page)
98 unsigned int i;
99 int rc = 0;
100 unsigned int partial_flags = page->partial_flags;
101+ l3_pgentry_t l3e = l3e_empty();
102
103 pl3e = map_domain_page(_mfn(pfn));
104
105@@ -1665,7 +1680,11 @@ static int alloc_l3_table(struct page_info *page)
106 rc = -ERESTART;
107 }
108 if ( rc < 0 )
109+ {
110+ /* XSA-299 Backport: Copy l3e for checking */
111+ l3e = pl3e[i];
112 break;
113+ }
114
115 pl3e[i] = adjust_guest_l3e(pl3e[i], d);
116 }
117@@ -1679,6 +1698,24 @@ static int alloc_l3_table(struct page_info *page)
118 {
119 page->nr_validated_ptes = i;
120 page->partial_flags = partial_flags;
121+ if ( current->arch.old_guest_table )
122+ {
123+ /*
124+ * We've experienced a validation failure. If
125+ * old_guest_table is set, "transfer" the general
126+ * reference count to pl3e[nr_validated_ptes] by
127+ * setting PTF_partial_set.
128+ *
129+ * As a precaution, check that old_guest_table is the
130+ * page pointed to by pl3e[nr_validated_ptes]. If
131+ * not, it's safer to leak a type ref on production
132+ * builds.
133+ */
134+ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
135+ page->partial_flags = PTF_partial_set;
136+ else
137+ ASSERT_UNREACHABLE();
138+ }
139 current->arch.old_guest_ptpg = NULL;
140 current->arch.old_guest_table = page;
141 }
142@@ -1841,7 +1878,23 @@ static int alloc_l4_table(struct page_info *page)
143 else
144 {
145 if ( current->arch.old_guest_table )
146- page->nr_validated_ptes++;
147+ {
148+ /*
149+ * We've experienced a validation failure. If
150+ * old_guest_table is set, "transfer" the general
151+ * reference count to pl3e[nr_validated_ptes] by
152+ * setting PTF_partial_set.
153+ *
154+ * As a precaution, check that old_guest_table is the
155+ * page pointed to by pl4e[nr_validated_ptes]. If
156+ * not, it's safer to leak a type ref on production
157+ * builds.
158+ */
159+ if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
160+ page->partial_flags = PTF_partial_set;
161+ else
162+ ASSERT_UNREACHABLE();
163+ }
164 current->arch.old_guest_ptpg = NULL;
165 current->arch.old_guest_table = page;
166 }
167--
1682.23.0
169
diff --git a/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
new file mode 100644
index 0000000000..5d722cd2ab
--- /dev/null
+++ b/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
@@ -0,0 +1,413 @@
1From 4905f7fbaa60f75df063305c9532fb63b77deab9 Mon Sep 17 00:00:00 2001
2From: George Dunlap <george.dunlap@citrix.com>
3Date: Thu, 10 Oct 2019 17:57:50 +0100
4Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to
5 begin with
6
7Validation and de-validation of pagetable trees may take arbitrarily
8large amounts of time, and so must be preemptible. This is indicated
9by setting the PGT_partial bit in the type_info, and setting
10nr_validated_entries and partial_flags appropriately. Specifically,
11if the entry at [nr_validated_entries] is partially validated,
12partial_flags should have the PGT_partial_set bit set, and the entry
13should hold a general reference count. During de-validation,
14put_page_type() is called on partially validated entries.
15
16Unfortunately, there are a number of issues with the current algorithm.
17
18First, doing a "normal" put_page_type() is not safe when no type ref
19is held: there is nothing to stop another vcpu from coming along and
20picking up validation again: at which point the put_page_type may drop
21the only page ref on an in-use page. Some examples are listed in the
22appendix.
23
24The core issue is that put_page_type() is being called both to clean
25up PGT_partial, and to drop a type count; and has no way of knowing
26which is which; and so if in between, PGT_partial is cleared,
27put_page_type() will drop the type ref erroneously.
28
29What is needed is to distinguish between two states:
30- Dropping a type ref which is held
31- Cleaning up a page which has been partially de/validated
32
33Fix this by telling put_page_type() which of the two activities you
34intend.
35
36When cleaning up a partial de/validation, take no action unless you
37find a page partially validated.
38
39If put_page_type() is called without PTF_partial_set, and finds the
40page in a PGT_partial state anyway, then there's certainly been a
41misaccounting somewhere, and carrying on would almost certainly cause
42a security issue, so crash the host instead.
43
44In put_page_from_lNe, pass partial_flags on to _put_page_type().
45
46old_guest_table may be set either with a fully validated page (when
47using the "deferred put" pattern), or with a partially validated page
48(when a normal "de-validation" is interrupted, or when a validation
49fails part-way through due to invalid entries). Add a flag,
50old_guest_table_partial, to indicate which of these it is, and use
51that to pass the appropriate flag to _put_page_type().
52
53While here, delete stray trailing whitespace.
54
55This is part of XSA-299.
56
57Reported-by: George Dunlap <george.dunlap@citrix.com>
58Signed-off-by: George Dunlap <george.dunlap@citrix.com>
59Reviewed-by: Jan Beulich <jbeulich@suse.com>
60-----
61Appendix:
62
63Suppose page A, when interpreted as an l3 pagetable, contains all
64valid entries; and suppose A[x] points to page B, which when
65interpreted as an l2 pagetable, contains all valid entries.
66
67P1: PIN_L3_TABLE
68 A -> PGT_l3_table | 1 | valid
69 B -> PGT_l2_table | 1 | valid
70
71P1: UNPIN_TABLE
72 > Arrange to interrupt after B has been de-validated
73 B:
74 type_info -> PGT_l2_table | 0
75 A:
76 type_info -> PGT_l3_table | 1 | partial
77 nr_validated_enties -> (less than x)
78
79P2: mod_l4_entry to point to A
80 > Arrange for this to be interrupted while B is being validated
81 B:
82 type_info -> PGT_l2_table | 1 | partial
83 (nr_validated_entires &c set as appropriate)
84 A:
85 type_info -> PGT_l3_table | 1 | partial
86 nr_validated_entries -> x
87 partial_pte = 1
88
89P3: mod_l3_entry some other unrelated l3 to point to B:
90 B:
91 type_info -> PGT_l2_table | 1
92
93P1: Restart UNPIN_TABLE
94
95At this point, since A.nr_validate_entries == x and A.partial_pte !=
960, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
97its type count to 0 while it's still being pointed to by some other l3
98
99A similar issue arises with old_guest_table. Consider the following
100scenario:
101
102Suppose A is a page which, when interpreted as an l2, has valid entries
103until entry x, which is invalid.
104
105V1: PIN_L2_TABLE(A)
106 <Validate until we try to validate [x], get -EINVAL>
107 A -> PGT_l2_table | 1 | PGT_partial
108 V1 -> old_guest_table = A
109 <delayed>
110
111V2: PIN_L2_TABLE(A)
112 <Pick up where V1 left off, try to re-validate [x], get -EINVAL>
113 A -> PGT_l2_table | 1 | PGT_partial
114 V2 -> old_guest_table = A
115 <restart>
116 put_old_guest_table()
117 _put_page_type(A)
118 A -> PGT_l2_table | 0
119
120V1: <restart>
121 put_old_guest_table()
122 _put_page_type(A) # UNDERFLOW
123
124Indeed, it is possible to engineer for old_guest_table for every vcpu
125a guest has to point to the same page.
126---
127 xen/arch/x86/domain.c | 6 +++
128 xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++-----
129 xen/include/asm-x86/domain.h | 4 +-
130 3 files changed, 95 insertions(+), 14 deletions(-)
131
132diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
133index 897124f05f..6074fa5947 100644
134--- a/xen/arch/x86/domain.c
135+++ b/xen/arch/x86/domain.c
136@@ -1075,9 +1075,15 @@ int arch_set_info_guest(
137 rc = -ERESTART;
138 /* Fallthrough */
139 case -ERESTART:
140+ /*
141+ * NB that we're putting the kernel-mode table
142+ * here, which we've already successfully
143+ * validated above; hence partial = false;
144+ */
145 v->arch.old_guest_ptpg = NULL;
146 v->arch.old_guest_table =
147 pagetable_get_page(v->arch.guest_table);
148+ v->arch.old_guest_table_partial = false;
149 v->arch.guest_table = pagetable_null();
150 break;
151 default:
152diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
153index 2ea32463a8..9ae71d864a 100644
154--- a/xen/arch/x86/mm.c
155+++ b/xen/arch/x86/mm.c
156@@ -1384,10 +1384,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
157 {
158 current->arch.old_guest_ptpg = ptpg;
159 current->arch.old_guest_table = pg;
160+ current->arch.old_guest_table_partial = false;
161 }
162 else
163 {
164- rc = _put_page_type(pg, PTF_preemptible, ptpg);
165+ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
166 if ( likely(!rc) )
167 put_page(pg);
168 }
169@@ -1410,6 +1411,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
170 unsigned long mfn = l3e_get_pfn(l3e);
171 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
172
173+ ASSERT(!(flags & PTF_partial_set));
174 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
175 do {
176 put_data_page(mfn_to_page(_mfn(mfn)), writeable);
177@@ -1422,12 +1424,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
178
179 if ( flags & PTF_defer )
180 {
181+ ASSERT(!(flags & PTF_partial_set));
182 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
183 current->arch.old_guest_table = pg;
184+ current->arch.old_guest_table_partial = false;
185 return 0;
186 }
187
188- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
189+ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
190 if ( likely(!rc) )
191 put_page(pg);
192
193@@ -1446,12 +1450,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
194
195 if ( flags & PTF_defer )
196 {
197+ ASSERT(!(flags & PTF_partial_set));
198 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
199 current->arch.old_guest_table = pg;
200+ current->arch.old_guest_table_partial = false;
201 return 0;
202 }
203
204- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
205+ rc = _put_page_type(pg, flags | PTF_preemptible,
206+ mfn_to_page(_mfn(pfn)));
207 if ( likely(!rc) )
208 put_page(pg);
209 }
210@@ -1556,6 +1563,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
211
212 pl2e = map_domain_page(_mfn(pfn));
213
214+ /*
215+ * NB that alloc_l2_table will never set partial_pte on an l2; but
216+ * free_l2_table might if a linear_pagetable entry is interrupted
217+ * partway through de-validation. In that circumstance,
218+ * get_page_from_l2e() will always return -EINVAL; and we must
219+ * retain the type ref by doing the normal partial_flags tracking.
220+ */
221+
222 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
223 i++, partial_flags = 0 )
224 {
225@@ -1610,6 +1625,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
226 page->partial_flags = partial_flags;
227 current->arch.old_guest_ptpg = NULL;
228 current->arch.old_guest_table = page;
229+ current->arch.old_guest_table_partial = true;
230 }
231 }
232 if ( rc < 0 )
233@@ -1712,12 +1728,16 @@ static int alloc_l3_table(struct page_info *page)
234 * builds.
235 */
236 if ( current->arch.old_guest_table == l3e_get_page(l3e) )
237+ {
238+ ASSERT(current->arch.old_guest_table_partial);
239 page->partial_flags = PTF_partial_set;
240+ }
241 else
242 ASSERT_UNREACHABLE();
243 }
244 current->arch.old_guest_ptpg = NULL;
245 current->arch.old_guest_table = page;
246+ current->arch.old_guest_table_partial = true;
247 }
248 while ( i-- > 0 )
249 pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
250@@ -1891,12 +1911,16 @@ static int alloc_l4_table(struct page_info *page)
251 * builds.
252 */
253 if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
254+ {
255+ ASSERT(current->arch.old_guest_table_partial);
256 page->partial_flags = PTF_partial_set;
257+ }
258 else
259 ASSERT_UNREACHABLE();
260 }
261 current->arch.old_guest_ptpg = NULL;
262 current->arch.old_guest_table = page;
263+ current->arch.old_guest_table_partial = true;
264 }
265 }
266 }
267@@ -2760,6 +2784,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
268 x = y;
269 nx = x - 1;
270
271+ /*
272+ * Is this expected to do a full reference drop, or only
273+ * cleanup partial validation / devalidation?
274+ *
275+ * If the former, the caller must hold a "full" type ref;
276+ * which means the page must be validated. If the page is
277+ * *not* fully validated, continuing would almost certainly
278+ * open up a security hole. An exception to this is during
279+ * domain destruction, where PGT_validated can be dropped
280+ * without dropping a type ref.
281+ *
282+ * If the latter, do nothing unless type PGT_partial is set.
283+ * If it is set, the type count must be 1.
284+ */
285+ if ( !(flags & PTF_partial_set) )
286+ BUG_ON((x & PGT_partial) ||
287+ !((x & PGT_validated) || page_get_owner(page)->is_dying));
288+ else if ( !(x & PGT_partial) )
289+ return 0;
290+ else
291+ BUG_ON((x & PGT_count_mask) != 1);
292+
293 ASSERT((x & PGT_count_mask) != 0);
294
295 if ( unlikely((nx & PGT_count_mask) == 0) )
296@@ -3012,17 +3058,34 @@ int put_old_guest_table(struct vcpu *v)
297 if ( !v->arch.old_guest_table )
298 return 0;
299
300- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
301- v->arch.old_guest_ptpg) )
302+ rc = _put_page_type(v->arch.old_guest_table,
303+ PTF_preemptible |
304+ ( v->arch.old_guest_table_partial ?
305+ PTF_partial_set : 0 ),
306+ v->arch.old_guest_ptpg);
307+
308+ if ( rc == -ERESTART || rc == -EINTR )
309 {
310- case -EINTR:
311- case -ERESTART:
312+ v->arch.old_guest_table_partial = (rc == -ERESTART);
313 return -ERESTART;
314- case 0:
315- put_page(v->arch.old_guest_table);
316 }
317
318+ /*
319+ * It shouldn't be possible for _put_page_type() to return
320+ * anything else at the moment; but if it does happen in
321+ * production, leaking the type ref is probably the best thing to
322+ * do. Either way, drop the general ref held by old_guest_table.
323+ */
324+ ASSERT(rc == 0);
325+
326+ put_page(v->arch.old_guest_table);
327 v->arch.old_guest_table = NULL;
328+ v->arch.old_guest_ptpg = NULL;
329+ /*
330+ * Safest default if someone sets old_guest_table without
331+ * explicitly setting old_guest_table_partial.
332+ */
333+ v->arch.old_guest_table_partial = true;
334
335 return rc;
336 }
337@@ -3175,11 +3238,11 @@ int new_guest_cr3(mfn_t mfn)
338 switch ( rc = put_page_and_type_preemptible(page) )
339 {
340 case -EINTR:
341- rc = -ERESTART;
342- /* fallthrough */
343 case -ERESTART:
344 curr->arch.old_guest_ptpg = NULL;
345 curr->arch.old_guest_table = page;
346+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
347+ rc = -ERESTART;
348 break;
349 default:
350 BUG_ON(rc);
351@@ -3448,6 +3511,7 @@ long do_mmuext_op(
352 {
353 curr->arch.old_guest_ptpg = NULL;
354 curr->arch.old_guest_table = page;
355+ curr->arch.old_guest_table_partial = false;
356 }
357 }
358 }
359@@ -3482,6 +3546,11 @@ long do_mmuext_op(
360 case -ERESTART:
361 curr->arch.old_guest_ptpg = NULL;
362 curr->arch.old_guest_table = page;
363+ /*
364+ * EINTR means we still hold the type ref; ERESTART
365+ * means PGT_partial holds the type ref
366+ */
367+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
368 rc = 0;
369 break;
370 default:
371@@ -3550,11 +3619,15 @@ long do_mmuext_op(
372 switch ( rc = put_page_and_type_preemptible(page) )
373 {
374 case -EINTR:
375- rc = -ERESTART;
376- /* fallthrough */
377 case -ERESTART:
378 curr->arch.old_guest_ptpg = NULL;
379 curr->arch.old_guest_table = page;
380+ /*
381+ * EINTR means we still hold the type ref;
382+ * ERESTART means PGT_partial holds the ref
383+ */
384+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
385+ rc = -ERESTART;
386 break;
387 default:
388 BUG_ON(rc);
389diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
390index aec65630d9..5afaf6b9de 100644
391--- a/xen/include/asm-x86/domain.h
392+++ b/xen/include/asm-x86/domain.h
393@@ -311,7 +311,7 @@ struct arch_domain
394
395 struct paging_domain paging;
396 struct p2m_domain *p2m;
397- /* To enforce lock ordering in the pod code wrt the
398+ /* To enforce lock ordering in the pod code wrt the
399 * page_alloc lock */
400 int page_alloc_unlock_level;
401
402@@ -550,6 +550,8 @@ struct arch_vcpu
403 struct page_info *old_guest_table; /* partially destructed pagetable */
404 struct page_info *old_guest_ptpg; /* containing page table of the */
405 /* former, if any */
406+ bool old_guest_table_partial; /* Are we dropping a type ref, or just
407+ * finishing up a partial de-validation? */
408 /* guest_table holds a ref to the page, and also a type-count unless
409 * shadow refcounts are in use */
410 pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
411--
4122.23.0
413
diff --git a/main/xen/xsa301-4.11-1.patch b/main/xen/xsa301-4.11-1.patch
new file mode 100644
index 0000000000..4d528fe13b
--- /dev/null
+++ b/main/xen/xsa301-4.11-1.patch
@@ -0,0 +1,80 @@
1From 21dfe8f707febd62869d4ebbaa155736870bebec Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Wed, 2 Oct 2019 12:06:50 +0100
4Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame
5
6The P2M helpers implementation is quite lax and will end up to ignore
7the unused top bits of a guest physical frame.
8
9This effectively means that p2m_set_entry() will create a mapping for a
10different frame (it is always equal to gfn & (mask unused bits)). Yet
11p2m->max_mapped_gfn will be updated using the original frame.
12
13At the moment, p2m_get_entry() and p2m_resolve_translation_fault()
14assume that p2m_get_root_pointer() will always return a non-NULL pointer
15when the GFN is smaller than p2m->max_mapped_gfn.
16
17Unfortunately, because of the aliasing described above, it would be
18possible to set p2m->max_mapped_gfn high enough so it covers frame that
19would lead p2m_get_root_pointer() to return NULL.
20
21As we don't sanity check the guest physical frame provided by a guest, a
22malicious guest could craft a series of hypercalls that will hit the
23BUG_ON() and therefore DoS Xen.
24
25To prevent aliasing, the function p2m_get_root_pointer() is now reworked
26to return NULL If any of the unused top bits are not zero. The caller
27can then decide what's the appropriate action to do. Since the two paths
28(i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very
29similarly, take the opportunity to consolidate them making the code a
30bit simpler.
31
32With this change, p2m_get_entry() will not try to insert a mapping as
33the root pointer is invalid.
34
35Note that root_table is now switch to unsigned long as unsigned int is
36not enough to hold part of a GFN.
37
38This is part of XSA-301.
39
40Reported-by: Julien Grall <Julien.Grall@arm.com>
41Signed-off-by: Julien Grall <julien.grall@arm.com>
42Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
43---
44 xen/arch/arm/p2m.c | 17 +++++------------
45 1 file changed, 5 insertions(+), 12 deletions(-)
46
47diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
48index d43c3aa896..3967ee7306 100644
49--- a/xen/arch/arm/p2m.c
50+++ b/xen/arch/arm/p2m.c
51@@ -177,21 +177,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m)
52 static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m,
53 gfn_t gfn)
54 {
55- unsigned int root_table;
56-
57- if ( P2M_ROOT_PAGES == 1 )
58- return __map_domain_page(p2m->root);
59+ unsigned long root_table;
60
61 /*
62- * Concatenated root-level tables. The table number will be the
63- * offset at the previous level. It is not possible to
64- * concatenate a level-0 root.
65+ * While the root table index is the offset from the previous level,
66+ * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be
67+ * 0. Yet we still want to check if all the unused bits are zeroed.
68 */
69- ASSERT(P2M_ROOT_LEVEL > 0);
70-
71- root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]);
72- root_table &= LPAE_ENTRY_MASK;
73-
74+ root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT);
75 if ( root_table >= P2M_ROOT_PAGES )
76 return NULL;
77
78--
792.11.0
80
diff --git a/main/xen/xsa301-4.11-2.patch b/main/xen/xsa301-4.11-2.patch
new file mode 100644
index 0000000000..33b6150370
--- /dev/null
+++ b/main/xen/xsa301-4.11-2.patch
@@ -0,0 +1,92 @@
1From 4426d993b7ee0966fb39531dc5a269ce8493ca97 Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Wed, 2 Oct 2019 12:35:59 +0100
4Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on
5 p2m->max_mapped_gfn
6
7The code base is using inconsistently the field p2m->max_mapped_gfn.
8Some of the useres expect that p2m->max_guest_gfn contain the highest
9mapped GFN while others expect highest + 1.
10
11p2m->max_guest_gfn is set as highest + 1, because of that the sanity
12check on the GFN in p2m_resolved_translation_fault() and
13p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn.
14
15p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is
16outside of address range supported and therefore the BUG_ON() could be
17hit.
18
19The current value hold in p2m->max_mapped_gfn is inconsistent with the
20expectation of the common code (see domain_get_maximum_gpfn()) and also
21the documentation of the field.
22
23Rather than changing the check in p2m_translation_fault() and
24p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest
25mapped GFN and the callers assuming "highest + 1" are now adjusted.
26
27Take the opportunity to use 1UL rather than 1 as page_order could
28theoritically big enough to overflow a 32-bit integer.
29
30Lastly, the documentation of the field max_guest_gfn to reflect how it
31is computed.
32
33This is part of XSA-301.
34
35Reported-by: Julien Grall <Julien.Grall@arm.com>
36Signed-off-by: Julien Grall <julien.grall@arm.com>
37Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
38---
39 xen/arch/arm/p2m.c | 6 +++---
40 xen/include/asm-arm/p2m.h | 5 +----
41 2 files changed, 4 insertions(+), 7 deletions(-)
42
43diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
44index 3967ee7306..c7e049901d 100644
45--- a/xen/arch/arm/p2m.c
46+++ b/xen/arch/arm/p2m.c
47@@ -931,7 +931,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m,
48 p2m_write_pte(entry, pte, p2m->clean_pte);
49
50 p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
51- gfn_add(sgfn, 1 << page_order));
52+ gfn_add(sgfn, (1UL << page_order) - 1));
53 p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
54 }
55
56@@ -1291,7 +1291,7 @@ int relinquish_p2m_mapping(struct domain *d)
57 p2m_write_lock(p2m);
58
59 start = p2m->lowest_mapped_gfn;
60- end = p2m->max_mapped_gfn;
61+ end = gfn_add(p2m->max_mapped_gfn, 1);
62
63 for ( ; gfn_x(start) < gfn_x(end);
64 start = gfn_next_boundary(start, order) )
65@@ -1356,7 +1356,7 @@ int p2m_cache_flush(struct domain *d, gfn_t start, unsigned long nr)
66 p2m_read_lock(p2m);
67
68 start = gfn_max(start, p2m->lowest_mapped_gfn);
69- end = gfn_min(end, p2m->max_mapped_gfn);
70+ end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1));
71
72 for ( ; gfn_x(start) < gfn_x(end); start = next_gfn )
73 {
74diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
75index 8823707c17..7f1f7e9109 100644
76--- a/xen/include/asm-arm/p2m.h
77+++ b/xen/include/asm-arm/p2m.h
78@@ -38,10 +38,7 @@ struct p2m_domain {
79 /* Current Translation Table Base Register for the p2m */
80 uint64_t vttbr;
81
82- /*
83- * Highest guest frame that's ever been mapped in the p2m
84- * Only takes into account ram and foreign mapping
85- */
86+ /* Highest guest frame that's ever been mapped in the p2m */
87 gfn_t max_mapped_gfn;
88
89 /*
90--
912.11.0
92
diff --git a/main/xen/xsa301-4.11-3.patch b/main/xen/xsa301-4.11-3.patch
new file mode 100644
index 0000000000..55a701a5c7
--- /dev/null
+++ b/main/xen/xsa301-4.11-3.patch
@@ -0,0 +1,49 @@
1From 61c73af08b4ede1fc8cfd2cf72661e6c7cfdbeaa Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Wed, 2 Oct 2019 10:55:07 +0100
4Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of
5 p2m_get_root_pointer() with BUG_ON()
6
7It turns out that the BUG_ON() was actually reachable with well-crafted
8hypercalls. The BUG_ON() is here to prevent catch logical error, so
9crashing Xen is a bit over the top.
10
11While all the holes should now be fixed, it would be better to downgrade
12the BUG_ON() to something less fatal to prevent any more DoS.
13
14The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE()
15to catch mistake in debug build and return INVALID_MFN for production
16build. The interface also requires to set page_order to give an idea of
17the size of "hole". So 'level' is now set so we report a hole of size of
18the an entry of the root page-table. This stays inline with what happen
19when the GFN is higher than p2m->max_mapped_gfn.
20
21This is part of XSA-301.
22
23Reported-by: Julien Grall <Julien.Grall@arm.com>
24Signed-off-by: Julien Grall <julien.grall@arm.com>
25---
26 xen/arch/arm/p2m.c | 7 ++++++-
27 1 file changed, 6 insertions(+), 1 deletion(-)
28
29diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
30index c7e049901d..af3515df42 100644
31--- a/xen/arch/arm/p2m.c
32+++ b/xen/arch/arm/p2m.c
33@@ -318,7 +318,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
34 * the table should always be non-NULL because the gfn is below
35 * p2m->max_mapped_gfn and the root table pages are always present.
36 */
37- BUG_ON(table == NULL);
38+ if ( !table )
39+ {
40+ ASSERT_UNREACHABLE();
41+ level = P2M_ROOT_LEVEL;
42+ goto out;
43+ }
44
45 for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
46 {
47--
482.11.0
49
diff --git a/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch b/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch
new file mode 100644
index 0000000000..0b93de18ac
--- /dev/null
+++ b/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch
@@ -0,0 +1,37 @@
1From 2bcbf2843250888b720bfea188ac9842c847f388 Mon Sep 17 00:00:00 2001
2From: Jan Beulich <jbeulich@suse.com>
3Date: Wed, 2 Oct 2019 13:36:59 +0200
4Subject: [PATCH 1/2] IOMMU: add missing HVM check
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9Fix an unguarded d->arch.hvm access in assign_device().
10
11Signed-off-by: Jan Beulich <jbeulich@suse.com>
12Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
13Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
14
15(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
16Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
17---
18 xen/drivers/passthrough/pci.c | 3 ++-
19 1 file changed, 2 insertions(+), 1 deletion(-)
20
21diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
22index e021c7a317..e1668a1968 100644
23--- a/xen/drivers/passthrough/pci.c
24+++ b/xen/drivers/passthrough/pci.c
25@@ -1386,7 +1386,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
26 /* Prevent device assign if mem paging or mem sharing have been
27 * enabled for this domain */
28 if ( unlikely(!need_iommu(d) &&
29- (d->arch.hvm_domain.mem_sharing_enabled ||
30+ ((is_hvm_domain(d) &&
31+ d->arch.hvm_domain.mem_sharing_enabled) ||
32 vm_event_check_ring(d->vm_event_paging) ||
33 p2m_get_hostp2m(d)->global_logdirty)) )
34 return -EXDEV;
35--
362.11.0
37
diff --git a/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch b/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch
new file mode 100644
index 0000000000..94eba850a4
--- /dev/null
+++ b/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch
@@ -0,0 +1,498 @@
1From 02dd07e53b904570e0320d17d77022ddbc4e8225 Mon Sep 17 00:00:00 2001
2From: Paul Durrant <paul.durrant@citrix.com>
3Date: Mon, 14 Oct 2019 17:52:59 +0100
4Subject: [PATCH 2/2] passthrough: quarantine PCI devices
5
6When a PCI device is assigned to an untrusted domain, it is possible for
7that domain to program the device to DMA to an arbitrary address. The
8IOMMU is used to protect the host from malicious DMA by making sure that
9the device addresses can only target memory assigned to the guest. However,
10when the guest domain is torn down the device is assigned back to dom0,
11thus allowing any in-flight DMA to potentially target critical host data.
12
13This patch introduces a 'quarantine' for PCI devices using dom_io. When
14the toolstack makes a device assignable (by binding it to pciback), it
15will now also assign it to DOMID_IO and the device will only be assigned
16back to dom0 when the device is made unassignable again. Whilst device is
17assignable it will only ever transfer between dom_io and guest domains.
18dom_io is actually only used as a sentinel domain for quarantining purposes;
19it is not configured with any IOMMU mappings. Assignment to dom_io simply
20means that the device's initiator (requestor) identifier is not present in
21the IOMMU's device table and thus any DMA transactions issued will be
22terminated with a fault condition.
23
24In addition, a fix to assignment handling is made for VT-d. Failure
25during the assignment step should not lead to a device still being
26associated with its prior owner. Hand the device to DomIO temporarily,
27until the assignment step has completed successfully. Remove the PI
28hooks from the source domain then earlier as well.
29
30Failure of the recovery reassign_device_ownership() may not go silent:
31There e.g. may still be left over RMRR mappings in the domain assignment
32to which has failed, and hence we can't allow that domain to continue
33executing.
34
35NOTE: This patch also includes one printk() cleanup; the
36 "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
37 since similar printk()-s elsewhere also don't log such a tag.
38
39This is XSA-302.
40
41Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
42Signed-off-by: Jan Beulich <jbeulich@suse.com>
43Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
44---
45 tools/libxl/libxl_pci.c | 25 +++++++++++-
46 xen/arch/x86/mm.c | 2 +
47 xen/common/domctl.c | 14 ++++++-
48 xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
49 xen/drivers/passthrough/iommu.c | 9 +++++
50 xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++-------
51 xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++---
52 xen/include/xen/pci.h | 3 ++
53 8 files changed, 138 insertions(+), 24 deletions(-)
54
55diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
56index 88a55ce8bd..1b5c44f3e7 100644
57--- a/tools/libxl/libxl_pci.c
58+++ b/tools/libxl/libxl_pci.c
59@@ -749,6 +749,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
60 libxl_device_pci *pcidev,
61 int rebind)
62 {
63+ libxl_ctx *ctx = libxl__gc_owner(gc);
64 unsigned dom, bus, dev, func;
65 char *spath, *driver_path = NULL;
66 int rc;
67@@ -774,7 +775,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
68 }
69 if ( rc ) {
70 LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
71- return 0;
72+ goto quarantine;
73 }
74
75 /* Check to see if there's already a driver that we need to unbind from */
76@@ -805,6 +806,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
77 return ERROR_FAIL;
78 }
79
80+quarantine:
81+ /*
82+ * DOMID_IO is just a sentinel domain, without any actual mappings,
83+ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
84+ * unnecessarily denied.
85+ */
86+ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
87+ XEN_DOMCTL_DEV_RDM_RELAXED);
88+ if ( rc < 0 ) {
89+ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
90+ return ERROR_FAIL;
91+ }
92+
93 return 0;
94 }
95
96@@ -812,9 +826,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
97 libxl_device_pci *pcidev,
98 int rebind)
99 {
100+ libxl_ctx *ctx = libxl__gc_owner(gc);
101 int rc;
102 char *driver_path;
103
104+ /* De-quarantine */
105+ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
106+ if ( rc < 0 ) {
107+ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
108+ pcidev->dev, pcidev->func);
109+ return ERROR_FAIL;
110+ }
111+
112 /* Unbind from pciback */
113 if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
114 return ERROR_FAIL;
115diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
116index ce2c082caf..0e42497cf7 100644
117--- a/xen/arch/x86/mm.c
118+++ b/xen/arch/x86/mm.c
119@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
120 * Initialise our DOMID_IO domain.
121 * This domain owns I/O pages that are within the range of the page_info
122 * array. Mappings occur at the priv of the caller.
123+ * Quarantined PCI devices will be associated with this domain.
124 */
125 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
126 BUG_ON(IS_ERR(dom_io));
127+ INIT_LIST_HEAD(&dom_io->arch.pdev_list);
128
129 /*
130 * Initialise our COW domain.
131diff --git a/xen/common/domctl.c b/xen/common/domctl.c
132index 3c6fa4ec67..a70f4b46f8 100644
133--- a/xen/common/domctl.c
134+++ b/xen/common/domctl.c
135@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
136
137 switch ( op->cmd )
138 {
139+ case XEN_DOMCTL_assign_device:
140+ case XEN_DOMCTL_deassign_device:
141+ if ( op->domain == DOMID_IO )
142+ {
143+ d = dom_io;
144+ break;
145+ }
146+ else if ( op->domain == DOMID_INVALID )
147+ return -ESRCH;
148+ /* fall through */
149 case XEN_DOMCTL_test_assign_device:
150 if ( op->domain == DOMID_INVALID )
151 {
152@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
153
154 if ( !domctl_lock_acquire() )
155 {
156- if ( d )
157+ if ( d && d != dom_io )
158 rcu_unlock_domain(d);
159 return hypercall_create_continuation(
160 __HYPERVISOR_domctl, "h", u_domctl);
161@@ -1163,7 +1173,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
162 domctl_lock_release();
163
164 domctl_out_unlock_domonly:
165- if ( d )
166+ if ( d && d != dom_io )
167 rcu_unlock_domain(d);
168
169 if ( copyback && __copy_to_guest(u_domctl, op, 1) )
170diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
171index 12d2695b89..ec8baae717 100644
172--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
173+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
174@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device(
175 u8 bus = pdev->bus;
176 const struct domain_iommu *hd = dom_iommu(domain);
177
178+ /* dom_io is used as a sentinel for quarantined devices */
179+ if ( domain == dom_io )
180+ return;
181+
182 BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
183 !iommu->dev_table.buffer );
184
185@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
186 int req_id;
187 u8 bus = pdev->bus;
188
189+ /* dom_io is used as a sentinel for quarantined devices */
190+ if ( domain == dom_io )
191+ return;
192+
193 BUG_ON ( iommu->dev_table.buffer == NULL );
194 req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
195 dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
196@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
197 ivrs_mappings[req_id].read_permission);
198 }
199
200- return reassign_device(hardware_domain, d, devfn, pdev);
201+ return reassign_device(pdev->domain, d, devfn, pdev);
202 }
203
204 static void deallocate_next_page_table(struct page_info *pg, int level)
205diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
206index b5f8044439..ad2ce8f39b 100644
207--- a/xen/drivers/passthrough/iommu.c
208+++ b/xen/drivers/passthrough/iommu.c
209@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d)
210 {
211 const struct domain_iommu *hd = dom_iommu(d);
212
213+ if ( d == dom_io )
214+ return;
215+
216 d->need_iommu = 0;
217 hd->platform_ops->teardown(d);
218 tasklet_schedule(&iommu_pt_cleanup_tasklet);
219@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d)
220 if ( need_iommu(d) > 0 )
221 return 0;
222
223+ if ( d == dom_io )
224+ return 0;
225+
226 if ( !iommu_use_hap_pt(d) )
227 {
228 int rc;
229@@ -404,6 +410,9 @@ int __init iommu_setup(void)
230 printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
231 if ( iommu_enabled )
232 {
233+ if ( iommu_domain_init(dom_io) )
234+ panic("Could not set up quarantine\n");
235+
236 printk(" - Dom0 mode: %s\n",
237 iommu_passthrough ? "Passthrough" :
238 iommu_dom0_strict ? "Strict" : "Relaxed");
239diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
240index e1668a1968..6b2e9d2896 100644
241--- a/xen/drivers/passthrough/pci.c
242+++ b/xen/drivers/passthrough/pci.c
243@@ -1359,19 +1359,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
244 return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
245 }
246
247-/*
248- * If the device isn't owned by the hardware domain, it means it already
249- * has been assigned to other domain, or it doesn't exist.
250- */
251 static int device_assigned(u16 seg, u8 bus, u8 devfn)
252 {
253 struct pci_dev *pdev;
254+ int rc = 0;
255
256 pcidevs_lock();
257- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
258+
259+ pdev = pci_get_pdev(seg, bus, devfn);
260+
261+ if ( !pdev )
262+ rc = -ENODEV;
263+ /*
264+ * If the device exists and it is not owned by either the hardware
265+ * domain or dom_io then it must be assigned to a guest, or be
266+ * hidden (owned by dom_xen).
267+ */
268+ else if ( pdev->domain != hardware_domain &&
269+ pdev->domain != dom_io )
270+ rc = -EBUSY;
271+
272 pcidevs_unlock();
273
274- return pdev ? 0 : -EBUSY;
275+ return rc;
276 }
277
278 static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
279@@ -1385,7 +1395,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
280
281 /* Prevent device assign if mem paging or mem sharing have been
282 * enabled for this domain */
283- if ( unlikely(!need_iommu(d) &&
284+ if ( d != dom_io &&
285+ unlikely(!need_iommu(d) &&
286 ((is_hvm_domain(d) &&
287 d->arch.hvm_domain.mem_sharing_enabled) ||
288 vm_event_check_ring(d->vm_event_paging) ||
289@@ -1402,12 +1413,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
290 return rc;
291 }
292
293- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
294+ pdev = pci_get_pdev(seg, bus, devfn);
295+
296+ rc = -ENODEV;
297 if ( !pdev )
298- {
299- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
300 goto done;
301- }
302+
303+ rc = 0;
304+ if ( d == pdev->domain )
305+ goto done;
306+
307+ rc = -EBUSY;
308+ if ( pdev->domain != hardware_domain &&
309+ pdev->domain != dom_io )
310+ goto done;
311
312 if ( pdev->msix )
313 msixtbl_init(d);
314@@ -1430,6 +1449,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
315 }
316
317 done:
318+ /* The device is assigned to dom_io so mark it as quarantined */
319+ if ( !rc && d == dom_io )
320+ pdev->quarantine = true;
321+
322 if ( !has_arch_pdevs(d) && need_iommu(d) )
323 iommu_teardown(d);
324 pcidevs_unlock();
325@@ -1442,6 +1465,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
326 {
327 const struct domain_iommu *hd = dom_iommu(d);
328 struct pci_dev *pdev = NULL;
329+ struct domain *target;
330 int ret = 0;
331
332 if ( !iommu_enabled || !hd->platform_ops )
333@@ -1452,12 +1476,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
334 if ( !pdev )
335 return -ENODEV;
336
337+ /* De-assignment from dom_io should de-quarantine the device */
338+ target = (pdev->quarantine && pdev->domain != dom_io) ?
339+ dom_io : hardware_domain;
340+
341 while ( pdev->phantom_stride )
342 {
343 devfn += pdev->phantom_stride;
344 if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
345 break;
346- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
347+ ret = hd->platform_ops->reassign_device(d, target, devfn,
348 pci_to_dev(pdev));
349 if ( !ret )
350 continue;
351@@ -1468,7 +1496,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
352 }
353
354 devfn = pdev->devfn;
355- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
356+ ret = hd->platform_ops->reassign_device(d, target, devfn,
357 pci_to_dev(pdev));
358 if ( ret )
359 {
360@@ -1478,6 +1506,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
361 return ret;
362 }
363
364+ if ( pdev->domain == hardware_domain )
365+ pdev->quarantine = false;
366+
367 pdev->fault.count = 0;
368
369 if ( !has_arch_pdevs(d) && need_iommu(d) )
370@@ -1656,7 +1687,7 @@ int iommu_do_pci_domctl(
371 ret = hypercall_create_continuation(__HYPERVISOR_domctl,
372 "h", u_domctl);
373 else if ( ret )
374- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
375+ printk(XENLOG_G_ERR
376 "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
377 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
378 d->domain_id, ret);
379diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
380index 481efef2b0..1d16127d8f 100644
381--- a/xen/drivers/passthrough/vtd/iommu.c
382+++ b/xen/drivers/passthrough/vtd/iommu.c
383@@ -1332,6 +1332,10 @@ int domain_context_mapping_one(
384 int agaw, rc, ret;
385 bool_t flush_dev_iotlb;
386
387+ /* dom_io is used as a sentinel for quarantined devices */
388+ if ( domain == dom_io )
389+ return 0;
390+
391 ASSERT(pcidevs_locked());
392 spin_lock(&iommu->lock);
393 maddr = bus_to_context_maddr(iommu, bus);
394@@ -1567,6 +1571,10 @@ int domain_context_unmap_one(
395 int iommu_domid, rc, ret;
396 bool_t flush_dev_iotlb;
397
398+ /* dom_io is used as a sentinel for quarantined devices */
399+ if ( domain == dom_io )
400+ return 0;
401+
402 ASSERT(pcidevs_locked());
403 spin_lock(&iommu->lock);
404
405@@ -1699,6 +1707,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
406 goto out;
407 }
408
409+ /* dom_io is used as a sentinel for quarantined devices */
410+ if ( domain == dom_io )
411+ goto out;
412+
413 /*
414 * if no other devices under the same iommu owned by this domain,
415 * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
416@@ -2383,6 +2395,15 @@ static int reassign_device_ownership(
417 if ( ret )
418 return ret;
419
420+ if ( devfn == pdev->devfn )
421+ {
422+ list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
423+ pdev->domain = dom_io;
424+ }
425+
426+ if ( !has_arch_pdevs(source) )
427+ vmx_pi_hooks_deassign(source);
428+
429 if ( !has_arch_pdevs(target) )
430 vmx_pi_hooks_assign(target);
431
432@@ -2401,15 +2422,13 @@ static int reassign_device_ownership(
433 pdev->domain = target;
434 }
435
436- if ( !has_arch_pdevs(source) )
437- vmx_pi_hooks_deassign(source);
438-
439 return ret;
440 }
441
442 static int intel_iommu_assign_device(
443 struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
444 {
445+ struct domain *s = pdev->domain;
446 struct acpi_rmrr_unit *rmrr;
447 int ret = 0, i;
448 u16 bdf, seg;
449@@ -2452,8 +2471,8 @@ static int intel_iommu_assign_device(
450 }
451 }
452
453- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
454- if ( ret )
455+ ret = reassign_device_ownership(s, d, devfn, pdev);
456+ if ( ret || d == dom_io )
457 return ret;
458
459 /* Setup rmrr identity mapping */
460@@ -2466,11 +2485,20 @@ static int intel_iommu_assign_device(
461 ret = rmrr_identity_mapping(d, 1, rmrr, flag);
462 if ( ret )
463 {
464- reassign_device_ownership(d, hardware_domain, devfn, pdev);
465+ int rc;
466+
467+ rc = reassign_device_ownership(d, s, devfn, pdev);
468 printk(XENLOG_G_ERR VTDPREFIX
469 " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
470 rmrr->base_address, rmrr->end_address,
471 d->domain_id, ret);
472+ if ( rc )
473+ {
474+ printk(XENLOG_ERR VTDPREFIX
475+ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
476+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
477+ domain_crash(d);
478+ }
479 break;
480 }
481 }
482diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
483index 43f21251a5..3241e51e3c 100644
484--- a/xen/include/xen/pci.h
485+++ b/xen/include/xen/pci.h
486@@ -68,6 +68,9 @@ struct pci_dev {
487
488 nodeid_t node; /* NUMA node */
489
490+ /* Device to be quarantined, don't automatically re-assign to dom0 */
491+ bool quarantine;
492+
493 enum pdev_type {
494 DEV_TYPE_PCI_UNKNOWN,
495 DEV_TYPE_PCIe_ENDPOINT,
496--
4972.11.0
498
diff --git a/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch b/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
new file mode 100644
index 0000000000..afb1096c1d
--- /dev/null
+++ b/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
@@ -0,0 +1,74 @@
1From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Tue, 1 Oct 2019 13:07:53 +0100
4Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two
5
6The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap
7entry function. While the macro is fairly small today, follow-up patches
8will increase the size signicantly.
9
10In general, assembly macros are more readable as they allow you to name
11parameters and avoid '\'. So the actual implementation of the trap is
12now switched to an assembly macro.
13
14This is part of XSA-303.
15
16Reported-by: Julien Grall <Julien.Grall@arm.com>
17Signed-off-by: Julien Grall <julien.grall@arm.com>
18Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
19Reviewed-by: Andre Przywara <andre.przywara@arm.com>
20---
21 xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++---------------
22 1 file changed, 19 insertions(+), 15 deletions(-)
23
24diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
25index 0b4cd19abd..4a762e04f1 100644
26--- a/xen/arch/arm/arm32/entry.S
27+++ b/xen/arch/arm/arm32/entry.S
28@@ -126,24 +126,28 @@ abort_guest_exit_end:
29 skip_check:
30 mov pc, lr
31
32-/*
33- * Macro to define trap entry. The iflags corresponds to the list of
34- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
35- */
36+ /*
37+ * Macro to define trap entry. The iflags corresponds to the list of
38+ * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
39+ */
40+ .macro vector trap, iflags
41+ SAVE_ALL
42+ cpsie \iflags
43+ adr lr, return_from_trap
44+ mov r0, sp
45+ /*
46+ * Save the stack pointer in r11. It will be restored after the
47+ * trap has been handled (see return_from_trap).
48+ */
49+ mov r11, sp
50+ bic sp, #7 /* Align the stack pointer (noop on guest trap) */
51+ b do_trap_\trap
52+ .endm
53+
54 #define __DEFINE_TRAP_ENTRY(trap, iflags) \
55 ALIGN; \
56 trap_##trap: \
57- SAVE_ALL; \
58- cpsie iflags; \
59- adr lr, return_from_trap; \
60- mov r0, sp; \
61- /* \
62- * Save the stack pointer in r11. It will be restored after the \
63- * trap has been handled (see return_from_trap). \
64- */ \
65- mov r11, sp; \
66- bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \
67- b do_trap_##trap
68+ vector trap, iflags
69
70 /* Trap handler which unmask IRQ/Abort, keep FIQ masked */
71 #define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
72--
732.11.0
74
diff --git a/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch b/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
new file mode 100644
index 0000000000..35f9c0475e
--- /dev/null
+++ b/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
@@ -0,0 +1,97 @@
1From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Tue, 1 Oct 2019 13:15:48 +0100
4Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro
5 vector
6
7Follow-up rework will require the macro vector to distinguish between
8a trap from a guest vs while in the hypervisor.
9
10The macro SAVE_ALL already has code to distinguish between the two and
11it is only called by the vector macro. So fold the former into the
12latter. This will help to avoid duplicating the check.
13
14This is part of XSA-303.
15
16Reported-by: Julien Grall <Julien.Grall@arm.com>
17Signed-off-by: Julien Grall <julien.grall@arm.com>
18Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
19Reviewed-by: Andre Przywara <andre.przywara@arm.com>
20---
21 xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++-----------------------
22 1 file changed, 23 insertions(+), 23 deletions(-)
23
24diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
25index 4a762e04f1..150cbc0b4b 100644
26--- a/xen/arch/arm/arm32/entry.S
27+++ b/xen/arch/arm/arm32/entry.S
28@@ -13,27 +13,6 @@
29 #define RESTORE_BANKED(mode) \
30 RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode)
31
32-#define SAVE_ALL \
33- sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \
34- push {r0-r12}; /* Save R0-R12 */ \
35- \
36- mrs r11, ELR_hyp; /* ELR_hyp is return address. */\
37- str r11, [sp, #UREGS_pc]; \
38- \
39- str lr, [sp, #UREGS_lr]; \
40- \
41- add r11, sp, #UREGS_kernel_sizeof+4; \
42- str r11, [sp, #UREGS_sp]; \
43- \
44- mrc CP32(r11, HSR); /* Save exception syndrome */ \
45- str r11, [sp, #UREGS_hsr]; \
46- \
47- mrs r11, SPSR_hyp; \
48- str r11, [sp, #UREGS_cpsr]; \
49- and r11, #PSR_MODE_MASK; \
50- cmp r11, #PSR_MODE_HYP; \
51- blne save_guest_regs
52-
53 save_guest_regs:
54 #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR
55 /*
56@@ -52,7 +31,7 @@ save_guest_regs:
57 ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */
58 str r11, [sp, #UREGS_sp]
59 SAVE_ONE_BANKED(SP_usr)
60- /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */
61+ /* LR_usr is the same physical register as lr and is saved by the caller */
62 SAVE_BANKED(svc)
63 SAVE_BANKED(abt)
64 SAVE_BANKED(und)
65@@ -131,7 +110,28 @@ skip_check:
66 * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
67 */
68 .macro vector trap, iflags
69- SAVE_ALL
70+ /* Save registers in the stack */
71+ sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
72+ push {r0-r12} /* Save R0-R12 */
73+ mrs r11, ELR_hyp /* ELR_hyp is return address */
74+ str r11, [sp, #UREGS_pc]
75+
76+ str lr, [sp, #UREGS_lr]
77+
78+ add r11, sp, #(UREGS_kernel_sizeof + 4)
79+
80+ str r11, [sp, #UREGS_sp]
81+
82+ mrc CP32(r11, HSR) /* Save exception syndrome */
83+ str r11, [sp, #UREGS_hsr]
84+
85+ mrs r11, SPSR_hyp
86+ str r11, [sp, #UREGS_cpsr]
87+ and r11, #PSR_MODE_MASK
88+ cmp r11, #PSR_MODE_HYP
89+ blne save_guest_regs
90+
91+ /* We are ready to handle the trap, setup the registers and jump. */
92 cpsie \iflags
93 adr lr, return_from_trap
94 mov r0, sp
95--
962.11.0
97
diff --git a/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
new file mode 100644
index 0000000000..5168452148
--- /dev/null
+++ b/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
@@ -0,0 +1,226 @@
1From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Fri, 11 Oct 2019 17:49:28 +0100
4Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap
5 without a change of level
6
7Exception vectors will unmask interrupts regardless the state of them in
8the interrupted context.
9
10One of the consequences is IRQ will be unmasked when receiving an
11undefined instruction exception (used by WARN*) from the hypervisor.
12This could result to unexpected behavior such as deadlock (if a lock was
13shared with interrupts).
14
15In a nutshell, interrupts should only be unmasked when it is safe to do.
16Xen only unmask IRQ and Abort interrupts, so the logic can stay simple.
17
18As vectors exceptions may be shared between guest and hypervisor, we now
19need to have a different policy for the interrupts.
20
21On exception from hypervisor, each vector will select the list of
22interrupts to inherit from the interrupted context. Any interrupts not
23listed will be kept masked.
24
25On exception from the guest, the Abort and IRQ will be unmasked
26depending on the exact vector.
27
28The interrupts will be kept unmasked when the vector cannot used by
29either guest or hypervisor.
30
31Note that each vector is not anymore preceded by ALIGN. This is fine
32because the alignment is already bigger than what we need.
33
34This is part of XSA-303.
35
36Reported-by: Julien Grall <Julien.Grall@arm.com>
37Signed-off-by: Julien Grall <julien.grall@arm.com>
38Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
39Reviewed-by: Andre Przywara <andre.przywara@arm.com>
40---
41 xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++----------
42 1 file changed, 109 insertions(+), 29 deletions(-)
43
44diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
45index 150cbc0b4b..ec90cca093 100644
46--- a/xen/arch/arm/arm32/entry.S
47+++ b/xen/arch/arm/arm32/entry.S
48@@ -4,6 +4,17 @@
49 #include <asm/alternative.h>
50 #include <public/xen.h>
51
52+/*
53+ * Short-hands to defined the interrupts (A, I, F)
54+ *
55+ * _ means the interrupt state will not change
56+ * X means the state of interrupt X will change
57+ *
58+ * To be used with msr cpsr_* only
59+ */
60+#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK
61+#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK
62+
63 #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg]
64 #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11
65
66@@ -106,10 +117,18 @@ skip_check:
67 mov pc, lr
68
69 /*
70- * Macro to define trap entry. The iflags corresponds to the list of
71- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
72+ * Macro to define a trap entry.
73+ *
74+ * @guest_iflags: Optional list of interrupts to unmask when
75+ * entering from guest context. As this is used with cpsie,
76+ * the letter (a, i, f) should be used.
77+ *
78+ * @hyp_iflags: Optional list of interrupts to inherit when
79+ * entering from hypervisor context. Any interrupts not
80+ * listed will be kept unchanged. As this is used with cpsr_*,
81+ * IFLAGS_* short-hands should be used.
82 */
83- .macro vector trap, iflags
84+ .macro vector trap, guest_iflags=n, hyp_iflags=0
85 /* Save registers in the stack */
86 sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
87 push {r0-r12} /* Save R0-R12 */
88@@ -127,12 +146,39 @@ skip_check:
89
90 mrs r11, SPSR_hyp
91 str r11, [sp, #UREGS_cpsr]
92- and r11, #PSR_MODE_MASK
93- cmp r11, #PSR_MODE_HYP
94- blne save_guest_regs
95
96+ /*
97+ * We need to distinguish whether we came from guest or
98+ * hypervisor context.
99+ */
100+ and r0, r11, #PSR_MODE_MASK
101+ cmp r0, #PSR_MODE_HYP
102+
103+ bne 1f
104+ /*
105+ * Trap from the hypervisor
106+ *
107+ * Inherit the state of the interrupts from the hypervisor
108+ * context. For that we need to use SPSR (stored in r11) and
109+ * modify CPSR accordingly.
110+ *
111+ * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags)
112+ */
113+ mrs r10, cpsr
114+ bic r10, r10, #\hyp_iflags
115+ and r11, r11, #\hyp_iflags
116+ orr r10, r10, r11
117+ msr cpsr_cx, r10
118+ b 2f
119+
120+1:
121+ /* Trap from the guest */
122+ bl save_guest_regs
123+ .if \guest_iflags != n
124+ cpsie \guest_iflags
125+ .endif
126+2:
127 /* We are ready to handle the trap, setup the registers and jump. */
128- cpsie \iflags
129 adr lr, return_from_trap
130 mov r0, sp
131 /*
132@@ -144,20 +190,6 @@ skip_check:
133 b do_trap_\trap
134 .endm
135
136-#define __DEFINE_TRAP_ENTRY(trap, iflags) \
137- ALIGN; \
138-trap_##trap: \
139- vector trap, iflags
140-
141-/* Trap handler which unmask IRQ/Abort, keep FIQ masked */
142-#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
143-
144-/* Trap handler which unmask Abort, keep IRQ/FIQ masked */
145-#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a)
146-
147-/* Trap handler which unmask IRQ, keep Abort/FIQ masked */
148-#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i)
149-
150 .align 5
151 GLOBAL(hyp_traps_vector)
152 b trap_reset /* 0x00 - Reset */
153@@ -228,14 +260,62 @@ decode_vectors:
154
155 #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
156
157-DEFINE_TRAP_ENTRY(reset)
158-DEFINE_TRAP_ENTRY(undefined_instruction)
159-DEFINE_TRAP_ENTRY(hypervisor_call)
160-DEFINE_TRAP_ENTRY(prefetch_abort)
161-DEFINE_TRAP_ENTRY(guest_sync)
162-DEFINE_TRAP_ENTRY_NOIRQ(irq)
163-DEFINE_TRAP_ENTRY_NOIRQ(fiq)
164-DEFINE_TRAP_ENTRY_NOABORT(data_abort)
165+/* Vector not used by the Hypervisor. */
166+trap_reset:
167+ vector reset
168+
169+/*
170+ * Vector only used by the Hypervisor.
171+ *
172+ * While the exception can be executed with all the interrupts (e.g.
173+ * IRQ) unmasked, the interrupted context may have purposefully masked
174+ * some of them. So we want to inherit the state from the interrupted
175+ * context.
176+ */
177+trap_undefined_instruction:
178+ vector undefined_instruction, hyp_iflags=IFLAGS_AIF
179+
180+/* We should never reach this trap */
181+trap_hypervisor_call:
182+ vector hypervisor_call
183+
184+/*
185+ * Vector only used by the hypervisor.
186+ *
187+ * While the exception can be executed with all the interrupts (e.g.
188+ * IRQ) unmasked, the interrupted context may have purposefully masked
189+ * some of them. So we want to inherit the state from the interrupted
190+ * context.
191+ */
192+trap_prefetch_abort:
193+ vector prefetch_abort, hyp_iflags=IFLAGS_AIF
194+
195+/*
196+ * Vector only used by the hypervisor.
197+ *
198+ * Data Abort should be rare and most likely fatal. It is best to not
199+ * unmask any interrupts to limit the amount of code that can run before
200+ * the Data Abort is treated.
201+ */
202+trap_data_abort:
203+ vector data_abort
204+
205+/* Vector only used by the guest. We can unmask Abort/IRQ. */
206+trap_guest_sync:
207+ vector guest_sync, guest_iflags=ai
208+
209+
210+/* Vector used by the hypervisor and the guest. */
211+trap_irq:
212+ vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F
213+
214+/*
215+ * Vector used by the hypervisor and the guest.
216+ *
217+ * FIQ are not meant to happen, so we don't unmask any interrupts.
218+ */
219+trap_fiq:
220+ vector fiq
221
222 return_from_trap:
223 /*
224--
2252.11.0
226
diff --git a/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
new file mode 100644
index 0000000000..106cbf98f1
--- /dev/null
+++ b/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
@@ -0,0 +1,114 @@
1From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001
2From: Julien Grall <julien.grall@arm.com>
3Date: Mon, 7 Oct 2019 18:10:56 +0100
4Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap
5 without a change of level
6
7Some of the traps without a change of the level (i.e. hypervisor ->
8hypervisor) will unmask interrupts regardless the state of them in the
9interrupted context.
10
11One of the consequences is IRQ will be unmasked when receiving a
12synchronous exception (used by WARN*()). This could result to unexpected
13behavior such as deadlock (if a lock was shared with interrupts).
14
15In a nutshell, interrupts should only be unmasked when it is safe to
16do. Xen only unmask IRQ and Abort interrupts, so the logic can stay
17simple:
18 - hyp_error: All the interrupts are now kept masked. SError should
19 be pretty rare and if ever happen then we most likely want to
20 avoid any other interrupts to be generated. The potential main
21 "caller" is during virtual SError synchronization on the exit
22 path from the guest (see check_pending_vserror).
23
24 - hyp_sync: The interrupts state is inherited from the interrupted
25 context.
26
27 - hyp_irq: All the interrupts but IRQ state are inherited from the
28 interrupted context. IRQ is kept masked.
29
30This is part of XSA-303.
31
32Reported-by: Julien Grall <Julien.Grall@arm.com>
33Signed-off-by: Julien Grall <julien.grall@arm.com>
34Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
35Reviewed-by: Andre Przywara <andre.przywara@arm.com>
36---
37 xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++----
38 1 file changed, 43 insertions(+), 4 deletions(-)
39
40diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
41index 2d9a2713a1..3e41ba65b6 100644
42--- a/xen/arch/arm/arm64/entry.S
43+++ b/xen/arch/arm/arm64/entry.S
44@@ -188,24 +188,63 @@ hyp_error_invalid:
45 entry hyp=1
46 invalid BAD_ERROR
47
48+/*
49+ * SError received while running in the hypervisor mode.
50+ *
51+ * Technically, we could unmask the IRQ if it were unmasked in the
52+ * interrupted context. However, this require to check the PSTATE. For
53+ * simplicity, as SError should be rare and potentially fatal,
54+ * all interrupts are kept masked.
55+ */
56 hyp_error:
57 entry hyp=1
58- msr daifclr, #2
59 mov x0, sp
60 bl do_trap_hyp_serror
61 exit hyp=1
62
63-/* Traps taken in Current EL with SP_ELx */
64+/*
65+ * Synchronous exception received while running in the hypervisor mode.
66+ *
67+ * While the exception could be executed with all the interrupts (e.g.
68+ * IRQ) unmasked, the interrupted context may have purposefully masked
69+ * some of them. So we want to inherit the state from the interrupted
70+ * context.
71+ */
72 hyp_sync:
73 entry hyp=1
74- msr daifclr, #6
75+
76+ /* Inherit interrupts */
77+ mrs x0, SPSR_el2
78+ and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK)
79+ msr daif, x0
80+
81 mov x0, sp
82 bl do_trap_hyp_sync
83 exit hyp=1
84
85+/*
86+ * IRQ received while running in the hypervisor mode.
87+ *
88+ * While the exception could be executed with all the interrupts but IRQ
89+ * unmasked, the interrupted context may have purposefully masked some
90+ * of them. So we want to inherit the state from the interrupt context
91+ * and keep IRQ masked.
92+ *
93+ * XXX: We may want to consider an ordering between interrupts (e.g. if
94+ * SError are masked, then IRQ should be masked too). However, this
95+ * would require some rework in some paths (e.g. panic, livepatch) to
96+ * ensure the ordering is enforced everywhere.
97+ */
98 hyp_irq:
99 entry hyp=1
100- msr daifclr, #4
101+
102+ /* Inherit D, A, F interrupts and keep I masked */
103+ mrs x0, SPSR_el2
104+ mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK)
105+ and x0, x0, x1
106+ orr x0, x0, #PSR_IRQ_MASK
107+ msr daif, x0
108+
109 mov x0, sp
110 bl do_trap_irq
111 exit hyp=1
112--
1132.11.0
114
diff --git a/main/xen/xsa304-4.10-1.patch b/main/xen/xsa304-4.10-1.patch
new file mode 100644
index 0000000000..4c144ac506
--- /dev/null
+++ b/main/xen/xsa304-4.10-1.patch
@@ -0,0 +1,71 @@
1From: Andrew Cooper <andrew.cooper3@citrix.com>
2Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
3
4Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
5an EPT superpage gets shattered. The root cause is still under investigation,
6but the end result is unusable in combination with CVE-2018-12207 protections.
7
8This is part of XSA-304 / CVE-2018-12207
9
10Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
11Reviewed-by: Jan Beulich <jbeulich@suse.com>
12
13diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
14index fb7edfaef9..d698b1d50a 100644
15--- a/xen/drivers/passthrough/vtd/extern.h
16+++ b/xen/drivers/passthrough/vtd/extern.h
17@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
18 int __must_check me_wifi_quirk(struct domain *domain,
19 u8 bus, u8 devfn, int map);
20 void pci_vtd_quirk(const struct pci_dev *);
21+void quirk_iommu_caps(struct iommu *iommu);
22+
23 bool_t platform_supports_intremap(void);
24 bool_t platform_supports_x2apic(void);
25
26diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
27index 2798a49907..17cf87ccf1 100644
28--- a/xen/drivers/passthrough/vtd/iommu.c
29+++ b/xen/drivers/passthrough/vtd/iommu.c
30@@ -1205,6 +1205,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
31 if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
32 return -ENODEV;
33
34+ quirk_iommu_caps(iommu);
35+
36 if ( cap_fault_reg_offset(iommu->cap) +
37 cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
38 ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
39diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
40index d6db862678..b02688e316 100644
41--- a/xen/drivers/passthrough/vtd/quirks.c
42+++ b/xen/drivers/passthrough/vtd/quirks.c
43@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
44 break;
45 }
46 }
47+
48+void __init quirk_iommu_caps(struct iommu *iommu)
49+{
50+ /*
51+ * IOMMU Quirks:
52+ *
53+ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
54+ * implement superpages internally.
55+ *
56+ * There are issues changing the walk length under in-flight DMA, which
57+ * has manifested as incompatibility between EPT/IOMMU sharing and the
58+ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages
59+ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
60+ * and IOMMU pagetables.
61+ *
62+ * Detection of SandyBridge unfortunately has to be done by processor
63+ * model because the client parts don't expose their IOMMUs as PCI devices
64+ * we could match with a Device ID.
65+ */
66+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
67+ boot_cpu_data.x86 == 6 &&
68+ (boot_cpu_data.x86_model == 0x2a ||
69+ boot_cpu_data.x86_model == 0x2d) )
70+ iommu->cap &= ~(0xful << 34);
71+}
diff --git a/main/xen/xsa304-4.10-2.patch b/main/xen/xsa304-4.10-2.patch
new file mode 100644
index 0000000000..38f739ad90
--- /dev/null
+++ b/main/xen/xsa304-4.10-2.patch
@@ -0,0 +1,268 @@
1From: Andrew Cooper <andrew.cooper3@citrix.com>
2Subject: x86/vtx: Disable executable EPT superpages to work around
3 CVE-2018-12207
4
5CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
6machine check exception can be generated in a corner case when an executable
7mapping changes size or cacheability without TLB invalidation. HVM guest
8kernels can trigger this to DoS the host.
9
10To mitigate, in affected hardware, all EPT superpages are marked NX. When an
11instruction fetch violation is observed against the superpage, the superpage
12is shattered to 4k and has execute permissions restored. This prevents the
13guest kernel from being able to create the necessary preconditions in the iTLB
14to exploit the vulnerability.
15
16This does come with a workload-dependent performance overhead, caused by
17increased TLB pressure. Performance can be restored, if guest kernels are
18trusted not to mount an attack, by specifying ept=exec-sp on the command line.
19
20This is part of XSA-304 / CVE-2018-12207
21
22Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
23Acked-by: George Dunlap <george.dunlap@citrix.com>
24Reviewed-by: Jan Beulich <jbeulich@suse.com>
25
26diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
27index c0700dfbfe..698ab63340 100644
28--- a/xen/arch/x86/hvm/hvm.c
29+++ b/xen/arch/x86/hvm/hvm.c
30@@ -1695,6 +1695,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
31 struct p2m_domain *p2m, *hostp2m;
32 int rc, fall_through = 0, paged = 0;
33 int sharing_enomem = 0;
34+ unsigned int page_order = 0;
35 vm_event_request_t *req_ptr = NULL;
36 bool_t ap2m_active, sync = 0;
37
38@@ -1763,7 +1764,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
39 hostp2m = p2m_get_hostp2m(currd);
40 mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
41 P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
42- NULL);
43+ &page_order);
44
45 if ( ap2m_active )
46 {
47@@ -1775,7 +1776,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
48 goto out;
49 }
50
51- mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
52+ mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order);
53 }
54 else
55 p2m = hostp2m;
56@@ -1817,6 +1818,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
57 break;
58 }
59
60+ /*
61+ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution
62+ * fault against a non-executable superpage, shatter it to regain
63+ * execute permissions.
64+ */
65+ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
66+ {
67+ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
68+ p2mt, p2ma);
69+
70+ if ( res )
71+ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
72+ gfn, res);
73+
74+ rc = !res;
75+ goto out_put_gfn;
76+ }
77+
78 if ( violation )
79 {
80 /* Should #VE be emulated for this fault? */
81diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
82index 205f2307c2..27050c0877 100644
83--- a/xen/arch/x86/hvm/vmx/vmcs.c
84+++ b/xen/arch/x86/hvm/vmx/vmcs.c
85@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
86
87 static bool_t __read_mostly opt_pml_enabled = 1;
88 static s8 __read_mostly opt_ept_ad = -1;
89+int8_t __read_mostly opt_ept_exec_sp = -1;
90
91 /*
92 * The 'ept' parameter controls functionalities that depend on, or impact the
93@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s)
94 opt_pml_enabled = val;
95 else if ( !cmdline_strcmp(s, "ad") )
96 opt_ept_ad = val;
97+ else if ( !cmdline_strcmp(s, "exec-sp") )
98+ opt_ept_exec_sp = val;
99 else
100 rc = -EINVAL;
101
102diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
103index fa1e0309c7..9285c2b2fa 100644
104--- a/xen/arch/x86/hvm/vmx/vmx.c
105+++ b/xen/arch/x86/hvm/vmx/vmx.c
106@@ -2490,6 +2490,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
107 static void __init lbr_tsx_fixup_check(void);
108 static void __init bdw_erratum_bdf14_fixup_check(void);
109
110+/*
111+ * Calculate whether the CPU is vulnerable to Instruction Fetch page
112+ * size-change MCEs.
113+ */
114+static bool __init has_if_pschange_mc(void)
115+{
116+ uint64_t caps = 0;
117+
118+ /*
119+ * If we are virtualised, there is nothing we can do. Our EPT tables are
120+ * shadowed by our hypervisor, and not walked by hardware.
121+ */
122+ if ( cpu_has_hypervisor )
123+ return false;
124+
125+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
126+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
127+
128+ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
129+ return false;
130+
131+ /*
132+ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
133+ * this time.
134+ */
135+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
136+ boot_cpu_data.x86 != 6 )
137+ return false;
138+
139+ switch ( boot_cpu_data.x86_model )
140+ {
141+ /*
142+ * Core processors since at least Nehalem are vulnerable.
143+ */
144+ case 0x1f: /* Auburndale / Havendale */
145+ case 0x1e: /* Nehalem */
146+ case 0x1a: /* Nehalem EP */
147+ case 0x2e: /* Nehalem EX */
148+ case 0x25: /* Westmere */
149+ case 0x2c: /* Westmere EP */
150+ case 0x2f: /* Westmere EX */
151+ case 0x2a: /* SandyBridge */
152+ case 0x2d: /* SandyBridge EP/EX */
153+ case 0x3a: /* IvyBridge */
154+ case 0x3e: /* IvyBridge EP/EX */
155+ case 0x3c: /* Haswell */
156+ case 0x3f: /* Haswell EX/EP */
157+ case 0x45: /* Haswell D */
158+ case 0x46: /* Haswell H */
159+ case 0x3d: /* Broadwell */
160+ case 0x47: /* Broadwell H */
161+ case 0x4f: /* Broadwell EP/EX */
162+ case 0x56: /* Broadwell D */
163+ case 0x4e: /* Skylake M */
164+ case 0x5e: /* Skylake D */
165+ case 0x55: /* Skylake-X / Cascade Lake */
166+ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
167+ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
168+ return true;
169+
170+ /*
171+ * Atom processors are not vulnerable.
172+ */
173+ case 0x1c: /* Pineview */
174+ case 0x26: /* Lincroft */
175+ case 0x27: /* Penwell */
176+ case 0x35: /* Cloverview */
177+ case 0x36: /* Cedarview */
178+ case 0x37: /* Baytrail / Valleyview (Silvermont) */
179+ case 0x4d: /* Avaton / Rangely (Silvermont) */
180+ case 0x4c: /* Cherrytrail / Brasswell */
181+ case 0x4a: /* Merrifield */
182+ case 0x5a: /* Moorefield */
183+ case 0x5c: /* Goldmont */
184+ case 0x5d: /* SoFIA 3G Granite/ES2.1 */
185+ case 0x65: /* SoFIA LTE AOSP */
186+ case 0x5f: /* Denverton */
187+ case 0x6e: /* Cougar Mountain */
188+ case 0x75: /* Lightning Mountain */
189+ case 0x7a: /* Gemini Lake */
190+ case 0x86: /* Jacobsville */
191+
192+ /*
193+ * Knights processors are not vulnerable.
194+ */
195+ case 0x57: /* Knights Landing */
196+ case 0x85: /* Knights Mill */
197+ return false;
198+
199+ default:
200+ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
201+ boot_cpu_data.x86_model);
202+ return true;
203+ }
204+}
205+
206 const struct hvm_function_table * __init start_vmx(void)
207 {
208 set_in_cr4(X86_CR4_VMXE);
209@@ -2510,6 +2606,17 @@ const struct hvm_function_table * __init start_vmx(void)
210 */
211 if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
212 {
213+ bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
214+
215+ if ( opt_ept_exec_sp == -1 )
216+ {
217+ /* Default to non-executable superpages on vulnerable hardware. */
218+ opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
219+
220+ if ( cpu_has_bug_pschange_mc )
221+ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
222+ }
223+
224 vmx_function_table.hap_supported = 1;
225 vmx_function_table.altp2m_supported = 1;
226
227diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
228index b4996ce658..424d42c93d 100644
229--- a/xen/arch/x86/mm/p2m-ept.c
230+++ b/xen/arch/x86/mm/p2m-ept.c
231@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
232 break;
233 }
234
235+ /*
236+ * Don't create executable superpages if we need to shatter them to
237+ * protect against CVE-2018-12207.
238+ */
239+ if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
240+ entry->x = 0;
241 }
242
243 #define GUEST_TABLE_MAP_FAILED 0
244diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
245index 7341cb191e..aad25335eb 100644
246--- a/xen/include/asm-x86/hvm/vmx/vmx.h
247+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
248@@ -28,6 +28,8 @@
249 #include <asm/hvm/trace.h>
250 #include <asm/hvm/vmx/vmcs.h>
251
252+extern int8_t opt_ept_exec_sp;
253+
254 typedef union {
255 struct {
256 u64 r : 1, /* bit 0 - Read permission */
257diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
258index e61aac2f51..47e7c412f2 100644
259--- a/xen/include/asm-x86/msr-index.h
260+++ b/xen/include/asm-x86/msr-index.h
261@@ -54,6 +54,7 @@
262 #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
263 #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
264 #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
265+#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
266
267 #define MSR_FLUSH_CMD 0x0000010b
268 #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
diff --git a/main/xen/xsa304-4.10-3.patch b/main/xen/xsa304-4.10-3.patch
new file mode 100644
index 0000000000..907b0895a8
--- /dev/null
+++ b/main/xen/xsa304-4.10-3.patch
@@ -0,0 +1,84 @@
1From: Andrew Cooper <andrew.cooper3@citrix.com>
2Subject: x86/vtx: Allow runtime modification of the exec-sp setting
3
4See patch for details.
5
6Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
7Reviewed-by: Jan Beulich <jbeulich@suse.com>
8Reviewed-by: George Dunlap <george.dunlap@citrix.com>
9
10diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
11index 27050c0877..3c29b7c46f 100644
12--- a/xen/arch/x86/hvm/vmx/vmcs.c
13+++ b/xen/arch/x86/hvm/vmx/vmcs.c
14@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s)
15 }
16 custom_param("ept", parse_ept_param);
17
18+static int parse_ept_param_runtime(const char *s)
19+{
20+ int val;
21+
22+ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
23+ !(hvm_funcs.hap_capabilities &
24+ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
25+ {
26+ printk("VMX: EPT not available, or not in use - ignoring\n");
27+ return 0;
28+ }
29+
30+ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
31+ return -EINVAL;
32+
33+ if ( val != opt_ept_exec_sp )
34+ {
35+ struct domain *d;
36+
37+ opt_ept_exec_sp = val;
38+
39+ rcu_read_lock(&domlist_read_lock);
40+ for_each_domain ( d )
41+ if ( paging_mode_hap(d) )
42+ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
43+ rcu_read_unlock(&domlist_read_lock);
44+ }
45+
46+ printk("VMX: EPT executable superpages %sabled\n",
47+ val ? "en" : "dis");
48+
49+ return 0;
50+}
51+custom_runtime_only_param("ept", parse_ept_param_runtime);
52+
53 /* Dynamic (run-time adjusted) execution control flags. */
54 u32 vmx_pin_based_exec_control __read_mostly;
55 u32 vmx_cpu_based_exec_control __read_mostly;
56diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
57index 7a52ba993e..416e77b03c 100644
58--- a/xen/arch/x86/mm/p2m.c
59+++ b/xen/arch/x86/mm/p2m.c
60@@ -263,17 +263,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
61 return 0;
62 }
63
64+/*
65+ * May be called with ot = nt = p2m_ram_rw for its side effect of
66+ * recalculating all PTEs in the p2m.
67+ */
68 void p2m_change_entry_type_global(struct domain *d,
69 p2m_type_t ot, p2m_type_t nt)
70 {
71 struct p2m_domain *p2m = p2m_get_hostp2m(d);
72
73- ASSERT(ot != nt);
74 ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
75
76 p2m_lock(p2m);
77 p2m->change_entry_type_global(p2m, ot, nt);
78- p2m->global_logdirty = (nt == p2m_ram_logdirty);
79+ /* Don't allow 'recalculate' operations to change the logdirty state. */
80+ if ( ot != nt )
81+ p2m->global_logdirty = (nt == p2m_ram_logdirty);
82 p2m_unlock(p2m);
83 }
84
diff --git a/main/xen/xsa305-4.10-1.patch b/main/xen/xsa305-4.10-1.patch
new file mode 100644
index 0000000000..e3163723a6
--- /dev/null
+++ b/main/xen/xsa305-4.10-1.patch
@@ -0,0 +1,288 @@
1From: Andrew Cooper <andrew.cooper3@citrix.com>
2Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
3
4To protect against the TSX Async Abort speculative vulnerability, Intel have
5released new microcode for affected parts which introduce the MSR_TSX_CTRL
6control, which allows TSX to be turned off. This will be architectural on
7future parts.
8
9Introduce tsx= to provide a global on/off for TSX, including its enumeration
10via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to
11guests at the moment.
12
13VMs may have booted before microcode is loaded, or before hosts have rebooted,
14and they still want to migrate freely. A VM which booted seeing TSX can
15migrate safely to hosts with TSX disabled - TSX will start unconditionally
16aborting, but still behave in a manner compatible with the ABI.
17
18The guest-visible behaviour is equivalent to late loading the microcode and
19setting the RTM_DISABLE bit in the course of live patching.
20
21This is part of XSA-305 / CVE-2019-11135
22
23Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
24Reviewed-by: Jan Beulich <jbeulich@suse.com>
25
26diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
27index 0cbfb5096c..1b169c7b72 100644
28--- a/docs/misc/xen-command-line.markdown
29+++ b/docs/misc/xen-command-line.markdown
30@@ -1920,6 +1920,20 @@ pages) must also be specified via the tbuf\_size parameter.
31 ### tsc
32 > `= unstable | skewed | stable:socket`
33
34+### tsx
35+ = <bool>
36+
37+ Applicability: x86
38+ Default: true
39+
40+Controls for the use of Transactional Synchronization eXtensions.
41+
42+On Intel parts released in Q3 2019 (with updated microcode), and future parts,
43+a control has been introduced which allows TSX to be turned off.
44+
45+On systems with the ability to turn TSX off, this boolean offers system wide
46+control of whether TSX is enabled or disabled.
47+
48 ### ucode
49 > `= [<integer> | scan]`
50
51diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
52index d86fb97fa3..4e4f39d933 100644
53--- a/xen/arch/x86/Makefile
54+++ b/xen/arch/x86/Makefile
55@@ -65,6 +65,7 @@ obj-y += sysctl.o
56 obj-y += time.o
57 obj-y += trace.o
58 obj-y += traps.o
59+obj-y += tsx.o
60 obj-y += usercopy.o
61 obj-y += x86_emulate.o
62 obj-$(CONFIG_TBOOT) += tboot.o
63diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
64index 98b63f3a01..e943d70bca 100644
65--- a/xen/arch/x86/cpuid.c
66+++ b/xen/arch/x86/cpuid.c
67@@ -600,6 +600,20 @@ void recalculate_cpuid_policy(struct domain *d)
68 if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
69 __set_bit(X86_FEATURE_ITSC, max_fs);
70
71+ /*
72+ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
73+ * TSX and hide the feature bits. Migrating-in VMs may have been booted
74+ * pre-mitigation when the TSX features were visbile.
75+ *
76+ * This situation is compatible (albeit with a perf hit to any TSX code in
77+ * the guest), so allow the feature bits to remain set.
78+ */
79+ if ( cpu_has_tsx_ctrl )
80+ {
81+ __set_bit(X86_FEATURE_HLE, max_fs);
82+ __set_bit(X86_FEATURE_RTM, max_fs);
83+ }
84+
85 /* Clamp the toolstacks choices to reality. */
86 for ( i = 0; i < ARRAY_SIZE(fs); i++ )
87 fs[i] &= max_fs[i];
88diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
89index 6853d4c120..6ceea913fb 100644
90--- a/xen/arch/x86/msr.c
91+++ b/xen/arch/x86/msr.c
92@@ -134,6 +134,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
93 case MSR_FLUSH_CMD:
94 /* Write-only */
95 case MSR_TSX_FORCE_ABORT:
96+ case MSR_TSX_CTRL:
97 /* Not offered to guests. */
98 goto gp_fault;
99
100@@ -192,6 +193,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
101 case MSR_ARCH_CAPABILITIES:
102 /* Read-only */
103 case MSR_TSX_FORCE_ABORT:
104+ case MSR_TSX_CTRL:
105 /* Not offered to guests. */
106 goto gp_fault;
107
108diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
109index 7903204761..949d4abbdf 100644
110--- a/xen/arch/x86/setup.c
111+++ b/xen/arch/x86/setup.c
112@@ -1540,6 +1540,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
113
114 early_microcode_init();
115
116+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
117+
118 identify_cpu(&boot_cpu_data);
119
120 set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
121diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
122index b0496eb66e..cdf53afc1e 100644
123--- a/xen/arch/x86/smpboot.c
124+++ b/xen/arch/x86/smpboot.c
125@@ -370,6 +370,8 @@ void start_secondary(void *unused)
126 if ( boot_cpu_has(X86_FEATURE_IBRSB) )
127 wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
128
129+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
130+
131 if ( xen_guest )
132 hypervisor_ap_setup();
133
134diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
135new file mode 100644
136index 0000000000..a8ec2ccc69
137--- /dev/null
138+++ b/xen/arch/x86/tsx.c
139@@ -0,0 +1,74 @@
140+#include <xen/init.h>
141+#include <asm/msr.h>
142+
143+/*
144+ * Valid values:
145+ * 1 => Explicit tsx=1
146+ * 0 => Explicit tsx=0
147+ * -1 => Default, implicit tsx=1
148+ *
149+ * This is arranged such that the bottom bit encodes whether TSX is actually
150+ * disabled, while identifying various explicit (>=0) and implicit (<0)
151+ * conditions.
152+ */
153+int8_t __read_mostly opt_tsx = -1;
154+int8_t __read_mostly cpu_has_tsx_ctrl = -1;
155+
156+static int __init parse_tsx(const char *s)
157+{
158+ int rc = 0, val = parse_bool(s, NULL);
159+
160+ if ( val >= 0 )
161+ opt_tsx = val;
162+ else
163+ rc = -EINVAL;
164+
165+ return rc;
166+}
167+custom_param("tsx", parse_tsx);
168+
169+void tsx_init(void)
170+{
171+ /*
172+ * This function is first called between microcode being loaded, and CPUID
173+ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL
174+ * is available.
175+ */
176+ if ( unlikely(cpu_has_tsx_ctrl < 0) )
177+ {
178+ uint64_t caps = 0;
179+
180+ if ( boot_cpu_data.cpuid_level >= 7 &&
181+ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
182+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
183+
184+ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
185+ }
186+
187+ if ( cpu_has_tsx_ctrl )
188+ {
189+ uint64_t val;
190+
191+ rdmsrl(MSR_TSX_CTRL, val);
192+
193+ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
194+ /* Check bottom bit only. Higher bits are various sentinals. */
195+ if ( !(opt_tsx & 1) )
196+ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
197+
198+ wrmsrl(MSR_TSX_CTRL, val);
199+ }
200+ else if ( opt_tsx >= 0 )
201+ printk_once(XENLOG_WARNING
202+ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
203+}
204+
205+/*
206+ * Local variables:
207+ * mode: C
208+ * c-file-style: "BSD"
209+ * c-basic-offset: 4
210+ * tab-width: 4
211+ * indent-tabs-mode: nil
212+ * End:
213+ */
214diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
215index 47e7c412f2..c96c4f85c9 100644
216--- a/xen/include/asm-x86/msr-index.h
217+++ b/xen/include/asm-x86/msr-index.h
218@@ -55,6 +55,7 @@
219 #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
220 #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
221 #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
222+#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
223
224 #define MSR_FLUSH_CMD 0x0000010b
225 #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
226@@ -62,6 +63,10 @@
227 #define MSR_TSX_FORCE_ABORT 0x0000010f
228 #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0)
229
230+#define MSR_TSX_CTRL 0x00000122
231+#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0)
232+#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1)
233+
234 /* Intel MSRs. Some also available on other CPUs */
235 #define MSR_IA32_PERFCTR0 0x000000c1
236 #define MSR_IA32_A_PERFCTR0 0x000004c1
237diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
238index a0f8bf47e5..e707380f43 100644
239--- a/xen/include/asm-x86/processor.h
240+++ b/xen/include/asm-x86/processor.h
241@@ -268,6 +268,16 @@ static always_inline unsigned int cpuid_count_ebx(
242 return ebx;
243 }
244
245+static always_inline unsigned int cpuid_count_edx(
246+ unsigned int leaf, unsigned int subleaf)
247+{
248+ unsigned int edx, tmp;
249+
250+ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
251+
252+ return edx;
253+}
254+
255 static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf,
256 struct cpuid_leaf *data)
257 {
258@@ -622,6 +632,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
259 return fam;
260 }
261
262+extern int8_t opt_tsx, cpu_has_tsx_ctrl;
263+void tsx_init(void);
264+
265 #endif /* !__ASSEMBLY__ */
266
267 #endif /* __ASM_X86_PROCESSOR_H */
268diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
269index 750f809968..be223a6950 100644
270--- a/xen/include/xen/lib.h
271+++ b/xen/include/xen/lib.h
272@@ -116,6 +116,16 @@ extern int printk_ratelimit(void);
273 #define gprintk(lvl, fmt, args...) \
274 printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
275
276+#define printk_once(fmt, args...) \
277+({ \
278+ static bool __read_mostly once_; \
279+ if ( unlikely(!once_) ) \
280+ { \
281+ once_ = true; \
282+ printk(fmt, ## args); \
283+ } \
284+})
285+
286 #ifdef NDEBUG
287
288 static inline void
diff --git a/main/xen/xsa305-4.10-2.patch b/main/xen/xsa305-4.10-2.patch
new file mode 100644
index 0000000000..3a061c26e7
--- /dev/null
+++ b/main/xen/xsa305-4.10-2.patch
@@ -0,0 +1,192 @@
1From: Andrew Cooper <andrew.cooper3@citrix.com>
2Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
3
4See patch documentation and comments.
5
6This is part of XSA-305 / CVE-2019-11135
7
8Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
9Reviewed-by: Jan Beulich <jbeulich@suse.com>
10
11diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
12index 1b169c7b72..7a03f4ec70 100644
13--- a/docs/misc/xen-command-line.markdown
14+++ b/docs/misc/xen-command-line.markdown
15@@ -1813,7 +1813,7 @@ extreme care.**
16 An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
17 mitigations, including pieces of infrastructure used to virtualise certain
18 mitigation features for guests. This also includes settings which `xpti`,
19-`smt`, `pv-l1tf` control, unless the respective option(s) have been
20+`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
21 specified earlier on the command line.
22
23 Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
24@@ -1924,7 +1924,7 @@ pages) must also be specified via the tbuf\_size parameter.
25 = <bool>
26
27 Applicability: x86
28- Default: true
29+ Default: false on parts vulnerable to TAA, true otherwise
30
31 Controls for the use of Transactional Synchronization eXtensions.
32
33@@ -1934,6 +1934,19 @@ a control has been introduced which allows TSX to be turned off.
34 On systems with the ability to turn TSX off, this boolean offers system wide
35 control of whether TSX is enabled or disabled.
36
37+On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
38+logic applies:
39+
40+ * An explicit `tsx=` choice is honoured, even if it is `true` and would
41+ result in a vulnerable system.
42+
43+ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
44+ mitigated by disabling TSX, as this is the lowest overhead option.
45+
46+ * If the use of TSX is important, the more expensive TAA mitigations can be
47+ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
48+ active by default.
49+
50 ### ucode
51 > `= [<integer> | scan]`
52
53diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
54index e25dadfa89..0f30362111 100644
55--- a/xen/arch/x86/spec_ctrl.c
56+++ b/xen/arch/x86/spec_ctrl.c
57@@ -136,6 +136,9 @@ static int __init parse_spec_ctrl(const char *s)
58 if ( opt_pv_l1tf_domu < 0 )
59 opt_pv_l1tf_domu = 0;
60
61+ if ( opt_tsx == -1 )
62+ opt_tsx = -3;
63+
64 disable_common:
65 opt_rsb_pv = false;
66 opt_rsb_hvm = false;
67@@ -346,7 +349,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
68 printk("Speculative mitigation facilities:\n");
69
70 /* Hardware features which pertain to speculative mitigations. */
71- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
72+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
73 (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
74 (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
75 (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
76@@ -358,7 +361,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
77 (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
78 (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
79 (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
80- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "");
81+ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
82+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
83+ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "");
84
85 /* Compiled-in support which pertains to mitigations. */
86 if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
87@@ -372,7 +377,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
88 "\n");
89
90 /* Settings for Xen's protection, irrespective of guests. */
91- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
92+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
93 thunk == THUNK_NONE ? "N/A" :
94 thunk == THUNK_RETPOLINE ? "RETPOLINE" :
95 thunk == THUNK_LFENCE ? "LFENCE" :
96@@ -381,6 +386,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
97 (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
98 !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
99 (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
100+ !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
101+ (opt_tsx & 1) ? " TSX+" : " TSX-",
102 opt_ibpb ? " IBPB" : "",
103 opt_l1d_flush ? " L1D_FLUSH" : "",
104 opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "");
105@@ -891,6 +898,7 @@ void __init init_speculation_mitigations(void)
106 {
107 enum ind_thunk thunk = THUNK_DEFAULT;
108 bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
109+ bool cpu_has_bug_taa;
110 uint64_t caps = 0;
111
112 if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
113@@ -1120,6 +1128,53 @@ void __init init_speculation_mitigations(void)
114 "enabled. Mitigations will not be fully effective. Please\n"
115 "choose an explicit smt=<bool> setting. See XSA-297.\n");
116
117+ /*
118+ * Vulnerability to TAA is a little complicated to quantify.
119+ *
120+ * In the pipeline, it is just another way to get speculative access to
121+ * stale load port, store buffer or fill buffer data, and therefore can be
122+ * considered a superset of MDS (on TSX-capable parts). On parts which
123+ * predate MDS_NO, the existing VERW flushing will mitigate this
124+ * sidechannel as well.
125+ *
126+ * On parts which contain MDS_NO, the lack of VERW flushing means that an
127+ * attacker can still use TSX to target microarchitectural buffers to leak
128+ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts
129+ * which have MDS_NO but lack TAA_NO.
130+ *
131+ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
132+ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so
133+ * we check both to spot TSX in a microcode/cmdline independent way.
134+ */
135+ cpu_has_bug_taa =
136+ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
137+ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
138+
139+ /*
140+ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
141+ * the MDS mitigation of disabling HT and using VERW flushing.
142+ *
143+ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
144+ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
145+ * advertised, and there isn't a MD_CLEAR_2 flag to use...
146+ *
147+ * If we're on affected hardware, able to do something about it (which
148+ * implies that VERW now works), no explicit TSX choice and traditional
149+ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
150+ * plausibly value TSX higher than Hyperthreading...), disable TSX to
151+ * mitigate TAA.
152+ */
153+ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
154+ ((hw_smt_enabled && opt_smt) ||
155+ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
156+ {
157+ setup_clear_cpu_cap(X86_FEATURE_HLE);
158+ setup_clear_cpu_cap(X86_FEATURE_RTM);
159+
160+ opt_tsx = 0;
161+ tsx_init();
162+ }
163+
164 print_details(thunk, caps);
165
166 /*
167diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
168index a8ec2ccc69..2d202a0d4e 100644
169--- a/xen/arch/x86/tsx.c
170+++ b/xen/arch/x86/tsx.c
171@@ -5,7 +5,8 @@
172 * Valid values:
173 * 1 => Explicit tsx=1
174 * 0 => Explicit tsx=0
175- * -1 => Default, implicit tsx=1
176+ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
177+ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
178 *
179 * This is arranged such that the bottom bit encodes whether TSX is actually
180 * disabled, while identifying various explicit (>=0) and implicit (<0)
181diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
182index c96c4f85c9..5ef80735b2 100644
183--- a/xen/include/asm-x86/msr-index.h
184+++ b/xen/include/asm-x86/msr-index.h
185@@ -56,6 +56,7 @@
186 #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
187 #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
188 #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
189+#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
190
191 #define MSR_FLUSH_CMD 0x0000010b
192 #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)