diff options
author | Leonardo Arena <rnalrd@alpinelinux.org> | 2019-12-31 12:57:18 +0000 |
---|---|---|
committer | Leonardo Arena <rnalrd@alpinelinux.org> | 2019-12-31 12:58:06 +0000 |
commit | b56efe8db5679b569767cee09b45ce5cd04b942d (patch) | |
tree | 763e2d062569bcacc4e7c798ff591b468fe4002f | |
parent | d21e62f341aa3be70aa5dc967aeebdf6a54e04d2 (diff) | |
download | alpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.tar.bz2 alpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.tar.xz alpine_aports-b56efe8db5679b569767cee09b45ce5cd04b942d.zip |
main/xen: security fixes
- CVE-2019-18425 XSA-298
- CVE-2019-18421 XSA-299
- CVE-2019-18423 XSA-301
- CVE-2019-18424 XSA-302
- CVE-2019-18422 XSA-303
- CVE-2018-12207 XSA-304
- CVE-2019-11135 XSA-305
fixes #10968
27 files changed, 4732 insertions, 1 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index dc6b2dc89b..1582f86324 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD | |||
@@ -3,7 +3,7 @@ | |||
3 | # Maintainer: William Pitcock <nenolod@dereferenced.org> | 3 | # Maintainer: William Pitcock <nenolod@dereferenced.org> |
4 | pkgname=xen | 4 | pkgname=xen |
5 | pkgver=4.10.4 | 5 | pkgver=4.10.4 |
6 | pkgrel=0 | 6 | pkgrel=1 |
7 | pkgdesc="Xen hypervisor" | 7 | pkgdesc="Xen hypervisor" |
8 | url="http://www.xen.org/" | 8 | url="http://www.xen.org/" |
9 | arch="x86_64 armhf aarch64" | 9 | arch="x86_64 armhf aarch64" |
@@ -154,6 +154,14 @@ options="!strip" | |||
154 | # - XSA-294 | 154 | # - XSA-294 |
155 | # - XSA-295 | 155 | # - XSA-295 |
156 | # - XSA-296 | 156 | # - XSA-296 |
157 | # 4.10.4-r1: | ||
158 | # - CVE-2019-18425 XSA-298 | ||
159 | # - CVE-2019-18421 XSA-299 | ||
160 | # - CVE-2019-18423 XSA-301 | ||
161 | # - CVE-2019-18424 XSA-302 | ||
162 | # - CVE-2019-18422 XSA-303 | ||
163 | # - CVE-2018-12207 XSA-304 | ||
164 | # - CVE-2019-11135 XSA-305 | ||
157 | 165 | ||
158 | case "$CARCH" in | 166 | case "$CARCH" in |
159 | x86*) | 167 | x86*) |
@@ -218,6 +226,33 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv | |||
218 | 226 | ||
219 | hotplug-Linux-iscsi-block-handle-lun-1.patch | 227 | hotplug-Linux-iscsi-block-handle-lun-1.patch |
220 | 228 | ||
229 | xsa298-4.10.patch | ||
230 | xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch | ||
231 | xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch | ||
232 | xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch | ||
233 | xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch | ||
234 | xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch | ||
235 | xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch | ||
236 | xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch | ||
237 | xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch | ||
238 | xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch | ||
239 | xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch | ||
240 | xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch | ||
241 | xsa301-4.11-1.patch | ||
242 | xsa301-4.11-2.patch | ||
243 | xsa301-4.11-3.patch | ||
244 | xsa302-0001-IOMMU-add-missing-HVM-check.patch | ||
245 | xsa302-0002-passthrough-quarantine-PCI-devices.patch | ||
246 | xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch | ||
247 | xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch | ||
248 | xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | ||
249 | xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | ||
250 | xsa304-4.10-1.patch | ||
251 | xsa304-4.10-2.patch | ||
252 | xsa304-4.10-3.patch | ||
253 | xsa305-4.10-1.patch | ||
254 | xsa305-4.10-2.patch | ||
255 | |||
221 | xenstored.initd | 256 | xenstored.initd |
222 | xenstored.confd | 257 | xenstored.confd |
223 | xenconsoled.initd | 258 | xenconsoled.initd |
@@ -471,6 +506,32 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a | |||
471 | 69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch | 506 | 69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch |
472 | 2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch | 507 | 2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch |
473 | 8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch | 508 | 8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch |
509 | c81ce3b1f14731061af530861f628e1fd392211f98c4aba9db8354e7aff604902908733ec716d46f679e65e068717dc87694797480f490046701c4e2aecc3a51 xsa298-4.10.patch | ||
510 | eaeba22b8582a5f7cac727d0c068236a6af375b8b9f9e57d69d97569a6e1b7da15c38b611bc2504a84e044a6cafabc1fed27a134547c629210ebc66750fbce9f xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch | ||
511 | a027beb481e58b575967212381fd98e992eb28c1e6cd9a207c7c3f22e9aa6f65ca94b73cd02f460fdb2c931c527300bc2bd6dee9f039d1ace3532069ab9fb42d xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch | ||
512 | 6a48835ad80ba6d8c97d09e74303d8c430e1f8a1245bdd4ea9b9301d4d35a5bbb388ef694d8ca9bbf872521123c40ac8f8142e59c2b13efd932948083d98b09f xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch | ||
513 | a9774b3bece635bb86501f67718cdeeeadfb32c465ef11a41a0f9869b42f879a82c73753c198b5285bb29e8df6531f6467619c4b29b583e0a761f45c2419b521 xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch | ||
514 | d25dd31942d676c4b4f9db593b1a520ef8e3feaf50dd79313860eb5afd5e41503caca937d5bd0fbc57a02f9d85d52fea3646e0bb1580ff4971c6d194f872b9d1 xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch | ||
515 | 695a3ea0a0c2965e88cf907719aa2ace509d1f4944793eabbe3ace44d94f4f6b8e685695cf668c129d205b6b1ef30f37c13acb736bdf7de3b44c1b60d05c22be xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch | ||
516 | 8bd1fb05bed70aacdebf31755e673c74700d6f5ee1a15a35d950e90d5c34f16b3d0531b56ae74f17203cf87579d2b157c049efea040a2a03c7d0e8adce8498b9 xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch | ||
517 | 45bf263b11abd75e2fa2ee9e757c13de0a99365861d900b82cad0302446762a0ae76b9efbd870887d6353dcf95d565987debf43f80be4c9a0950c88964a3ee6a xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch | ||
518 | 35faf5434ebf4c6166d7f8fd10f9010e3dc8a714d5b9e168f641d420e070222c172060a7a72b8c81b93aa762b1d5286098713b485f86c1f1a679c5c588dd642f xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch | ||
519 | 8512e19397e30b4cca367b1fb936ef615ed5d4656206d16b24d0f44539a6ec5af07d0021a6276b48592a68b0fb7c5d3a3f035c9b3a1b7bfaa82f70204096a745 xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch | ||
520 | 81813683d7d83610296c7dfb2f75be7ccf1e332d9abc8fcf741906ddbcaa5b38511a1047c233e34e21437737be2fc343b027f4f73133c4ab823ff879842a5002 xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch | ||
521 | 916dc53eddb225a5c118630553baaa784c14a2a2ddc1f031ea83dbbb0241f0b103e664d3429414236951f6de689ff234be3fb900b83d1e1a4a4227385b32d496 xsa301-4.11-1.patch | ||
522 | 555d6586543f4b8661d1104851073277290ccce17d05279531c685966186948f933706d834ac1dd748678340df0aaa0423d18ea88be9a35bec67685aeb6258ac xsa301-4.11-2.patch | ||
523 | 5cf43aeb65988b7395a2d87bef0587cc172456ebebc2b9b67c231802ebcfb6bc1bdccccaaa09be463f1a79158234cb4003c0cd478990b99f020d812b90acc011 xsa301-4.11-3.patch | ||
524 | 6e918e7e6488d89807df5ff5c73926eb6c2990893c25850c5a55d2944619c6e135855ec57a5f54379c809e1ec854a4b56d1acd1c2bc0b50a06d183b470167d0f xsa302-0001-IOMMU-add-missing-HVM-check.patch | ||
525 | cda95d99b8a51175b1ca98318ae4488a7b82f43c1e7a4e9903d8f5f9277c08acb759d05f146b8363363f9f1ed45663190fb935726c43fe667301134b88b21692 xsa302-0002-passthrough-quarantine-PCI-devices.patch | ||
526 | b65de69f7c0097177652fc6fe7c0c12ab44c6bb0a8823b19ee315a574b04f9151a572d518d684fec467b995c9c9756bd5b2d88f7546199c0b807155c5dca43b5 xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch | ||
527 | 440869c1d7212820ba0c7d4b35681483897d1dcc4aa2f833af1370ac5bd8995b3d2712c598e6309488b90f37e36ca36db232e5de06242afa017d1c991f5d6af6 xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch | ||
528 | 7d56d0576fcd90ce4296e59cd2eae35929ecae6a7fa40e88c2f66f54234083b09be92630f299e5bb24d23b32949e58d49bafa1bed1e73719b73a4c640b86206f xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | ||
529 | 49b540f2585f43685c9f37ea1b6f166a98e71d85e6e0fbf2807b5788b3e71cb35dd71f71b7ad5a6d230ba60706cd11ef4bcecec7c2f250f28fd95dbd50fffc2b xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | ||
530 | 8502fd41000664f74382e2691f0a7ceef5121227532a55ffef3046745fe05461b266c93191f505ce3566b2e932b2f0880510dff714948384215fc48093b8d983 xsa304-4.10-1.patch | ||
531 | c0149a445a9f6ef4aa0d928ff321afa7ea6f52d96213042f444a9b96912729fa27c5b81c247c56f45922061f2e45649c8ab462d73765de8ca49022b9994ccf05 xsa304-4.10-2.patch | ||
532 | f7c34c984885f73f51fd3ca0274b7a6b3ca938547b910bb1becc73d7df668b0f9f69d6f402cc3a183a2acff1a9978c2d5775bd2acced4300212568e8ca22d47a xsa304-4.10-3.patch | ||
533 | eeca8ad1ec1b13b7d1849b94537d24e8f91eff6fb7b2e406a08accb9ec72ddb48360c90b2a250ffbc628970f00de557fcddacbcf09062a59a36a8b6ffcbf1909 xsa305-4.10-1.patch | ||
534 | 6fc52805ef24510aa5092d1bda61d1299b74c8b37fdca0c17e9df62ec16bb9c7343f09b8dd1f4801c4c5db3b3f6f7208c0c35034ef8aa86b08df308e82597892 xsa305-4.10-2.patch | ||
474 | 52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd | 535 | 52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd |
475 | 093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd | 536 | 093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd |
476 | 3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd | 537 | 3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd |
diff --git a/main/xen/xsa298-4.10.patch b/main/xen/xsa298-4.10.patch new file mode 100644 index 0000000000..f0b2c1efc1 --- /dev/null +++ b/main/xen/xsa298-4.10.patch | |||
@@ -0,0 +1,87 @@ | |||
1 | From: Jan Beulich <jbeulich@suse.com> | ||
2 | Subject: x86/PV: check GDT/LDT limits during emulation | ||
3 | |||
4 | Accesses beyond the LDT limit originating from emulation would trigger | ||
5 | the ASSERT() in pv_map_ldt_shadow_page(). On production builds such | ||
6 | accesses would cause an attempt to promote the touched page (offset from | ||
7 | the present LDT base address) to a segment descriptor one. If this | ||
8 | happens to succeed, guest user mode would be able to elevate its | ||
9 | privileges to that of the guest kernel. This is particularly easy when | ||
10 | there's no LDT at all, in which case the LDT base stored internally to | ||
11 | Xen is simply zero. | ||
12 | |||
13 | Also adjust the ASSERT() that was triggering: It was off by one to | ||
14 | begin with, and for production builds we also better use | ||
15 | ASSERT_UNREACHABLE() instead with suitable recovery code afterwards. | ||
16 | |||
17 | This is XSA-298. | ||
18 | |||
19 | Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
20 | Signed-off-by: Jan Beulich <jbeulich@suse.com> | ||
21 | Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
22 | |||
23 | --- a/xen/arch/x86/pv/emul-gate-op.c | ||
24 | +++ b/xen/arch/x86/pv/emul-gate-op.c | ||
25 | @@ -60,7 +60,13 @@ static int read_gate_descriptor(unsigned | ||
26 | (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) | ||
27 | + (gate_sel >> 3); | ||
28 | if ( (gate_sel < 4) || | ||
29 | - ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || | ||
30 | + /* | ||
31 | + * We're interested in call gates only, which occupy a single | ||
32 | + * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. | ||
33 | + */ | ||
34 | + ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= | ||
35 | + (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents | ||
36 | + : v->arch.pv_vcpu.gdt_ents)) || | ||
37 | __get_user(desc, pdesc) ) | ||
38 | return 0; | ||
39 | |||
40 | @@ -79,7 +85,7 @@ static int read_gate_descriptor(unsigned | ||
41 | if ( !is_pv_32bit_vcpu(v) ) | ||
42 | { | ||
43 | if ( (*ar & 0x1f00) != 0x0c00 || | ||
44 | - (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || | ||
45 | + /* Limit check done above already. */ | ||
46 | __get_user(desc, pdesc + 1) || | ||
47 | (desc.b & 0x1f00) ) | ||
48 | return 0; | ||
49 | --- a/xen/arch/x86/pv/emulate.c | ||
50 | +++ b/xen/arch/x86/pv/emulate.c | ||
51 | @@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int | ||
52 | { | ||
53 | struct desc_struct desc; | ||
54 | |||
55 | - if ( sel < 4) | ||
56 | + if ( sel < 4 || | ||
57 | + /* | ||
58 | + * Don't apply the GDT limit here, as the selector may be a Xen | ||
59 | + * provided one. __get_user() will fail (without taking further | ||
60 | + * action) for ones falling in the gap between guest populated | ||
61 | + * and Xen ones. | ||
62 | + */ | ||
63 | + ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) ) | ||
64 | desc.b = desc.a = 0; | ||
65 | else if ( __get_user(desc, | ||
66 | (const struct desc_struct *)(!(sel & 4) | ||
67 | --- a/xen/arch/x86/pv/mm.c | ||
68 | +++ b/xen/arch/x86/pv/mm.c | ||
69 | @@ -98,12 +98,16 @@ bool pv_map_ldt_shadow_page(unsigned int | ||
70 | BUG_ON(unlikely(in_irq())); | ||
71 | |||
72 | /* | ||
73 | - * Hardware limit checking should guarantee this property. NB. This is | ||
74 | + * Prior limit checking should guarantee this property. NB. This is | ||
75 | * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the | ||
76 | * current vcpu, and vcpu_reset() will block until this vcpu has been | ||
77 | * descheduled before continuing. | ||
78 | */ | ||
79 | - ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents); | ||
80 | + if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) ) | ||
81 | + { | ||
82 | + ASSERT_UNREACHABLE(); | ||
83 | + return false; | ||
84 | + } | ||
85 | |||
86 | if ( is_pv_32bit_domain(currd) ) | ||
87 | linear = (uint32_t)linear; | ||
diff --git a/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch b/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch new file mode 100644 index 0000000000..2db7b3f980 --- /dev/null +++ b/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch | |||
@@ -0,0 +1,94 @@ | |||
1 | From bc266a68aa014af2cc3ed0a1f55723fdeac2e545 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry | ||
5 | |||
6 | On detection of a potential L1TF issue, most validation code returns | ||
7 | -ERESTART to allow the switch to shadow mode to happen and cause the | ||
8 | original operation to be restarted. | ||
9 | |||
10 | However, in the validation code, the return value -ERESTART has been | ||
11 | repurposed to indicate 1) the function has partially completed | ||
12 | something which needs to be undone, and 2) calling put_page_type() | ||
13 | should cleanly undo it. This causes problems in several places. | ||
14 | |||
15 | For L1 tables, on receiving an -ERESTART return from alloc_l1_table(), | ||
16 | alloc_page_type() will set PGT_partial on the page. If for some | ||
17 | reason the original operation never restarts, then on domain | ||
18 | destruction, relinquish_memory() will call free_page_type() on the | ||
19 | page. | ||
20 | |||
21 | Unfortunately, alloc_ and free_l1_table() aren't set up to deal with | ||
22 | PGT_partial. When returning a failure, alloc_l1_table() always | ||
23 | de-validates whatever it's validated so far, and free_l1_table() | ||
24 | always devalidates the whole page. This means that if | ||
25 | relinquish_memory() calls free_page_type() on an L1 that didn't | ||
26 | complete due to an L1TF, it will call put_page_from_l1e() on "page | ||
27 | entries" that have never been validated. | ||
28 | |||
29 | For L2+ tables, setting rc to ERESTART causes the rest of the | ||
30 | alloc_lN_table() function to *think* that the entry in question will | ||
31 | have PGT_partial set. This will cause it to set partial_pte = 1. If | ||
32 | relinqush_memory() then calls free_page_type() on one of those pages, | ||
33 | then free_lN_table() will call put_page_from_lNe() on the entry when | ||
34 | it shouldn't. | ||
35 | |||
36 | Rather than indicating -ERESTART, indicate -EINTR. This is the code | ||
37 | to indicate that nothing has changed from when you started the call | ||
38 | (which is effectively how alloc_l1_table() handles errors). | ||
39 | |||
40 | mod_lN_entry() shouldn't have any of these types of problems, so leave | ||
41 | potential changes there for a clean-up patch later. | ||
42 | |||
43 | This is part of XSA-299. | ||
44 | |||
45 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
46 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
47 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
48 | --- | ||
49 | xen/arch/x86/mm.c | 8 ++++---- | ||
50 | 1 file changed, 4 insertions(+), 4 deletions(-) | ||
51 | |||
52 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
53 | index ce2c082caf..0cbca48a02 100644 | ||
54 | --- a/xen/arch/x86/mm.c | ||
55 | +++ b/xen/arch/x86/mm.c | ||
56 | @@ -1152,7 +1152,7 @@ get_page_from_l2e( | ||
57 | int rc; | ||
58 | |||
59 | if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) | ||
60 | - return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1; | ||
61 | + return pv_l1tf_check_l2e(d, l2e) ? -EINTR : 1; | ||
62 | |||
63 | if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) | ||
64 | { | ||
65 | @@ -1188,7 +1188,7 @@ get_page_from_l3e( | ||
66 | int rc; | ||
67 | |||
68 | if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) | ||
69 | - return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1; | ||
70 | + return pv_l1tf_check_l3e(d, l3e) ? -EINTR : 1; | ||
71 | |||
72 | if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) | ||
73 | { | ||
74 | @@ -1221,7 +1221,7 @@ get_page_from_l4e( | ||
75 | int rc; | ||
76 | |||
77 | if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) | ||
78 | - return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1; | ||
79 | + return pv_l1tf_check_l4e(d, l4e) ? -EINTR : 1; | ||
80 | |||
81 | if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) | ||
82 | { | ||
83 | @@ -1435,7 +1435,7 @@ static int alloc_l1_table(struct page_info *page) | ||
84 | { | ||
85 | if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) | ||
86 | { | ||
87 | - ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; | ||
88 | + ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0; | ||
89 | if ( ret ) | ||
90 | goto out; | ||
91 | } | ||
92 | -- | ||
93 | 2.23.0 | ||
94 | |||
diff --git a/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch new file mode 100644 index 0000000000..10345be2d4 --- /dev/null +++ b/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch | |||
@@ -0,0 +1,99 @@ | |||
1 | From fd7bfe9aaee41c589c16c541ec538285dcde1fb2 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially | ||
5 | de-validated page | ||
6 | |||
7 | When unpinning pagetables, if an operation is interrupted, | ||
8 | relinquish_memory() re-sets PGT_pinned so that the un-pin will | ||
9 | pickedup again when the hypercall restarts. | ||
10 | |||
11 | This is appropriate when put_page_and_type_preemptible() returns | ||
12 | -EINTR, which indicates that the page is back in its initial state | ||
13 | (i.e., completely validated). However, for -ERESTART, this leads to a | ||
14 | state where a page has both PGT_pinned and PGT_partial set. | ||
15 | |||
16 | This happens to work at the moment, although it's not really a | ||
17 | "canonical" state; but in subsequent patches, where we need to make a | ||
18 | distinction in handling between PGT_validated and PGT_partial pages, | ||
19 | this causes issues. | ||
20 | |||
21 | Move to a "canonical" state by: | ||
22 | - Only re-setting PGT_pinned on -EINTR | ||
23 | - Re-dropping the refcount held by PGT_pinned on -ERESTART | ||
24 | |||
25 | In the latter case, the PGT_partial bit will be cleared further down | ||
26 | with the rest of the other PGT_partial pages. | ||
27 | |||
28 | While here, clean up some trainling whitespace. | ||
29 | |||
30 | This is part of XSA-299. | ||
31 | |||
32 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
33 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
34 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
35 | --- | ||
36 | xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++--- | ||
37 | 1 file changed, 28 insertions(+), 3 deletions(-) | ||
38 | |||
39 | diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c | ||
40 | index 91c2b1c21a..897124f05f 100644 | ||
41 | --- a/xen/arch/x86/domain.c | ||
42 | +++ b/xen/arch/x86/domain.c | ||
43 | @@ -112,7 +112,7 @@ static void play_dead(void) | ||
44 | * this case, heap corruption or #PF can occur (when heap debugging is | ||
45 | * enabled). For example, even printk() can involve tasklet scheduling, | ||
46 | * which touches per-cpu vars. | ||
47 | - * | ||
48 | + * | ||
49 | * Consider very carefully when adding code to *dead_idle. Most hypervisor | ||
50 | * subsystems are unsafe to call. | ||
51 | */ | ||
52 | @@ -1837,9 +1837,34 @@ static int relinquish_memory( | ||
53 | break; | ||
54 | case -ERESTART: | ||
55 | case -EINTR: | ||
56 | + /* | ||
57 | + * -EINTR means PGT_validated has been re-set; re-set | ||
58 | + * PGT_pinned again so that it gets picked up next time | ||
59 | + * around. | ||
60 | + * | ||
61 | + * -ERESTART, OTOH, means PGT_partial is set instead. Put | ||
62 | + * it back on the list, but don't set PGT_pinned; the | ||
63 | + * section below will finish off de-validation. But we do | ||
64 | + * need to drop the general ref associated with | ||
65 | + * PGT_pinned, since put_page_and_type_preemptible() | ||
66 | + * didn't do it. | ||
67 | + * | ||
68 | + * NB we can do an ASSERT for PGT_validated, since we | ||
69 | + * "own" the type ref; but theoretically, the PGT_partial | ||
70 | + * could be cleared by someone else. | ||
71 | + */ | ||
72 | + if ( ret == -EINTR ) | ||
73 | + { | ||
74 | + ASSERT(page->u.inuse.type_info & PGT_validated); | ||
75 | + set_bit(_PGT_pinned, &page->u.inuse.type_info); | ||
76 | + } | ||
77 | + else | ||
78 | + put_page(page); | ||
79 | + | ||
80 | ret = -ERESTART; | ||
81 | + | ||
82 | + /* Put the page back on the list and drop the ref we grabbed above */ | ||
83 | page_list_add(page, list); | ||
84 | - set_bit(_PGT_pinned, &page->u.inuse.type_info); | ||
85 | put_page(page); | ||
86 | goto out; | ||
87 | default: | ||
88 | @@ -2061,7 +2086,7 @@ void vcpu_kick(struct vcpu *v) | ||
89 | * pending flag. These values may fluctuate (after all, we hold no | ||
90 | * locks) but the key insight is that each change will cause | ||
91 | * evtchn_upcall_pending to be polled. | ||
92 | - * | ||
93 | + * | ||
94 | * NB2. We save the running flag across the unblock to avoid a needless | ||
95 | * IPI for domains that we IPI'd to unblock. | ||
96 | */ | ||
97 | -- | ||
98 | 2.23.0 | ||
99 | |||
diff --git a/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch new file mode 100644 index 0000000000..1e79d7666e --- /dev/null +++ b/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch | |||
@@ -0,0 +1,610 @@ | |||
1 | From 6bad09c708d906922fb59d7e2c06d5de9a633ca3 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into | ||
5 | individual flags | ||
6 | |||
7 | At the moment, partial_pte is a tri-state that contains two distinct bits | ||
8 | of information: | ||
9 | |||
10 | 1. If zero, the pte at index [nr_validated_ptes] is un-validated. If | ||
11 | non-zero, the pte was last seen with PGT_partial set. | ||
12 | |||
13 | 2. If positive, the pte at index [nr_validated_ptes] does not hold a | ||
14 | general reference count. If negative, it does. | ||
15 | |||
16 | To make future patches more clear, separate out this functionality | ||
17 | into two distinct, named bits: PTF_partial_set (for #1) and | ||
18 | PTF_partial_general_ref (for #2). | ||
19 | |||
20 | Additionally, a number of functions which need this information also | ||
21 | take other flags to control behavior (such as `preemptible` and | ||
22 | `defer`). These are hard to read in the caller (since you only see | ||
23 | 'true' or 'false'), and ugly when many are added together. In | ||
24 | preparation for adding yet another flag in a future patch, collapse | ||
25 | all of these into a single `flag` variable. | ||
26 | |||
27 | NB that this does mean checking for what was previously the '-1' | ||
28 | condition a bit more ugly in the put_page_from_lNe functions (since | ||
29 | you have to check for both partial_set and general ref); but this | ||
30 | clause will go away in a future patch. | ||
31 | |||
32 | Also note that the original comment had an off-by-one error: | ||
33 | partial_flags (like partial_pte before it) concerns | ||
34 | plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1]. | ||
35 | |||
36 | No functional change intended. | ||
37 | |||
38 | This is part of XSA-299. | ||
39 | |||
40 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
41 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
42 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
43 | --- | ||
44 | xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++--------------- | ||
45 | xen/include/asm-x86/mm.h | 41 +++++++--- | ||
46 | 2 files changed, 128 insertions(+), 78 deletions(-) | ||
47 | |||
48 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
49 | index 0cbca48a02..84ee48ec3f 100644 | ||
50 | --- a/xen/arch/x86/mm.c | ||
51 | +++ b/xen/arch/x86/mm.c | ||
52 | @@ -651,20 +651,34 @@ static int alloc_segdesc_page(struct page_info *page) | ||
53 | static int __get_page_type(struct page_info *page, unsigned long type, | ||
54 | int preemptible); | ||
55 | |||
56 | +/* | ||
57 | + * The following flags are used to specify behavior of various get and | ||
58 | + * put commands. The first two are also stored in page->partial_flags | ||
59 | + * to indicate the state of the page pointed to by | ||
60 | + * page->pte[page->nr_validated_entries]. See the comment in mm.h for | ||
61 | + * more information. | ||
62 | + */ | ||
63 | +#define PTF_partial_set (1 << 0) | ||
64 | +#define PTF_partial_general_ref (1 << 1) | ||
65 | +#define PTF_preemptible (1 << 2) | ||
66 | +#define PTF_defer (1 << 3) | ||
67 | + | ||
68 | static int get_page_and_type_from_mfn( | ||
69 | mfn_t mfn, unsigned long type, struct domain *d, | ||
70 | - int partial, int preemptible) | ||
71 | + unsigned int flags) | ||
72 | { | ||
73 | struct page_info *page = mfn_to_page(mfn); | ||
74 | int rc; | ||
75 | + bool preemptible = flags & PTF_preemptible, | ||
76 | + partial_ref = flags & PTF_partial_general_ref; | ||
77 | |||
78 | - if ( likely(partial >= 0) && | ||
79 | + if ( likely(!partial_ref) && | ||
80 | unlikely(!get_page_from_mfn(mfn, d)) ) | ||
81 | return -EINVAL; | ||
82 | |||
83 | rc = __get_page_type(page, type, preemptible); | ||
84 | |||
85 | - if ( unlikely(rc) && partial >= 0 && | ||
86 | + if ( unlikely(rc) && !partial_ref && | ||
87 | (!preemptible || page != current->arch.old_guest_table) ) | ||
88 | put_page(page); | ||
89 | |||
90 | @@ -1146,7 +1160,7 @@ get_page_from_l1e( | ||
91 | define_get_linear_pagetable(l2); | ||
92 | static int | ||
93 | get_page_from_l2e( | ||
94 | - l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) | ||
95 | + l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags) | ||
96 | { | ||
97 | unsigned long mfn = l2e_get_pfn(l2e); | ||
98 | int rc; | ||
99 | @@ -1163,8 +1177,9 @@ get_page_from_l2e( | ||
100 | |||
101 | if ( !(l2e_get_flags(l2e) & _PAGE_PSE) ) | ||
102 | { | ||
103 | - rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, | ||
104 | - partial, false); | ||
105 | + ASSERT(!(flags & PTF_preemptible)); | ||
106 | + | ||
107 | + rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); | ||
108 | if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) | ||
109 | rc = 0; | ||
110 | return rc; | ||
111 | @@ -1183,7 +1198,7 @@ get_page_from_l2e( | ||
112 | define_get_linear_pagetable(l3); | ||
113 | static int | ||
114 | get_page_from_l3e( | ||
115 | - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) | ||
116 | + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags) | ||
117 | { | ||
118 | int rc; | ||
119 | |||
120 | @@ -1198,7 +1213,7 @@ get_page_from_l3e( | ||
121 | } | ||
122 | |||
123 | rc = get_page_and_type_from_mfn( | ||
124 | - l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1); | ||
125 | + l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); | ||
126 | if ( unlikely(rc == -EINVAL) && | ||
127 | !is_pv_32bit_domain(d) && | ||
128 | get_l3_linear_pagetable(l3e, pfn, d) ) | ||
129 | @@ -1216,7 +1231,7 @@ get_page_from_l3e( | ||
130 | define_get_linear_pagetable(l4); | ||
131 | static int | ||
132 | get_page_from_l4e( | ||
133 | - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) | ||
134 | + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags) | ||
135 | { | ||
136 | int rc; | ||
137 | |||
138 | @@ -1231,7 +1246,7 @@ get_page_from_l4e( | ||
139 | } | ||
140 | |||
141 | rc = get_page_and_type_from_mfn( | ||
142 | - l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1); | ||
143 | + l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); | ||
144 | if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) | ||
145 | rc = 0; | ||
146 | |||
147 | @@ -1306,7 +1321,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) | ||
148 | * Note also that this automatically deals correctly with linear p.t.'s. | ||
149 | */ | ||
150 | static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
151 | - int partial, bool defer) | ||
152 | + unsigned int flags) | ||
153 | { | ||
154 | int rc = 0; | ||
155 | |||
156 | @@ -1326,12 +1341,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
157 | struct page_info *pg = l2e_get_page(l2e); | ||
158 | struct page_info *ptpg = mfn_to_page(_mfn(pfn)); | ||
159 | |||
160 | - if ( unlikely(partial > 0) ) | ||
161 | + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
162 | + PTF_partial_set ) | ||
163 | { | ||
164 | - ASSERT(!defer); | ||
165 | + ASSERT(!(flags & PTF_defer)); | ||
166 | rc = _put_page_type(pg, true, ptpg); | ||
167 | } | ||
168 | - else if ( defer ) | ||
169 | + else if ( flags & PTF_defer ) | ||
170 | { | ||
171 | current->arch.old_guest_ptpg = ptpg; | ||
172 | current->arch.old_guest_table = pg; | ||
173 | @@ -1348,7 +1364,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
174 | } | ||
175 | |||
176 | static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
177 | - int partial, bool defer) | ||
178 | + unsigned int flags) | ||
179 | { | ||
180 | struct page_info *pg; | ||
181 | int rc; | ||
182 | @@ -1371,13 +1387,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
183 | |||
184 | pg = l3e_get_page(l3e); | ||
185 | |||
186 | - if ( unlikely(partial > 0) ) | ||
187 | + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
188 | + PTF_partial_set ) | ||
189 | { | ||
190 | - ASSERT(!defer); | ||
191 | + ASSERT(!(flags & PTF_defer)); | ||
192 | return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
193 | } | ||
194 | |||
195 | - if ( defer ) | ||
196 | + if ( flags & PTF_defer ) | ||
197 | { | ||
198 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
199 | current->arch.old_guest_table = pg; | ||
200 | @@ -1392,7 +1409,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
201 | } | ||
202 | |||
203 | static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
204 | - int partial, bool defer) | ||
205 | + unsigned int flags) | ||
206 | { | ||
207 | int rc = 1; | ||
208 | |||
209 | @@ -1401,13 +1418,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
210 | { | ||
211 | struct page_info *pg = l4e_get_page(l4e); | ||
212 | |||
213 | - if ( unlikely(partial > 0) ) | ||
214 | + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
215 | + PTF_partial_set ) | ||
216 | { | ||
217 | - ASSERT(!defer); | ||
218 | + ASSERT(!(flags & PTF_defer)); | ||
219 | return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
220 | } | ||
221 | |||
222 | - if ( defer ) | ||
223 | + if ( flags & PTF_defer ) | ||
224 | { | ||
225 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
226 | current->arch.old_guest_table = pg; | ||
227 | @@ -1514,12 +1532,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
228 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
229 | l2_pgentry_t *pl2e; | ||
230 | unsigned int i; | ||
231 | - int rc = 0, partial = page->partial_pte; | ||
232 | + int rc = 0; | ||
233 | + unsigned int partial_flags = page->partial_flags; | ||
234 | |||
235 | pl2e = map_domain_page(_mfn(pfn)); | ||
236 | |||
237 | for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; | ||
238 | - i++, partial = 0 ) | ||
239 | + i++, partial_flags = 0 ) | ||
240 | { | ||
241 | if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) | ||
242 | { | ||
243 | @@ -1529,18 +1548,19 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
244 | } | ||
245 | |||
246 | if ( !is_guest_l2_slot(d, type, i) || | ||
247 | - (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 ) | ||
248 | + (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 ) | ||
249 | continue; | ||
250 | |||
251 | if ( rc == -ERESTART ) | ||
252 | { | ||
253 | page->nr_validated_ptes = i; | ||
254 | - page->partial_pte = partial ?: 1; | ||
255 | + /* Set 'set', retain 'general ref' */ | ||
256 | + page->partial_flags = partial_flags | PTF_partial_set; | ||
257 | } | ||
258 | else if ( rc == -EINTR && i ) | ||
259 | { | ||
260 | page->nr_validated_ptes = i; | ||
261 | - page->partial_pte = 0; | ||
262 | + page->partial_flags = 0; | ||
263 | rc = -ERESTART; | ||
264 | } | ||
265 | else if ( rc < 0 && rc != -EINTR ) | ||
266 | @@ -1549,7 +1569,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
267 | if ( i ) | ||
268 | { | ||
269 | page->nr_validated_ptes = i; | ||
270 | - page->partial_pte = 0; | ||
271 | + page->partial_flags = 0; | ||
272 | current->arch.old_guest_ptpg = NULL; | ||
273 | current->arch.old_guest_table = page; | ||
274 | } | ||
275 | @@ -1573,7 +1593,8 @@ static int alloc_l3_table(struct page_info *page) | ||
276 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
277 | l3_pgentry_t *pl3e; | ||
278 | unsigned int i; | ||
279 | - int rc = 0, partial = page->partial_pte; | ||
280 | + int rc = 0; | ||
281 | + unsigned int partial_flags = page->partial_flags; | ||
282 | |||
283 | pl3e = map_domain_page(_mfn(pfn)); | ||
284 | |||
285 | @@ -1588,7 +1609,7 @@ static int alloc_l3_table(struct page_info *page) | ||
286 | memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); | ||
287 | |||
288 | for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; | ||
289 | - i++, partial = 0 ) | ||
290 | + i++, partial_flags = 0 ) | ||
291 | { | ||
292 | if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) | ||
293 | { | ||
294 | @@ -1605,20 +1626,22 @@ static int alloc_l3_table(struct page_info *page) | ||
295 | else | ||
296 | rc = get_page_and_type_from_mfn( | ||
297 | l3e_get_mfn(pl3e[i]), | ||
298 | - PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); | ||
299 | + PGT_l2_page_table | PGT_pae_xen_l2, d, | ||
300 | + partial_flags | PTF_preemptible); | ||
301 | } | ||
302 | - else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 ) | ||
303 | + else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 ) | ||
304 | continue; | ||
305 | |||
306 | if ( rc == -ERESTART ) | ||
307 | { | ||
308 | page->nr_validated_ptes = i; | ||
309 | - page->partial_pte = partial ?: 1; | ||
310 | + /* Set 'set', leave 'general ref' set if this entry was set */ | ||
311 | + page->partial_flags = partial_flags | PTF_partial_set; | ||
312 | } | ||
313 | else if ( rc == -EINTR && i ) | ||
314 | { | ||
315 | page->nr_validated_ptes = i; | ||
316 | - page->partial_pte = 0; | ||
317 | + page->partial_flags = 0; | ||
318 | rc = -ERESTART; | ||
319 | } | ||
320 | if ( rc < 0 ) | ||
321 | @@ -1635,7 +1658,7 @@ static int alloc_l3_table(struct page_info *page) | ||
322 | if ( i ) | ||
323 | { | ||
324 | page->nr_validated_ptes = i; | ||
325 | - page->partial_pte = 0; | ||
326 | + page->partial_flags = 0; | ||
327 | current->arch.old_guest_ptpg = NULL; | ||
328 | current->arch.old_guest_table = page; | ||
329 | } | ||
330 | @@ -1767,19 +1790,21 @@ static int alloc_l4_table(struct page_info *page) | ||
331 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
332 | l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); | ||
333 | unsigned int i; | ||
334 | - int rc = 0, partial = page->partial_pte; | ||
335 | + int rc = 0; | ||
336 | + unsigned int partial_flags = page->partial_flags; | ||
337 | |||
338 | for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; | ||
339 | - i++, partial = 0 ) | ||
340 | + i++, partial_flags = 0 ) | ||
341 | { | ||
342 | if ( !is_guest_l4_slot(d, i) || | ||
343 | - (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 ) | ||
344 | + (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 ) | ||
345 | continue; | ||
346 | |||
347 | if ( rc == -ERESTART ) | ||
348 | { | ||
349 | page->nr_validated_ptes = i; | ||
350 | - page->partial_pte = partial ?: 1; | ||
351 | + /* Set 'set', leave 'general ref' set if this entry was set */ | ||
352 | + page->partial_flags = partial_flags | PTF_partial_set; | ||
353 | } | ||
354 | else if ( rc < 0 ) | ||
355 | { | ||
356 | @@ -1789,7 +1814,7 @@ static int alloc_l4_table(struct page_info *page) | ||
357 | if ( i ) | ||
358 | { | ||
359 | page->nr_validated_ptes = i; | ||
360 | - page->partial_pte = 0; | ||
361 | + page->partial_flags = 0; | ||
362 | if ( rc == -EINTR ) | ||
363 | rc = -ERESTART; | ||
364 | else | ||
365 | @@ -1842,19 +1867,20 @@ static int free_l2_table(struct page_info *page) | ||
366 | struct domain *d = page_get_owner(page); | ||
367 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
368 | l2_pgentry_t *pl2e; | ||
369 | - int rc = 0, partial = page->partial_pte; | ||
370 | - unsigned int i = page->nr_validated_ptes - !partial; | ||
371 | + int rc = 0; | ||
372 | + unsigned int partial_flags = page->partial_flags, | ||
373 | + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); | ||
374 | |||
375 | pl2e = map_domain_page(_mfn(pfn)); | ||
376 | |||
377 | for ( ; ; ) | ||
378 | { | ||
379 | if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) | ||
380 | - rc = put_page_from_l2e(pl2e[i], pfn, partial, false); | ||
381 | + rc = put_page_from_l2e(pl2e[i], pfn, partial_flags); | ||
382 | if ( rc < 0 ) | ||
383 | break; | ||
384 | |||
385 | - partial = 0; | ||
386 | + partial_flags = 0; | ||
387 | |||
388 | if ( !i-- ) | ||
389 | break; | ||
390 | @@ -1876,12 +1902,14 @@ static int free_l2_table(struct page_info *page) | ||
391 | else if ( rc == -ERESTART ) | ||
392 | { | ||
393 | page->nr_validated_ptes = i; | ||
394 | - page->partial_pte = partial ?: -1; | ||
395 | + page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
396 | + partial_flags : | ||
397 | + (PTF_partial_set | PTF_partial_general_ref); | ||
398 | } | ||
399 | else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) | ||
400 | { | ||
401 | page->nr_validated_ptes = i + 1; | ||
402 | - page->partial_pte = 0; | ||
403 | + page->partial_flags = 0; | ||
404 | rc = -ERESTART; | ||
405 | } | ||
406 | |||
407 | @@ -1893,18 +1921,19 @@ static int free_l3_table(struct page_info *page) | ||
408 | struct domain *d = page_get_owner(page); | ||
409 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
410 | l3_pgentry_t *pl3e; | ||
411 | - int rc = 0, partial = page->partial_pte; | ||
412 | - unsigned int i = page->nr_validated_ptes - !partial; | ||
413 | + int rc = 0; | ||
414 | + unsigned int partial_flags = page->partial_flags, | ||
415 | + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); | ||
416 | |||
417 | pl3e = map_domain_page(_mfn(pfn)); | ||
418 | |||
419 | for ( ; ; ) | ||
420 | { | ||
421 | - rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); | ||
422 | + rc = put_page_from_l3e(pl3e[i], pfn, partial_flags); | ||
423 | if ( rc < 0 ) | ||
424 | break; | ||
425 | |||
426 | - partial = 0; | ||
427 | + partial_flags = 0; | ||
428 | if ( rc == 0 ) | ||
429 | pl3e[i] = unadjust_guest_l3e(pl3e[i], d); | ||
430 | |||
431 | @@ -1923,12 +1952,14 @@ static int free_l3_table(struct page_info *page) | ||
432 | if ( rc == -ERESTART ) | ||
433 | { | ||
434 | page->nr_validated_ptes = i; | ||
435 | - page->partial_pte = partial ?: -1; | ||
436 | + page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
437 | + partial_flags : | ||
438 | + (PTF_partial_set | PTF_partial_general_ref); | ||
439 | } | ||
440 | else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) | ||
441 | { | ||
442 | page->nr_validated_ptes = i + 1; | ||
443 | - page->partial_pte = 0; | ||
444 | + page->partial_flags = 0; | ||
445 | rc = -ERESTART; | ||
446 | } | ||
447 | return rc > 0 ? 0 : rc; | ||
448 | @@ -1939,26 +1970,29 @@ static int free_l4_table(struct page_info *page) | ||
449 | struct domain *d = page_get_owner(page); | ||
450 | unsigned long pfn = mfn_x(page_to_mfn(page)); | ||
451 | l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); | ||
452 | - int rc = 0, partial = page->partial_pte; | ||
453 | - unsigned int i = page->nr_validated_ptes - !partial; | ||
454 | + int rc = 0; | ||
455 | + unsigned partial_flags = page->partial_flags, | ||
456 | + i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); | ||
457 | |||
458 | do { | ||
459 | if ( is_guest_l4_slot(d, i) ) | ||
460 | - rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); | ||
461 | + rc = put_page_from_l4e(pl4e[i], pfn, partial_flags); | ||
462 | if ( rc < 0 ) | ||
463 | break; | ||
464 | - partial = 0; | ||
465 | + partial_flags = 0; | ||
466 | } while ( i-- ); | ||
467 | |||
468 | if ( rc == -ERESTART ) | ||
469 | { | ||
470 | page->nr_validated_ptes = i; | ||
471 | - page->partial_pte = partial ?: -1; | ||
472 | + page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
473 | + partial_flags : | ||
474 | + (PTF_partial_set | PTF_partial_general_ref); | ||
475 | } | ||
476 | else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) | ||
477 | { | ||
478 | page->nr_validated_ptes = i + 1; | ||
479 | - page->partial_pte = 0; | ||
480 | + page->partial_flags = 0; | ||
481 | rc = -ERESTART; | ||
482 | } | ||
483 | |||
484 | @@ -2180,7 +2214,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, | ||
485 | return -EBUSY; | ||
486 | } | ||
487 | |||
488 | - put_page_from_l2e(ol2e, pfn, 0, true); | ||
489 | + put_page_from_l2e(ol2e, pfn, PTF_defer); | ||
490 | |||
491 | return rc; | ||
492 | } | ||
493 | @@ -2248,7 +2282,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, | ||
494 | if ( !create_pae_xen_mappings(d, pl3e) ) | ||
495 | BUG(); | ||
496 | |||
497 | - put_page_from_l3e(ol3e, pfn, 0, 1); | ||
498 | + put_page_from_l3e(ol3e, pfn, PTF_defer); | ||
499 | return rc; | ||
500 | } | ||
501 | |||
502 | @@ -2311,7 +2345,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, | ||
503 | return -EFAULT; | ||
504 | } | ||
505 | |||
506 | - put_page_from_l4e(ol4e, pfn, 0, 1); | ||
507 | + put_page_from_l4e(ol4e, pfn, PTF_defer); | ||
508 | return rc; | ||
509 | } | ||
510 | |||
511 | @@ -2577,7 +2611,7 @@ int free_page_type(struct page_info *page, unsigned long type, | ||
512 | if ( !(type & PGT_partial) ) | ||
513 | { | ||
514 | page->nr_validated_ptes = 1U << PAGETABLE_ORDER; | ||
515 | - page->partial_pte = 0; | ||
516 | + page->partial_flags = 0; | ||
517 | } | ||
518 | |||
519 | switch ( type & PGT_type_mask ) | ||
520 | @@ -2862,7 +2896,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, | ||
521 | if ( !(x & PGT_partial) ) | ||
522 | { | ||
523 | page->nr_validated_ptes = 0; | ||
524 | - page->partial_pte = 0; | ||
525 | + page->partial_flags = 0; | ||
526 | } | ||
527 | page->linear_pt_count = 0; | ||
528 | rc = alloc_page_type(page, type, preemptible); | ||
529 | @@ -3037,7 +3071,8 @@ int new_guest_cr3(mfn_t mfn) | ||
530 | |||
531 | rc = paging_mode_refcounts(d) | ||
532 | ? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL) | ||
533 | - : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1); | ||
534 | + : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, | ||
535 | + PTF_preemptible); | ||
536 | switch ( rc ) | ||
537 | { | ||
538 | case 0: | ||
539 | @@ -3420,7 +3455,7 @@ long do_mmuext_op( | ||
540 | if ( op.arg1.mfn != 0 ) | ||
541 | { | ||
542 | rc = get_page_and_type_from_mfn( | ||
543 | - _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1); | ||
544 | + _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); | ||
545 | |||
546 | if ( unlikely(rc) ) | ||
547 | { | ||
548 | diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h | ||
549 | index 1030b8b5e6..a531fe3115 100644 | ||
550 | --- a/xen/include/asm-x86/mm.h | ||
551 | +++ b/xen/include/asm-x86/mm.h | ||
552 | @@ -157,19 +157,34 @@ struct page_info | ||
553 | * setting the flag must not drop that reference, whereas the instance | ||
554 | * clearing it will have to. | ||
555 | * | ||
556 | - * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has | ||
557 | - * been partially validated. This implies that the general reference | ||
558 | - * to the page (acquired from get_page_from_lNe()) would be dropped | ||
559 | - * (again due to the apparent failure) and hence must be re-acquired | ||
560 | - * when resuming the validation, but must not be dropped when picking | ||
561 | - * up the page for invalidation. | ||
562 | + * If partial_flags & PTF_partial_set is set, then the page at | ||
563 | + * at @nr_validated_ptes had PGT_partial set as a result of an | ||
564 | + * operation on the current page. (That page may or may not | ||
565 | + * still have PGT_partial set.) | ||
566 | * | ||
567 | - * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has | ||
568 | - * been partially invalidated. This is basically the opposite case of | ||
569 | - * above, i.e. the general reference to the page was not dropped in | ||
570 | - * put_page_from_lNe() (due to the apparent failure), and hence it | ||
571 | - * must be dropped when the put operation is resumed (and completes), | ||
572 | - * but it must not be acquired if picking up the page for validation. | ||
573 | + * If PTF_partial_general_ref is set, then the PTE at | ||
574 | + * @nr_validated_ptef holds a general reference count for the | ||
575 | + * page. | ||
576 | + * | ||
577 | + * This happens: | ||
578 | + * - During de-validation, if de-validation of the page was | ||
579 | + * interrupted | ||
580 | + * - During validation, if an invalid entry is encountered and | ||
581 | + * validation is preemptible | ||
582 | + * - During validation, if PTF_partial_general_ref was set on | ||
583 | + * this entry to begin with (perhaps because we're picking | ||
584 | + * up from a partial de-validation). | ||
585 | + * | ||
586 | + * When resuming validation, if PTF_partial_general_ref is clear, | ||
587 | + * then a general reference must be re-acquired; if it is set, no | ||
588 | + * reference should be acquired. | ||
589 | + * | ||
590 | + * When resuming de-validation, if PTF_partial_general_ref is | ||
591 | + * clear, no reference should be dropped; if it is set, a | ||
592 | + * reference should be dropped. | ||
593 | + * | ||
594 | + * NB that PTF_partial_set and PTF_partial_general_ref are | ||
595 | + * defined in mm.c, the only place where they are used. | ||
596 | * | ||
597 | * The 3rd field, @linear_pt_count, indicates | ||
598 | * - by a positive value, how many same-level page table entries a page | ||
599 | @@ -180,7 +195,7 @@ struct page_info | ||
600 | struct { | ||
601 | u16 nr_validated_ptes:PAGETABLE_ORDER + 1; | ||
602 | u16 :16 - PAGETABLE_ORDER - 1 - 2; | ||
603 | - s16 partial_pte:2; | ||
604 | + u16 partial_flags:2; | ||
605 | s16 linear_pt_count; | ||
606 | }; | ||
607 | |||
608 | -- | ||
609 | 2.23.0 | ||
610 | |||
diff --git a/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch new file mode 100644 index 0000000000..9c5b9669e9 --- /dev/null +++ b/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch | |||
@@ -0,0 +1,141 @@ | |||
1 | From 255ad8804c79dc874322a7060ae0615305bcb8e8 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a | ||
5 | boolean | ||
6 | |||
7 | This is in mainly in preparation for _put_page_type taking the | ||
8 | partial_flags value in the future. It also makes it easier to read in | ||
9 | the caller (since you see a flag name rather than `true` or `false`). | ||
10 | |||
11 | No functional change intended. | ||
12 | |||
13 | This is part of XSA-299. | ||
14 | |||
15 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
16 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
17 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
18 | --- | ||
19 | xen/arch/x86/mm.c | 25 +++++++++++++------------ | ||
20 | 1 file changed, 13 insertions(+), 12 deletions(-) | ||
21 | |||
22 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
23 | index 84ee48ec3f..e3264f8879 100644 | ||
24 | --- a/xen/arch/x86/mm.c | ||
25 | +++ b/xen/arch/x86/mm.c | ||
26 | @@ -1253,7 +1253,7 @@ get_page_from_l4e( | ||
27 | return rc; | ||
28 | } | ||
29 | |||
30 | -static int _put_page_type(struct page_info *page, bool preemptible, | ||
31 | +static int _put_page_type(struct page_info *page, unsigned int flags, | ||
32 | struct page_info *ptpg); | ||
33 | |||
34 | void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) | ||
35 | @@ -1345,7 +1345,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
36 | PTF_partial_set ) | ||
37 | { | ||
38 | ASSERT(!(flags & PTF_defer)); | ||
39 | - rc = _put_page_type(pg, true, ptpg); | ||
40 | + rc = _put_page_type(pg, PTF_preemptible, ptpg); | ||
41 | } | ||
42 | else if ( flags & PTF_defer ) | ||
43 | { | ||
44 | @@ -1354,7 +1354,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
45 | } | ||
46 | else | ||
47 | { | ||
48 | - rc = _put_page_type(pg, true, ptpg); | ||
49 | + rc = _put_page_type(pg, PTF_preemptible, ptpg); | ||
50 | if ( likely(!rc) ) | ||
51 | put_page(pg); | ||
52 | } | ||
53 | @@ -1391,7 +1391,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
54 | PTF_partial_set ) | ||
55 | { | ||
56 | ASSERT(!(flags & PTF_defer)); | ||
57 | - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
58 | + return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
59 | } | ||
60 | |||
61 | if ( flags & PTF_defer ) | ||
62 | @@ -1401,7 +1401,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | - rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
67 | + rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
68 | if ( likely(!rc) ) | ||
69 | put_page(pg); | ||
70 | |||
71 | @@ -1422,7 +1422,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
72 | PTF_partial_set ) | ||
73 | { | ||
74 | ASSERT(!(flags & PTF_defer)); | ||
75 | - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
76 | + return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
77 | } | ||
78 | |||
79 | if ( flags & PTF_defer ) | ||
80 | @@ -1432,7 +1432,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | - rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); | ||
85 | + rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
86 | if ( likely(!rc) ) | ||
87 | put_page(pg); | ||
88 | } | ||
89 | @@ -2680,11 +2680,12 @@ static int _put_final_page_type(struct page_info *page, unsigned long type, | ||
90 | } | ||
91 | |||
92 | |||
93 | -static int _put_page_type(struct page_info *page, bool preemptible, | ||
94 | +static int _put_page_type(struct page_info *page, unsigned int flags, | ||
95 | struct page_info *ptpg) | ||
96 | { | ||
97 | unsigned long nx, x, y = page->u.inuse.type_info; | ||
98 | int rc = 0; | ||
99 | + bool preemptible = flags & PTF_preemptible; | ||
100 | |||
101 | for ( ; ; ) | ||
102 | { | ||
103 | @@ -2884,7 +2885,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, | ||
104 | |||
105 | if ( unlikely(iommu_ret) ) | ||
106 | { | ||
107 | - _put_page_type(page, false, NULL); | ||
108 | + _put_page_type(page, 0, NULL); | ||
109 | rc = iommu_ret; | ||
110 | goto out; | ||
111 | } | ||
112 | @@ -2911,7 +2912,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, | ||
113 | |||
114 | void put_page_type(struct page_info *page) | ||
115 | { | ||
116 | - int rc = _put_page_type(page, false, NULL); | ||
117 | + int rc = _put_page_type(page, 0, NULL); | ||
118 | ASSERT(rc == 0); | ||
119 | (void)rc; | ||
120 | } | ||
121 | @@ -2927,7 +2928,7 @@ int get_page_type(struct page_info *page, unsigned long type) | ||
122 | |||
123 | int put_page_type_preemptible(struct page_info *page) | ||
124 | { | ||
125 | - return _put_page_type(page, true, NULL); | ||
126 | + return _put_page_type(page, PTF_preemptible, NULL); | ||
127 | } | ||
128 | |||
129 | int get_page_type_preemptible(struct page_info *page, unsigned long type) | ||
130 | @@ -2943,7 +2944,7 @@ int put_old_guest_table(struct vcpu *v) | ||
131 | if ( !v->arch.old_guest_table ) | ||
132 | return 0; | ||
133 | |||
134 | - switch ( rc = _put_page_type(v->arch.old_guest_table, true, | ||
135 | + switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, | ||
136 | v->arch.old_guest_ptpg) ) | ||
137 | { | ||
138 | case -EINTR: | ||
139 | -- | ||
140 | 2.23.0 | ||
141 | |||
diff --git a/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch new file mode 100644 index 0000000000..8724f4d6ac --- /dev/null +++ b/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch | |||
@@ -0,0 +1,79 @@ | |||
1 | From 36ce2b6e246d41ebaeb994dbf2b4e0e4555893bf Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional | ||
5 | |||
6 | Make it easier to read by declaring the conditions in which we will | ||
7 | retain the ref, rather than the conditions under which we release it. | ||
8 | |||
9 | The only way (page == current->arch.old_guest_table) can be true is if | ||
10 | preemptible is true; so remove this from the query itself, and add an | ||
11 | ASSERT() to that effect on the opposite path. | ||
12 | |||
13 | No functional change intended. | ||
14 | |||
15 | NB that alloc_lN_table() mishandle the "linear pt failure" situation | ||
16 | described in the comment; this will be addressed in a future patch. | ||
17 | |||
18 | This is part of XSA-299. | ||
19 | |||
20 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
21 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
22 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
23 | --- | ||
24 | xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++-- | ||
25 | 1 file changed, 37 insertions(+), 2 deletions(-) | ||
26 | |||
27 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
28 | index e3264f8879..ce7f5b84f3 100644 | ||
29 | --- a/xen/arch/x86/mm.c | ||
30 | +++ b/xen/arch/x86/mm.c | ||
31 | @@ -678,8 +678,43 @@ static int get_page_and_type_from_mfn( | ||
32 | |||
33 | rc = __get_page_type(page, type, preemptible); | ||
34 | |||
35 | - if ( unlikely(rc) && !partial_ref && | ||
36 | - (!preemptible || page != current->arch.old_guest_table) ) | ||
37 | + /* | ||
38 | + * Retain the refcount if: | ||
39 | + * - page is fully validated (rc == 0) | ||
40 | + * - page is not validated (rc < 0) but: | ||
41 | + * - We came in with a reference (partial_ref) | ||
42 | + * - page is partially validated but there's been an error | ||
43 | + * (page == current->arch.old_guest_table) | ||
44 | + * | ||
45 | + * The partial_ref-on-error clause is worth an explanation. There | ||
46 | + * are two scenarios where partial_ref might be true coming in: | ||
47 | + * - mfn has been partially demoted as type `type`; i.e. has | ||
48 | + * PGT_partial set | ||
49 | + * - mfn has been partially demoted as L(type+1) (i.e., a linear | ||
50 | + * page; e.g. we're being called from get_page_from_l2e with | ||
51 | + * type == PGT_l1_table, but the mfn is PGT_l2_table) | ||
52 | + * | ||
53 | + * If there's an error, in the first case, _get_page_type will | ||
54 | + * either return -ERESTART, in which case we want to retain the | ||
55 | + * ref (as the caller will consider it retained), or -EINVAL, in | ||
56 | + * which case old_guest_table will be set; in both cases, we need | ||
57 | + * to retain the ref. | ||
58 | + * | ||
59 | + * In the second case, if there's an error, _get_page_type() can | ||
60 | + * *only* return -EINVAL, and *never* set old_guest_table. In | ||
61 | + * that case we also want to retain the reference, to allow the | ||
62 | + * page to continue to be torn down (i.e., PGT_partial cleared) | ||
63 | + * safely. | ||
64 | + * | ||
65 | + * Also note that we shouldn't be able to leave with the reference | ||
66 | + * count retained unless we succeeded, or the operation was | ||
67 | + * preemptible. | ||
68 | + */ | ||
69 | + if ( likely(!rc) || partial_ref ) | ||
70 | + /* nothing */; | ||
71 | + else if ( page == current->arch.old_guest_table ) | ||
72 | + ASSERT(preemptible); | ||
73 | + else | ||
74 | put_page(page); | ||
75 | |||
76 | return rc; | ||
77 | -- | ||
78 | 2.23.0 | ||
79 | |||
diff --git a/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch new file mode 100644 index 0000000000..379c5002c6 --- /dev/null +++ b/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch | |||
@@ -0,0 +1,101 @@ | |||
1 | From 180f638fb5047c478ca32b15dd2ba9ba0ce43623 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when | ||
5 | preempting | ||
6 | |||
7 | In order to allow recursive pagetable promotions and demotions to be | ||
8 | interrupted, Xen must keep track of the state of the sub-pages | ||
9 | promoted or demoted. This is stored in two elements in the page | ||
10 | struct: nr_entries_validated and partial_flags. | ||
11 | |||
12 | The rule is that entries [0, nr_entries_validated) should always be | ||
13 | validated and hold a general reference count. If partial_flags is | ||
14 | zero, then [nr_entries_validated] is not validated and no reference | ||
15 | count is held. If PTF_partial_set is set, then [nr_entries_validated] | ||
16 | is partially validated. | ||
17 | |||
18 | At the moment, a distinction is made between promotion and demotion | ||
19 | with regard to whether the entry itself "holds" a general reference | ||
20 | count: when entry promotion is interrupted (i.e., returns -ERESTART), | ||
21 | the entry is not considered to hold a reference; when entry demotion | ||
22 | is interrupted, the entry is still considered to hold a general | ||
23 | reference. | ||
24 | |||
25 | PTF_partial_general_ref is used to distinguish between these cases. | ||
26 | If clear, it's a partial promotion => no general reference count held | ||
27 | by the entry; if set, it's partial demotion, so a general reference | ||
28 | count held. Because promotions and demotions can be interleaved, this | ||
29 | value is passed to get_page_and_type_from_mfn and put_page_from_l*e, | ||
30 | to be able to properly handle reference counts. | ||
31 | |||
32 | Unfortunately, when alloc_l[23]_table check hypercall_preempt_check() | ||
33 | and return -ERESTART, they set nr_entries_validated, but don't clear | ||
34 | partial_flags. | ||
35 | |||
36 | If we were picking up from a previously-interrupted promotion, that | ||
37 | means that PTF_partial_set would be set even though | ||
38 | [nr_entries_validated] was not partially validated. This means that | ||
39 | if the page in this state were de-validated, put_page_type() would | ||
40 | erroneously be called on that entry. | ||
41 | |||
42 | Perhaps worse, if we were racing with a de-validation, then we might | ||
43 | leave both PTF_partial_set and PTF_partial_general_ref; and when | ||
44 | de-validation picked up again, both the type and the general ref would | ||
45 | be erroneously dropped from [nr_entries_validated]. | ||
46 | |||
47 | In a sense, the real issue here is code duplication. Rather than | ||
48 | duplicate the interruption code, set rc to -EINTR and fall through to | ||
49 | the code which already handles that case correctly. | ||
50 | |||
51 | Given the logic at this point, it should be impossible for | ||
52 | partial_flags to be non-zero; add an ASSERT() to catch any changes. | ||
53 | |||
54 | This is part of XSA-299. | ||
55 | |||
56 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
57 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
58 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
59 | --- | ||
60 | xen/arch/x86/mm.c | 18 ++++-------------- | ||
61 | 1 file changed, 4 insertions(+), 14 deletions(-) | ||
62 | |||
63 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
64 | index ce7f5b84f3..9b9b67cd74 100644 | ||
65 | --- a/xen/arch/x86/mm.c | ||
66 | +++ b/xen/arch/x86/mm.c | ||
67 | @@ -1576,13 +1576,8 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
68 | i++, partial_flags = 0 ) | ||
69 | { | ||
70 | if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) | ||
71 | - { | ||
72 | - page->nr_validated_ptes = i; | ||
73 | - rc = -ERESTART; | ||
74 | - break; | ||
75 | - } | ||
76 | - | ||
77 | - if ( !is_guest_l2_slot(d, type, i) || | ||
78 | + rc = -EINTR; | ||
79 | + else if ( !is_guest_l2_slot(d, type, i) || | ||
80 | (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 ) | ||
81 | continue; | ||
82 | |||
83 | @@ -1647,13 +1642,8 @@ static int alloc_l3_table(struct page_info *page) | ||
84 | i++, partial_flags = 0 ) | ||
85 | { | ||
86 | if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) | ||
87 | - { | ||
88 | - page->nr_validated_ptes = i; | ||
89 | - rc = -ERESTART; | ||
90 | - break; | ||
91 | - } | ||
92 | - | ||
93 | - if ( is_pv_32bit_domain(d) && (i == 3) ) | ||
94 | + rc = -EINTR; | ||
95 | + else if ( is_pv_32bit_domain(d) && (i == 3) ) | ||
96 | { | ||
97 | if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || | ||
98 | (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) | ||
99 | -- | ||
100 | 2.23.0 | ||
101 | |||
diff --git a/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch new file mode 100644 index 0000000000..253c0fbb7c --- /dev/null +++ b/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch | |||
@@ -0,0 +1,374 @@ | |||
1 | From 29f56f0e7c11a299da497c866b4c76ebbc862045 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial | ||
5 | |||
6 | In order to allow recursive pagetable promotions and demotions to be | ||
7 | interrupted, Xen must keep track of the state of the sub-pages | ||
8 | promoted or demoted. This is stored in two elements in the page struct: | ||
9 | nr_entries_validated and partial_flags. | ||
10 | |||
11 | The rule is that entries [0, nr_entries_validated) should always be | ||
12 | validated and hold a general reference count. If partial_flags is | ||
13 | zero, then [nr_entries_validated] is not validated and no reference | ||
14 | count is held. If PTF_partial_set is set, then [nr_entries_validated] | ||
15 | is partially validated. | ||
16 | |||
17 | At the moment, a distinction is made between promotion and demotion | ||
18 | with regard to whether the entry itself "holds" a general reference | ||
19 | count: when entry promotion is interrupted (i.e., returns -ERESTART), | ||
20 | the entry is not considered to hold a reference; when entry demotion | ||
21 | is interrupted, the entry is still considered to hold a general | ||
22 | reference. | ||
23 | |||
24 | PTF_partial_general_ref is used to distinguish between these cases. | ||
25 | If clear, it's a partial promotion => no general reference count held | ||
26 | by the entry; if set, it's partial demotion, so a general reference | ||
27 | count held. Because promotions and demotions can be interleaved, this | ||
28 | value is passed to get_page_and_type_from_mfn and put_page_from_l*e, | ||
29 | to be able to properly handle reference counts. | ||
30 | |||
31 | Unfortunately, because a refcount is not held, it is possible to | ||
32 | engineer a situation where PFT_partial_set is set but the page in | ||
33 | question has been assigned to another domain. A sketch is provided in | ||
34 | the appendix. | ||
35 | |||
36 | Fix this by having the parent page table entry hold a general | ||
37 | reference count whenever PFT_partial_set is set. (For clarity of | ||
38 | change, keep two separate flags. These will be collapsed in a | ||
39 | subsequent changeset.) | ||
40 | |||
41 | This has two basic implications. On the put_page_from_lNe() side, | ||
42 | this mean that the (partial_set && !partial_ref) case can never happen, | ||
43 | and no longer needs to be special-cased. | ||
44 | |||
45 | Secondly, because both flags are set together, there's no need to carry over | ||
46 | existing bits from partial_pte. | ||
47 | |||
48 | (NB there is still another issue with calling _put_page_type() on a | ||
49 | page which had PGT_partial set; that will be handled in a subsequent | ||
50 | patch.) | ||
51 | |||
52 | On the get_page_and_type_from_mfn() side, we need to distinguish | ||
53 | between callers which hold a reference on partial (i.e., | ||
54 | alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and | ||
55 | so on): pass a flag if the type should be retained on interruption. | ||
56 | |||
57 | NB that since l1 promotion can't be preempted, that get_page_from_l2e | ||
58 | can't return -ERESTART. | ||
59 | |||
60 | This is part of XSA-299. | ||
61 | |||
62 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
63 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
64 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
65 | ----- | ||
66 | * Appendix: Engineering PTF_partial_set while a page belongs to a | ||
67 | foreign domain | ||
68 | |||
69 | Suppose A is a page which can be promoted to an l3, and B is a page | ||
70 | which can be promoted to an l2, and A[x] points to B. B has | ||
71 | PGC_allocated set but no other general references. | ||
72 | |||
73 | V1: PIN_L3 A. | ||
74 | A is validated, B is validated. | ||
75 | A.type_count = 1 | PGT_validated | PGT_pinned | ||
76 | B.type_count = 1 | PGT_validated | ||
77 | B.count = 2 | PGC_allocated (A[x] holds a general ref) | ||
78 | |||
79 | V1: UNPIN A. | ||
80 | A begins de-validation. | ||
81 | Arrange to be interrupted when i < x | ||
82 | V1->old_guest_table = A | ||
83 | V1->old_guest_table_ref_held = false | ||
84 | A.type_count = 1 | PGT_partial | ||
85 | A.nr_validated_entries = i < x | ||
86 | B.type_count = 0 | ||
87 | B.count = 1 | PGC_allocated | ||
88 | |||
89 | V2: MOD_L4_ENTRY to point some l4e to A. | ||
90 | Picks up re-validation of A. | ||
91 | Arrange to be interrupted halfway through B's validation | ||
92 | B.type_count = 1 | PGT_partial | ||
93 | B.count = 2 | PGC_allocated (PGT_partial holds a general ref) | ||
94 | A.type_count = 1 | PGT_partial | ||
95 | A.nr_validated_entries = x | ||
96 | A.partial_pte = PTF_partial_set | ||
97 | |||
98 | V3: MOD_L3_ENTRY to point some other l3e (not in A) to B. | ||
99 | Validates B. | ||
100 | B.type_count = 1 | PGT_validated | ||
101 | B.count = 2 | PGC_allocated ("other l3e" holds a general ref) | ||
102 | |||
103 | V3: MOD_L3_ENTRY to clear l3e pointing to B. | ||
104 | Devalidates B. | ||
105 | B.type_count = 0 | ||
106 | B.count = 1 | PGC_allocated | ||
107 | |||
108 | V3: decrease_reservation(B) | ||
109 | Clears PGC_allocated | ||
110 | B.count = 0 => B is freed | ||
111 | |||
112 | B gets assigned to a different domain | ||
113 | |||
114 | V1: Restarts UNPIN of A | ||
115 | put_old_guest_table(A) | ||
116 | ... | ||
117 | free_l3_table(A) | ||
118 | |||
119 | Now since A.partial_flags has PTF_partial_set, free_l3_table() will | ||
120 | call put_page_from_l3e() on A[x], which points to B, while B is owned | ||
121 | by another domain. | ||
122 | |||
123 | If A[x] held a general refcount for B on partial validation, as it does | ||
124 | for partial de-validation, then B would still have a reference count of | ||
125 | 1 after PGC_allocated was freed; so B wouldn't be freed until after | ||
126 | put_page_from_l3e() had happend on A[x]. | ||
127 | --- | ||
128 | xen/arch/x86/mm.c | 84 +++++++++++++++++++++++----------------- | ||
129 | xen/include/asm-x86/mm.h | 15 ++++--- | ||
130 | 2 files changed, 58 insertions(+), 41 deletions(-) | ||
131 | |||
132 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
133 | index 9b9b67cd74..2f185a3cd3 100644 | ||
134 | --- a/xen/arch/x86/mm.c | ||
135 | +++ b/xen/arch/x86/mm.c | ||
136 | @@ -658,10 +658,11 @@ static int __get_page_type(struct page_info *page, unsigned long type, | ||
137 | * page->pte[page->nr_validated_entries]. See the comment in mm.h for | ||
138 | * more information. | ||
139 | */ | ||
140 | -#define PTF_partial_set (1 << 0) | ||
141 | -#define PTF_partial_general_ref (1 << 1) | ||
142 | -#define PTF_preemptible (1 << 2) | ||
143 | -#define PTF_defer (1 << 3) | ||
144 | +#define PTF_partial_set (1 << 0) | ||
145 | +#define PTF_partial_general_ref (1 << 1) | ||
146 | +#define PTF_preemptible (1 << 2) | ||
147 | +#define PTF_defer (1 << 3) | ||
148 | +#define PTF_retain_ref_on_restart (1 << 4) | ||
149 | |||
150 | static int get_page_and_type_from_mfn( | ||
151 | mfn_t mfn, unsigned long type, struct domain *d, | ||
152 | @@ -670,7 +671,11 @@ static int get_page_and_type_from_mfn( | ||
153 | struct page_info *page = mfn_to_page(mfn); | ||
154 | int rc; | ||
155 | bool preemptible = flags & PTF_preemptible, | ||
156 | - partial_ref = flags & PTF_partial_general_ref; | ||
157 | + partial_ref = flags & PTF_partial_general_ref, | ||
158 | + partial_set = flags & PTF_partial_set, | ||
159 | + retain_ref = flags & PTF_retain_ref_on_restart; | ||
160 | + | ||
161 | + ASSERT(partial_ref == partial_set); | ||
162 | |||
163 | if ( likely(!partial_ref) && | ||
164 | unlikely(!get_page_from_mfn(mfn, d)) ) | ||
165 | @@ -683,13 +688,15 @@ static int get_page_and_type_from_mfn( | ||
166 | * - page is fully validated (rc == 0) | ||
167 | * - page is not validated (rc < 0) but: | ||
168 | * - We came in with a reference (partial_ref) | ||
169 | + * - page is partially validated (rc == -ERESTART), and the | ||
170 | + * caller has asked the ref to be retained in that case | ||
171 | * - page is partially validated but there's been an error | ||
172 | * (page == current->arch.old_guest_table) | ||
173 | * | ||
174 | * The partial_ref-on-error clause is worth an explanation. There | ||
175 | * are two scenarios where partial_ref might be true coming in: | ||
176 | - * - mfn has been partially demoted as type `type`; i.e. has | ||
177 | - * PGT_partial set | ||
178 | + * - mfn has been partially promoted / demoted as type `type`; | ||
179 | + * i.e. has PGT_partial set | ||
180 | * - mfn has been partially demoted as L(type+1) (i.e., a linear | ||
181 | * page; e.g. we're being called from get_page_from_l2e with | ||
182 | * type == PGT_l1_table, but the mfn is PGT_l2_table) | ||
183 | @@ -712,7 +719,8 @@ static int get_page_and_type_from_mfn( | ||
184 | */ | ||
185 | if ( likely(!rc) || partial_ref ) | ||
186 | /* nothing */; | ||
187 | - else if ( page == current->arch.old_guest_table ) | ||
188 | + else if ( page == current->arch.old_guest_table || | ||
189 | + (retain_ref && rc == -ERESTART) ) | ||
190 | ASSERT(preemptible); | ||
191 | else | ||
192 | put_page(page); | ||
193 | @@ -1379,8 +1387,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
194 | if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
195 | PTF_partial_set ) | ||
196 | { | ||
197 | - ASSERT(!(flags & PTF_defer)); | ||
198 | - rc = _put_page_type(pg, PTF_preemptible, ptpg); | ||
199 | + /* partial_set should always imply partial_ref */ | ||
200 | + BUG(); | ||
201 | } | ||
202 | else if ( flags & PTF_defer ) | ||
203 | { | ||
204 | @@ -1425,8 +1433,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
205 | if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
206 | PTF_partial_set ) | ||
207 | { | ||
208 | - ASSERT(!(flags & PTF_defer)); | ||
209 | - return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
210 | + /* partial_set should always imply partial_ref */ | ||
211 | + BUG(); | ||
212 | } | ||
213 | |||
214 | if ( flags & PTF_defer ) | ||
215 | @@ -1456,8 +1464,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
216 | if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
217 | PTF_partial_set ) | ||
218 | { | ||
219 | - ASSERT(!(flags & PTF_defer)); | ||
220 | - return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
221 | + /* partial_set should always imply partial_ref */ | ||
222 | + BUG(); | ||
223 | } | ||
224 | |||
225 | if ( flags & PTF_defer ) | ||
226 | @@ -1581,13 +1589,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
227 | (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 ) | ||
228 | continue; | ||
229 | |||
230 | - if ( rc == -ERESTART ) | ||
231 | - { | ||
232 | - page->nr_validated_ptes = i; | ||
233 | - /* Set 'set', retain 'general ref' */ | ||
234 | - page->partial_flags = partial_flags | PTF_partial_set; | ||
235 | - } | ||
236 | - else if ( rc == -EINTR && i ) | ||
237 | + /* | ||
238 | + * It shouldn't be possible for get_page_from_l2e to return | ||
239 | + * -ERESTART, since we never call this with PTF_preemptible. | ||
240 | + * (alloc_l1_table may return -EINTR on an L1TF-vulnerable | ||
241 | + * entry.) | ||
242 | + * | ||
243 | + * NB that while on a "clean" promotion, we can never get | ||
244 | + * PGT_partial. It is possible to arrange for an l2e to | ||
245 | + * contain a partially-devalidated l2; but in that case, both | ||
246 | + * of the following functions will fail anyway (the first | ||
247 | + * because the page in question is not an l1; the second | ||
248 | + * because the page is not fully validated). | ||
249 | + */ | ||
250 | + ASSERT(rc != -ERESTART); | ||
251 | + | ||
252 | + if ( rc == -EINTR && i ) | ||
253 | { | ||
254 | page->nr_validated_ptes = i; | ||
255 | page->partial_flags = 0; | ||
256 | @@ -1596,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
257 | else if ( rc < 0 && rc != -EINTR ) | ||
258 | { | ||
259 | gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); | ||
260 | + ASSERT(current->arch.old_guest_table == NULL); | ||
261 | if ( i ) | ||
262 | { | ||
263 | page->nr_validated_ptes = i; | ||
264 | @@ -1652,16 +1670,17 @@ static int alloc_l3_table(struct page_info *page) | ||
265 | rc = get_page_and_type_from_mfn( | ||
266 | l3e_get_mfn(pl3e[i]), | ||
267 | PGT_l2_page_table | PGT_pae_xen_l2, d, | ||
268 | - partial_flags | PTF_preemptible); | ||
269 | + partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); | ||
270 | } | ||
271 | - else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 ) | ||
272 | + else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, | ||
273 | + partial_flags | PTF_retain_ref_on_restart)) > 0 ) | ||
274 | continue; | ||
275 | |||
276 | if ( rc == -ERESTART ) | ||
277 | { | ||
278 | page->nr_validated_ptes = i; | ||
279 | /* Set 'set', leave 'general ref' set if this entry was set */ | ||
280 | - page->partial_flags = partial_flags | PTF_partial_set; | ||
281 | + page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
282 | } | ||
283 | else if ( rc == -EINTR && i ) | ||
284 | { | ||
285 | @@ -1822,14 +1841,15 @@ static int alloc_l4_table(struct page_info *page) | ||
286 | i++, partial_flags = 0 ) | ||
287 | { | ||
288 | if ( !is_guest_l4_slot(d, i) || | ||
289 | - (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 ) | ||
290 | + (rc = get_page_from_l4e(pl4e[i], pfn, d, | ||
291 | + partial_flags | PTF_retain_ref_on_restart)) > 0 ) | ||
292 | continue; | ||
293 | |||
294 | if ( rc == -ERESTART ) | ||
295 | { | ||
296 | page->nr_validated_ptes = i; | ||
297 | /* Set 'set', leave 'general ref' set if this entry was set */ | ||
298 | - page->partial_flags = partial_flags | PTF_partial_set; | ||
299 | + page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
300 | } | ||
301 | else if ( rc < 0 ) | ||
302 | { | ||
303 | @@ -1927,9 +1947,7 @@ static int free_l2_table(struct page_info *page) | ||
304 | else if ( rc == -ERESTART ) | ||
305 | { | ||
306 | page->nr_validated_ptes = i; | ||
307 | - page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
308 | - partial_flags : | ||
309 | - (PTF_partial_set | PTF_partial_general_ref); | ||
310 | + page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
311 | } | ||
312 | else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) | ||
313 | { | ||
314 | @@ -1977,9 +1995,7 @@ static int free_l3_table(struct page_info *page) | ||
315 | if ( rc == -ERESTART ) | ||
316 | { | ||
317 | page->nr_validated_ptes = i; | ||
318 | - page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
319 | - partial_flags : | ||
320 | - (PTF_partial_set | PTF_partial_general_ref); | ||
321 | + page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
322 | } | ||
323 | else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) | ||
324 | { | ||
325 | @@ -2010,9 +2026,7 @@ static int free_l4_table(struct page_info *page) | ||
326 | if ( rc == -ERESTART ) | ||
327 | { | ||
328 | page->nr_validated_ptes = i; | ||
329 | - page->partial_flags = (partial_flags & PTF_partial_set) ? | ||
330 | - partial_flags : | ||
331 | - (PTF_partial_set | PTF_partial_general_ref); | ||
332 | + page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
333 | } | ||
334 | else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) | ||
335 | { | ||
336 | diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h | ||
337 | index a531fe3115..74b0246c02 100644 | ||
338 | --- a/xen/include/asm-x86/mm.h | ||
339 | +++ b/xen/include/asm-x86/mm.h | ||
340 | @@ -167,22 +167,25 @@ struct page_info | ||
341 | * page. | ||
342 | * | ||
343 | * This happens: | ||
344 | - * - During de-validation, if de-validation of the page was | ||
345 | + * - During validation or de-validation, if the operation was | ||
346 | * interrupted | ||
347 | * - During validation, if an invalid entry is encountered and | ||
348 | * validation is preemptible | ||
349 | * - During validation, if PTF_partial_general_ref was set on | ||
350 | - * this entry to begin with (perhaps because we're picking | ||
351 | - * up from a partial de-validation). | ||
352 | + * this entry to begin with (perhaps because it picked up a | ||
353 | + * previous operation) | ||
354 | * | ||
355 | - * When resuming validation, if PTF_partial_general_ref is clear, | ||
356 | - * then a general reference must be re-acquired; if it is set, no | ||
357 | - * reference should be acquired. | ||
358 | + * When resuming validation, if PTF_partial_general_ref is | ||
359 | + * clear, then a general reference must be re-acquired; if it | ||
360 | + * is set, no reference should be acquired. | ||
361 | * | ||
362 | * When resuming de-validation, if PTF_partial_general_ref is | ||
363 | * clear, no reference should be dropped; if it is set, a | ||
364 | * reference should be dropped. | ||
365 | * | ||
366 | + * NB at the moment, PTF_partial_set should be set if and only if | ||
367 | + * PTF_partial_general_ref is set. | ||
368 | + * | ||
369 | * NB that PTF_partial_set and PTF_partial_general_ref are | ||
370 | * defined in mm.c, the only place where they are used. | ||
371 | * | ||
372 | -- | ||
373 | 2.23.0 | ||
374 | |||
diff --git a/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch new file mode 100644 index 0000000000..d7602d644b --- /dev/null +++ b/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch | |||
@@ -0,0 +1,227 @@ | |||
1 | From 140c8876835a134daf507d6c60bdcdf9126f166f Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and | ||
5 | PTF_partial_general_ref into one | ||
6 | |||
7 | ...now that they are equivalent. No functional change intended. | ||
8 | |||
9 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
10 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
11 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
12 | --- | ||
13 | xen/arch/x86/mm.c | 50 +++++++++++----------------------------- | ||
14 | xen/include/asm-x86/mm.h | 29 +++++++++++------------ | ||
15 | 2 files changed, 26 insertions(+), 53 deletions(-) | ||
16 | |||
17 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
18 | index 2f185a3cd3..693791331a 100644 | ||
19 | --- a/xen/arch/x86/mm.c | ||
20 | +++ b/xen/arch/x86/mm.c | ||
21 | @@ -653,13 +653,12 @@ static int __get_page_type(struct page_info *page, unsigned long type, | ||
22 | |||
23 | /* | ||
24 | * The following flags are used to specify behavior of various get and | ||
25 | - * put commands. The first two are also stored in page->partial_flags | ||
26 | - * to indicate the state of the page pointed to by | ||
27 | + * put commands. The first is also stored in page->partial_flags to | ||
28 | + * indicate the state of the page pointed to by | ||
29 | * page->pte[page->nr_validated_entries]. See the comment in mm.h for | ||
30 | * more information. | ||
31 | */ | ||
32 | #define PTF_partial_set (1 << 0) | ||
33 | -#define PTF_partial_general_ref (1 << 1) | ||
34 | #define PTF_preemptible (1 << 2) | ||
35 | #define PTF_defer (1 << 3) | ||
36 | #define PTF_retain_ref_on_restart (1 << 4) | ||
37 | @@ -671,13 +670,10 @@ static int get_page_and_type_from_mfn( | ||
38 | struct page_info *page = mfn_to_page(mfn); | ||
39 | int rc; | ||
40 | bool preemptible = flags & PTF_preemptible, | ||
41 | - partial_ref = flags & PTF_partial_general_ref, | ||
42 | partial_set = flags & PTF_partial_set, | ||
43 | retain_ref = flags & PTF_retain_ref_on_restart; | ||
44 | |||
45 | - ASSERT(partial_ref == partial_set); | ||
46 | - | ||
47 | - if ( likely(!partial_ref) && | ||
48 | + if ( likely(!partial_set) && | ||
49 | unlikely(!get_page_from_mfn(mfn, d)) ) | ||
50 | return -EINVAL; | ||
51 | |||
52 | @@ -687,14 +683,14 @@ static int get_page_and_type_from_mfn( | ||
53 | * Retain the refcount if: | ||
54 | * - page is fully validated (rc == 0) | ||
55 | * - page is not validated (rc < 0) but: | ||
56 | - * - We came in with a reference (partial_ref) | ||
57 | + * - We came in with a reference (partial_set) | ||
58 | * - page is partially validated (rc == -ERESTART), and the | ||
59 | * caller has asked the ref to be retained in that case | ||
60 | * - page is partially validated but there's been an error | ||
61 | * (page == current->arch.old_guest_table) | ||
62 | * | ||
63 | - * The partial_ref-on-error clause is worth an explanation. There | ||
64 | - * are two scenarios where partial_ref might be true coming in: | ||
65 | + * The partial_set-on-error clause is worth an explanation. There | ||
66 | + * are two scenarios where partial_set might be true coming in: | ||
67 | * - mfn has been partially promoted / demoted as type `type`; | ||
68 | * i.e. has PGT_partial set | ||
69 | * - mfn has been partially demoted as L(type+1) (i.e., a linear | ||
70 | @@ -717,7 +713,7 @@ static int get_page_and_type_from_mfn( | ||
71 | * count retained unless we succeeded, or the operation was | ||
72 | * preemptible. | ||
73 | */ | ||
74 | - if ( likely(!rc) || partial_ref ) | ||
75 | + if ( likely(!rc) || partial_set ) | ||
76 | /* nothing */; | ||
77 | else if ( page == current->arch.old_guest_table || | ||
78 | (retain_ref && rc == -ERESTART) ) | ||
79 | @@ -1384,13 +1380,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
80 | struct page_info *pg = l2e_get_page(l2e); | ||
81 | struct page_info *ptpg = mfn_to_page(_mfn(pfn)); | ||
82 | |||
83 | - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
84 | - PTF_partial_set ) | ||
85 | - { | ||
86 | - /* partial_set should always imply partial_ref */ | ||
87 | - BUG(); | ||
88 | - } | ||
89 | - else if ( flags & PTF_defer ) | ||
90 | + if ( flags & PTF_defer ) | ||
91 | { | ||
92 | current->arch.old_guest_ptpg = ptpg; | ||
93 | current->arch.old_guest_table = pg; | ||
94 | @@ -1430,13 +1420,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
95 | |||
96 | pg = l3e_get_page(l3e); | ||
97 | |||
98 | - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
99 | - PTF_partial_set ) | ||
100 | - { | ||
101 | - /* partial_set should always imply partial_ref */ | ||
102 | - BUG(); | ||
103 | - } | ||
104 | - | ||
105 | if ( flags & PTF_defer ) | ||
106 | { | ||
107 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
108 | @@ -1461,13 +1444,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
109 | { | ||
110 | struct page_info *pg = l4e_get_page(l4e); | ||
111 | |||
112 | - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == | ||
113 | - PTF_partial_set ) | ||
114 | - { | ||
115 | - /* partial_set should always imply partial_ref */ | ||
116 | - BUG(); | ||
117 | - } | ||
118 | - | ||
119 | if ( flags & PTF_defer ) | ||
120 | { | ||
121 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
122 | @@ -1680,7 +1656,7 @@ static int alloc_l3_table(struct page_info *page) | ||
123 | { | ||
124 | page->nr_validated_ptes = i; | ||
125 | /* Set 'set', leave 'general ref' set if this entry was set */ | ||
126 | - page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
127 | + page->partial_flags = PTF_partial_set; | ||
128 | } | ||
129 | else if ( rc == -EINTR && i ) | ||
130 | { | ||
131 | @@ -1849,7 +1825,7 @@ static int alloc_l4_table(struct page_info *page) | ||
132 | { | ||
133 | page->nr_validated_ptes = i; | ||
134 | /* Set 'set', leave 'general ref' set if this entry was set */ | ||
135 | - page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
136 | + page->partial_flags = PTF_partial_set; | ||
137 | } | ||
138 | else if ( rc < 0 ) | ||
139 | { | ||
140 | @@ -1947,7 +1923,7 @@ static int free_l2_table(struct page_info *page) | ||
141 | else if ( rc == -ERESTART ) | ||
142 | { | ||
143 | page->nr_validated_ptes = i; | ||
144 | - page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
145 | + page->partial_flags = PTF_partial_set; | ||
146 | } | ||
147 | else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) | ||
148 | { | ||
149 | @@ -1995,7 +1971,7 @@ static int free_l3_table(struct page_info *page) | ||
150 | if ( rc == -ERESTART ) | ||
151 | { | ||
152 | page->nr_validated_ptes = i; | ||
153 | - page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
154 | + page->partial_flags = PTF_partial_set; | ||
155 | } | ||
156 | else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) | ||
157 | { | ||
158 | @@ -2026,7 +2002,7 @@ static int free_l4_table(struct page_info *page) | ||
159 | if ( rc == -ERESTART ) | ||
160 | { | ||
161 | page->nr_validated_ptes = i; | ||
162 | - page->partial_flags = PTF_partial_set | PTF_partial_general_ref; | ||
163 | + page->partial_flags = PTF_partial_set; | ||
164 | } | ||
165 | else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) | ||
166 | { | ||
167 | diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h | ||
168 | index 74b0246c02..704345335c 100644 | ||
169 | --- a/xen/include/asm-x86/mm.h | ||
170 | +++ b/xen/include/asm-x86/mm.h | ||
171 | @@ -162,7 +162,7 @@ struct page_info | ||
172 | * operation on the current page. (That page may or may not | ||
173 | * still have PGT_partial set.) | ||
174 | * | ||
175 | - * If PTF_partial_general_ref is set, then the PTE at | ||
176 | + * Additionally, if PTF_partial_set is set, then the PTE at | ||
177 | * @nr_validated_ptef holds a general reference count for the | ||
178 | * page. | ||
179 | * | ||
180 | @@ -171,23 +171,20 @@ struct page_info | ||
181 | * interrupted | ||
182 | * - During validation, if an invalid entry is encountered and | ||
183 | * validation is preemptible | ||
184 | - * - During validation, if PTF_partial_general_ref was set on | ||
185 | - * this entry to begin with (perhaps because it picked up a | ||
186 | + * - During validation, if PTF_partial_set was set on this | ||
187 | + * entry to begin with (perhaps because it picked up a | ||
188 | * previous operation) | ||
189 | * | ||
190 | - * When resuming validation, if PTF_partial_general_ref is | ||
191 | - * clear, then a general reference must be re-acquired; if it | ||
192 | - * is set, no reference should be acquired. | ||
193 | + * When resuming validation, if PTF_partial_set is clear, then | ||
194 | + * a general reference must be re-acquired; if it is set, no | ||
195 | + * reference should be acquired. | ||
196 | * | ||
197 | - * When resuming de-validation, if PTF_partial_general_ref is | ||
198 | - * clear, no reference should be dropped; if it is set, a | ||
199 | - * reference should be dropped. | ||
200 | + * When resuming de-validation, if PTF_partial_set is clear, | ||
201 | + * no reference should be dropped; if it is set, a reference | ||
202 | + * should be dropped. | ||
203 | * | ||
204 | - * NB at the moment, PTF_partial_set should be set if and only if | ||
205 | - * PTF_partial_general_ref is set. | ||
206 | - * | ||
207 | - * NB that PTF_partial_set and PTF_partial_general_ref are | ||
208 | - * defined in mm.c, the only place where they are used. | ||
209 | + * NB that PTF_partial_set is defined in mm.c, the only place | ||
210 | + * where it is used. | ||
211 | * | ||
212 | * The 3rd field, @linear_pt_count, indicates | ||
213 | * - by a positive value, how many same-level page table entries a page | ||
214 | @@ -197,8 +194,8 @@ struct page_info | ||
215 | */ | ||
216 | struct { | ||
217 | u16 nr_validated_ptes:PAGETABLE_ORDER + 1; | ||
218 | - u16 :16 - PAGETABLE_ORDER - 1 - 2; | ||
219 | - u16 partial_flags:2; | ||
220 | + u16 :16 - PAGETABLE_ORDER - 1 - 1; | ||
221 | + u16 partial_flags:1; | ||
222 | s16 linear_pt_count; | ||
223 | }; | ||
224 | |||
225 | -- | ||
226 | 2.23.0 | ||
227 | |||
diff --git a/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch new file mode 100644 index 0000000000..a3519c2103 --- /dev/null +++ b/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch | |||
@@ -0,0 +1,106 @@ | |||
1 | From 203bc967574c7c5a06ed6bb452a9761f46dce724 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion | ||
5 | failures | ||
6 | |||
7 | In order to allow recursive pagetable promotions and demotions to be | ||
8 | interrupted, Xen must keep track of the state of the sub-pages | ||
9 | promoted or demoted. This is stored in two elements in the page | ||
10 | struct: nr_entries_validated and partial_flags. | ||
11 | |||
12 | The rule is that entries [0, nr_entries_validated) should always be | ||
13 | validated and hold a general reference count. If partial_flags is | ||
14 | zero, then [nr_entries_validated] is not validated and no reference | ||
15 | count is held. If PTF_partial_set is set, then [nr_entries_validated] | ||
16 | is partially validated, and a general reference count is held. | ||
17 | |||
18 | Unfortunately, in cases where an entry began with PTF_partial_set set, | ||
19 | and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is | ||
20 | erroneously dropped. (This scenario can be engineered mainly by the | ||
21 | use of interleaving of promoting and demoting a page which has "linear | ||
22 | pagetable" entries; see the appendix for a sketch.) This means that | ||
23 | we will "leak" a general reference count on the page in question, | ||
24 | preventing the page from being freed. | ||
25 | |||
26 | Fix this by setting page->partial_flags to the partial_flags local | ||
27 | variable. | ||
28 | |||
29 | This is part of XSA-299. | ||
30 | |||
31 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
32 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
33 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
34 | ----- | ||
35 | Appendix | ||
36 | |||
37 | Suppose A and B can both be promoted to L2 pages, and A[x] points to B. | ||
38 | |||
39 | V1: PIN_L2 B. | ||
40 | B.type_count = 1 | PGT_validated | ||
41 | B.count = 2 | PGC_allocated | ||
42 | |||
43 | V1: MOD_L3_ENTRY pointing something to A. | ||
44 | In the process of validating A[x], grab an extra type / ref on B: | ||
45 | B.type_count = 2 | PGT_validated | ||
46 | B.count = 3 | PGC_allocated | ||
47 | A.type_count = 1 | PGT_validated | ||
48 | A.count = 2 | PGC_allocated | ||
49 | |||
50 | V1: UNPIN B. | ||
51 | B.type_count = 1 | PGT_validate | ||
52 | B.count = 2 | PGC_allocated | ||
53 | |||
54 | V1: MOD_L3_ENTRY removing the reference to A. | ||
55 | De-validate A, down to A[x], which points to B. | ||
56 | Drop the final type on B. Arrange to be interrupted. | ||
57 | B.type_count = 1 | PGT_partial | ||
58 | B.count = 2 | PGC_allocated | ||
59 | A.type_count = 1 | PGT_partial | ||
60 | A.nr_validated_entries = x | ||
61 | A.partial_pte = -1 | ||
62 | |||
63 | V2: MOD_L3_ENTRY adds a reference to A. | ||
64 | |||
65 | At this point, get_page_from_l2e(A[x]) tries | ||
66 | get_page_and_type_from_mfn(), which fails because it's the wrong type; | ||
67 | and get_l2_linear_pagetable() also fails, because B isn't validated as | ||
68 | an l2 anymore. | ||
69 | --- | ||
70 | xen/arch/x86/mm.c | 6 +++--- | ||
71 | 1 file changed, 3 insertions(+), 3 deletions(-) | ||
72 | |||
73 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
74 | index 693791331a..300f147e98 100644 | ||
75 | --- a/xen/arch/x86/mm.c | ||
76 | +++ b/xen/arch/x86/mm.c | ||
77 | @@ -1593,7 +1593,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
78 | if ( i ) | ||
79 | { | ||
80 | page->nr_validated_ptes = i; | ||
81 | - page->partial_flags = 0; | ||
82 | + page->partial_flags = partial_flags; | ||
83 | current->arch.old_guest_ptpg = NULL; | ||
84 | current->arch.old_guest_table = page; | ||
85 | } | ||
86 | @@ -1678,7 +1678,7 @@ static int alloc_l3_table(struct page_info *page) | ||
87 | if ( i ) | ||
88 | { | ||
89 | page->nr_validated_ptes = i; | ||
90 | - page->partial_flags = 0; | ||
91 | + page->partial_flags = partial_flags; | ||
92 | current->arch.old_guest_ptpg = NULL; | ||
93 | current->arch.old_guest_table = page; | ||
94 | } | ||
95 | @@ -1835,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page) | ||
96 | if ( i ) | ||
97 | { | ||
98 | page->nr_validated_ptes = i; | ||
99 | - page->partial_flags = 0; | ||
100 | + page->partial_flags = partial_flags; | ||
101 | if ( rc == -EINTR ) | ||
102 | rc = -ERESTART; | ||
103 | else | ||
104 | -- | ||
105 | 2.23.0 | ||
106 | |||
diff --git a/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch new file mode 100644 index 0000000000..f8e7915bb9 --- /dev/null +++ b/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch | |||
@@ -0,0 +1,169 @@ | |||
1 | From 45242b9057b4feccb837362f39e0eb97dc0093c8 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:49 +0100 | ||
4 | Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error | ||
5 | |||
6 | If an invalid entry is discovered when validating a page-table tree, | ||
7 | the entire tree which has so far been validated must be de-validated. | ||
8 | Since this may take a long time, alloc_l[2-4]_table() set current | ||
9 | vcpu's old_guest_table immediately; put_old_guest_table() will make | ||
10 | sure that put_page_type() will be called to finish off the | ||
11 | de-validation before any other MMU operations can happen on the vcpu. | ||
12 | |||
13 | The invariant for partial pages should be: | ||
14 | |||
15 | * Entries [0, nr_validated_ptes) should be completely validated; | ||
16 | put_page_type() will de-validate these. | ||
17 | |||
18 | * If [nr_validated_ptes] is partially validated, partial_flags should | ||
19 | set PTF_partiaL_set. put_page_type() will be called on this page to | ||
20 | finish off devalidation, and the appropriate refcount adjustments | ||
21 | will be done. | ||
22 | |||
23 | alloc_l[2-3]_table() indicates partial validation to its callers by | ||
24 | setting current->old_guest_table. | ||
25 | |||
26 | Unfortunately, this is mishandled. | ||
27 | |||
28 | Take the case where validating lNe[x] returns an error. | ||
29 | |||
30 | First, alloc_l3_table() doesn't check old_guest_table at all; as a | ||
31 | result, partial_flags is not set when it should be. nr_validated_ptes | ||
32 | is set to x; and since PFT_partial_set clear, de-validation resumes at | ||
33 | nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not | ||
34 | have put_page_type() called on it when de-validating the rest of the | ||
35 | l3: it will be stuck in the PGT_partial state until the domain is | ||
36 | destroyed, or until it is re-used as an l2. (Any other page type will | ||
37 | fail.) | ||
38 | |||
39 | Worse, alloc_l4_table(), rather than setting PTF_partial_set as it | ||
40 | should, sets nr_validated_ptes to x+1. When de-validating, since | ||
41 | partial is 0, this will correctly resume calling put_page_type at [x]; | ||
42 | but, if the put_page_type() is never called, but instead | ||
43 | get_page_type() is called, validation will pick up at [x+1], | ||
44 | neglecting to validate [x]. If the rest of the validation succeeds, | ||
45 | the l4 will be validated even though [x] is invalid. | ||
46 | |||
47 | Fix this in both cases by setting PTF_partial_set if old_guest_table | ||
48 | is set. | ||
49 | |||
50 | While here, add some safety catches: | ||
51 | - old_guest_table must point to the page contained in | ||
52 | [nr_validated_ptes]. | ||
53 | - alloc_l1_page shouldn't set old_guest_table | ||
54 | |||
55 | If we experience one of these situations in production builds, it's | ||
56 | safer to avoid calling put_page_type for the pages in question. If | ||
57 | they have PGT_partial set, they will be cleaned up on domain | ||
58 | destruction; if not, we have no idea whether a type count is safe to | ||
59 | drop. Retaining an extra type ref that should have been dropped may | ||
60 | trigger a BUG() on the free_domain_page() path, but dropping a type | ||
61 | count that shouldn't be dropped may cause a privilege escalation. | ||
62 | |||
63 | This is part of XSA-299. | ||
64 | |||
65 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
66 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
67 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
68 | --- | ||
69 | xen/arch/x86/mm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++- | ||
70 | 1 file changed, 54 insertions(+), 1 deletion(-) | ||
71 | |||
72 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
73 | index 300f147e98..2ea32463a8 100644 | ||
74 | --- a/xen/arch/x86/mm.c | ||
75 | +++ b/xen/arch/x86/mm.c | ||
76 | @@ -1592,6 +1592,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
77 | ASSERT(current->arch.old_guest_table == NULL); | ||
78 | if ( i ) | ||
79 | { | ||
80 | + /* | ||
81 | + * alloc_l1_table() doesn't set old_guest_table; it does | ||
82 | + * its own tear-down immediately on failure. If it | ||
83 | + * did we'd need to check it and set partial_flags as we | ||
84 | + * do in alloc_l[34]_table(). | ||
85 | + * | ||
86 | + * Note on the use of ASSERT: if it's non-null and | ||
87 | + * hasn't been cleaned up yet, it should have | ||
88 | + * PGT_partial set; and so the type will be cleaned up | ||
89 | + * on domain destruction. Unfortunately, we would | ||
90 | + * leak the general ref held by old_guest_table; but | ||
91 | + * leaking a page is less bad than a host crash. | ||
92 | + */ | ||
93 | + ASSERT(current->arch.old_guest_table == NULL); | ||
94 | page->nr_validated_ptes = i; | ||
95 | page->partial_flags = partial_flags; | ||
96 | current->arch.old_guest_ptpg = NULL; | ||
97 | @@ -1619,6 +1633,7 @@ static int alloc_l3_table(struct page_info *page) | ||
98 | unsigned int i; | ||
99 | int rc = 0; | ||
100 | unsigned int partial_flags = page->partial_flags; | ||
101 | + l3_pgentry_t l3e = l3e_empty(); | ||
102 | |||
103 | pl3e = map_domain_page(_mfn(pfn)); | ||
104 | |||
105 | @@ -1665,7 +1680,11 @@ static int alloc_l3_table(struct page_info *page) | ||
106 | rc = -ERESTART; | ||
107 | } | ||
108 | if ( rc < 0 ) | ||
109 | + { | ||
110 | + /* XSA-299 Backport: Copy l3e for checking */ | ||
111 | + l3e = pl3e[i]; | ||
112 | break; | ||
113 | + } | ||
114 | |||
115 | pl3e[i] = adjust_guest_l3e(pl3e[i], d); | ||
116 | } | ||
117 | @@ -1679,6 +1698,24 @@ static int alloc_l3_table(struct page_info *page) | ||
118 | { | ||
119 | page->nr_validated_ptes = i; | ||
120 | page->partial_flags = partial_flags; | ||
121 | + if ( current->arch.old_guest_table ) | ||
122 | + { | ||
123 | + /* | ||
124 | + * We've experienced a validation failure. If | ||
125 | + * old_guest_table is set, "transfer" the general | ||
126 | + * reference count to pl3e[nr_validated_ptes] by | ||
127 | + * setting PTF_partial_set. | ||
128 | + * | ||
129 | + * As a precaution, check that old_guest_table is the | ||
130 | + * page pointed to by pl3e[nr_validated_ptes]. If | ||
131 | + * not, it's safer to leak a type ref on production | ||
132 | + * builds. | ||
133 | + */ | ||
134 | + if ( current->arch.old_guest_table == l3e_get_page(l3e) ) | ||
135 | + page->partial_flags = PTF_partial_set; | ||
136 | + else | ||
137 | + ASSERT_UNREACHABLE(); | ||
138 | + } | ||
139 | current->arch.old_guest_ptpg = NULL; | ||
140 | current->arch.old_guest_table = page; | ||
141 | } | ||
142 | @@ -1841,7 +1878,23 @@ static int alloc_l4_table(struct page_info *page) | ||
143 | else | ||
144 | { | ||
145 | if ( current->arch.old_guest_table ) | ||
146 | - page->nr_validated_ptes++; | ||
147 | + { | ||
148 | + /* | ||
149 | + * We've experienced a validation failure. If | ||
150 | + * old_guest_table is set, "transfer" the general | ||
151 | + * reference count to pl3e[nr_validated_ptes] by | ||
152 | + * setting PTF_partial_set. | ||
153 | + * | ||
154 | + * As a precaution, check that old_guest_table is the | ||
155 | + * page pointed to by pl4e[nr_validated_ptes]. If | ||
156 | + * not, it's safer to leak a type ref on production | ||
157 | + * builds. | ||
158 | + */ | ||
159 | + if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) ) | ||
160 | + page->partial_flags = PTF_partial_set; | ||
161 | + else | ||
162 | + ASSERT_UNREACHABLE(); | ||
163 | + } | ||
164 | current->arch.old_guest_ptpg = NULL; | ||
165 | current->arch.old_guest_table = page; | ||
166 | } | ||
167 | -- | ||
168 | 2.23.0 | ||
169 | |||
diff --git a/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch new file mode 100644 index 0000000000..5d722cd2ab --- /dev/null +++ b/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch | |||
@@ -0,0 +1,413 @@ | |||
1 | From 4905f7fbaa60f75df063305c9532fb63b77deab9 Mon Sep 17 00:00:00 2001 | ||
2 | From: George Dunlap <george.dunlap@citrix.com> | ||
3 | Date: Thu, 10 Oct 2019 17:57:50 +0100 | ||
4 | Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to | ||
5 | begin with | ||
6 | |||
7 | Validation and de-validation of pagetable trees may take arbitrarily | ||
8 | large amounts of time, and so must be preemptible. This is indicated | ||
9 | by setting the PGT_partial bit in the type_info, and setting | ||
10 | nr_validated_entries and partial_flags appropriately. Specifically, | ||
11 | if the entry at [nr_validated_entries] is partially validated, | ||
12 | partial_flags should have the PGT_partial_set bit set, and the entry | ||
13 | should hold a general reference count. During de-validation, | ||
14 | put_page_type() is called on partially validated entries. | ||
15 | |||
16 | Unfortunately, there are a number of issues with the current algorithm. | ||
17 | |||
18 | First, doing a "normal" put_page_type() is not safe when no type ref | ||
19 | is held: there is nothing to stop another vcpu from coming along and | ||
20 | picking up validation again: at which point the put_page_type may drop | ||
21 | the only page ref on an in-use page. Some examples are listed in the | ||
22 | appendix. | ||
23 | |||
24 | The core issue is that put_page_type() is being called both to clean | ||
25 | up PGT_partial, and to drop a type count; and has no way of knowing | ||
26 | which is which; and so if in between, PGT_partial is cleared, | ||
27 | put_page_type() will drop the type ref erroneously. | ||
28 | |||
29 | What is needed is to distinguish between two states: | ||
30 | - Dropping a type ref which is held | ||
31 | - Cleaning up a page which has been partially de/validated | ||
32 | |||
33 | Fix this by telling put_page_type() which of the two activities you | ||
34 | intend. | ||
35 | |||
36 | When cleaning up a partial de/validation, take no action unless you | ||
37 | find a page partially validated. | ||
38 | |||
39 | If put_page_type() is called without PTF_partial_set, and finds the | ||
40 | page in a PGT_partial state anyway, then there's certainly been a | ||
41 | misaccounting somewhere, and carrying on would almost certainly cause | ||
42 | a security issue, so crash the host instead. | ||
43 | |||
44 | In put_page_from_lNe, pass partial_flags on to _put_page_type(). | ||
45 | |||
46 | old_guest_table may be set either with a fully validated page (when | ||
47 | using the "deferred put" pattern), or with a partially validated page | ||
48 | (when a normal "de-validation" is interrupted, or when a validation | ||
49 | fails part-way through due to invalid entries). Add a flag, | ||
50 | old_guest_table_partial, to indicate which of these it is, and use | ||
51 | that to pass the appropriate flag to _put_page_type(). | ||
52 | |||
53 | While here, delete stray trailing whitespace. | ||
54 | |||
55 | This is part of XSA-299. | ||
56 | |||
57 | Reported-by: George Dunlap <george.dunlap@citrix.com> | ||
58 | Signed-off-by: George Dunlap <george.dunlap@citrix.com> | ||
59 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
60 | ----- | ||
61 | Appendix: | ||
62 | |||
63 | Suppose page A, when interpreted as an l3 pagetable, contains all | ||
64 | valid entries; and suppose A[x] points to page B, which when | ||
65 | interpreted as an l2 pagetable, contains all valid entries. | ||
66 | |||
67 | P1: PIN_L3_TABLE | ||
68 | A -> PGT_l3_table | 1 | valid | ||
69 | B -> PGT_l2_table | 1 | valid | ||
70 | |||
71 | P1: UNPIN_TABLE | ||
72 | > Arrange to interrupt after B has been de-validated | ||
73 | B: | ||
74 | type_info -> PGT_l2_table | 0 | ||
75 | A: | ||
76 | type_info -> PGT_l3_table | 1 | partial | ||
77 | nr_validated_enties -> (less than x) | ||
78 | |||
79 | P2: mod_l4_entry to point to A | ||
80 | > Arrange for this to be interrupted while B is being validated | ||
81 | B: | ||
82 | type_info -> PGT_l2_table | 1 | partial | ||
83 | (nr_validated_entires &c set as appropriate) | ||
84 | A: | ||
85 | type_info -> PGT_l3_table | 1 | partial | ||
86 | nr_validated_entries -> x | ||
87 | partial_pte = 1 | ||
88 | |||
89 | P3: mod_l3_entry some other unrelated l3 to point to B: | ||
90 | B: | ||
91 | type_info -> PGT_l2_table | 1 | ||
92 | |||
93 | P1: Restart UNPIN_TABLE | ||
94 | |||
95 | At this point, since A.nr_validate_entries == x and A.partial_pte != | ||
96 | 0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping | ||
97 | its type count to 0 while it's still being pointed to by some other l3 | ||
98 | |||
99 | A similar issue arises with old_guest_table. Consider the following | ||
100 | scenario: | ||
101 | |||
102 | Suppose A is a page which, when interpreted as an l2, has valid entries | ||
103 | until entry x, which is invalid. | ||
104 | |||
105 | V1: PIN_L2_TABLE(A) | ||
106 | <Validate until we try to validate [x], get -EINVAL> | ||
107 | A -> PGT_l2_table | 1 | PGT_partial | ||
108 | V1 -> old_guest_table = A | ||
109 | <delayed> | ||
110 | |||
111 | V2: PIN_L2_TABLE(A) | ||
112 | <Pick up where V1 left off, try to re-validate [x], get -EINVAL> | ||
113 | A -> PGT_l2_table | 1 | PGT_partial | ||
114 | V2 -> old_guest_table = A | ||
115 | <restart> | ||
116 | put_old_guest_table() | ||
117 | _put_page_type(A) | ||
118 | A -> PGT_l2_table | 0 | ||
119 | |||
120 | V1: <restart> | ||
121 | put_old_guest_table() | ||
122 | _put_page_type(A) # UNDERFLOW | ||
123 | |||
124 | Indeed, it is possible to engineer for old_guest_table for every vcpu | ||
125 | a guest has to point to the same page. | ||
126 | --- | ||
127 | xen/arch/x86/domain.c | 6 +++ | ||
128 | xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++----- | ||
129 | xen/include/asm-x86/domain.h | 4 +- | ||
130 | 3 files changed, 95 insertions(+), 14 deletions(-) | ||
131 | |||
132 | diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c | ||
133 | index 897124f05f..6074fa5947 100644 | ||
134 | --- a/xen/arch/x86/domain.c | ||
135 | +++ b/xen/arch/x86/domain.c | ||
136 | @@ -1075,9 +1075,15 @@ int arch_set_info_guest( | ||
137 | rc = -ERESTART; | ||
138 | /* Fallthrough */ | ||
139 | case -ERESTART: | ||
140 | + /* | ||
141 | + * NB that we're putting the kernel-mode table | ||
142 | + * here, which we've already successfully | ||
143 | + * validated above; hence partial = false; | ||
144 | + */ | ||
145 | v->arch.old_guest_ptpg = NULL; | ||
146 | v->arch.old_guest_table = | ||
147 | pagetable_get_page(v->arch.guest_table); | ||
148 | + v->arch.old_guest_table_partial = false; | ||
149 | v->arch.guest_table = pagetable_null(); | ||
150 | break; | ||
151 | default: | ||
152 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
153 | index 2ea32463a8..9ae71d864a 100644 | ||
154 | --- a/xen/arch/x86/mm.c | ||
155 | +++ b/xen/arch/x86/mm.c | ||
156 | @@ -1384,10 +1384,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, | ||
157 | { | ||
158 | current->arch.old_guest_ptpg = ptpg; | ||
159 | current->arch.old_guest_table = pg; | ||
160 | + current->arch.old_guest_table_partial = false; | ||
161 | } | ||
162 | else | ||
163 | { | ||
164 | - rc = _put_page_type(pg, PTF_preemptible, ptpg); | ||
165 | + rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); | ||
166 | if ( likely(!rc) ) | ||
167 | put_page(pg); | ||
168 | } | ||
169 | @@ -1410,6 +1411,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
170 | unsigned long mfn = l3e_get_pfn(l3e); | ||
171 | int writeable = l3e_get_flags(l3e) & _PAGE_RW; | ||
172 | |||
173 | + ASSERT(!(flags & PTF_partial_set)); | ||
174 | ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); | ||
175 | do { | ||
176 | put_data_page(mfn_to_page(_mfn(mfn)), writeable); | ||
177 | @@ -1422,12 +1424,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, | ||
178 | |||
179 | if ( flags & PTF_defer ) | ||
180 | { | ||
181 | + ASSERT(!(flags & PTF_partial_set)); | ||
182 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
183 | current->arch.old_guest_table = pg; | ||
184 | + current->arch.old_guest_table_partial = false; | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | - rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
189 | + rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
190 | if ( likely(!rc) ) | ||
191 | put_page(pg); | ||
192 | |||
193 | @@ -1446,12 +1450,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, | ||
194 | |||
195 | if ( flags & PTF_defer ) | ||
196 | { | ||
197 | + ASSERT(!(flags & PTF_partial_set)); | ||
198 | current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); | ||
199 | current->arch.old_guest_table = pg; | ||
200 | + current->arch.old_guest_table_partial = false; | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | - rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); | ||
205 | + rc = _put_page_type(pg, flags | PTF_preemptible, | ||
206 | + mfn_to_page(_mfn(pfn))); | ||
207 | if ( likely(!rc) ) | ||
208 | put_page(pg); | ||
209 | } | ||
210 | @@ -1556,6 +1563,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
211 | |||
212 | pl2e = map_domain_page(_mfn(pfn)); | ||
213 | |||
214 | + /* | ||
215 | + * NB that alloc_l2_table will never set partial_pte on an l2; but | ||
216 | + * free_l2_table might if a linear_pagetable entry is interrupted | ||
217 | + * partway through de-validation. In that circumstance, | ||
218 | + * get_page_from_l2e() will always return -EINVAL; and we must | ||
219 | + * retain the type ref by doing the normal partial_flags tracking. | ||
220 | + */ | ||
221 | + | ||
222 | for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; | ||
223 | i++, partial_flags = 0 ) | ||
224 | { | ||
225 | @@ -1610,6 +1625,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) | ||
226 | page->partial_flags = partial_flags; | ||
227 | current->arch.old_guest_ptpg = NULL; | ||
228 | current->arch.old_guest_table = page; | ||
229 | + current->arch.old_guest_table_partial = true; | ||
230 | } | ||
231 | } | ||
232 | if ( rc < 0 ) | ||
233 | @@ -1712,12 +1728,16 @@ static int alloc_l3_table(struct page_info *page) | ||
234 | * builds. | ||
235 | */ | ||
236 | if ( current->arch.old_guest_table == l3e_get_page(l3e) ) | ||
237 | + { | ||
238 | + ASSERT(current->arch.old_guest_table_partial); | ||
239 | page->partial_flags = PTF_partial_set; | ||
240 | + } | ||
241 | else | ||
242 | ASSERT_UNREACHABLE(); | ||
243 | } | ||
244 | current->arch.old_guest_ptpg = NULL; | ||
245 | current->arch.old_guest_table = page; | ||
246 | + current->arch.old_guest_table_partial = true; | ||
247 | } | ||
248 | while ( i-- > 0 ) | ||
249 | pl3e[i] = unadjust_guest_l3e(pl3e[i], d); | ||
250 | @@ -1891,12 +1911,16 @@ static int alloc_l4_table(struct page_info *page) | ||
251 | * builds. | ||
252 | */ | ||
253 | if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) ) | ||
254 | + { | ||
255 | + ASSERT(current->arch.old_guest_table_partial); | ||
256 | page->partial_flags = PTF_partial_set; | ||
257 | + } | ||
258 | else | ||
259 | ASSERT_UNREACHABLE(); | ||
260 | } | ||
261 | current->arch.old_guest_ptpg = NULL; | ||
262 | current->arch.old_guest_table = page; | ||
263 | + current->arch.old_guest_table_partial = true; | ||
264 | } | ||
265 | } | ||
266 | } | ||
267 | @@ -2760,6 +2784,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags, | ||
268 | x = y; | ||
269 | nx = x - 1; | ||
270 | |||
271 | + /* | ||
272 | + * Is this expected to do a full reference drop, or only | ||
273 | + * cleanup partial validation / devalidation? | ||
274 | + * | ||
275 | + * If the former, the caller must hold a "full" type ref; | ||
276 | + * which means the page must be validated. If the page is | ||
277 | + * *not* fully validated, continuing would almost certainly | ||
278 | + * open up a security hole. An exception to this is during | ||
279 | + * domain destruction, where PGT_validated can be dropped | ||
280 | + * without dropping a type ref. | ||
281 | + * | ||
282 | + * If the latter, do nothing unless type PGT_partial is set. | ||
283 | + * If it is set, the type count must be 1. | ||
284 | + */ | ||
285 | + if ( !(flags & PTF_partial_set) ) | ||
286 | + BUG_ON((x & PGT_partial) || | ||
287 | + !((x & PGT_validated) || page_get_owner(page)->is_dying)); | ||
288 | + else if ( !(x & PGT_partial) ) | ||
289 | + return 0; | ||
290 | + else | ||
291 | + BUG_ON((x & PGT_count_mask) != 1); | ||
292 | + | ||
293 | ASSERT((x & PGT_count_mask) != 0); | ||
294 | |||
295 | if ( unlikely((nx & PGT_count_mask) == 0) ) | ||
296 | @@ -3012,17 +3058,34 @@ int put_old_guest_table(struct vcpu *v) | ||
297 | if ( !v->arch.old_guest_table ) | ||
298 | return 0; | ||
299 | |||
300 | - switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, | ||
301 | - v->arch.old_guest_ptpg) ) | ||
302 | + rc = _put_page_type(v->arch.old_guest_table, | ||
303 | + PTF_preemptible | | ||
304 | + ( v->arch.old_guest_table_partial ? | ||
305 | + PTF_partial_set : 0 ), | ||
306 | + v->arch.old_guest_ptpg); | ||
307 | + | ||
308 | + if ( rc == -ERESTART || rc == -EINTR ) | ||
309 | { | ||
310 | - case -EINTR: | ||
311 | - case -ERESTART: | ||
312 | + v->arch.old_guest_table_partial = (rc == -ERESTART); | ||
313 | return -ERESTART; | ||
314 | - case 0: | ||
315 | - put_page(v->arch.old_guest_table); | ||
316 | } | ||
317 | |||
318 | + /* | ||
319 | + * It shouldn't be possible for _put_page_type() to return | ||
320 | + * anything else at the moment; but if it does happen in | ||
321 | + * production, leaking the type ref is probably the best thing to | ||
322 | + * do. Either way, drop the general ref held by old_guest_table. | ||
323 | + */ | ||
324 | + ASSERT(rc == 0); | ||
325 | + | ||
326 | + put_page(v->arch.old_guest_table); | ||
327 | v->arch.old_guest_table = NULL; | ||
328 | + v->arch.old_guest_ptpg = NULL; | ||
329 | + /* | ||
330 | + * Safest default if someone sets old_guest_table without | ||
331 | + * explicitly setting old_guest_table_partial. | ||
332 | + */ | ||
333 | + v->arch.old_guest_table_partial = true; | ||
334 | |||
335 | return rc; | ||
336 | } | ||
337 | @@ -3175,11 +3238,11 @@ int new_guest_cr3(mfn_t mfn) | ||
338 | switch ( rc = put_page_and_type_preemptible(page) ) | ||
339 | { | ||
340 | case -EINTR: | ||
341 | - rc = -ERESTART; | ||
342 | - /* fallthrough */ | ||
343 | case -ERESTART: | ||
344 | curr->arch.old_guest_ptpg = NULL; | ||
345 | curr->arch.old_guest_table = page; | ||
346 | + curr->arch.old_guest_table_partial = (rc == -ERESTART); | ||
347 | + rc = -ERESTART; | ||
348 | break; | ||
349 | default: | ||
350 | BUG_ON(rc); | ||
351 | @@ -3448,6 +3511,7 @@ long do_mmuext_op( | ||
352 | { | ||
353 | curr->arch.old_guest_ptpg = NULL; | ||
354 | curr->arch.old_guest_table = page; | ||
355 | + curr->arch.old_guest_table_partial = false; | ||
356 | } | ||
357 | } | ||
358 | } | ||
359 | @@ -3482,6 +3546,11 @@ long do_mmuext_op( | ||
360 | case -ERESTART: | ||
361 | curr->arch.old_guest_ptpg = NULL; | ||
362 | curr->arch.old_guest_table = page; | ||
363 | + /* | ||
364 | + * EINTR means we still hold the type ref; ERESTART | ||
365 | + * means PGT_partial holds the type ref | ||
366 | + */ | ||
367 | + curr->arch.old_guest_table_partial = (rc == -ERESTART); | ||
368 | rc = 0; | ||
369 | break; | ||
370 | default: | ||
371 | @@ -3550,11 +3619,15 @@ long do_mmuext_op( | ||
372 | switch ( rc = put_page_and_type_preemptible(page) ) | ||
373 | { | ||
374 | case -EINTR: | ||
375 | - rc = -ERESTART; | ||
376 | - /* fallthrough */ | ||
377 | case -ERESTART: | ||
378 | curr->arch.old_guest_ptpg = NULL; | ||
379 | curr->arch.old_guest_table = page; | ||
380 | + /* | ||
381 | + * EINTR means we still hold the type ref; | ||
382 | + * ERESTART means PGT_partial holds the ref | ||
383 | + */ | ||
384 | + curr->arch.old_guest_table_partial = (rc == -ERESTART); | ||
385 | + rc = -ERESTART; | ||
386 | break; | ||
387 | default: | ||
388 | BUG_ON(rc); | ||
389 | diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h | ||
390 | index aec65630d9..5afaf6b9de 100644 | ||
391 | --- a/xen/include/asm-x86/domain.h | ||
392 | +++ b/xen/include/asm-x86/domain.h | ||
393 | @@ -311,7 +311,7 @@ struct arch_domain | ||
394 | |||
395 | struct paging_domain paging; | ||
396 | struct p2m_domain *p2m; | ||
397 | - /* To enforce lock ordering in the pod code wrt the | ||
398 | + /* To enforce lock ordering in the pod code wrt the | ||
399 | * page_alloc lock */ | ||
400 | int page_alloc_unlock_level; | ||
401 | |||
402 | @@ -550,6 +550,8 @@ struct arch_vcpu | ||
403 | struct page_info *old_guest_table; /* partially destructed pagetable */ | ||
404 | struct page_info *old_guest_ptpg; /* containing page table of the */ | ||
405 | /* former, if any */ | ||
406 | + bool old_guest_table_partial; /* Are we dropping a type ref, or just | ||
407 | + * finishing up a partial de-validation? */ | ||
408 | /* guest_table holds a ref to the page, and also a type-count unless | ||
409 | * shadow refcounts are in use */ | ||
410 | pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ | ||
411 | -- | ||
412 | 2.23.0 | ||
413 | |||
diff --git a/main/xen/xsa301-4.11-1.patch b/main/xen/xsa301-4.11-1.patch new file mode 100644 index 0000000000..4d528fe13b --- /dev/null +++ b/main/xen/xsa301-4.11-1.patch | |||
@@ -0,0 +1,80 @@ | |||
1 | From 21dfe8f707febd62869d4ebbaa155736870bebec Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Wed, 2 Oct 2019 12:06:50 +0100 | ||
4 | Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame | ||
5 | |||
6 | The P2M helpers implementation is quite lax and will end up to ignore | ||
7 | the unused top bits of a guest physical frame. | ||
8 | |||
9 | This effectively means that p2m_set_entry() will create a mapping for a | ||
10 | different frame (it is always equal to gfn & (mask unused bits)). Yet | ||
11 | p2m->max_mapped_gfn will be updated using the original frame. | ||
12 | |||
13 | At the moment, p2m_get_entry() and p2m_resolve_translation_fault() | ||
14 | assume that p2m_get_root_pointer() will always return a non-NULL pointer | ||
15 | when the GFN is smaller than p2m->max_mapped_gfn. | ||
16 | |||
17 | Unfortunately, because of the aliasing described above, it would be | ||
18 | possible to set p2m->max_mapped_gfn high enough so it covers frame that | ||
19 | would lead p2m_get_root_pointer() to return NULL. | ||
20 | |||
21 | As we don't sanity check the guest physical frame provided by a guest, a | ||
22 | malicious guest could craft a series of hypercalls that will hit the | ||
23 | BUG_ON() and therefore DoS Xen. | ||
24 | |||
25 | To prevent aliasing, the function p2m_get_root_pointer() is now reworked | ||
26 | to return NULL If any of the unused top bits are not zero. The caller | ||
27 | can then decide what's the appropriate action to do. Since the two paths | ||
28 | (i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very | ||
29 | similarly, take the opportunity to consolidate them making the code a | ||
30 | bit simpler. | ||
31 | |||
32 | With this change, p2m_get_entry() will not try to insert a mapping as | ||
33 | the root pointer is invalid. | ||
34 | |||
35 | Note that root_table is now switch to unsigned long as unsigned int is | ||
36 | not enough to hold part of a GFN. | ||
37 | |||
38 | This is part of XSA-301. | ||
39 | |||
40 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
41 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
42 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
43 | --- | ||
44 | xen/arch/arm/p2m.c | 17 +++++------------ | ||
45 | 1 file changed, 5 insertions(+), 12 deletions(-) | ||
46 | |||
47 | diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c | ||
48 | index d43c3aa896..3967ee7306 100644 | ||
49 | --- a/xen/arch/arm/p2m.c | ||
50 | +++ b/xen/arch/arm/p2m.c | ||
51 | @@ -177,21 +177,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m) | ||
52 | static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m, | ||
53 | gfn_t gfn) | ||
54 | { | ||
55 | - unsigned int root_table; | ||
56 | - | ||
57 | - if ( P2M_ROOT_PAGES == 1 ) | ||
58 | - return __map_domain_page(p2m->root); | ||
59 | + unsigned long root_table; | ||
60 | |||
61 | /* | ||
62 | - * Concatenated root-level tables. The table number will be the | ||
63 | - * offset at the previous level. It is not possible to | ||
64 | - * concatenate a level-0 root. | ||
65 | + * While the root table index is the offset from the previous level, | ||
66 | + * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be | ||
67 | + * 0. Yet we still want to check if all the unused bits are zeroed. | ||
68 | */ | ||
69 | - ASSERT(P2M_ROOT_LEVEL > 0); | ||
70 | - | ||
71 | - root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]); | ||
72 | - root_table &= LPAE_ENTRY_MASK; | ||
73 | - | ||
74 | + root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT); | ||
75 | if ( root_table >= P2M_ROOT_PAGES ) | ||
76 | return NULL; | ||
77 | |||
78 | -- | ||
79 | 2.11.0 | ||
80 | |||
diff --git a/main/xen/xsa301-4.11-2.patch b/main/xen/xsa301-4.11-2.patch new file mode 100644 index 0000000000..33b6150370 --- /dev/null +++ b/main/xen/xsa301-4.11-2.patch | |||
@@ -0,0 +1,92 @@ | |||
1 | From 4426d993b7ee0966fb39531dc5a269ce8493ca97 Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Wed, 2 Oct 2019 12:35:59 +0100 | ||
4 | Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on | ||
5 | p2m->max_mapped_gfn | ||
6 | |||
7 | The code base is using inconsistently the field p2m->max_mapped_gfn. | ||
8 | Some of the useres expect that p2m->max_guest_gfn contain the highest | ||
9 | mapped GFN while others expect highest + 1. | ||
10 | |||
11 | p2m->max_guest_gfn is set as highest + 1, because of that the sanity | ||
12 | check on the GFN in p2m_resolved_translation_fault() and | ||
13 | p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn. | ||
14 | |||
15 | p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is | ||
16 | outside of address range supported and therefore the BUG_ON() could be | ||
17 | hit. | ||
18 | |||
19 | The current value hold in p2m->max_mapped_gfn is inconsistent with the | ||
20 | expectation of the common code (see domain_get_maximum_gpfn()) and also | ||
21 | the documentation of the field. | ||
22 | |||
23 | Rather than changing the check in p2m_translation_fault() and | ||
24 | p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest | ||
25 | mapped GFN and the callers assuming "highest + 1" are now adjusted. | ||
26 | |||
27 | Take the opportunity to use 1UL rather than 1 as page_order could | ||
28 | theoritically big enough to overflow a 32-bit integer. | ||
29 | |||
30 | Lastly, the documentation of the field max_guest_gfn to reflect how it | ||
31 | is computed. | ||
32 | |||
33 | This is part of XSA-301. | ||
34 | |||
35 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
36 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
37 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
38 | --- | ||
39 | xen/arch/arm/p2m.c | 6 +++--- | ||
40 | xen/include/asm-arm/p2m.h | 5 +---- | ||
41 | 2 files changed, 4 insertions(+), 7 deletions(-) | ||
42 | |||
43 | diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c | ||
44 | index 3967ee7306..c7e049901d 100644 | ||
45 | --- a/xen/arch/arm/p2m.c | ||
46 | +++ b/xen/arch/arm/p2m.c | ||
47 | @@ -931,7 +931,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m, | ||
48 | p2m_write_pte(entry, pte, p2m->clean_pte); | ||
49 | |||
50 | p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn, | ||
51 | - gfn_add(sgfn, 1 << page_order)); | ||
52 | + gfn_add(sgfn, (1UL << page_order) - 1)); | ||
53 | p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn); | ||
54 | } | ||
55 | |||
56 | @@ -1291,7 +1291,7 @@ int relinquish_p2m_mapping(struct domain *d) | ||
57 | p2m_write_lock(p2m); | ||
58 | |||
59 | start = p2m->lowest_mapped_gfn; | ||
60 | - end = p2m->max_mapped_gfn; | ||
61 | + end = gfn_add(p2m->max_mapped_gfn, 1); | ||
62 | |||
63 | for ( ; gfn_x(start) < gfn_x(end); | ||
64 | start = gfn_next_boundary(start, order) ) | ||
65 | @@ -1356,7 +1356,7 @@ int p2m_cache_flush(struct domain *d, gfn_t start, unsigned long nr) | ||
66 | p2m_read_lock(p2m); | ||
67 | |||
68 | start = gfn_max(start, p2m->lowest_mapped_gfn); | ||
69 | - end = gfn_min(end, p2m->max_mapped_gfn); | ||
70 | + end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1)); | ||
71 | |||
72 | for ( ; gfn_x(start) < gfn_x(end); start = next_gfn ) | ||
73 | { | ||
74 | diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h | ||
75 | index 8823707c17..7f1f7e9109 100644 | ||
76 | --- a/xen/include/asm-arm/p2m.h | ||
77 | +++ b/xen/include/asm-arm/p2m.h | ||
78 | @@ -38,10 +38,7 @@ struct p2m_domain { | ||
79 | /* Current Translation Table Base Register for the p2m */ | ||
80 | uint64_t vttbr; | ||
81 | |||
82 | - /* | ||
83 | - * Highest guest frame that's ever been mapped in the p2m | ||
84 | - * Only takes into account ram and foreign mapping | ||
85 | - */ | ||
86 | + /* Highest guest frame that's ever been mapped in the p2m */ | ||
87 | gfn_t max_mapped_gfn; | ||
88 | |||
89 | /* | ||
90 | -- | ||
91 | 2.11.0 | ||
92 | |||
diff --git a/main/xen/xsa301-4.11-3.patch b/main/xen/xsa301-4.11-3.patch new file mode 100644 index 0000000000..55a701a5c7 --- /dev/null +++ b/main/xen/xsa301-4.11-3.patch | |||
@@ -0,0 +1,49 @@ | |||
1 | From 61c73af08b4ede1fc8cfd2cf72661e6c7cfdbeaa Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Wed, 2 Oct 2019 10:55:07 +0100 | ||
4 | Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of | ||
5 | p2m_get_root_pointer() with BUG_ON() | ||
6 | |||
7 | It turns out that the BUG_ON() was actually reachable with well-crafted | ||
8 | hypercalls. The BUG_ON() is here to prevent catch logical error, so | ||
9 | crashing Xen is a bit over the top. | ||
10 | |||
11 | While all the holes should now be fixed, it would be better to downgrade | ||
12 | the BUG_ON() to something less fatal to prevent any more DoS. | ||
13 | |||
14 | The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE() | ||
15 | to catch mistake in debug build and return INVALID_MFN for production | ||
16 | build. The interface also requires to set page_order to give an idea of | ||
17 | the size of "hole". So 'level' is now set so we report a hole of size of | ||
18 | the an entry of the root page-table. This stays inline with what happen | ||
19 | when the GFN is higher than p2m->max_mapped_gfn. | ||
20 | |||
21 | This is part of XSA-301. | ||
22 | |||
23 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
24 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
25 | --- | ||
26 | xen/arch/arm/p2m.c | 7 ++++++- | ||
27 | 1 file changed, 6 insertions(+), 1 deletion(-) | ||
28 | |||
29 | diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c | ||
30 | index c7e049901d..af3515df42 100644 | ||
31 | --- a/xen/arch/arm/p2m.c | ||
32 | +++ b/xen/arch/arm/p2m.c | ||
33 | @@ -318,7 +318,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, | ||
34 | * the table should always be non-NULL because the gfn is below | ||
35 | * p2m->max_mapped_gfn and the root table pages are always present. | ||
36 | */ | ||
37 | - BUG_ON(table == NULL); | ||
38 | + if ( !table ) | ||
39 | + { | ||
40 | + ASSERT_UNREACHABLE(); | ||
41 | + level = P2M_ROOT_LEVEL; | ||
42 | + goto out; | ||
43 | + } | ||
44 | |||
45 | for ( level = P2M_ROOT_LEVEL; level < 3; level++ ) | ||
46 | { | ||
47 | -- | ||
48 | 2.11.0 | ||
49 | |||
diff --git a/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch b/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch new file mode 100644 index 0000000000..0b93de18ac --- /dev/null +++ b/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch | |||
@@ -0,0 +1,37 @@ | |||
1 | From 2bcbf2843250888b720bfea188ac9842c847f388 Mon Sep 17 00:00:00 2001 | ||
2 | From: Jan Beulich <jbeulich@suse.com> | ||
3 | Date: Wed, 2 Oct 2019 13:36:59 +0200 | ||
4 | Subject: [PATCH 1/2] IOMMU: add missing HVM check | ||
5 | MIME-Version: 1.0 | ||
6 | Content-Type: text/plain; charset=UTF-8 | ||
7 | Content-Transfer-Encoding: 8bit | ||
8 | |||
9 | Fix an unguarded d->arch.hvm access in assign_device(). | ||
10 | |||
11 | Signed-off-by: Jan Beulich <jbeulich@suse.com> | ||
12 | Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> | ||
13 | Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
14 | |||
15 | (cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6) | ||
16 | Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> | ||
17 | --- | ||
18 | xen/drivers/passthrough/pci.c | 3 ++- | ||
19 | 1 file changed, 2 insertions(+), 1 deletion(-) | ||
20 | |||
21 | diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c | ||
22 | index e021c7a317..e1668a1968 100644 | ||
23 | --- a/xen/drivers/passthrough/pci.c | ||
24 | +++ b/xen/drivers/passthrough/pci.c | ||
25 | @@ -1386,7 +1386,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) | ||
26 | /* Prevent device assign if mem paging or mem sharing have been | ||
27 | * enabled for this domain */ | ||
28 | if ( unlikely(!need_iommu(d) && | ||
29 | - (d->arch.hvm_domain.mem_sharing_enabled || | ||
30 | + ((is_hvm_domain(d) && | ||
31 | + d->arch.hvm_domain.mem_sharing_enabled) || | ||
32 | vm_event_check_ring(d->vm_event_paging) || | ||
33 | p2m_get_hostp2m(d)->global_logdirty)) ) | ||
34 | return -EXDEV; | ||
35 | -- | ||
36 | 2.11.0 | ||
37 | |||
diff --git a/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch b/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch new file mode 100644 index 0000000000..94eba850a4 --- /dev/null +++ b/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch | |||
@@ -0,0 +1,498 @@ | |||
1 | From 02dd07e53b904570e0320d17d77022ddbc4e8225 Mon Sep 17 00:00:00 2001 | ||
2 | From: Paul Durrant <paul.durrant@citrix.com> | ||
3 | Date: Mon, 14 Oct 2019 17:52:59 +0100 | ||
4 | Subject: [PATCH 2/2] passthrough: quarantine PCI devices | ||
5 | |||
6 | When a PCI device is assigned to an untrusted domain, it is possible for | ||
7 | that domain to program the device to DMA to an arbitrary address. The | ||
8 | IOMMU is used to protect the host from malicious DMA by making sure that | ||
9 | the device addresses can only target memory assigned to the guest. However, | ||
10 | when the guest domain is torn down the device is assigned back to dom0, | ||
11 | thus allowing any in-flight DMA to potentially target critical host data. | ||
12 | |||
13 | This patch introduces a 'quarantine' for PCI devices using dom_io. When | ||
14 | the toolstack makes a device assignable (by binding it to pciback), it | ||
15 | will now also assign it to DOMID_IO and the device will only be assigned | ||
16 | back to dom0 when the device is made unassignable again. Whilst device is | ||
17 | assignable it will only ever transfer between dom_io and guest domains. | ||
18 | dom_io is actually only used as a sentinel domain for quarantining purposes; | ||
19 | it is not configured with any IOMMU mappings. Assignment to dom_io simply | ||
20 | means that the device's initiator (requestor) identifier is not present in | ||
21 | the IOMMU's device table and thus any DMA transactions issued will be | ||
22 | terminated with a fault condition. | ||
23 | |||
24 | In addition, a fix to assignment handling is made for VT-d. Failure | ||
25 | during the assignment step should not lead to a device still being | ||
26 | associated with its prior owner. Hand the device to DomIO temporarily, | ||
27 | until the assignment step has completed successfully. Remove the PI | ||
28 | hooks from the source domain then earlier as well. | ||
29 | |||
30 | Failure of the recovery reassign_device_ownership() may not go silent: | ||
31 | There e.g. may still be left over RMRR mappings in the domain assignment | ||
32 | to which has failed, and hence we can't allow that domain to continue | ||
33 | executing. | ||
34 | |||
35 | NOTE: This patch also includes one printk() cleanup; the | ||
36 | "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(), | ||
37 | since similar printk()-s elsewhere also don't log such a tag. | ||
38 | |||
39 | This is XSA-302. | ||
40 | |||
41 | Signed-off-by: Paul Durrant <paul.durrant@citrix.com> | ||
42 | Signed-off-by: Jan Beulich <jbeulich@suse.com> | ||
43 | Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> | ||
44 | --- | ||
45 | tools/libxl/libxl_pci.c | 25 +++++++++++- | ||
46 | xen/arch/x86/mm.c | 2 + | ||
47 | xen/common/domctl.c | 14 ++++++- | ||
48 | xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++- | ||
49 | xen/drivers/passthrough/iommu.c | 9 +++++ | ||
50 | xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++------- | ||
51 | xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++--- | ||
52 | xen/include/xen/pci.h | 3 ++ | ||
53 | 8 files changed, 138 insertions(+), 24 deletions(-) | ||
54 | |||
55 | diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c | ||
56 | index 88a55ce8bd..1b5c44f3e7 100644 | ||
57 | --- a/tools/libxl/libxl_pci.c | ||
58 | +++ b/tools/libxl/libxl_pci.c | ||
59 | @@ -749,6 +749,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, | ||
60 | libxl_device_pci *pcidev, | ||
61 | int rebind) | ||
62 | { | ||
63 | + libxl_ctx *ctx = libxl__gc_owner(gc); | ||
64 | unsigned dom, bus, dev, func; | ||
65 | char *spath, *driver_path = NULL; | ||
66 | int rc; | ||
67 | @@ -774,7 +775,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, | ||
68 | } | ||
69 | if ( rc ) { | ||
70 | LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); | ||
71 | - return 0; | ||
72 | + goto quarantine; | ||
73 | } | ||
74 | |||
75 | /* Check to see if there's already a driver that we need to unbind from */ | ||
76 | @@ -805,6 +806,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, | ||
77 | return ERROR_FAIL; | ||
78 | } | ||
79 | |||
80 | +quarantine: | ||
81 | + /* | ||
82 | + * DOMID_IO is just a sentinel domain, without any actual mappings, | ||
83 | + * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being | ||
84 | + * unnecessarily denied. | ||
85 | + */ | ||
86 | + rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), | ||
87 | + XEN_DOMCTL_DEV_RDM_RELAXED); | ||
88 | + if ( rc < 0 ) { | ||
89 | + LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); | ||
90 | + return ERROR_FAIL; | ||
91 | + } | ||
92 | + | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | @@ -812,9 +826,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc, | ||
97 | libxl_device_pci *pcidev, | ||
98 | int rebind) | ||
99 | { | ||
100 | + libxl_ctx *ctx = libxl__gc_owner(gc); | ||
101 | int rc; | ||
102 | char *driver_path; | ||
103 | |||
104 | + /* De-quarantine */ | ||
105 | + rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); | ||
106 | + if ( rc < 0 ) { | ||
107 | + LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, | ||
108 | + pcidev->dev, pcidev->func); | ||
109 | + return ERROR_FAIL; | ||
110 | + } | ||
111 | + | ||
112 | /* Unbind from pciback */ | ||
113 | if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { | ||
114 | return ERROR_FAIL; | ||
115 | diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c | ||
116 | index ce2c082caf..0e42497cf7 100644 | ||
117 | --- a/xen/arch/x86/mm.c | ||
118 | +++ b/xen/arch/x86/mm.c | ||
119 | @@ -295,9 +295,11 @@ void __init arch_init_memory(void) | ||
120 | * Initialise our DOMID_IO domain. | ||
121 | * This domain owns I/O pages that are within the range of the page_info | ||
122 | * array. Mappings occur at the priv of the caller. | ||
123 | + * Quarantined PCI devices will be associated with this domain. | ||
124 | */ | ||
125 | dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL); | ||
126 | BUG_ON(IS_ERR(dom_io)); | ||
127 | + INIT_LIST_HEAD(&dom_io->arch.pdev_list); | ||
128 | |||
129 | /* | ||
130 | * Initialise our COW domain. | ||
131 | diff --git a/xen/common/domctl.c b/xen/common/domctl.c | ||
132 | index 3c6fa4ec67..a70f4b46f8 100644 | ||
133 | --- a/xen/common/domctl.c | ||
134 | +++ b/xen/common/domctl.c | ||
135 | @@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) | ||
136 | |||
137 | switch ( op->cmd ) | ||
138 | { | ||
139 | + case XEN_DOMCTL_assign_device: | ||
140 | + case XEN_DOMCTL_deassign_device: | ||
141 | + if ( op->domain == DOMID_IO ) | ||
142 | + { | ||
143 | + d = dom_io; | ||
144 | + break; | ||
145 | + } | ||
146 | + else if ( op->domain == DOMID_INVALID ) | ||
147 | + return -ESRCH; | ||
148 | + /* fall through */ | ||
149 | case XEN_DOMCTL_test_assign_device: | ||
150 | if ( op->domain == DOMID_INVALID ) | ||
151 | { | ||
152 | @@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) | ||
153 | |||
154 | if ( !domctl_lock_acquire() ) | ||
155 | { | ||
156 | - if ( d ) | ||
157 | + if ( d && d != dom_io ) | ||
158 | rcu_unlock_domain(d); | ||
159 | return hypercall_create_continuation( | ||
160 | __HYPERVISOR_domctl, "h", u_domctl); | ||
161 | @@ -1163,7 +1173,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) | ||
162 | domctl_lock_release(); | ||
163 | |||
164 | domctl_out_unlock_domonly: | ||
165 | - if ( d ) | ||
166 | + if ( d && d != dom_io ) | ||
167 | rcu_unlock_domain(d); | ||
168 | |||
169 | if ( copyback && __copy_to_guest(u_domctl, op, 1) ) | ||
170 | diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c | ||
171 | index 12d2695b89..ec8baae717 100644 | ||
172 | --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c | ||
173 | +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c | ||
174 | @@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device( | ||
175 | u8 bus = pdev->bus; | ||
176 | const struct domain_iommu *hd = dom_iommu(domain); | ||
177 | |||
178 | + /* dom_io is used as a sentinel for quarantined devices */ | ||
179 | + if ( domain == dom_io ) | ||
180 | + return; | ||
181 | + | ||
182 | BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || | ||
183 | !iommu->dev_table.buffer ); | ||
184 | |||
185 | @@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain, | ||
186 | int req_id; | ||
187 | u8 bus = pdev->bus; | ||
188 | |||
189 | + /* dom_io is used as a sentinel for quarantined devices */ | ||
190 | + if ( domain == dom_io ) | ||
191 | + return; | ||
192 | + | ||
193 | BUG_ON ( iommu->dev_table.buffer == NULL ); | ||
194 | req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); | ||
195 | dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); | ||
196 | @@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn, | ||
197 | ivrs_mappings[req_id].read_permission); | ||
198 | } | ||
199 | |||
200 | - return reassign_device(hardware_domain, d, devfn, pdev); | ||
201 | + return reassign_device(pdev->domain, d, devfn, pdev); | ||
202 | } | ||
203 | |||
204 | static void deallocate_next_page_table(struct page_info *pg, int level) | ||
205 | diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c | ||
206 | index b5f8044439..ad2ce8f39b 100644 | ||
207 | --- a/xen/drivers/passthrough/iommu.c | ||
208 | +++ b/xen/drivers/passthrough/iommu.c | ||
209 | @@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d) | ||
210 | { | ||
211 | const struct domain_iommu *hd = dom_iommu(d); | ||
212 | |||
213 | + if ( d == dom_io ) | ||
214 | + return; | ||
215 | + | ||
216 | d->need_iommu = 0; | ||
217 | hd->platform_ops->teardown(d); | ||
218 | tasklet_schedule(&iommu_pt_cleanup_tasklet); | ||
219 | @@ -229,6 +232,9 @@ int iommu_construct(struct domain *d) | ||
220 | if ( need_iommu(d) > 0 ) | ||
221 | return 0; | ||
222 | |||
223 | + if ( d == dom_io ) | ||
224 | + return 0; | ||
225 | + | ||
226 | if ( !iommu_use_hap_pt(d) ) | ||
227 | { | ||
228 | int rc; | ||
229 | @@ -404,6 +410,9 @@ int __init iommu_setup(void) | ||
230 | printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); | ||
231 | if ( iommu_enabled ) | ||
232 | { | ||
233 | + if ( iommu_domain_init(dom_io) ) | ||
234 | + panic("Could not set up quarantine\n"); | ||
235 | + | ||
236 | printk(" - Dom0 mode: %s\n", | ||
237 | iommu_passthrough ? "Passthrough" : | ||
238 | iommu_dom0_strict ? "Strict" : "Relaxed"); | ||
239 | diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c | ||
240 | index e1668a1968..6b2e9d2896 100644 | ||
241 | --- a/xen/drivers/passthrough/pci.c | ||
242 | +++ b/xen/drivers/passthrough/pci.c | ||
243 | @@ -1359,19 +1359,29 @@ static int iommu_remove_device(struct pci_dev *pdev) | ||
244 | return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); | ||
245 | } | ||
246 | |||
247 | -/* | ||
248 | - * If the device isn't owned by the hardware domain, it means it already | ||
249 | - * has been assigned to other domain, or it doesn't exist. | ||
250 | - */ | ||
251 | static int device_assigned(u16 seg, u8 bus, u8 devfn) | ||
252 | { | ||
253 | struct pci_dev *pdev; | ||
254 | + int rc = 0; | ||
255 | |||
256 | pcidevs_lock(); | ||
257 | - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); | ||
258 | + | ||
259 | + pdev = pci_get_pdev(seg, bus, devfn); | ||
260 | + | ||
261 | + if ( !pdev ) | ||
262 | + rc = -ENODEV; | ||
263 | + /* | ||
264 | + * If the device exists and it is not owned by either the hardware | ||
265 | + * domain or dom_io then it must be assigned to a guest, or be | ||
266 | + * hidden (owned by dom_xen). | ||
267 | + */ | ||
268 | + else if ( pdev->domain != hardware_domain && | ||
269 | + pdev->domain != dom_io ) | ||
270 | + rc = -EBUSY; | ||
271 | + | ||
272 | pcidevs_unlock(); | ||
273 | |||
274 | - return pdev ? 0 : -EBUSY; | ||
275 | + return rc; | ||
276 | } | ||
277 | |||
278 | static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) | ||
279 | @@ -1385,7 +1395,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) | ||
280 | |||
281 | /* Prevent device assign if mem paging or mem sharing have been | ||
282 | * enabled for this domain */ | ||
283 | - if ( unlikely(!need_iommu(d) && | ||
284 | + if ( d != dom_io && | ||
285 | + unlikely(!need_iommu(d) && | ||
286 | ((is_hvm_domain(d) && | ||
287 | d->arch.hvm_domain.mem_sharing_enabled) || | ||
288 | vm_event_check_ring(d->vm_event_paging) || | ||
289 | @@ -1402,12 +1413,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) | ||
290 | return rc; | ||
291 | } | ||
292 | |||
293 | - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); | ||
294 | + pdev = pci_get_pdev(seg, bus, devfn); | ||
295 | + | ||
296 | + rc = -ENODEV; | ||
297 | if ( !pdev ) | ||
298 | - { | ||
299 | - rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; | ||
300 | goto done; | ||
301 | - } | ||
302 | + | ||
303 | + rc = 0; | ||
304 | + if ( d == pdev->domain ) | ||
305 | + goto done; | ||
306 | + | ||
307 | + rc = -EBUSY; | ||
308 | + if ( pdev->domain != hardware_domain && | ||
309 | + pdev->domain != dom_io ) | ||
310 | + goto done; | ||
311 | |||
312 | if ( pdev->msix ) | ||
313 | msixtbl_init(d); | ||
314 | @@ -1430,6 +1449,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) | ||
315 | } | ||
316 | |||
317 | done: | ||
318 | + /* The device is assigned to dom_io so mark it as quarantined */ | ||
319 | + if ( !rc && d == dom_io ) | ||
320 | + pdev->quarantine = true; | ||
321 | + | ||
322 | if ( !has_arch_pdevs(d) && need_iommu(d) ) | ||
323 | iommu_teardown(d); | ||
324 | pcidevs_unlock(); | ||
325 | @@ -1442,6 +1465,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) | ||
326 | { | ||
327 | const struct domain_iommu *hd = dom_iommu(d); | ||
328 | struct pci_dev *pdev = NULL; | ||
329 | + struct domain *target; | ||
330 | int ret = 0; | ||
331 | |||
332 | if ( !iommu_enabled || !hd->platform_ops ) | ||
333 | @@ -1452,12 +1476,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) | ||
334 | if ( !pdev ) | ||
335 | return -ENODEV; | ||
336 | |||
337 | + /* De-assignment from dom_io should de-quarantine the device */ | ||
338 | + target = (pdev->quarantine && pdev->domain != dom_io) ? | ||
339 | + dom_io : hardware_domain; | ||
340 | + | ||
341 | while ( pdev->phantom_stride ) | ||
342 | { | ||
343 | devfn += pdev->phantom_stride; | ||
344 | if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) | ||
345 | break; | ||
346 | - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, | ||
347 | + ret = hd->platform_ops->reassign_device(d, target, devfn, | ||
348 | pci_to_dev(pdev)); | ||
349 | if ( !ret ) | ||
350 | continue; | ||
351 | @@ -1468,7 +1496,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) | ||
352 | } | ||
353 | |||
354 | devfn = pdev->devfn; | ||
355 | - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, | ||
356 | + ret = hd->platform_ops->reassign_device(d, target, devfn, | ||
357 | pci_to_dev(pdev)); | ||
358 | if ( ret ) | ||
359 | { | ||
360 | @@ -1478,6 +1506,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) | ||
361 | return ret; | ||
362 | } | ||
363 | |||
364 | + if ( pdev->domain == hardware_domain ) | ||
365 | + pdev->quarantine = false; | ||
366 | + | ||
367 | pdev->fault.count = 0; | ||
368 | |||
369 | if ( !has_arch_pdevs(d) && need_iommu(d) ) | ||
370 | @@ -1656,7 +1687,7 @@ int iommu_do_pci_domctl( | ||
371 | ret = hypercall_create_continuation(__HYPERVISOR_domctl, | ||
372 | "h", u_domctl); | ||
373 | else if ( ret ) | ||
374 | - printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " | ||
375 | + printk(XENLOG_G_ERR | ||
376 | "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", | ||
377 | seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), | ||
378 | d->domain_id, ret); | ||
379 | diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c | ||
380 | index 481efef2b0..1d16127d8f 100644 | ||
381 | --- a/xen/drivers/passthrough/vtd/iommu.c | ||
382 | +++ b/xen/drivers/passthrough/vtd/iommu.c | ||
383 | @@ -1332,6 +1332,10 @@ int domain_context_mapping_one( | ||
384 | int agaw, rc, ret; | ||
385 | bool_t flush_dev_iotlb; | ||
386 | |||
387 | + /* dom_io is used as a sentinel for quarantined devices */ | ||
388 | + if ( domain == dom_io ) | ||
389 | + return 0; | ||
390 | + | ||
391 | ASSERT(pcidevs_locked()); | ||
392 | spin_lock(&iommu->lock); | ||
393 | maddr = bus_to_context_maddr(iommu, bus); | ||
394 | @@ -1567,6 +1571,10 @@ int domain_context_unmap_one( | ||
395 | int iommu_domid, rc, ret; | ||
396 | bool_t flush_dev_iotlb; | ||
397 | |||
398 | + /* dom_io is used as a sentinel for quarantined devices */ | ||
399 | + if ( domain == dom_io ) | ||
400 | + return 0; | ||
401 | + | ||
402 | ASSERT(pcidevs_locked()); | ||
403 | spin_lock(&iommu->lock); | ||
404 | |||
405 | @@ -1699,6 +1707,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, | ||
406 | goto out; | ||
407 | } | ||
408 | |||
409 | + /* dom_io is used as a sentinel for quarantined devices */ | ||
410 | + if ( domain == dom_io ) | ||
411 | + goto out; | ||
412 | + | ||
413 | /* | ||
414 | * if no other devices under the same iommu owned by this domain, | ||
415 | * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp | ||
416 | @@ -2383,6 +2395,15 @@ static int reassign_device_ownership( | ||
417 | if ( ret ) | ||
418 | return ret; | ||
419 | |||
420 | + if ( devfn == pdev->devfn ) | ||
421 | + { | ||
422 | + list_move(&pdev->domain_list, &dom_io->arch.pdev_list); | ||
423 | + pdev->domain = dom_io; | ||
424 | + } | ||
425 | + | ||
426 | + if ( !has_arch_pdevs(source) ) | ||
427 | + vmx_pi_hooks_deassign(source); | ||
428 | + | ||
429 | if ( !has_arch_pdevs(target) ) | ||
430 | vmx_pi_hooks_assign(target); | ||
431 | |||
432 | @@ -2401,15 +2422,13 @@ static int reassign_device_ownership( | ||
433 | pdev->domain = target; | ||
434 | } | ||
435 | |||
436 | - if ( !has_arch_pdevs(source) ) | ||
437 | - vmx_pi_hooks_deassign(source); | ||
438 | - | ||
439 | return ret; | ||
440 | } | ||
441 | |||
442 | static int intel_iommu_assign_device( | ||
443 | struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) | ||
444 | { | ||
445 | + struct domain *s = pdev->domain; | ||
446 | struct acpi_rmrr_unit *rmrr; | ||
447 | int ret = 0, i; | ||
448 | u16 bdf, seg; | ||
449 | @@ -2452,8 +2471,8 @@ static int intel_iommu_assign_device( | ||
450 | } | ||
451 | } | ||
452 | |||
453 | - ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); | ||
454 | - if ( ret ) | ||
455 | + ret = reassign_device_ownership(s, d, devfn, pdev); | ||
456 | + if ( ret || d == dom_io ) | ||
457 | return ret; | ||
458 | |||
459 | /* Setup rmrr identity mapping */ | ||
460 | @@ -2466,11 +2485,20 @@ static int intel_iommu_assign_device( | ||
461 | ret = rmrr_identity_mapping(d, 1, rmrr, flag); | ||
462 | if ( ret ) | ||
463 | { | ||
464 | - reassign_device_ownership(d, hardware_domain, devfn, pdev); | ||
465 | + int rc; | ||
466 | + | ||
467 | + rc = reassign_device_ownership(d, s, devfn, pdev); | ||
468 | printk(XENLOG_G_ERR VTDPREFIX | ||
469 | " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", | ||
470 | rmrr->base_address, rmrr->end_address, | ||
471 | d->domain_id, ret); | ||
472 | + if ( rc ) | ||
473 | + { | ||
474 | + printk(XENLOG_ERR VTDPREFIX | ||
475 | + " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", | ||
476 | + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); | ||
477 | + domain_crash(d); | ||
478 | + } | ||
479 | break; | ||
480 | } | ||
481 | } | ||
482 | diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h | ||
483 | index 43f21251a5..3241e51e3c 100644 | ||
484 | --- a/xen/include/xen/pci.h | ||
485 | +++ b/xen/include/xen/pci.h | ||
486 | @@ -68,6 +68,9 @@ struct pci_dev { | ||
487 | |||
488 | nodeid_t node; /* NUMA node */ | ||
489 | |||
490 | + /* Device to be quarantined, don't automatically re-assign to dom0 */ | ||
491 | + bool quarantine; | ||
492 | + | ||
493 | enum pdev_type { | ||
494 | DEV_TYPE_PCI_UNKNOWN, | ||
495 | DEV_TYPE_PCIe_ENDPOINT, | ||
496 | -- | ||
497 | 2.11.0 | ||
498 | |||
diff --git a/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch b/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch new file mode 100644 index 0000000000..afb1096c1d --- /dev/null +++ b/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch | |||
@@ -0,0 +1,74 @@ | |||
1 | From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Tue, 1 Oct 2019 13:07:53 +0100 | ||
4 | Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two | ||
5 | |||
6 | The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap | ||
7 | entry function. While the macro is fairly small today, follow-up patches | ||
8 | will increase the size signicantly. | ||
9 | |||
10 | In general, assembly macros are more readable as they allow you to name | ||
11 | parameters and avoid '\'. So the actual implementation of the trap is | ||
12 | now switched to an assembly macro. | ||
13 | |||
14 | This is part of XSA-303. | ||
15 | |||
16 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
17 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
18 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
19 | Reviewed-by: Andre Przywara <andre.przywara@arm.com> | ||
20 | --- | ||
21 | xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++--------------- | ||
22 | 1 file changed, 19 insertions(+), 15 deletions(-) | ||
23 | |||
24 | diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S | ||
25 | index 0b4cd19abd..4a762e04f1 100644 | ||
26 | --- a/xen/arch/arm/arm32/entry.S | ||
27 | +++ b/xen/arch/arm/arm32/entry.S | ||
28 | @@ -126,24 +126,28 @@ abort_guest_exit_end: | ||
29 | skip_check: | ||
30 | mov pc, lr | ||
31 | |||
32 | -/* | ||
33 | - * Macro to define trap entry. The iflags corresponds to the list of | ||
34 | - * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. | ||
35 | - */ | ||
36 | + /* | ||
37 | + * Macro to define trap entry. The iflags corresponds to the list of | ||
38 | + * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. | ||
39 | + */ | ||
40 | + .macro vector trap, iflags | ||
41 | + SAVE_ALL | ||
42 | + cpsie \iflags | ||
43 | + adr lr, return_from_trap | ||
44 | + mov r0, sp | ||
45 | + /* | ||
46 | + * Save the stack pointer in r11. It will be restored after the | ||
47 | + * trap has been handled (see return_from_trap). | ||
48 | + */ | ||
49 | + mov r11, sp | ||
50 | + bic sp, #7 /* Align the stack pointer (noop on guest trap) */ | ||
51 | + b do_trap_\trap | ||
52 | + .endm | ||
53 | + | ||
54 | #define __DEFINE_TRAP_ENTRY(trap, iflags) \ | ||
55 | ALIGN; \ | ||
56 | trap_##trap: \ | ||
57 | - SAVE_ALL; \ | ||
58 | - cpsie iflags; \ | ||
59 | - adr lr, return_from_trap; \ | ||
60 | - mov r0, sp; \ | ||
61 | - /* \ | ||
62 | - * Save the stack pointer in r11. It will be restored after the \ | ||
63 | - * trap has been handled (see return_from_trap). \ | ||
64 | - */ \ | ||
65 | - mov r11, sp; \ | ||
66 | - bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ | ||
67 | - b do_trap_##trap | ||
68 | + vector trap, iflags | ||
69 | |||
70 | /* Trap handler which unmask IRQ/Abort, keep FIQ masked */ | ||
71 | #define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) | ||
72 | -- | ||
73 | 2.11.0 | ||
74 | |||
diff --git a/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch b/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch new file mode 100644 index 0000000000..35f9c0475e --- /dev/null +++ b/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch | |||
@@ -0,0 +1,97 @@ | |||
1 | From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Tue, 1 Oct 2019 13:15:48 +0100 | ||
4 | Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro | ||
5 | vector | ||
6 | |||
7 | Follow-up rework will require the macro vector to distinguish between | ||
8 | a trap from a guest vs while in the hypervisor. | ||
9 | |||
10 | The macro SAVE_ALL already has code to distinguish between the two and | ||
11 | it is only called by the vector macro. So fold the former into the | ||
12 | latter. This will help to avoid duplicating the check. | ||
13 | |||
14 | This is part of XSA-303. | ||
15 | |||
16 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
17 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
18 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
19 | Reviewed-by: Andre Przywara <andre.przywara@arm.com> | ||
20 | --- | ||
21 | xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++----------------------- | ||
22 | 1 file changed, 23 insertions(+), 23 deletions(-) | ||
23 | |||
24 | diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S | ||
25 | index 4a762e04f1..150cbc0b4b 100644 | ||
26 | --- a/xen/arch/arm/arm32/entry.S | ||
27 | +++ b/xen/arch/arm/arm32/entry.S | ||
28 | @@ -13,27 +13,6 @@ | ||
29 | #define RESTORE_BANKED(mode) \ | ||
30 | RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode) | ||
31 | |||
32 | -#define SAVE_ALL \ | ||
33 | - sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \ | ||
34 | - push {r0-r12}; /* Save R0-R12 */ \ | ||
35 | - \ | ||
36 | - mrs r11, ELR_hyp; /* ELR_hyp is return address. */\ | ||
37 | - str r11, [sp, #UREGS_pc]; \ | ||
38 | - \ | ||
39 | - str lr, [sp, #UREGS_lr]; \ | ||
40 | - \ | ||
41 | - add r11, sp, #UREGS_kernel_sizeof+4; \ | ||
42 | - str r11, [sp, #UREGS_sp]; \ | ||
43 | - \ | ||
44 | - mrc CP32(r11, HSR); /* Save exception syndrome */ \ | ||
45 | - str r11, [sp, #UREGS_hsr]; \ | ||
46 | - \ | ||
47 | - mrs r11, SPSR_hyp; \ | ||
48 | - str r11, [sp, #UREGS_cpsr]; \ | ||
49 | - and r11, #PSR_MODE_MASK; \ | ||
50 | - cmp r11, #PSR_MODE_HYP; \ | ||
51 | - blne save_guest_regs | ||
52 | - | ||
53 | save_guest_regs: | ||
54 | #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR | ||
55 | /* | ||
56 | @@ -52,7 +31,7 @@ save_guest_regs: | ||
57 | ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */ | ||
58 | str r11, [sp, #UREGS_sp] | ||
59 | SAVE_ONE_BANKED(SP_usr) | ||
60 | - /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */ | ||
61 | + /* LR_usr is the same physical register as lr and is saved by the caller */ | ||
62 | SAVE_BANKED(svc) | ||
63 | SAVE_BANKED(abt) | ||
64 | SAVE_BANKED(und) | ||
65 | @@ -131,7 +110,28 @@ skip_check: | ||
66 | * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. | ||
67 | */ | ||
68 | .macro vector trap, iflags | ||
69 | - SAVE_ALL | ||
70 | + /* Save registers in the stack */ | ||
71 | + sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ | ||
72 | + push {r0-r12} /* Save R0-R12 */ | ||
73 | + mrs r11, ELR_hyp /* ELR_hyp is return address */ | ||
74 | + str r11, [sp, #UREGS_pc] | ||
75 | + | ||
76 | + str lr, [sp, #UREGS_lr] | ||
77 | + | ||
78 | + add r11, sp, #(UREGS_kernel_sizeof + 4) | ||
79 | + | ||
80 | + str r11, [sp, #UREGS_sp] | ||
81 | + | ||
82 | + mrc CP32(r11, HSR) /* Save exception syndrome */ | ||
83 | + str r11, [sp, #UREGS_hsr] | ||
84 | + | ||
85 | + mrs r11, SPSR_hyp | ||
86 | + str r11, [sp, #UREGS_cpsr] | ||
87 | + and r11, #PSR_MODE_MASK | ||
88 | + cmp r11, #PSR_MODE_HYP | ||
89 | + blne save_guest_regs | ||
90 | + | ||
91 | + /* We are ready to handle the trap, setup the registers and jump. */ | ||
92 | cpsie \iflags | ||
93 | adr lr, return_from_trap | ||
94 | mov r0, sp | ||
95 | -- | ||
96 | 2.11.0 | ||
97 | |||
diff --git a/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch new file mode 100644 index 0000000000..5168452148 --- /dev/null +++ b/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | |||
@@ -0,0 +1,226 @@ | |||
1 | From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Fri, 11 Oct 2019 17:49:28 +0100 | ||
4 | Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap | ||
5 | without a change of level | ||
6 | |||
7 | Exception vectors will unmask interrupts regardless the state of them in | ||
8 | the interrupted context. | ||
9 | |||
10 | One of the consequences is IRQ will be unmasked when receiving an | ||
11 | undefined instruction exception (used by WARN*) from the hypervisor. | ||
12 | This could result to unexpected behavior such as deadlock (if a lock was | ||
13 | shared with interrupts). | ||
14 | |||
15 | In a nutshell, interrupts should only be unmasked when it is safe to do. | ||
16 | Xen only unmask IRQ and Abort interrupts, so the logic can stay simple. | ||
17 | |||
18 | As vectors exceptions may be shared between guest and hypervisor, we now | ||
19 | need to have a different policy for the interrupts. | ||
20 | |||
21 | On exception from hypervisor, each vector will select the list of | ||
22 | interrupts to inherit from the interrupted context. Any interrupts not | ||
23 | listed will be kept masked. | ||
24 | |||
25 | On exception from the guest, the Abort and IRQ will be unmasked | ||
26 | depending on the exact vector. | ||
27 | |||
28 | The interrupts will be kept unmasked when the vector cannot used by | ||
29 | either guest or hypervisor. | ||
30 | |||
31 | Note that each vector is not anymore preceded by ALIGN. This is fine | ||
32 | because the alignment is already bigger than what we need. | ||
33 | |||
34 | This is part of XSA-303. | ||
35 | |||
36 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
37 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
38 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
39 | Reviewed-by: Andre Przywara <andre.przywara@arm.com> | ||
40 | --- | ||
41 | xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++---------- | ||
42 | 1 file changed, 109 insertions(+), 29 deletions(-) | ||
43 | |||
44 | diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S | ||
45 | index 150cbc0b4b..ec90cca093 100644 | ||
46 | --- a/xen/arch/arm/arm32/entry.S | ||
47 | +++ b/xen/arch/arm/arm32/entry.S | ||
48 | @@ -4,6 +4,17 @@ | ||
49 | #include <asm/alternative.h> | ||
50 | #include <public/xen.h> | ||
51 | |||
52 | +/* | ||
53 | + * Short-hands to defined the interrupts (A, I, F) | ||
54 | + * | ||
55 | + * _ means the interrupt state will not change | ||
56 | + * X means the state of interrupt X will change | ||
57 | + * | ||
58 | + * To be used with msr cpsr_* only | ||
59 | + */ | ||
60 | +#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK | ||
61 | +#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK | ||
62 | + | ||
63 | #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg] | ||
64 | #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11 | ||
65 | |||
66 | @@ -106,10 +117,18 @@ skip_check: | ||
67 | mov pc, lr | ||
68 | |||
69 | /* | ||
70 | - * Macro to define trap entry. The iflags corresponds to the list of | ||
71 | - * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. | ||
72 | + * Macro to define a trap entry. | ||
73 | + * | ||
74 | + * @guest_iflags: Optional list of interrupts to unmask when | ||
75 | + * entering from guest context. As this is used with cpsie, | ||
76 | + * the letter (a, i, f) should be used. | ||
77 | + * | ||
78 | + * @hyp_iflags: Optional list of interrupts to inherit when | ||
79 | + * entering from hypervisor context. Any interrupts not | ||
80 | + * listed will be kept unchanged. As this is used with cpsr_*, | ||
81 | + * IFLAGS_* short-hands should be used. | ||
82 | */ | ||
83 | - .macro vector trap, iflags | ||
84 | + .macro vector trap, guest_iflags=n, hyp_iflags=0 | ||
85 | /* Save registers in the stack */ | ||
86 | sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ | ||
87 | push {r0-r12} /* Save R0-R12 */ | ||
88 | @@ -127,12 +146,39 @@ skip_check: | ||
89 | |||
90 | mrs r11, SPSR_hyp | ||
91 | str r11, [sp, #UREGS_cpsr] | ||
92 | - and r11, #PSR_MODE_MASK | ||
93 | - cmp r11, #PSR_MODE_HYP | ||
94 | - blne save_guest_regs | ||
95 | |||
96 | + /* | ||
97 | + * We need to distinguish whether we came from guest or | ||
98 | + * hypervisor context. | ||
99 | + */ | ||
100 | + and r0, r11, #PSR_MODE_MASK | ||
101 | + cmp r0, #PSR_MODE_HYP | ||
102 | + | ||
103 | + bne 1f | ||
104 | + /* | ||
105 | + * Trap from the hypervisor | ||
106 | + * | ||
107 | + * Inherit the state of the interrupts from the hypervisor | ||
108 | + * context. For that we need to use SPSR (stored in r11) and | ||
109 | + * modify CPSR accordingly. | ||
110 | + * | ||
111 | + * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags) | ||
112 | + */ | ||
113 | + mrs r10, cpsr | ||
114 | + bic r10, r10, #\hyp_iflags | ||
115 | + and r11, r11, #\hyp_iflags | ||
116 | + orr r10, r10, r11 | ||
117 | + msr cpsr_cx, r10 | ||
118 | + b 2f | ||
119 | + | ||
120 | +1: | ||
121 | + /* Trap from the guest */ | ||
122 | + bl save_guest_regs | ||
123 | + .if \guest_iflags != n | ||
124 | + cpsie \guest_iflags | ||
125 | + .endif | ||
126 | +2: | ||
127 | /* We are ready to handle the trap, setup the registers and jump. */ | ||
128 | - cpsie \iflags | ||
129 | adr lr, return_from_trap | ||
130 | mov r0, sp | ||
131 | /* | ||
132 | @@ -144,20 +190,6 @@ skip_check: | ||
133 | b do_trap_\trap | ||
134 | .endm | ||
135 | |||
136 | -#define __DEFINE_TRAP_ENTRY(trap, iflags) \ | ||
137 | - ALIGN; \ | ||
138 | -trap_##trap: \ | ||
139 | - vector trap, iflags | ||
140 | - | ||
141 | -/* Trap handler which unmask IRQ/Abort, keep FIQ masked */ | ||
142 | -#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) | ||
143 | - | ||
144 | -/* Trap handler which unmask Abort, keep IRQ/FIQ masked */ | ||
145 | -#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a) | ||
146 | - | ||
147 | -/* Trap handler which unmask IRQ, keep Abort/FIQ masked */ | ||
148 | -#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i) | ||
149 | - | ||
150 | .align 5 | ||
151 | GLOBAL(hyp_traps_vector) | ||
152 | b trap_reset /* 0x00 - Reset */ | ||
153 | @@ -228,14 +260,62 @@ decode_vectors: | ||
154 | |||
155 | #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ | ||
156 | |||
157 | -DEFINE_TRAP_ENTRY(reset) | ||
158 | -DEFINE_TRAP_ENTRY(undefined_instruction) | ||
159 | -DEFINE_TRAP_ENTRY(hypervisor_call) | ||
160 | -DEFINE_TRAP_ENTRY(prefetch_abort) | ||
161 | -DEFINE_TRAP_ENTRY(guest_sync) | ||
162 | -DEFINE_TRAP_ENTRY_NOIRQ(irq) | ||
163 | -DEFINE_TRAP_ENTRY_NOIRQ(fiq) | ||
164 | -DEFINE_TRAP_ENTRY_NOABORT(data_abort) | ||
165 | +/* Vector not used by the Hypervisor. */ | ||
166 | +trap_reset: | ||
167 | + vector reset | ||
168 | + | ||
169 | +/* | ||
170 | + * Vector only used by the Hypervisor. | ||
171 | + * | ||
172 | + * While the exception can be executed with all the interrupts (e.g. | ||
173 | + * IRQ) unmasked, the interrupted context may have purposefully masked | ||
174 | + * some of them. So we want to inherit the state from the interrupted | ||
175 | + * context. | ||
176 | + */ | ||
177 | +trap_undefined_instruction: | ||
178 | + vector undefined_instruction, hyp_iflags=IFLAGS_AIF | ||
179 | + | ||
180 | +/* We should never reach this trap */ | ||
181 | +trap_hypervisor_call: | ||
182 | + vector hypervisor_call | ||
183 | + | ||
184 | +/* | ||
185 | + * Vector only used by the hypervisor. | ||
186 | + * | ||
187 | + * While the exception can be executed with all the interrupts (e.g. | ||
188 | + * IRQ) unmasked, the interrupted context may have purposefully masked | ||
189 | + * some of them. So we want to inherit the state from the interrupted | ||
190 | + * context. | ||
191 | + */ | ||
192 | +trap_prefetch_abort: | ||
193 | + vector prefetch_abort, hyp_iflags=IFLAGS_AIF | ||
194 | + | ||
195 | +/* | ||
196 | + * Vector only used by the hypervisor. | ||
197 | + * | ||
198 | + * Data Abort should be rare and most likely fatal. It is best to not | ||
199 | + * unmask any interrupts to limit the amount of code that can run before | ||
200 | + * the Data Abort is treated. | ||
201 | + */ | ||
202 | +trap_data_abort: | ||
203 | + vector data_abort | ||
204 | + | ||
205 | +/* Vector only used by the guest. We can unmask Abort/IRQ. */ | ||
206 | +trap_guest_sync: | ||
207 | + vector guest_sync, guest_iflags=ai | ||
208 | + | ||
209 | + | ||
210 | +/* Vector used by the hypervisor and the guest. */ | ||
211 | +trap_irq: | ||
212 | + vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F | ||
213 | + | ||
214 | +/* | ||
215 | + * Vector used by the hypervisor and the guest. | ||
216 | + * | ||
217 | + * FIQ are not meant to happen, so we don't unmask any interrupts. | ||
218 | + */ | ||
219 | +trap_fiq: | ||
220 | + vector fiq | ||
221 | |||
222 | return_from_trap: | ||
223 | /* | ||
224 | -- | ||
225 | 2.11.0 | ||
226 | |||
diff --git a/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch new file mode 100644 index 0000000000..106cbf98f1 --- /dev/null +++ b/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch | |||
@@ -0,0 +1,114 @@ | |||
1 | From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001 | ||
2 | From: Julien Grall <julien.grall@arm.com> | ||
3 | Date: Mon, 7 Oct 2019 18:10:56 +0100 | ||
4 | Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap | ||
5 | without a change of level | ||
6 | |||
7 | Some of the traps without a change of the level (i.e. hypervisor -> | ||
8 | hypervisor) will unmask interrupts regardless the state of them in the | ||
9 | interrupted context. | ||
10 | |||
11 | One of the consequences is IRQ will be unmasked when receiving a | ||
12 | synchronous exception (used by WARN*()). This could result to unexpected | ||
13 | behavior such as deadlock (if a lock was shared with interrupts). | ||
14 | |||
15 | In a nutshell, interrupts should only be unmasked when it is safe to | ||
16 | do. Xen only unmask IRQ and Abort interrupts, so the logic can stay | ||
17 | simple: | ||
18 | - hyp_error: All the interrupts are now kept masked. SError should | ||
19 | be pretty rare and if ever happen then we most likely want to | ||
20 | avoid any other interrupts to be generated. The potential main | ||
21 | "caller" is during virtual SError synchronization on the exit | ||
22 | path from the guest (see check_pending_vserror). | ||
23 | |||
24 | - hyp_sync: The interrupts state is inherited from the interrupted | ||
25 | context. | ||
26 | |||
27 | - hyp_irq: All the interrupts but IRQ state are inherited from the | ||
28 | interrupted context. IRQ is kept masked. | ||
29 | |||
30 | This is part of XSA-303. | ||
31 | |||
32 | Reported-by: Julien Grall <Julien.Grall@arm.com> | ||
33 | Signed-off-by: Julien Grall <julien.grall@arm.com> | ||
34 | Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> | ||
35 | Reviewed-by: Andre Przywara <andre.przywara@arm.com> | ||
36 | --- | ||
37 | xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++---- | ||
38 | 1 file changed, 43 insertions(+), 4 deletions(-) | ||
39 | |||
40 | diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S | ||
41 | index 2d9a2713a1..3e41ba65b6 100644 | ||
42 | --- a/xen/arch/arm/arm64/entry.S | ||
43 | +++ b/xen/arch/arm/arm64/entry.S | ||
44 | @@ -188,24 +188,63 @@ hyp_error_invalid: | ||
45 | entry hyp=1 | ||
46 | invalid BAD_ERROR | ||
47 | |||
48 | +/* | ||
49 | + * SError received while running in the hypervisor mode. | ||
50 | + * | ||
51 | + * Technically, we could unmask the IRQ if it were unmasked in the | ||
52 | + * interrupted context. However, this require to check the PSTATE. For | ||
53 | + * simplicity, as SError should be rare and potentially fatal, | ||
54 | + * all interrupts are kept masked. | ||
55 | + */ | ||
56 | hyp_error: | ||
57 | entry hyp=1 | ||
58 | - msr daifclr, #2 | ||
59 | mov x0, sp | ||
60 | bl do_trap_hyp_serror | ||
61 | exit hyp=1 | ||
62 | |||
63 | -/* Traps taken in Current EL with SP_ELx */ | ||
64 | +/* | ||
65 | + * Synchronous exception received while running in the hypervisor mode. | ||
66 | + * | ||
67 | + * While the exception could be executed with all the interrupts (e.g. | ||
68 | + * IRQ) unmasked, the interrupted context may have purposefully masked | ||
69 | + * some of them. So we want to inherit the state from the interrupted | ||
70 | + * context. | ||
71 | + */ | ||
72 | hyp_sync: | ||
73 | entry hyp=1 | ||
74 | - msr daifclr, #6 | ||
75 | + | ||
76 | + /* Inherit interrupts */ | ||
77 | + mrs x0, SPSR_el2 | ||
78 | + and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK) | ||
79 | + msr daif, x0 | ||
80 | + | ||
81 | mov x0, sp | ||
82 | bl do_trap_hyp_sync | ||
83 | exit hyp=1 | ||
84 | |||
85 | +/* | ||
86 | + * IRQ received while running in the hypervisor mode. | ||
87 | + * | ||
88 | + * While the exception could be executed with all the interrupts but IRQ | ||
89 | + * unmasked, the interrupted context may have purposefully masked some | ||
90 | + * of them. So we want to inherit the state from the interrupt context | ||
91 | + * and keep IRQ masked. | ||
92 | + * | ||
93 | + * XXX: We may want to consider an ordering between interrupts (e.g. if | ||
94 | + * SError are masked, then IRQ should be masked too). However, this | ||
95 | + * would require some rework in some paths (e.g. panic, livepatch) to | ||
96 | + * ensure the ordering is enforced everywhere. | ||
97 | + */ | ||
98 | hyp_irq: | ||
99 | entry hyp=1 | ||
100 | - msr daifclr, #4 | ||
101 | + | ||
102 | + /* Inherit D, A, F interrupts and keep I masked */ | ||
103 | + mrs x0, SPSR_el2 | ||
104 | + mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK) | ||
105 | + and x0, x0, x1 | ||
106 | + orr x0, x0, #PSR_IRQ_MASK | ||
107 | + msr daif, x0 | ||
108 | + | ||
109 | mov x0, sp | ||
110 | bl do_trap_irq | ||
111 | exit hyp=1 | ||
112 | -- | ||
113 | 2.11.0 | ||
114 | |||
diff --git a/main/xen/xsa304-4.10-1.patch b/main/xen/xsa304-4.10-1.patch new file mode 100644 index 0000000000..4c144ac506 --- /dev/null +++ b/main/xen/xsa304-4.10-1.patch | |||
@@ -0,0 +1,71 @@ | |||
1 | From: Andrew Cooper <andrew.cooper3@citrix.com> | ||
2 | Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs | ||
3 | |||
4 | Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and | ||
5 | an EPT superpage gets shattered. The root cause is still under investigation, | ||
6 | but the end result is unusable in combination with CVE-2018-12207 protections. | ||
7 | |||
8 | This is part of XSA-304 / CVE-2018-12207 | ||
9 | |||
10 | Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
11 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
12 | |||
13 | diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h | ||
14 | index fb7edfaef9..d698b1d50a 100644 | ||
15 | --- a/xen/drivers/passthrough/vtd/extern.h | ||
16 | +++ b/xen/drivers/passthrough/vtd/extern.h | ||
17 | @@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu); | ||
18 | int __must_check me_wifi_quirk(struct domain *domain, | ||
19 | u8 bus, u8 devfn, int map); | ||
20 | void pci_vtd_quirk(const struct pci_dev *); | ||
21 | +void quirk_iommu_caps(struct iommu *iommu); | ||
22 | + | ||
23 | bool_t platform_supports_intremap(void); | ||
24 | bool_t platform_supports_x2apic(void); | ||
25 | |||
26 | diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c | ||
27 | index 2798a49907..17cf87ccf1 100644 | ||
28 | --- a/xen/drivers/passthrough/vtd/iommu.c | ||
29 | +++ b/xen/drivers/passthrough/vtd/iommu.c | ||
30 | @@ -1205,6 +1205,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) | ||
31 | if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) | ||
32 | return -ENODEV; | ||
33 | |||
34 | + quirk_iommu_caps(iommu); | ||
35 | + | ||
36 | if ( cap_fault_reg_offset(iommu->cap) + | ||
37 | cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || | ||
38 | ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) | ||
39 | diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c | ||
40 | index d6db862678..b02688e316 100644 | ||
41 | --- a/xen/drivers/passthrough/vtd/quirks.c | ||
42 | +++ b/xen/drivers/passthrough/vtd/quirks.c | ||
43 | @@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev) | ||
44 | break; | ||
45 | } | ||
46 | } | ||
47 | + | ||
48 | +void __init quirk_iommu_caps(struct iommu *iommu) | ||
49 | +{ | ||
50 | + /* | ||
51 | + * IOMMU Quirks: | ||
52 | + * | ||
53 | + * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't | ||
54 | + * implement superpages internally. | ||
55 | + * | ||
56 | + * There are issues changing the walk length under in-flight DMA, which | ||
57 | + * has manifested as incompatibility between EPT/IOMMU sharing and the | ||
58 | + * workaround for CVE-2018-12207 / XSA-304. Hide the superpages | ||
59 | + * capabilities in the IOMMU, which will prevent Xen from sharing the EPT | ||
60 | + * and IOMMU pagetables. | ||
61 | + * | ||
62 | + * Detection of SandyBridge unfortunately has to be done by processor | ||
63 | + * model because the client parts don't expose their IOMMUs as PCI devices | ||
64 | + * we could match with a Device ID. | ||
65 | + */ | ||
66 | + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | ||
67 | + boot_cpu_data.x86 == 6 && | ||
68 | + (boot_cpu_data.x86_model == 0x2a || | ||
69 | + boot_cpu_data.x86_model == 0x2d) ) | ||
70 | + iommu->cap &= ~(0xful << 34); | ||
71 | +} | ||
diff --git a/main/xen/xsa304-4.10-2.patch b/main/xen/xsa304-4.10-2.patch new file mode 100644 index 0000000000..38f739ad90 --- /dev/null +++ b/main/xen/xsa304-4.10-2.patch | |||
@@ -0,0 +1,268 @@ | |||
1 | From: Andrew Cooper <andrew.cooper3@citrix.com> | ||
2 | Subject: x86/vtx: Disable executable EPT superpages to work around | ||
3 | CVE-2018-12207 | ||
4 | |||
5 | CVE-2018-12207 covers a set of errata on various Intel processors, whereby a | ||
6 | machine check exception can be generated in a corner case when an executable | ||
7 | mapping changes size or cacheability without TLB invalidation. HVM guest | ||
8 | kernels can trigger this to DoS the host. | ||
9 | |||
10 | To mitigate, in affected hardware, all EPT superpages are marked NX. When an | ||
11 | instruction fetch violation is observed against the superpage, the superpage | ||
12 | is shattered to 4k and has execute permissions restored. This prevents the | ||
13 | guest kernel from being able to create the necessary preconditions in the iTLB | ||
14 | to exploit the vulnerability. | ||
15 | |||
16 | This does come with a workload-dependent performance overhead, caused by | ||
17 | increased TLB pressure. Performance can be restored, if guest kernels are | ||
18 | trusted not to mount an attack, by specifying ept=exec-sp on the command line. | ||
19 | |||
20 | This is part of XSA-304 / CVE-2018-12207 | ||
21 | |||
22 | Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
23 | Acked-by: George Dunlap <george.dunlap@citrix.com> | ||
24 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
25 | |||
26 | diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c | ||
27 | index c0700dfbfe..698ab63340 100644 | ||
28 | --- a/xen/arch/x86/hvm/hvm.c | ||
29 | +++ b/xen/arch/x86/hvm/hvm.c | ||
30 | @@ -1695,6 +1695,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, | ||
31 | struct p2m_domain *p2m, *hostp2m; | ||
32 | int rc, fall_through = 0, paged = 0; | ||
33 | int sharing_enomem = 0; | ||
34 | + unsigned int page_order = 0; | ||
35 | vm_event_request_t *req_ptr = NULL; | ||
36 | bool_t ap2m_active, sync = 0; | ||
37 | |||
38 | @@ -1763,7 +1764,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, | ||
39 | hostp2m = p2m_get_hostp2m(currd); | ||
40 | mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma, | ||
41 | P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0), | ||
42 | - NULL); | ||
43 | + &page_order); | ||
44 | |||
45 | if ( ap2m_active ) | ||
46 | { | ||
47 | @@ -1775,7 +1776,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, | ||
48 | goto out; | ||
49 | } | ||
50 | |||
51 | - mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL); | ||
52 | + mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order); | ||
53 | } | ||
54 | else | ||
55 | p2m = hostp2m; | ||
56 | @@ -1817,6 +1818,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, | ||
57 | break; | ||
58 | } | ||
59 | |||
60 | + /* | ||
61 | + * Workaround for XSA-304 / CVE-2018-12207. If we take an execution | ||
62 | + * fault against a non-executable superpage, shatter it to regain | ||
63 | + * execute permissions. | ||
64 | + */ | ||
65 | + if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) | ||
66 | + { | ||
67 | + int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, | ||
68 | + p2mt, p2ma); | ||
69 | + | ||
70 | + if ( res ) | ||
71 | + printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", | ||
72 | + gfn, res); | ||
73 | + | ||
74 | + rc = !res; | ||
75 | + goto out_put_gfn; | ||
76 | + } | ||
77 | + | ||
78 | if ( violation ) | ||
79 | { | ||
80 | /* Should #VE be emulated for this fault? */ | ||
81 | diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c | ||
82 | index 205f2307c2..27050c0877 100644 | ||
83 | --- a/xen/arch/x86/hvm/vmx/vmcs.c | ||
84 | +++ b/xen/arch/x86/hvm/vmx/vmcs.c | ||
85 | @@ -67,6 +67,7 @@ integer_param("ple_window", ple_window); | ||
86 | |||
87 | static bool_t __read_mostly opt_pml_enabled = 1; | ||
88 | static s8 __read_mostly opt_ept_ad = -1; | ||
89 | +int8_t __read_mostly opt_ept_exec_sp = -1; | ||
90 | |||
91 | /* | ||
92 | * The 'ept' parameter controls functionalities that depend on, or impact the | ||
93 | @@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s) | ||
94 | opt_pml_enabled = val; | ||
95 | else if ( !cmdline_strcmp(s, "ad") ) | ||
96 | opt_ept_ad = val; | ||
97 | + else if ( !cmdline_strcmp(s, "exec-sp") ) | ||
98 | + opt_ept_exec_sp = val; | ||
99 | else | ||
100 | rc = -EINVAL; | ||
101 | |||
102 | diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c | ||
103 | index fa1e0309c7..9285c2b2fa 100644 | ||
104 | --- a/xen/arch/x86/hvm/vmx/vmx.c | ||
105 | +++ b/xen/arch/x86/hvm/vmx/vmx.c | ||
106 | @@ -2490,6 +2490,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs) | ||
107 | static void __init lbr_tsx_fixup_check(void); | ||
108 | static void __init bdw_erratum_bdf14_fixup_check(void); | ||
109 | |||
110 | +/* | ||
111 | + * Calculate whether the CPU is vulnerable to Instruction Fetch page | ||
112 | + * size-change MCEs. | ||
113 | + */ | ||
114 | +static bool __init has_if_pschange_mc(void) | ||
115 | +{ | ||
116 | + uint64_t caps = 0; | ||
117 | + | ||
118 | + /* | ||
119 | + * If we are virtualised, there is nothing we can do. Our EPT tables are | ||
120 | + * shadowed by our hypervisor, and not walked by hardware. | ||
121 | + */ | ||
122 | + if ( cpu_has_hypervisor ) | ||
123 | + return false; | ||
124 | + | ||
125 | + if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) | ||
126 | + rdmsrl(MSR_ARCH_CAPABILITIES, caps); | ||
127 | + | ||
128 | + if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) | ||
129 | + return false; | ||
130 | + | ||
131 | + /* | ||
132 | + * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at | ||
133 | + * this time. | ||
134 | + */ | ||
135 | + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || | ||
136 | + boot_cpu_data.x86 != 6 ) | ||
137 | + return false; | ||
138 | + | ||
139 | + switch ( boot_cpu_data.x86_model ) | ||
140 | + { | ||
141 | + /* | ||
142 | + * Core processors since at least Nehalem are vulnerable. | ||
143 | + */ | ||
144 | + case 0x1f: /* Auburndale / Havendale */ | ||
145 | + case 0x1e: /* Nehalem */ | ||
146 | + case 0x1a: /* Nehalem EP */ | ||
147 | + case 0x2e: /* Nehalem EX */ | ||
148 | + case 0x25: /* Westmere */ | ||
149 | + case 0x2c: /* Westmere EP */ | ||
150 | + case 0x2f: /* Westmere EX */ | ||
151 | + case 0x2a: /* SandyBridge */ | ||
152 | + case 0x2d: /* SandyBridge EP/EX */ | ||
153 | + case 0x3a: /* IvyBridge */ | ||
154 | + case 0x3e: /* IvyBridge EP/EX */ | ||
155 | + case 0x3c: /* Haswell */ | ||
156 | + case 0x3f: /* Haswell EX/EP */ | ||
157 | + case 0x45: /* Haswell D */ | ||
158 | + case 0x46: /* Haswell H */ | ||
159 | + case 0x3d: /* Broadwell */ | ||
160 | + case 0x47: /* Broadwell H */ | ||
161 | + case 0x4f: /* Broadwell EP/EX */ | ||
162 | + case 0x56: /* Broadwell D */ | ||
163 | + case 0x4e: /* Skylake M */ | ||
164 | + case 0x5e: /* Skylake D */ | ||
165 | + case 0x55: /* Skylake-X / Cascade Lake */ | ||
166 | + case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ | ||
167 | + case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ | ||
168 | + return true; | ||
169 | + | ||
170 | + /* | ||
171 | + * Atom processors are not vulnerable. | ||
172 | + */ | ||
173 | + case 0x1c: /* Pineview */ | ||
174 | + case 0x26: /* Lincroft */ | ||
175 | + case 0x27: /* Penwell */ | ||
176 | + case 0x35: /* Cloverview */ | ||
177 | + case 0x36: /* Cedarview */ | ||
178 | + case 0x37: /* Baytrail / Valleyview (Silvermont) */ | ||
179 | + case 0x4d: /* Avaton / Rangely (Silvermont) */ | ||
180 | + case 0x4c: /* Cherrytrail / Brasswell */ | ||
181 | + case 0x4a: /* Merrifield */ | ||
182 | + case 0x5a: /* Moorefield */ | ||
183 | + case 0x5c: /* Goldmont */ | ||
184 | + case 0x5d: /* SoFIA 3G Granite/ES2.1 */ | ||
185 | + case 0x65: /* SoFIA LTE AOSP */ | ||
186 | + case 0x5f: /* Denverton */ | ||
187 | + case 0x6e: /* Cougar Mountain */ | ||
188 | + case 0x75: /* Lightning Mountain */ | ||
189 | + case 0x7a: /* Gemini Lake */ | ||
190 | + case 0x86: /* Jacobsville */ | ||
191 | + | ||
192 | + /* | ||
193 | + * Knights processors are not vulnerable. | ||
194 | + */ | ||
195 | + case 0x57: /* Knights Landing */ | ||
196 | + case 0x85: /* Knights Mill */ | ||
197 | + return false; | ||
198 | + | ||
199 | + default: | ||
200 | + printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", | ||
201 | + boot_cpu_data.x86_model); | ||
202 | + return true; | ||
203 | + } | ||
204 | +} | ||
205 | + | ||
206 | const struct hvm_function_table * __init start_vmx(void) | ||
207 | { | ||
208 | set_in_cr4(X86_CR4_VMXE); | ||
209 | @@ -2510,6 +2606,17 @@ const struct hvm_function_table * __init start_vmx(void) | ||
210 | */ | ||
211 | if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) | ||
212 | { | ||
213 | + bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); | ||
214 | + | ||
215 | + if ( opt_ept_exec_sp == -1 ) | ||
216 | + { | ||
217 | + /* Default to non-executable superpages on vulnerable hardware. */ | ||
218 | + opt_ept_exec_sp = !cpu_has_bug_pschange_mc; | ||
219 | + | ||
220 | + if ( cpu_has_bug_pschange_mc ) | ||
221 | + printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); | ||
222 | + } | ||
223 | + | ||
224 | vmx_function_table.hap_supported = 1; | ||
225 | vmx_function_table.altp2m_supported = 1; | ||
226 | |||
227 | diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c | ||
228 | index b4996ce658..424d42c93d 100644 | ||
229 | --- a/xen/arch/x86/mm/p2m-ept.c | ||
230 | +++ b/xen/arch/x86/mm/p2m-ept.c | ||
231 | @@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry, | ||
232 | break; | ||
233 | } | ||
234 | |||
235 | + /* | ||
236 | + * Don't create executable superpages if we need to shatter them to | ||
237 | + * protect against CVE-2018-12207. | ||
238 | + */ | ||
239 | + if ( !opt_ept_exec_sp && is_epte_superpage(entry) ) | ||
240 | + entry->x = 0; | ||
241 | } | ||
242 | |||
243 | #define GUEST_TABLE_MAP_FAILED 0 | ||
244 | diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h | ||
245 | index 7341cb191e..aad25335eb 100644 | ||
246 | --- a/xen/include/asm-x86/hvm/vmx/vmx.h | ||
247 | +++ b/xen/include/asm-x86/hvm/vmx/vmx.h | ||
248 | @@ -28,6 +28,8 @@ | ||
249 | #include <asm/hvm/trace.h> | ||
250 | #include <asm/hvm/vmx/vmcs.h> | ||
251 | |||
252 | +extern int8_t opt_ept_exec_sp; | ||
253 | + | ||
254 | typedef union { | ||
255 | struct { | ||
256 | u64 r : 1, /* bit 0 - Read permission */ | ||
257 | diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h | ||
258 | index e61aac2f51..47e7c412f2 100644 | ||
259 | --- a/xen/include/asm-x86/msr-index.h | ||
260 | +++ b/xen/include/asm-x86/msr-index.h | ||
261 | @@ -54,6 +54,7 @@ | ||
262 | #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) | ||
263 | #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) | ||
264 | #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) | ||
265 | +#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) | ||
266 | |||
267 | #define MSR_FLUSH_CMD 0x0000010b | ||
268 | #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) | ||
diff --git a/main/xen/xsa304-4.10-3.patch b/main/xen/xsa304-4.10-3.patch new file mode 100644 index 0000000000..907b0895a8 --- /dev/null +++ b/main/xen/xsa304-4.10-3.patch | |||
@@ -0,0 +1,84 @@ | |||
1 | From: Andrew Cooper <andrew.cooper3@citrix.com> | ||
2 | Subject: x86/vtx: Allow runtime modification of the exec-sp setting | ||
3 | |||
4 | See patch for details. | ||
5 | |||
6 | Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
7 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
8 | Reviewed-by: George Dunlap <george.dunlap@citrix.com> | ||
9 | |||
10 | diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c | ||
11 | index 27050c0877..3c29b7c46f 100644 | ||
12 | --- a/xen/arch/x86/hvm/vmx/vmcs.c | ||
13 | +++ b/xen/arch/x86/hvm/vmx/vmcs.c | ||
14 | @@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s) | ||
15 | } | ||
16 | custom_param("ept", parse_ept_param); | ||
17 | |||
18 | +static int parse_ept_param_runtime(const char *s) | ||
19 | +{ | ||
20 | + int val; | ||
21 | + | ||
22 | + if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || | ||
23 | + !(hvm_funcs.hap_capabilities & | ||
24 | + (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) | ||
25 | + { | ||
26 | + printk("VMX: EPT not available, or not in use - ignoring\n"); | ||
27 | + return 0; | ||
28 | + } | ||
29 | + | ||
30 | + if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) | ||
31 | + return -EINVAL; | ||
32 | + | ||
33 | + if ( val != opt_ept_exec_sp ) | ||
34 | + { | ||
35 | + struct domain *d; | ||
36 | + | ||
37 | + opt_ept_exec_sp = val; | ||
38 | + | ||
39 | + rcu_read_lock(&domlist_read_lock); | ||
40 | + for_each_domain ( d ) | ||
41 | + if ( paging_mode_hap(d) ) | ||
42 | + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); | ||
43 | + rcu_read_unlock(&domlist_read_lock); | ||
44 | + } | ||
45 | + | ||
46 | + printk("VMX: EPT executable superpages %sabled\n", | ||
47 | + val ? "en" : "dis"); | ||
48 | + | ||
49 | + return 0; | ||
50 | +} | ||
51 | +custom_runtime_only_param("ept", parse_ept_param_runtime); | ||
52 | + | ||
53 | /* Dynamic (run-time adjusted) execution control flags. */ | ||
54 | u32 vmx_pin_based_exec_control __read_mostly; | ||
55 | u32 vmx_cpu_based_exec_control __read_mostly; | ||
56 | diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c | ||
57 | index 7a52ba993e..416e77b03c 100644 | ||
58 | --- a/xen/arch/x86/mm/p2m.c | ||
59 | +++ b/xen/arch/x86/mm/p2m.c | ||
60 | @@ -263,17 +263,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start, | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | +/* | ||
65 | + * May be called with ot = nt = p2m_ram_rw for its side effect of | ||
66 | + * recalculating all PTEs in the p2m. | ||
67 | + */ | ||
68 | void p2m_change_entry_type_global(struct domain *d, | ||
69 | p2m_type_t ot, p2m_type_t nt) | ||
70 | { | ||
71 | struct p2m_domain *p2m = p2m_get_hostp2m(d); | ||
72 | |||
73 | - ASSERT(ot != nt); | ||
74 | ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); | ||
75 | |||
76 | p2m_lock(p2m); | ||
77 | p2m->change_entry_type_global(p2m, ot, nt); | ||
78 | - p2m->global_logdirty = (nt == p2m_ram_logdirty); | ||
79 | + /* Don't allow 'recalculate' operations to change the logdirty state. */ | ||
80 | + if ( ot != nt ) | ||
81 | + p2m->global_logdirty = (nt == p2m_ram_logdirty); | ||
82 | p2m_unlock(p2m); | ||
83 | } | ||
84 | |||
diff --git a/main/xen/xsa305-4.10-1.patch b/main/xen/xsa305-4.10-1.patch new file mode 100644 index 0000000000..e3163723a6 --- /dev/null +++ b/main/xen/xsa305-4.10-1.patch | |||
@@ -0,0 +1,288 @@ | |||
1 | From: Andrew Cooper <andrew.cooper3@citrix.com> | ||
2 | Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available | ||
3 | |||
4 | To protect against the TSX Async Abort speculative vulnerability, Intel have | ||
5 | released new microcode for affected parts which introduce the MSR_TSX_CTRL | ||
6 | control, which allows TSX to be turned off. This will be architectural on | ||
7 | future parts. | ||
8 | |||
9 | Introduce tsx= to provide a global on/off for TSX, including its enumeration | ||
10 | via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to | ||
11 | guests at the moment. | ||
12 | |||
13 | VMs may have booted before microcode is loaded, or before hosts have rebooted, | ||
14 | and they still want to migrate freely. A VM which booted seeing TSX can | ||
15 | migrate safely to hosts with TSX disabled - TSX will start unconditionally | ||
16 | aborting, but still behave in a manner compatible with the ABI. | ||
17 | |||
18 | The guest-visible behaviour is equivalent to late loading the microcode and | ||
19 | setting the RTM_DISABLE bit in the course of live patching. | ||
20 | |||
21 | This is part of XSA-305 / CVE-2019-11135 | ||
22 | |||
23 | Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
24 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
25 | |||
26 | diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown | ||
27 | index 0cbfb5096c..1b169c7b72 100644 | ||
28 | --- a/docs/misc/xen-command-line.markdown | ||
29 | +++ b/docs/misc/xen-command-line.markdown | ||
30 | @@ -1920,6 +1920,20 @@ pages) must also be specified via the tbuf\_size parameter. | ||
31 | ### tsc | ||
32 | > `= unstable | skewed | stable:socket` | ||
33 | |||
34 | +### tsx | ||
35 | + = <bool> | ||
36 | + | ||
37 | + Applicability: x86 | ||
38 | + Default: true | ||
39 | + | ||
40 | +Controls for the use of Transactional Synchronization eXtensions. | ||
41 | + | ||
42 | +On Intel parts released in Q3 2019 (with updated microcode), and future parts, | ||
43 | +a control has been introduced which allows TSX to be turned off. | ||
44 | + | ||
45 | +On systems with the ability to turn TSX off, this boolean offers system wide | ||
46 | +control of whether TSX is enabled or disabled. | ||
47 | + | ||
48 | ### ucode | ||
49 | > `= [<integer> | scan]` | ||
50 | |||
51 | diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile | ||
52 | index d86fb97fa3..4e4f39d933 100644 | ||
53 | --- a/xen/arch/x86/Makefile | ||
54 | +++ b/xen/arch/x86/Makefile | ||
55 | @@ -65,6 +65,7 @@ obj-y += sysctl.o | ||
56 | obj-y += time.o | ||
57 | obj-y += trace.o | ||
58 | obj-y += traps.o | ||
59 | +obj-y += tsx.o | ||
60 | obj-y += usercopy.o | ||
61 | obj-y += x86_emulate.o | ||
62 | obj-$(CONFIG_TBOOT) += tboot.o | ||
63 | diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c | ||
64 | index 98b63f3a01..e943d70bca 100644 | ||
65 | --- a/xen/arch/x86/cpuid.c | ||
66 | +++ b/xen/arch/x86/cpuid.c | ||
67 | @@ -600,6 +600,20 @@ void recalculate_cpuid_policy(struct domain *d) | ||
68 | if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) | ||
69 | __set_bit(X86_FEATURE_ITSC, max_fs); | ||
70 | |||
71 | + /* | ||
72 | + * On hardware with MSR_TSX_CTRL, the admin may have elected to disable | ||
73 | + * TSX and hide the feature bits. Migrating-in VMs may have been booted | ||
74 | + * pre-mitigation when the TSX features were visbile. | ||
75 | + * | ||
76 | + * This situation is compatible (albeit with a perf hit to any TSX code in | ||
77 | + * the guest), so allow the feature bits to remain set. | ||
78 | + */ | ||
79 | + if ( cpu_has_tsx_ctrl ) | ||
80 | + { | ||
81 | + __set_bit(X86_FEATURE_HLE, max_fs); | ||
82 | + __set_bit(X86_FEATURE_RTM, max_fs); | ||
83 | + } | ||
84 | + | ||
85 | /* Clamp the toolstacks choices to reality. */ | ||
86 | for ( i = 0; i < ARRAY_SIZE(fs); i++ ) | ||
87 | fs[i] &= max_fs[i]; | ||
88 | diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c | ||
89 | index 6853d4c120..6ceea913fb 100644 | ||
90 | --- a/xen/arch/x86/msr.c | ||
91 | +++ b/xen/arch/x86/msr.c | ||
92 | @@ -134,6 +134,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) | ||
93 | case MSR_FLUSH_CMD: | ||
94 | /* Write-only */ | ||
95 | case MSR_TSX_FORCE_ABORT: | ||
96 | + case MSR_TSX_CTRL: | ||
97 | /* Not offered to guests. */ | ||
98 | goto gp_fault; | ||
99 | |||
100 | @@ -192,6 +193,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) | ||
101 | case MSR_ARCH_CAPABILITIES: | ||
102 | /* Read-only */ | ||
103 | case MSR_TSX_FORCE_ABORT: | ||
104 | + case MSR_TSX_CTRL: | ||
105 | /* Not offered to guests. */ | ||
106 | goto gp_fault; | ||
107 | |||
108 | diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c | ||
109 | index 7903204761..949d4abbdf 100644 | ||
110 | --- a/xen/arch/x86/setup.c | ||
111 | +++ b/xen/arch/x86/setup.c | ||
112 | @@ -1540,6 +1540,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) | ||
113 | |||
114 | early_microcode_init(); | ||
115 | |||
116 | + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ | ||
117 | + | ||
118 | identify_cpu(&boot_cpu_data); | ||
119 | |||
120 | set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); | ||
121 | diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c | ||
122 | index b0496eb66e..cdf53afc1e 100644 | ||
123 | --- a/xen/arch/x86/smpboot.c | ||
124 | +++ b/xen/arch/x86/smpboot.c | ||
125 | @@ -370,6 +370,8 @@ void start_secondary(void *unused) | ||
126 | if ( boot_cpu_has(X86_FEATURE_IBRSB) ) | ||
127 | wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); | ||
128 | |||
129 | + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ | ||
130 | + | ||
131 | if ( xen_guest ) | ||
132 | hypervisor_ap_setup(); | ||
133 | |||
134 | diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c | ||
135 | new file mode 100644 | ||
136 | index 0000000000..a8ec2ccc69 | ||
137 | --- /dev/null | ||
138 | +++ b/xen/arch/x86/tsx.c | ||
139 | @@ -0,0 +1,74 @@ | ||
140 | +#include <xen/init.h> | ||
141 | +#include <asm/msr.h> | ||
142 | + | ||
143 | +/* | ||
144 | + * Valid values: | ||
145 | + * 1 => Explicit tsx=1 | ||
146 | + * 0 => Explicit tsx=0 | ||
147 | + * -1 => Default, implicit tsx=1 | ||
148 | + * | ||
149 | + * This is arranged such that the bottom bit encodes whether TSX is actually | ||
150 | + * disabled, while identifying various explicit (>=0) and implicit (<0) | ||
151 | + * conditions. | ||
152 | + */ | ||
153 | +int8_t __read_mostly opt_tsx = -1; | ||
154 | +int8_t __read_mostly cpu_has_tsx_ctrl = -1; | ||
155 | + | ||
156 | +static int __init parse_tsx(const char *s) | ||
157 | +{ | ||
158 | + int rc = 0, val = parse_bool(s, NULL); | ||
159 | + | ||
160 | + if ( val >= 0 ) | ||
161 | + opt_tsx = val; | ||
162 | + else | ||
163 | + rc = -EINVAL; | ||
164 | + | ||
165 | + return rc; | ||
166 | +} | ||
167 | +custom_param("tsx", parse_tsx); | ||
168 | + | ||
169 | +void tsx_init(void) | ||
170 | +{ | ||
171 | + /* | ||
172 | + * This function is first called between microcode being loaded, and CPUID | ||
173 | + * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL | ||
174 | + * is available. | ||
175 | + */ | ||
176 | + if ( unlikely(cpu_has_tsx_ctrl < 0) ) | ||
177 | + { | ||
178 | + uint64_t caps = 0; | ||
179 | + | ||
180 | + if ( boot_cpu_data.cpuid_level >= 7 && | ||
181 | + (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) | ||
182 | + rdmsrl(MSR_ARCH_CAPABILITIES, caps); | ||
183 | + | ||
184 | + cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); | ||
185 | + } | ||
186 | + | ||
187 | + if ( cpu_has_tsx_ctrl ) | ||
188 | + { | ||
189 | + uint64_t val; | ||
190 | + | ||
191 | + rdmsrl(MSR_TSX_CTRL, val); | ||
192 | + | ||
193 | + val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); | ||
194 | + /* Check bottom bit only. Higher bits are various sentinals. */ | ||
195 | + if ( !(opt_tsx & 1) ) | ||
196 | + val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; | ||
197 | + | ||
198 | + wrmsrl(MSR_TSX_CTRL, val); | ||
199 | + } | ||
200 | + else if ( opt_tsx >= 0 ) | ||
201 | + printk_once(XENLOG_WARNING | ||
202 | + "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); | ||
203 | +} | ||
204 | + | ||
205 | +/* | ||
206 | + * Local variables: | ||
207 | + * mode: C | ||
208 | + * c-file-style: "BSD" | ||
209 | + * c-basic-offset: 4 | ||
210 | + * tab-width: 4 | ||
211 | + * indent-tabs-mode: nil | ||
212 | + * End: | ||
213 | + */ | ||
214 | diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h | ||
215 | index 47e7c412f2..c96c4f85c9 100644 | ||
216 | --- a/xen/include/asm-x86/msr-index.h | ||
217 | +++ b/xen/include/asm-x86/msr-index.h | ||
218 | @@ -55,6 +55,7 @@ | ||
219 | #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) | ||
220 | #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) | ||
221 | #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) | ||
222 | +#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) | ||
223 | |||
224 | #define MSR_FLUSH_CMD 0x0000010b | ||
225 | #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) | ||
226 | @@ -62,6 +63,10 @@ | ||
227 | #define MSR_TSX_FORCE_ABORT 0x0000010f | ||
228 | #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) | ||
229 | |||
230 | +#define MSR_TSX_CTRL 0x00000122 | ||
231 | +#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) | ||
232 | +#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) | ||
233 | + | ||
234 | /* Intel MSRs. Some also available on other CPUs */ | ||
235 | #define MSR_IA32_PERFCTR0 0x000000c1 | ||
236 | #define MSR_IA32_A_PERFCTR0 0x000004c1 | ||
237 | diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h | ||
238 | index a0f8bf47e5..e707380f43 100644 | ||
239 | --- a/xen/include/asm-x86/processor.h | ||
240 | +++ b/xen/include/asm-x86/processor.h | ||
241 | @@ -268,6 +268,16 @@ static always_inline unsigned int cpuid_count_ebx( | ||
242 | return ebx; | ||
243 | } | ||
244 | |||
245 | +static always_inline unsigned int cpuid_count_edx( | ||
246 | + unsigned int leaf, unsigned int subleaf) | ||
247 | +{ | ||
248 | + unsigned int edx, tmp; | ||
249 | + | ||
250 | + cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); | ||
251 | + | ||
252 | + return edx; | ||
253 | +} | ||
254 | + | ||
255 | static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf, | ||
256 | struct cpuid_leaf *data) | ||
257 | { | ||
258 | @@ -622,6 +632,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model, | ||
259 | return fam; | ||
260 | } | ||
261 | |||
262 | +extern int8_t opt_tsx, cpu_has_tsx_ctrl; | ||
263 | +void tsx_init(void); | ||
264 | + | ||
265 | #endif /* !__ASSEMBLY__ */ | ||
266 | |||
267 | #endif /* __ASM_X86_PROCESSOR_H */ | ||
268 | diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h | ||
269 | index 750f809968..be223a6950 100644 | ||
270 | --- a/xen/include/xen/lib.h | ||
271 | +++ b/xen/include/xen/lib.h | ||
272 | @@ -116,6 +116,16 @@ extern int printk_ratelimit(void); | ||
273 | #define gprintk(lvl, fmt, args...) \ | ||
274 | printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) | ||
275 | |||
276 | +#define printk_once(fmt, args...) \ | ||
277 | +({ \ | ||
278 | + static bool __read_mostly once_; \ | ||
279 | + if ( unlikely(!once_) ) \ | ||
280 | + { \ | ||
281 | + once_ = true; \ | ||
282 | + printk(fmt, ## args); \ | ||
283 | + } \ | ||
284 | +}) | ||
285 | + | ||
286 | #ifdef NDEBUG | ||
287 | |||
288 | static inline void | ||
diff --git a/main/xen/xsa305-4.10-2.patch b/main/xen/xsa305-4.10-2.patch new file mode 100644 index 0000000000..3a061c26e7 --- /dev/null +++ b/main/xen/xsa305-4.10-2.patch | |||
@@ -0,0 +1,192 @@ | |||
1 | From: Andrew Cooper <andrew.cooper3@citrix.com> | ||
2 | Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel | ||
3 | |||
4 | See patch documentation and comments. | ||
5 | |||
6 | This is part of XSA-305 / CVE-2019-11135 | ||
7 | |||
8 | Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> | ||
9 | Reviewed-by: Jan Beulich <jbeulich@suse.com> | ||
10 | |||
11 | diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown | ||
12 | index 1b169c7b72..7a03f4ec70 100644 | ||
13 | --- a/docs/misc/xen-command-line.markdown | ||
14 | +++ b/docs/misc/xen-command-line.markdown | ||
15 | @@ -1813,7 +1813,7 @@ extreme care.** | ||
16 | An overall boolean value, `spec-ctrl=no`, can be specified to turn off all | ||
17 | mitigations, including pieces of infrastructure used to virtualise certain | ||
18 | mitigation features for guests. This also includes settings which `xpti`, | ||
19 | -`smt`, `pv-l1tf` control, unless the respective option(s) have been | ||
20 | +`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been | ||
21 | specified earlier on the command line. | ||
22 | |||
23 | Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to | ||
24 | @@ -1924,7 +1924,7 @@ pages) must also be specified via the tbuf\_size parameter. | ||
25 | = <bool> | ||
26 | |||
27 | Applicability: x86 | ||
28 | - Default: true | ||
29 | + Default: false on parts vulnerable to TAA, true otherwise | ||
30 | |||
31 | Controls for the use of Transactional Synchronization eXtensions. | ||
32 | |||
33 | @@ -1934,6 +1934,19 @@ a control has been introduced which allows TSX to be turned off. | ||
34 | On systems with the ability to turn TSX off, this boolean offers system wide | ||
35 | control of whether TSX is enabled or disabled. | ||
36 | |||
37 | +On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following | ||
38 | +logic applies: | ||
39 | + | ||
40 | + * An explicit `tsx=` choice is honoured, even if it is `true` and would | ||
41 | + result in a vulnerable system. | ||
42 | + | ||
43 | + * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be | ||
44 | + mitigated by disabling TSX, as this is the lowest overhead option. | ||
45 | + | ||
46 | + * If the use of TSX is important, the more expensive TAA mitigations can be | ||
47 | + opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain | ||
48 | + active by default. | ||
49 | + | ||
50 | ### ucode | ||
51 | > `= [<integer> | scan]` | ||
52 | |||
53 | diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c | ||
54 | index e25dadfa89..0f30362111 100644 | ||
55 | --- a/xen/arch/x86/spec_ctrl.c | ||
56 | +++ b/xen/arch/x86/spec_ctrl.c | ||
57 | @@ -136,6 +136,9 @@ static int __init parse_spec_ctrl(const char *s) | ||
58 | if ( opt_pv_l1tf_domu < 0 ) | ||
59 | opt_pv_l1tf_domu = 0; | ||
60 | |||
61 | + if ( opt_tsx == -1 ) | ||
62 | + opt_tsx = -3; | ||
63 | + | ||
64 | disable_common: | ||
65 | opt_rsb_pv = false; | ||
66 | opt_rsb_hvm = false; | ||
67 | @@ -346,7 +349,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) | ||
68 | printk("Speculative mitigation facilities:\n"); | ||
69 | |||
70 | /* Hardware features which pertain to speculative mitigations. */ | ||
71 | - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", | ||
72 | + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", | ||
73 | (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", | ||
74 | (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", | ||
75 | (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", | ||
76 | @@ -358,7 +361,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) | ||
77 | (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", | ||
78 | (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", | ||
79 | (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", | ||
80 | - (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); | ||
81 | + (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", | ||
82 | + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", | ||
83 | + (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); | ||
84 | |||
85 | /* Compiled-in support which pertains to mitigations. */ | ||
86 | if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) | ||
87 | @@ -372,7 +377,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) | ||
88 | "\n"); | ||
89 | |||
90 | /* Settings for Xen's protection, irrespective of guests. */ | ||
91 | - printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", | ||
92 | + printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", | ||
93 | thunk == THUNK_NONE ? "N/A" : | ||
94 | thunk == THUNK_RETPOLINE ? "RETPOLINE" : | ||
95 | thunk == THUNK_LFENCE ? "LFENCE" : | ||
96 | @@ -381,6 +386,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) | ||
97 | (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", | ||
98 | !boot_cpu_has(X86_FEATURE_SSBD) ? "" : | ||
99 | (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", | ||
100 | + !(caps & ARCH_CAPS_TSX_CTRL) ? "" : | ||
101 | + (opt_tsx & 1) ? " TSX+" : " TSX-", | ||
102 | opt_ibpb ? " IBPB" : "", | ||
103 | opt_l1d_flush ? " L1D_FLUSH" : "", | ||
104 | opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); | ||
105 | @@ -891,6 +898,7 @@ void __init init_speculation_mitigations(void) | ||
106 | { | ||
107 | enum ind_thunk thunk = THUNK_DEFAULT; | ||
108 | bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; | ||
109 | + bool cpu_has_bug_taa; | ||
110 | uint64_t caps = 0; | ||
111 | |||
112 | if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) | ||
113 | @@ -1120,6 +1128,53 @@ void __init init_speculation_mitigations(void) | ||
114 | "enabled. Mitigations will not be fully effective. Please\n" | ||
115 | "choose an explicit smt=<bool> setting. See XSA-297.\n"); | ||
116 | |||
117 | + /* | ||
118 | + * Vulnerability to TAA is a little complicated to quantify. | ||
119 | + * | ||
120 | + * In the pipeline, it is just another way to get speculative access to | ||
121 | + * stale load port, store buffer or fill buffer data, and therefore can be | ||
122 | + * considered a superset of MDS (on TSX-capable parts). On parts which | ||
123 | + * predate MDS_NO, the existing VERW flushing will mitigate this | ||
124 | + * sidechannel as well. | ||
125 | + * | ||
126 | + * On parts which contain MDS_NO, the lack of VERW flushing means that an | ||
127 | + * attacker can still use TSX to target microarchitectural buffers to leak | ||
128 | + * secrets. Therefore, we consider TAA to be the set of TSX-capable parts | ||
129 | + * which have MDS_NO but lack TAA_NO. | ||
130 | + * | ||
131 | + * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the | ||
132 | + * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so | ||
133 | + * we check both to spot TSX in a microcode/cmdline independent way. | ||
134 | + */ | ||
135 | + cpu_has_bug_taa = | ||
136 | + (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && | ||
137 | + (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; | ||
138 | + | ||
139 | + /* | ||
140 | + * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs | ||
141 | + * the MDS mitigation of disabling HT and using VERW flushing. | ||
142 | + * | ||
143 | + * On CPUs which advertise MDS_NO, VERW has no flushing side effect until | ||
144 | + * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being | ||
145 | + * advertised, and there isn't a MD_CLEAR_2 flag to use... | ||
146 | + * | ||
147 | + * If we're on affected hardware, able to do something about it (which | ||
148 | + * implies that VERW now works), no explicit TSX choice and traditional | ||
149 | + * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might | ||
150 | + * plausibly value TSX higher than Hyperthreading...), disable TSX to | ||
151 | + * mitigate TAA. | ||
152 | + */ | ||
153 | + if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && | ||
154 | + ((hw_smt_enabled && opt_smt) || | ||
155 | + !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) | ||
156 | + { | ||
157 | + setup_clear_cpu_cap(X86_FEATURE_HLE); | ||
158 | + setup_clear_cpu_cap(X86_FEATURE_RTM); | ||
159 | + | ||
160 | + opt_tsx = 0; | ||
161 | + tsx_init(); | ||
162 | + } | ||
163 | + | ||
164 | print_details(thunk, caps); | ||
165 | |||
166 | /* | ||
167 | diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c | ||
168 | index a8ec2ccc69..2d202a0d4e 100644 | ||
169 | --- a/xen/arch/x86/tsx.c | ||
170 | +++ b/xen/arch/x86/tsx.c | ||
171 | @@ -5,7 +5,8 @@ | ||
172 | * Valid values: | ||
173 | * 1 => Explicit tsx=1 | ||
174 | * 0 => Explicit tsx=0 | ||
175 | - * -1 => Default, implicit tsx=1 | ||
176 | + * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA | ||
177 | + * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) | ||
178 | * | ||
179 | * This is arranged such that the bottom bit encodes whether TSX is actually | ||
180 | * disabled, while identifying various explicit (>=0) and implicit (<0) | ||
181 | diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h | ||
182 | index c96c4f85c9..5ef80735b2 100644 | ||
183 | --- a/xen/include/asm-x86/msr-index.h | ||
184 | +++ b/xen/include/asm-x86/msr-index.h | ||
185 | @@ -56,6 +56,7 @@ | ||
186 | #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) | ||
187 | #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) | ||
188 | #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) | ||
189 | +#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) | ||
190 | |||
191 | #define MSR_FLUSH_CMD 0x0000010b | ||
192 | #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) | ||