diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 250e1be..a78cbfb 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -61,6 +61,10 @@ "Comment": "v0.5.0-303-g5ccd7f5", "Rev": "5ccd7f57e77e6ca59d1587c6deaad88ef5d32f7c" }, + { + "ImportPath": "github.com/hooklift/xhyve", + "Rev": "9622b3100eaeebde930c87c55a2a08e0ade2c3af" + }, { "ImportPath": "github.com/pmezard/go-difflib/difflib", "Rev": "d8ed2627bdf02c080bf22230dbb337003b7aba2d" diff --git a/README.md b/README.md index d161bf9..c7577c8 100644 --- a/README.md +++ b/README.md @@ -30,19 +30,6 @@ If you were doubt problem either, please post to this repository [issues](https: > make install ``` -### xhyve-bindings -https://github.com/zchee/xhyve-bindings - -Separated xhyve Go bindings into [xhyve-bindings](https://github.com/zchee/xhyve-bindings). -Or, see experimental embedded xhyve branch [embed-xhyve](https://github.com/zchee/docker-machine-driver-xhyve/tree/embed-xhyve) - -```bash -> go get -d github.com/zchee/xhyve-bindings -> cd $GOPATH/src/github.com/zchee/xhyve-bindings -> make -> make install -``` - ## Install Like the docker-machine's `Makefile`, install the `docker-machine-driver-xhyve` binary will be in `/usr/local/bin`. diff --git a/bin/main.go b/bin/main.go index 26867cd..7fd8a6e 100644 --- a/bin/main.go +++ b/bin/main.go @@ -1,10 +1,20 @@ package main import ( + "os" + "github.com/docker/machine/libmachine/drivers/plugin" + goxhyve "github.com/hooklift/xhyve" "github.com/zchee/docker-machine-driver-xhyve" ) func main() { - plugin.RegisterDriver(new(xhyve.Driver)) + if len(os.Args) >= 2 && os.Args[1] == "xhyve" { + args := os.Args[1:] // Technically we only need 2:, but a bug in hooklift/xhyve requires one arg in the beginning + if err := goxhyve.Run(args, make(chan string)); err != nil { + panic(err) + } + } else { + plugin.RegisterDriver(xhyve.NewDriver("", "")) + } } diff --git a/mk/driver.mk b/mk/driver.mk index e1f8069..c8dbdfa 100644 --- a/mk/driver.mk +++ b/mk/driver.mk @@ -54,12 +54,15 @@ test-upgrade: test-url: ${DOCKER_MACHINE_CMD} --storage-path ${DOCKER_MACHINE_STORAGEPATH} url ${DOCKER_MACHINE_VM_NAME} -driver-run: clean build install driver-kill - rm -rf ${DOCKER_MACHINE_STORAGEPATH}/machines/${DOCKER_MACHINE_VM_NAME} && ${DOCKER_MACHINE_CMD} --storage-path ${DOCKER_MACHINE_STORAGEPATH} create --driver xhyve --xhyve-disk-size ${DOCKER_MACHINE_VM_DISKSIZE} ${DOCKER_MACHINE_VM_NAME} +driver-run: clean build install driver-remove driver-kill + ${DOCKER_MACHINE_CMD} --storage-path ${DOCKER_MACHINE_STORAGEPATH} create --driver xhyve --xhyve-disk-size ${DOCKER_MACHINE_VM_DISKSIZE} ${DOCKER_MACHINE_VM_NAME} driver-kill: @PID=$$(pgrep goxhyve) || PID=none; \ echo "${CBLUE}==>${CRESET}${CGREEN}Kill goxhyve test process. PID:$$PID ${CRESET} ..."; \ sudo kill $$PID 2>/dev/null || true +driver-remove: + ${DOCKER_MACHINE_CMD} --storage-path ${DOCKER_MACHINE_STORAGEPATH} rm -f ${DOCKER_MACHINE_VM_NAME} || true + .PHONY: driver-kill diff --git a/vendor/github.com/hooklift/xhyve/.gitignore b/vendor/github.com/hooklift/xhyve/.gitignore new file mode 100644 index 0000000..de9792a --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/.gitignore @@ -0,0 +1,3 @@ +cmd/xhyve/imgs/stable* +cmd/xhyve/xhyve +vendor diff --git a/vendor/github.com/hooklift/xhyve/.gitmodules b/vendor/github.com/hooklift/xhyve/.gitmodules new file mode 100644 index 0000000..ec437bb --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/xhyve"] + path = vendor/xhyve + url = https://github.com/mist64/xhyve.git diff --git a/vendor/github.com/hooklift/xhyve/CONTRIBUTING.md b/vendor/github.com/hooklift/xhyve/CONTRIBUTING.md new file mode 100644 index 0000000..7eee32c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/CONTRIBUTING.md @@ -0,0 +1,51 @@ +### Thanks for your interest in contributing to this project and for taking the time to read this guide. + +## Code of conduct +*Taken from http://libvirt.org/governance.html with minor adjustments.* + +The open source community covers people from a wide variety of countries, backgrounds and positions. This global diversity is a great strength for this project, but can also lead to communication issues, which may in turn cause unhappiness. To maximize happiness of the project community taken as a whole, all members (whether users, contributors or committers) are expected to abide by the project's code of conduct. At a high level the code can be summarized as "be excellent to each other". Expanding on this: + +**Be respectful:** disagreements between people are to be expected and are usually the sign of healthy debate and engagement. Disagreements can lead to frustration and even anger for some members. Turning to personal insults, intimidation or threatening behavior does not improve the situation. Participants should thus take care to ensure all communications / interactions stay professional at all times. + +**Be considerate:** remember that the community has members with a diverse background many of whom have English as a second language. What might appear impolite, may simply be a result of a lack of knowledge of the English language. Bear in mind that actions will have an impact on other community members and the project as a whole, so take potential consequences into account before pursuing a course of action. + +**Be forgiving:** humans are fallible and as such prone to make mistakes and inexplicably change their positions at times. Don't assume that other members are acting with malicious intent. Be prepared to forgive people who make mistakes and assist each other in learning from them. Playing a blame game doesn't help anyone. + +## Issues +* Before reporting an issue make sure you search first if anybody has already reported a similar issue and whether or not it has been fixed. +* Make sure your issue report sufficiently details the problem. +* Include code samples reproducing the issue. +* Please do not derail or troll issues. Keep the discussion on topic and respect the Code of conduct. +* If you want to tackle any open issue, make sure you let people know you are working on it. + +## Development workflow +Go is unlike any other language in that it forces a specific development workflow and project structure. Trying to fight it is useless, frustrating and time consuming. So, you better be prepare to adapt your workflow when contributing to Go projects. + +### Prerequisites +* **Go**: To install Go please follow its installation guide at https://golang.org/doc/install +* **Git:** brew install git + +### Pull Requests +* Please be generous describing your changes. +* Although it is highly suggested to include tests, they are not a hard requirement in order to get your contributions accepted. +* Keep pull requests small so core developers can review them quickly. + +### Workflow for third-party code contributions +* In Github, fork `https://github.com/hooklift/xhyve.git` to your own account +* Get the package using "go get": `go get github.com/hooklift/xhyve` +* Move to where the package was cloned: `cd $GOPATH/src/github.com/hooklift/xhyve` +* Add a git remote pointing to your own fork: `git remote add downstream git@github.com:/xhyve.git` +* Create a branch for making your changes, then commit them. +* Push changes to downstream repository, this is your own fork: `git push downstream` +* In Github, from your fork, create the Pull Request and send it upstream. +* You are done. + + +### Workflow for core developers +Since core developers usually have access to the upstream repository, there is no need for having a workflow like the one for thid-party contributors. + +* Get the package using "go get": `go get github.com/hooklift/xhyve` +* Create a branch for making your changes, then commit them. +* Push changes to the repository: `git push origin ` +* In Github, create the Pull Request from your branch to master. +* Before merging into master, wait for at least two developers to code review your contribution. diff --git a/vendor/github.com/hooklift/xhyve/LICENSE b/vendor/github.com/hooklift/xhyve/LICENSE new file mode 100644 index 0000000..52d1351 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/LICENSE @@ -0,0 +1,374 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hooklift/xhyve/Makefile b/vendor/github.com/hooklift/xhyve/Makefile new file mode 100644 index 0000000..f5f5a59 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/Makefile @@ -0,0 +1,18 @@ +include xhyve.mk + +# Downloads xhyve files to make this library Go gettable. +# It also applies a patch to rename the main function so we can use xhyve +# as a library instead. +upstream: + git clone https://github.com/mist64/xhyve.git vendor/xhyve + -cd vendor/xhyve; patch -N -p1 < ../../xhyve.patch + find . \( -name \*.orig -o -name \*.rej \) -delete + for file in $(SRC); do \ + cp -f $$file $$(basename $$file) ; \ + done + cp -r vendor/xhyve/include include + +clean: + rm -rf *.c vendor include + +.PHONY: upstream clean diff --git a/vendor/github.com/hooklift/xhyve/README.md b/vendor/github.com/hooklift/xhyve/README.md new file mode 100644 index 0000000..ef4d3a7 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/README.md @@ -0,0 +1,37 @@ +# libxhyve (OS X only) +Go bindings to use [xhyve](https://github.com/mist64/xhyve) as a library. + +### Prerequisites +* OS X Yosemite and upwards +* Go 1.5.x + +### Install +go get github.com/hooklift/xhyve + +### Example + +[![asciicast](https://asciinema.org/a/bkxdrtso1cod53p5qzbypm4vs.png)](https://asciinema.org/a/bkxdrtso1cod53p5qzbypm4vs) + +```go +package main + +import ( + "os" + "github.com/hooklift/xhyve" +) + +func main() { + if err := xhyve.Run(os.Args); err != nil { + panic(err) + } +} +``` + +There is small CLI that you can use to test the library. + +```bash +cd cmd/xhyve; go build +sudo ./xhyve -m 1024M -c 1 -A -s 0:0,hostbridge -s 31,lpc \ +-l com1,stdio -s 2:0,virtio-net -U 6BCE442E-4359-4BD9-84F7-EDFB8EC6D2EF \ +-f 'kexec,imgs/vmlinuz,imgs/initrd.gz,earlyprintk=serial console=ttyS0' +``` diff --git a/vendor/github.com/hooklift/xhyve/acpitbl.c b/vendor/github.com/hooklift/xhyve/acpitbl.c new file mode 100644 index 0000000..48bfd74 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/acpitbl.c @@ -0,0 +1,1076 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * xhyve ACPI table generator. + * + * Does not require iasl but DSDT is limited to 1 PCI bus (0) and 8 devices. + * + * slot 0 hostbridge + * slot 31 lpc + */ + +/* + * The tables are placed in the guest's ROM area just below 1MB physical, + * above the MPTable. + * + * Layout + * ------ + * RSDP -> 0xf2400 (36 bytes fixed) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) + * MADT -> 0xf2500 (depends on #CPUs) + * FADT -> 0xf2600 (268 bytes) + * HPET -> 0xf2740 (56 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) + * DSDT -> 0xf2800 (variable - can go up to 0x100000) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define XHYVE_ACPI_BASE 0xf2400 +#define XHYVE_ACPI_SIZE 0xdc00 +#define RSDT_OFFSET 0x040 +#define XSDT_OFFSET 0x080 +#define MADT_OFFSET 0x100 +#define FADT_OFFSET 0x200 +#define HPET_OFFSET 0x340 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 +#define DSDT_OFFSET 0x400 + +/* ACPI table base in guest memory */ +static void *tb; +static int acpi_ncpu; +static uint32_t hpet_capabilities; +static void *dsdt; + +void +dsdt_line(UNUSED const char *fmt, ...) +{ +} + +void +dsdt_fixed_ioport(UNUSED uint16_t iobase, UNUSED uint16_t length) +{ +} + +void +dsdt_fixed_irq(UNUSED uint8_t irq) +{ +} + +void +dsdt_fixed_mem32(UNUSED uint32_t base, UNUSED uint32_t length) +{ +} + +void +dsdt_indent(UNUSED int levels) +{ +} + +void dsdt_unindent(UNUSED int levels) +{ +} + +static uint8_t +acpitbl_checksum(void *table, size_t length) { + unsigned int i; + uint8_t sum; + + for (sum = 0, i = 0; i < length; i++) { + sum += ((uint8_t *) table)[i]; + } + + return (((uint8_t) 0) - sum); +} + +static void +acpitbl_write8(void *base, uint64_t offset, uint8_t val) { + memcpy(((void *) (((uintptr_t) base) + offset)), &val, 1); +} + +static void +acpitbl_write16(void *base, uint64_t offset, uint16_t val) { + memcpy(((void *) (((uintptr_t) base) + offset)), &val, 2); +} + +static void +acpitbl_write32(void *base, uint64_t offset, uint32_t val) { + memcpy(((void *) (((uintptr_t) base) + offset)), &val, 4); +} + +static void +acpitbl_write64(void *base, uint64_t offset, uint64_t val) { + memcpy(((void *) (((uintptr_t) base) + offset)), &val, 8); +} + +static void +acpitbl_build_rdsp(void) { + void *rdsp; + /* + * [000h 0000 8] Signature : "RSD PTR " + * [008h 0008 1] Checksum : 00 + * [009h 0009 6] Oem ID : "BHYVE " + * [00Fh 0015 1] Revision : 02 + * [010h 0016 4] RSDT Address : 00000000 + * [014h 0020 4] Length : 00000024 + * [018h 0024 8] XSDT Address : 0000000000000000 + * [020h 0032 1] Extended Checksum : 00 + * [021h 0033 3] Reserved : 000000 + */ + static const uint8_t rdsp_tmpl[36] = { + 0x52, 0x53, 0x44, 0x20, 0x50, 0x54, 0x52, 0x20, + 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + + rdsp = (void *) (((uintptr_t) tb) + 0); + /* copy RDSP template to guest memory */ + memcpy(rdsp, rdsp_tmpl, 36); + /* fixup table */ + acpitbl_write32(rdsp, 0x10, ((uint32_t) (XHYVE_ACPI_BASE + RSDT_OFFSET))); + acpitbl_write64(rdsp, 0x18, ((uint64_t) (XHYVE_ACPI_BASE + XSDT_OFFSET))); + /* write checksum */ + acpitbl_write8(rdsp, 0x8, acpitbl_checksum(rdsp, 20)); + /* write extended checksum */ + acpitbl_write8(rdsp, 0x20, acpitbl_checksum(rdsp, 36)); +} + +static void +acpitbl_build_rsdt(void) { + void *rsdt; + /* + * [000h 0000 4] Signature : "RSDT" + * [004h 0004 4] Table Length : 00000034 + * [008h 0008 1] Revision : 01 + * [009h 0009 1] Checksum : 00 + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVRSDT " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 4] ACPI Table Address 0 : 00000000 + * [028h 0040 4] ACPI Table Address 1 : 00000000 + * [02Ch 0044 4] ACPI Table Address 2 : 00000000 + * [030h 0048 4] ACPI Table Address 3 : 00000000 + */ + static const uint8_t rsdt_tmpl[52] = { + 0x52, 0x53, 0x44, 0x54, 0x34, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x52, 0x53, 0x44, 0x54, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + + rsdt = (void *) (((uintptr_t) tb) + RSDT_OFFSET); + /* copy RSDT template to guest memory */ + memcpy(rsdt, rsdt_tmpl, 52); + /* fixup table */ + acpitbl_write32(rsdt, 0x24, ((uint32_t) (XHYVE_ACPI_BASE + MADT_OFFSET))); + acpitbl_write32(rsdt, 0x28, ((uint32_t) (XHYVE_ACPI_BASE + FADT_OFFSET))); + acpitbl_write32(rsdt, 0x2c, ((uint32_t) (XHYVE_ACPI_BASE + HPET_OFFSET))); + acpitbl_write32(rsdt, 0x30, ((uint32_t) (XHYVE_ACPI_BASE + MCFG_OFFSET))); + /* write checksum */ + acpitbl_write8(rsdt, 0x9, acpitbl_checksum(rsdt, 52)); +} + +static void +acpitbl_build_xsdt(void) { + void *xsdt; + /* + * [000h 0000 4] Signature : "XSDT" + * [004h 0004 4] Table Length : 00000044 + * [008h 0008 1] Revision : 01 + * [009h 0009 1] Checksum : 00 + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVXSDT " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 8] ACPI Table Address 0 : 0000000000000000 + * [02Ch 0044 8] ACPI Table Address 1 : 0000000000000000 + * [034h 0052 8] ACPI Table Address 2 : 0000000000000000 + * [03Ch 0060 8] ACPI Table Address 3 : 0000000000000000 + */ + static const uint8_t xsdt_tmpl[68] = { + 0x58, 0x53, 0x44, 0x54, 0x44, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x58, 0x53, 0x44, 0x54, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + + xsdt = (void *) (((uintptr_t) tb) + XSDT_OFFSET); + /* copy XSDT template to guest memory */ + memcpy(xsdt, xsdt_tmpl, 68); + /* fixup table */ + acpitbl_write64(xsdt, 0x24, ((uint64_t) (XHYVE_ACPI_BASE + MADT_OFFSET))); + acpitbl_write64(xsdt, 0x2c, ((uint64_t) (XHYVE_ACPI_BASE + FADT_OFFSET))); + acpitbl_write64(xsdt, 0x34, ((uint64_t) (XHYVE_ACPI_BASE + HPET_OFFSET))); + acpitbl_write64(xsdt, 0x3c, ((uint64_t) (XHYVE_ACPI_BASE + MCFG_OFFSET))); + /* write checksum */ + acpitbl_write8(xsdt, 0x9, acpitbl_checksum(xsdt, 68)); +} + +static void +acpitbl_build_madt(void) { + void *madt_head, *madt_apic, *madt_tail; + int i; + /* + * [000h 0000 4] Signature : "APIC" + * [004h 0004 4] Table Length : 00000000 + * [008h 0008 1] Revision : 01 + * [009h 0009 1] Checksum : 4E + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVMADT " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 4] Local Apic Address : FEE00000 + * [028h 0040 4] Flags (decoded below) : 00000001 + * PC-AT Compatibility : 1 + */ + static const uint8_t madt_head_tmpl[44] = { + 0x41, 0x50, 0x49, 0x43, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x4D, 0x41, 0x44, 0x54, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0xE0, 0xFE, + 0x01, 0x00, 0x00, 0x00, + }; + /* + * [+000h +0000 1] Subtable Type : 00 + * [+001h +0001 1] Length : 08 + * [+002h +0002 1] Processor ID : 00 + * [+003h +0003 1] Local Apic ID : 00 + * [+004h +0004 4] Flags (decoded below) : 00000001 + * Processor Enabled : 1 + */ + static const uint8_t madt_apic_tmpl[8] = { + 0x00, 0x08, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + /* + * [+000h +0000 1] Subtable Type : 01 + * [+001h +0001 1] Length : 0C + * [+002h +0002 1] I/O Apic ID : 00 + * [+003h +0003 1] Reserved : 00 + * [+004h +0004 4] Address : FEC00000 + * [+008h +0008 4] Interrupt : 00000000 + * [+00Ch +0012 1] Subtable Type : 02 + * [+00Dh +0013 1] Length : 0A + * [+00Eh +0014 1] Bus : 00 + * [+00Fh +0015 1] Source : 00 + * [+010h +0016 4] Interrupt : 00000002 + * [+014h +0020 2] Flags (decoded below) : 0005 + * Polarity : 1 + * Trigger Mode : 1 + * [+016h +0022 1] Subtable Type : 02 + * [+017h +0023 1] Length : 0A + * [+018h +0024 1] Bus : 00 + * [+019h +0025 1] Source : 00 + * [+01Ah +0026 4] Interrupt : 00000000 + * [+01Eh +0030 2] Flags (decoded below) : 000F + * Polarity : 3 + * Trigger Mode : 3 + * [+020h +0032 1] Subtable Type : 04 + * [+021h +0033 1] Length : 06 + * [+022h +0034 1] Processor ID : FF + * [+023h +0035 2] Flags (decoded below) : 0005 + * Polarity : 1 + * Trigger Mode : 1 + * [+025h +0037 1] Interrupt Input LINT : 01 + */ + static const uint8_t madt_tail_tmpl[38] = { + 0x01, 0x0C, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xFE, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x0A, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x02, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, + 0x04, 0x06, 0xFF, 0x05, 0x00, 0x01 + }; + + madt_head = (void *) (((uintptr_t) tb) + MADT_OFFSET); + /* copy MADT head template to guest memory */ + memcpy(madt_head, madt_head_tmpl, 44); + + for (i = 0; i < acpi_ncpu; i++) { + madt_apic = (void *) (((uintptr_t) tb) + + ((size_t) ((MADT_OFFSET + 44) + (8 * i)))); + /* copy MADT APIC template to guest memory */ + memcpy(madt_apic, madt_apic_tmpl, 8); + /* fixup table */ + acpitbl_write8(madt_apic, 0x2, ((uint8_t) i)); + acpitbl_write8(madt_apic, 0x3, ((uint8_t) i)); + } + + madt_tail = (void *) (((uintptr_t) tb) + + ((size_t) ((MADT_OFFSET + 44) + (8 * acpi_ncpu)))); + /* copy MADT tail template to guest memory */ + memcpy(madt_tail, madt_tail_tmpl, 38); + /* fixup table */ + acpitbl_write8(madt_tail, 0x2, 0); + acpitbl_write8(madt_tail, 0x19, SCI_INT); + acpitbl_write32(madt_tail, 0x1a, SCI_INT); + /* write checksum */ + acpitbl_write32(madt_head, 0x4, ((uint32_t) (44 + (8 * acpi_ncpu) + 38))); + acpitbl_write8(madt_head, 0x9, + acpitbl_checksum(madt_head, ((size_t) (44 + (8 * acpi_ncpu) + 38)))); +} + +static void +acpitbl_build_fadt(void) { + void *fadt; + /* + * [000h 0000 4] Signature : "FACP" + * [004h 0004 4] Table Length : 0000010C + * [008h 0008 1] Revision : 05 + * [009h 0009 1] Checksum : 00 + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVFACP " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 4] FACS Address : 00000000 + * [028h 0040 4] DSDT Address : 00000000 + * [02Ch 0044 1] Model : 01 + * [02Dh 0045 1] PM Profile : 00 (Unspecified) + * [02Eh 0046 2] SCI Interrupt : 0000 + * [030h 0048 4] SMI Command Port : 00000000 + * [034h 0052 1] ACPI Enable Value : 00 + * [035h 0053 1] ACPI Disable Value : 00 + * [036h 0054 1] S4BIOS Command : 00 + * [037h 0055 1] P-State Control : 00 + * [038h 0056 4] PM1A Event Block Address : 00000000 + * [03Ch 0060 4] PM1B Event Block Address : 00000000 + * [040h 0064 4] PM1A Control Block Address : 00000000 + * [044h 0068 4] PM1B Control Block Address : 00000000 + * [048h 0072 4] PM2 Control Block Address : 00000000 + * [04Ch 0076 4] PM Timer Block Address : 00000000 + * [050h 0080 4] GPE0 Block Address : 00000000 + * [054h 0084 4] GPE1 Block Address : 00000000 + * [058h 0088 1] PM1 Event Block Length : 04 + * [059h 0089 1] PM1 Control Block Length : 02 + * [05Ah 0090 1] PM2 Control Block Length : 00 + * [05Bh 0091 1] PM Timer Block Length : 04 + * [05Ch 0092 1] GPE0 Block Length : 00 + * [05Dh 0093 1] GPE1 Block Length : 00 + * [05Eh 0094 1] GPE1 Base Offset : 00 + * [05Fh 0095 1] _CST Support : 00 + * [060h 0096 2] C2 Latency : 0000 + * [062h 0098 2] C3 Latency : 0000 + * [064h 0100 2] CPU Cache Size : 0000 + * [066h 0102 2] Cache Flush Stride : 0000 + * [068h 0104 1] Duty Cycle Offset : 00 + * [069h 0105 1] Duty Cycle Width : 00 + * [06Ah 0106 1] RTC Day Alarm Index : 00 + * [06Bh 0107 1] RTC Month Alarm Index : 00 + * [06Ch 0108 1] RTC Century Index : 32 + * [06Dh 0109 2] Boot Flags (decoded below) : 0014 + * Legacy Devices Supported (V2) : 0 + * 8042 Present on ports 60/64 (V2) : 0 + * VGA Not Present (V4) : 1 + * MSI Not Supported (V4) : 0 + * PCIe ASPM Not Supported (V4) : 1 + * [06Fh 0111 1] Reserved : 00 + * [070h 0112 4] Flags (decoded below) : 00081525 + * WBINVD instruction is operational (V1) : 1 + * WBINVD flushes all caches (V1) : 0 + * All CPUs support C1 (V1) : 1 + * C2 works on MP system (V1) : 0 + * Control Method Power Button (V1) : 0 + * Control Method Sleep Button (V1) : 1 + * RTC wake not in fixed reg space (V1) : 0 + * RTC can wake system from S4 (V1) : 0 + * 32-bit PM Timer (V1) : 1 + * Docking Supported (V1) : 0 + * Reset Register Supported (V2) : 1 + * Sealed Case (V3) : 0 + * Headless - No Video (V3) : 1 + * Use native instr after SLP_TYPx (V3) : 0 + * PCIEXP_WAK Bits Supported (V4) : 0 + * Use Platform Timer (V4) : 0 + * RTC_STS valid on S4 wake (V4) : 0 + * Remote Power-on capable (V4) : 0 + * Use APIC Cluster Model (V4) : 0 + * Use APIC Physical Destination Mode (V4) : 1 + * [074h 0116 12] Reset Register : + * [074h 0116 1] Space ID : 01 (SystemIO) + * [075h 0117 1] Bit Width : 08 + * [076h 0118 1] Bit Offset : 00 + * [077h 0119 1] Access Width : 01 + * [078h 0120 8] Address : 0000000000000CF9 + * [080h 0128 1] Value to cause reset : 06 + * [081h 0129 3] Reserved : 000001 + * [084h 0132 8] FACS Address : 0000000000000000 + * [08Ch 0140 8] DSDT Address : 0000000000000000 + * [094h 0148 12] PM1A Event Block : + * [094h 0148 1] Space ID : 01 (SystemIO) + * [095h 0149 1] Bit Width : 20 + * [096h 0150 1] Bit Offset : 00 + * [097h 0151 1] Access Width : 02 + * [098h 0152 8] Address : 0000000000000000 + * [0A0h 0160 12] PM1B Event Block : + * [0A0h 0160 1] Space ID : 01 (SystemIO) + * [0A1h 0161 1] Bit Width : 00 + * [0A2h 0162 1] Bit Offset : 00 + * [0A3h 0163 1] Access Width : 00 + * [0A4h 0164 8] Address : 0000000000000000 + * [0ACh 0172 12] PM1A Control Block : + * [0ACh 0172 1] Space ID : 01 (SystemIO) + * [0ADh 0173 1] Bit Width : 10 + * [0AEh 0174 1] Bit Offset : 00 + * [0AFh 0175 1] Access Width : 02 + * [0B0h 0176 8] Address : 0000000000000000 + * [0B8h 0184 12] PM1B Control Block : + * [0B8h 0184 1] Space ID : 01 (SystemIO) + * [0B9h 0185 1] Bit Width : 00 + * [0BAh 0186 1] Bit Offset : 00 + * [0BBh 0187 1] Access Width : 00 + * [0BCh 0188 8] Address : 0000000000000000 + * [0C4h 0196 12] PM2 Control Block : + * [0C4h 0196 1] Space ID : 01 (SystemIO) + * [0C5h 0197 1] Bit Width : 08 + * [0C6h 0198 1] Bit Offset : 00 + * [0C7h 0199 1] Access Width : 00 + * [0C8h 0200 8] Address : 0000000000000000 + * [0D0h 0208 12] PM Timer Block : + * [0D0h 0208 1] Space ID : 01 (SystemIO) + * [0D1h 0209 1] Bit Width : 20 + * [0D2h 0210 1] Bit Offset : 00 + * [0D3h 0211 1] Access Width : 03 + * [0D4h 0212 8] Address : 0000000000000000 + * [0DCh 0220 12] GPE0 Block : + * [0DCh 0220 1] Space ID : 01 (SystemIO) + * [0DDh 0221 1] Bit Width : 00 + * [0DEh 0222 1] Bit Offset : 00 + * [0DFh 0223 1] Access Width : 01 + * [0E0h 0224 8] Address : 0000000000000000 + * [0E8h 0232 12] GPE1 Block : + * [0E8h 0232 1] Space ID : 01 (SystemIO) + * [0E9h 0233 1] Bit Width : 00 + * [0EAh 0234 1] Bit Offset : 00 + * [0EBh 0235 1] Access Width : 00 + * [0ECh 0236 8] Address : 0000000000000000 + * [0F4h 0244 12] Sleep Control Register : + * [0F4h 0244 1] Space ID : 01 (SystemIO) + * [0F5h 0245 1] Bit Width : 08 + * [0F6h 0246 1] Bit Offset : 00 + * [0F7h 0247 1] Access Width : 01 + * [0F8h 0248 8] Address : 0000000000000000 + * [100h 0256 12] Sleep Status Register : + * [100h 0256 01] Space ID : 01 (SystemIO) + * [101h 0257 01] Bit Width : 08 + * [102h 0258 01] Bit Offset : 00 + * [103h 0259 01] Access Width : 01 + * [104h 0260 08] Address : 0000000000000000 + */ + static const uint8_t fadt_tmpl[268] = { + 0x46, 0x41, 0x43, 0x50, 0x0C, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x46, 0x41, 0x43, 0x50, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x32, 0x14, 0x00, 0x00, + 0x25, 0x15, 0x08, 0x00, 0x01, 0x08, 0x00, 0x01, + 0xF9, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x20, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x10, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x20, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x08, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + + fadt = (void *) (((uintptr_t) tb) + FADT_OFFSET); + /* copy FADT template to guest memory */ + memcpy(fadt, fadt_tmpl, 268); + /* fixup table */ + acpitbl_write32(fadt, 0x24, ((uint32_t) (XHYVE_ACPI_BASE + FACS_OFFSET))); + acpitbl_write32(fadt, 0x28, ((uint32_t) (XHYVE_ACPI_BASE + DSDT_OFFSET))); + acpitbl_write16(fadt, 0x2e, SCI_INT); + acpitbl_write32(fadt, 0x30, SMI_CMD); + acpitbl_write8(fadt, 0x34, BHYVE_ACPI_ENABLE); + acpitbl_write8(fadt, 0x35, BHYVE_ACPI_DISABLE); + acpitbl_write32(fadt, 0x38, PM1A_EVT_ADDR); + acpitbl_write32(fadt, 0x40, PM1A_CNT_ADDR); + acpitbl_write32(fadt, 0x4c, IO_PMTMR); + acpitbl_write64(fadt, 0x84, ((uint64_t) (XHYVE_ACPI_BASE + FACS_OFFSET))); + acpitbl_write64(fadt, 0x8c, ((uint64_t) (XHYVE_ACPI_BASE + DSDT_OFFSET))); + acpitbl_write64(fadt, 0x98, ((uint64_t) PM1A_EVT_ADDR)); + acpitbl_write64(fadt, 0xb0, ((uint64_t) PM1A_CNT_ADDR)); + acpitbl_write64(fadt, 0xd4, ((uint64_t) IO_PMTMR)); + /* write checksum */ + acpitbl_write8(fadt, 0x9, acpitbl_checksum(fadt, 268)); +} + +static void +acpitbl_build_hpet(void) { + void *hpet; + /* + * [000h 0000 4] Signature : "HPET" + * [004h 0004 4] Table Length : 00000038 + * [008h 0008 1] Revision : 01 + * [009h 0009 1] Checksum : 00 + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVHPET " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 4] Hardware Block ID : 00000000 + * [028h 0040 12] Timer Block Register : + * [028h 0040 1] Space ID : 00 (SystemMemory) + * [029h 0041 1] Bit Width : 00 + * [02Ah 0042 1] Bit Offset : 00 + * [02Bh 0043 1] Access Width : 00 + * [02Ch 0044 8] Address : 00000000FED00000 + * [034h 0052 1] Sequence Number : 00 + * [035h 0053 2] Minimum Clock Ticks : 0000 + * [037h 0055 1] Flags (decoded below) : 01 + * 4K Page Protect : 1 + * 64K Page Protect : 0 + */ + static const uint8_t hpet_tmpl[56] = { + 0x48, 0x50, 0x45, 0x54, 0x38, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x48, 0x50, 0x45, 0x54, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFE, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 + }; + + hpet = (void *) (((uintptr_t) tb) + HPET_OFFSET); + /* copy HPET template to guest memory */ + memcpy(hpet, hpet_tmpl, 56); + /* fixup table */ + acpitbl_write32(hpet, 0x24, hpet_capabilities); + /* write checksum */ + acpitbl_write8(hpet, 0x9, acpitbl_checksum(hpet, 56)); +} + +static void +acpitbl_build_mcfg(void) { + void *mcfg; + /* + * [000h 0000 4] Signature : "MCFG" + * [004h 0004 4] Table Length : 0000003C + * [008h 0008 1] Revision : 01 + * [009h 0009 1] Checksum : 00 + * [00Ah 0010 6] Oem ID : "BHYVE " + * [010h 0016 8] Oem Table ID : "BVMCFG " + * [018h 0024 4] Oem Revision : 00000001 + * [01Ch 0028 4] Asl Compiler ID : "INTL" + * [020h 0032 4] Asl Compiler Revision : 20140828 + * [024h 0036 8] Reserved : 0000000000000000 + * [02Ch 0044 8] Base Address : 0000000000000000 + * [034h 0052 2] Segment Group Number : 0000 + * [036h 0054 1] Start Bus Number : 00 + * [037h 0055 1] End Bus Number : FF + * [038h 0056 4] Reserved : 00000000 + */ + static const uint8_t mcfg_tmpl[60] = { + 0x4D, 0x43, 0x46, 0x47, 0x3C, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x4D, 0x43, 0x46, 0x47, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4E, 0x54, 0x4C, + 0x28, 0x08, 0x14, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0x00, 0x00 + }; + + mcfg = (void *) (((uintptr_t) tb) + MCFG_OFFSET); + /* copy MCFG template to guest memory */ + memcpy(mcfg, mcfg_tmpl, 60); + /* fixup table */ + acpitbl_write64(mcfg, 0x2c, pci_ecfg_base()); + /* write checksum */ + acpitbl_write8(mcfg, 0x9, acpitbl_checksum(mcfg, 60)); +} + +static void +acpitbl_build_facs(void) { + void *facs; + /* + * [000h 0000 4] Signature : "FACS" + * [004h 0004 4] Length : 00000040 + * [008h 0008 4] Hardware Signature : 00000000 + * [00Ch 0012 4] 32 Firmware Waking Vector : 00000000 + * [010h 0016 4] Global Lock : 00000000 + * [014h 0020 4] Flags (decoded below) : 00000000 + * S4BIOS Support Present : 0 + * 64-bit Wake Supported (V2) : 0 + * [018h 0024 8] 64 Firmware Waking Vector : 0000000000000000 + * [020h 0032 1] Version : 02 + * [021h 0033 3] Reserved : 000000 + * [024h 0036 4] OspmFlags (decoded below) : 00000000 + * 64-bit Wake Env Required (V2) : 0 + */ + static const uint8_t facs_tmpl[64] = { + 0x46, 0x41, 0x43, 0x53, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + }; + + facs = (void *) (((uintptr_t) tb) + FACS_OFFSET); + /* copy MCFG template to guest memory */ + memcpy(facs, facs_tmpl, 64); +} + +void dsdt_fixup(int bus, uint16_t iobase, uint16_t iolimit, uint32_t membase32, + uint32_t memlimit32, uint64_t membase64, uint64_t memlimit64) +{ + if (bus != 0) { + fprintf(stderr, "DSDT, unsupported PCI bus (%d)\n", bus); + exit(-1); + } + + acpitbl_write16(dsdt, 0xb6, iobase); + acpitbl_write16(dsdt, 0xb8, (iolimit - 1)); + acpitbl_write16(dsdt, 0xbc, (iolimit - iobase)); + acpitbl_write32(dsdt, 0xc8, membase32); + acpitbl_write32(dsdt, 0xcc, (memlimit32 - 1)); + acpitbl_write32(dsdt, 0xd4, (memlimit32 - membase32)); + acpitbl_write64(dsdt, 0xe6, membase64); + acpitbl_write64(dsdt, 0xee, (memlimit64 - 1)); + acpitbl_write64(dsdt, 0xfe, (memlimit64 - membase64)); +} + +static void +acpitbl_build_dsdt(void) { + static const uint8_t dsdt_tmpl[2604] = { + 0x44, 0x53, 0x44, 0x54, 0x2d, 0x0a, 0x00, 0x00, + 0x02, 0x5d, 0x42, 0x48, 0x59, 0x56, 0x45, 0x20, + 0x42, 0x56, 0x44, 0x53, 0x44, 0x54, 0x20, 0x20, + 0x01, 0x00, 0x00, 0x00, 0x49, 0x4e, 0x54, 0x4c, + 0x28, 0x08, 0x14, 0x20, 0x08, 0x5f, 0x53, 0x35, + 0x5f, 0x12, 0x05, 0x02, 0x0a, 0x05, 0x00, 0x08, + 0x50, 0x49, 0x43, 0x4d, 0x0a, 0x00, 0x14, 0x0c, + 0x5f, 0x50, 0x49, 0x43, 0x01, 0x70, 0x68, 0x50, + 0x49, 0x43, 0x4d, 0x10, 0x4f, 0x9a, 0x5f, 0x53, + 0x42, 0x5f, 0x5b, 0x82, 0x47, 0x9a, 0x50, 0x43, + 0x30, 0x30, 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, + 0x41, 0xd0, 0x0a, 0x03, 0x08, 0x5f, 0x41, 0x44, + 0x52, 0x00, 0x14, 0x09, 0x5f, 0x42, 0x42, 0x4e, + 0x00, 0xa4, 0x0a, 0x00, 0x08, 0x5f, 0x43, 0x52, + 0x53, 0x11, 0x46, 0x09, 0x0a, 0x92, 0x88, 0x0d, + 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x47, 0x01, + 0xf8, 0x0c, 0xf8, 0x0c, 0x01, 0x08, 0x88, 0x0d, + 0x00, 0x01, 0x0c, 0x03, 0x00, 0x00, 0x00, 0x00, + 0xf7, 0x0c, 0x00, 0x00, 0xf8, 0x0c, 0x88, 0x0d, + 0x00, 0x01, 0x0c, 0x03, 0x00, 0x00, 0x00, 0x0d, + 0xff, 0x1f, 0x00, 0x00, 0x00, 0x13, 0x88, 0x0d, + 0x00, 0x01, 0x0c, 0x03, 0x00, 0x00, 0x00, 0x20, + 0x1f, 0x20, 0x00, 0x00, 0x20, 0x00, 0x87, 0x17, + 0x00, 0x00, 0x0c, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x1f, 0xc0, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x8a, 0x2b, 0x00, 0x00, 0x0c, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0x0f, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x79, 0x00, + 0x08, 0x50, 0x50, 0x52, 0x54, 0x12, 0x4b, 0x0f, + 0x08, 0x12, 0x1e, 0x04, 0x0c, 0xff, 0xff, 0x01, + 0x00, 0x0a, 0x00, 0x5c, 0x2f, 0x04, 0x5f, 0x53, + 0x42, 0x5f, 0x50, 0x43, 0x30, 0x30, 0x49, 0x53, + 0x41, 0x5f, 0x4c, 0x4e, 0x4b, 0x41, 0x0a, 0x00, + 0x12, 0x1e, 0x04, 0x0c, 0xff, 0xff, 0x02, 0x00, + 0x0a, 0x00, 0x5c, 0x2f, 0x04, 0x5f, 0x53, 0x42, + 0x5f, 0x50, 0x43, 0x30, 0x30, 0x49, 0x53, 0x41, + 0x5f, 0x4c, 0x4e, 0x4b, 0x42, 0x0a, 0x00, 0x12, + 0x1e, 0x04, 0x0c, 0xff, 0xff, 0x03, 0x00, 0x0a, + 0x00, 0x5c, 0x2f, 0x04, 0x5f, 0x53, 0x42, 0x5f, + 0x50, 0x43, 0x30, 0x30, 0x49, 0x53, 0x41, 0x5f, + 0x4c, 0x4e, 0x4b, 0x43, 0x0a, 0x00, 0x12, 0x1e, + 0x04, 0x0c, 0xff, 0xff, 0x04, 0x00, 0x0a, 0x00, + 0x5c, 0x2f, 0x04, 0x5f, 0x53, 0x42, 0x5f, 0x50, + 0x43, 0x30, 0x30, 0x49, 0x53, 0x41, 0x5f, 0x4c, + 0x4e, 0x4b, 0x44, 0x0a, 0x00, 0x12, 0x1e, 0x04, + 0x0c, 0xff, 0xff, 0x05, 0x00, 0x0a, 0x00, 0x5c, + 0x2f, 0x04, 0x5f, 0x53, 0x42, 0x5f, 0x50, 0x43, + 0x30, 0x30, 0x49, 0x53, 0x41, 0x5f, 0x4c, 0x4e, + 0x4b, 0x45, 0x0a, 0x00, 0x12, 0x1e, 0x04, 0x0c, + 0xff, 0xff, 0x06, 0x00, 0x0a, 0x00, 0x5c, 0x2f, + 0x04, 0x5f, 0x53, 0x42, 0x5f, 0x50, 0x43, 0x30, + 0x30, 0x49, 0x53, 0x41, 0x5f, 0x4c, 0x4e, 0x4b, + 0x46, 0x0a, 0x00, 0x12, 0x1e, 0x04, 0x0c, 0xff, + 0xff, 0x07, 0x00, 0x0a, 0x00, 0x5c, 0x2f, 0x04, + 0x5f, 0x53, 0x42, 0x5f, 0x50, 0x43, 0x30, 0x30, + 0x49, 0x53, 0x41, 0x5f, 0x4c, 0x4e, 0x4b, 0x47, + 0x0a, 0x00, 0x12, 0x1e, 0x04, 0x0c, 0xff, 0xff, + 0x08, 0x00, 0x0a, 0x00, 0x5c, 0x2f, 0x04, 0x5f, + 0x53, 0x42, 0x5f, 0x50, 0x43, 0x30, 0x30, 0x49, + 0x53, 0x41, 0x5f, 0x4c, 0x4e, 0x4b, 0x48, 0x0a, + 0x00, 0x08, 0x41, 0x50, 0x52, 0x54, 0x12, 0x4b, + 0x06, 0x08, 0x12, 0x0c, 0x04, 0x0c, 0xff, 0xff, + 0x01, 0x00, 0x0a, 0x00, 0x00, 0x0a, 0x10, 0x12, + 0x0c, 0x04, 0x0c, 0xff, 0xff, 0x02, 0x00, 0x0a, + 0x00, 0x00, 0x0a, 0x11, 0x12, 0x0c, 0x04, 0x0c, + 0xff, 0xff, 0x03, 0x00, 0x0a, 0x00, 0x00, 0x0a, + 0x12, 0x12, 0x0c, 0x04, 0x0c, 0xff, 0xff, 0x04, + 0x00, 0x0a, 0x00, 0x00, 0x0a, 0x13, 0x12, 0x0c, + 0x04, 0x0c, 0xff, 0xff, 0x05, 0x00, 0x0a, 0x00, + 0x00, 0x0a, 0x14, 0x12, 0x0c, 0x04, 0x0c, 0xff, + 0xff, 0x06, 0x00, 0x0a, 0x00, 0x00, 0x0a, 0x15, + 0x12, 0x0c, 0x04, 0x0c, 0xff, 0xff, 0x07, 0x00, + 0x0a, 0x00, 0x00, 0x0a, 0x16, 0x12, 0x0c, 0x04, + 0x0c, 0xff, 0xff, 0x08, 0x00, 0x0a, 0x00, 0x00, + 0x0a, 0x17, 0x14, 0x18, 0x5f, 0x50, 0x52, 0x54, + 0x00, 0xa0, 0x0a, 0x50, 0x49, 0x43, 0x4d, 0xa4, + 0x41, 0x50, 0x52, 0x54, 0xa1, 0x06, 0xa4, 0x50, + 0x50, 0x52, 0x54, 0x5b, 0x82, 0x4e, 0x75, 0x49, + 0x53, 0x41, 0x5f, 0x08, 0x5f, 0x41, 0x44, 0x52, + 0x0c, 0x00, 0x00, 0x1f, 0x00, 0x5b, 0x80, 0x4c, + 0x50, 0x43, 0x52, 0x02, 0x0a, 0x00, 0x0b, 0x00, + 0x01, 0x5b, 0x81, 0x33, 0x4c, 0x50, 0x43, 0x52, + 0x00, 0x00, 0x40, 0x30, 0x50, 0x49, 0x52, 0x41, + 0x08, 0x50, 0x49, 0x52, 0x42, 0x08, 0x50, 0x49, + 0x52, 0x43, 0x08, 0x50, 0x49, 0x52, 0x44, 0x08, + 0x00, 0x20, 0x50, 0x49, 0x52, 0x45, 0x08, 0x50, + 0x49, 0x52, 0x46, 0x08, 0x50, 0x49, 0x52, 0x47, + 0x08, 0x50, 0x49, 0x52, 0x48, 0x08, 0x14, 0x33, + 0x50, 0x49, 0x52, 0x56, 0x01, 0xa0, 0x09, 0x7b, + 0x68, 0x0a, 0x80, 0x00, 0xa4, 0x0a, 0x00, 0x7b, + 0x68, 0x0a, 0x0f, 0x60, 0xa0, 0x08, 0x95, 0x60, + 0x0a, 0x03, 0xa4, 0x0a, 0x00, 0xa0, 0x08, 0x93, + 0x60, 0x0a, 0x08, 0xa4, 0x0a, 0x00, 0xa0, 0x08, + 0x93, 0x60, 0x0a, 0x0d, 0xa4, 0x0a, 0x00, 0xa4, + 0x0a, 0x01, 0x5b, 0x82, 0x4f, 0x0a, 0x4c, 0x4e, + 0x4b, 0x41, 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, + 0x41, 0xd0, 0x0c, 0x0f, 0x08, 0x5f, 0x55, 0x49, + 0x44, 0x0a, 0x01, 0x14, 0x18, 0x5f, 0x53, 0x54, + 0x41, 0x00, 0xa0, 0x0c, 0x50, 0x49, 0x52, 0x56, + 0x50, 0x49, 0x52, 0x41, 0xa4, 0x0a, 0x0b, 0xa1, + 0x04, 0xa4, 0x0a, 0x09, 0x08, 0x5f, 0x50, 0x52, + 0x53, 0x11, 0x09, 0x0a, 0x06, 0x23, 0xf8, 0xde, + 0x18, 0x79, 0x00, 0x08, 0x43, 0x42, 0x30, 0x31, + 0x11, 0x09, 0x0a, 0x06, 0x23, 0x00, 0x00, 0x18, + 0x79, 0x00, 0x8b, 0x43, 0x42, 0x30, 0x31, 0x0a, + 0x01, 0x43, 0x49, 0x52, 0x41, 0x14, 0x2b, 0x5f, + 0x43, 0x52, 0x53, 0x00, 0x7b, 0x50, 0x49, 0x52, + 0x41, 0x0a, 0x8f, 0x60, 0xa0, 0x0e, 0x50, 0x49, + 0x52, 0x56, 0x60, 0x79, 0x0a, 0x01, 0x60, 0x43, + 0x49, 0x52, 0x41, 0xa1, 0x08, 0x70, 0x0a, 0x00, + 0x43, 0x49, 0x52, 0x41, 0xa4, 0x43, 0x42, 0x30, + 0x31, 0x14, 0x0d, 0x5f, 0x44, 0x49, 0x53, 0x00, + 0x70, 0x0a, 0x80, 0x50, 0x49, 0x52, 0x41, 0x14, + 0x1b, 0x5f, 0x53, 0x52, 0x53, 0x01, 0x8b, 0x68, + 0x0a, 0x01, 0x53, 0x49, 0x52, 0x41, 0x82, 0x53, + 0x49, 0x52, 0x41, 0x60, 0x70, 0x76, 0x60, 0x50, + 0x49, 0x52, 0x41, 0x5b, 0x82, 0x4f, 0x0a, 0x4c, + 0x4e, 0x4b, 0x42, 0x08, 0x5f, 0x48, 0x49, 0x44, + 0x0c, 0x41, 0xd0, 0x0c, 0x0f, 0x08, 0x5f, 0x55, + 0x49, 0x44, 0x0a, 0x02, 0x14, 0x18, 0x5f, 0x53, + 0x54, 0x41, 0x00, 0xa0, 0x0c, 0x50, 0x49, 0x52, + 0x56, 0x50, 0x49, 0x52, 0x42, 0xa4, 0x0a, 0x0b, + 0xa1, 0x04, 0xa4, 0x0a, 0x09, 0x08, 0x5f, 0x50, + 0x52, 0x53, 0x11, 0x09, 0x0a, 0x06, 0x23, 0xf8, + 0xde, 0x18, 0x79, 0x00, 0x08, 0x43, 0x42, 0x30, + 0x32, 0x11, 0x09, 0x0a, 0x06, 0x23, 0x00, 0x00, + 0x18, 0x79, 0x00, 0x8b, 0x43, 0x42, 0x30, 0x32, + 0x0a, 0x01, 0x43, 0x49, 0x52, 0x42, 0x14, 0x2b, + 0x5f, 0x43, 0x52, 0x53, 0x00, 0x7b, 0x50, 0x49, + 0x52, 0x42, 0x0a, 0x8f, 0x60, 0xa0, 0x0e, 0x50, + 0x49, 0x52, 0x56, 0x60, 0x79, 0x0a, 0x01, 0x60, + 0x43, 0x49, 0x52, 0x42, 0xa1, 0x08, 0x70, 0x0a, + 0x00, 0x43, 0x49, 0x52, 0x42, 0xa4, 0x43, 0x42, + 0x30, 0x32, 0x14, 0x0d, 0x5f, 0x44, 0x49, 0x53, + 0x00, 0x70, 0x0a, 0x80, 0x50, 0x49, 0x52, 0x42, + 0x14, 0x1b, 0x5f, 0x53, 0x52, 0x53, 0x01, 0x8b, + 0x68, 0x0a, 0x01, 0x53, 0x49, 0x52, 0x42, 0x82, + 0x53, 0x49, 0x52, 0x42, 0x60, 0x70, 0x76, 0x60, + 0x50, 0x49, 0x52, 0x42, 0x5b, 0x82, 0x4f, 0x0a, + 0x4c, 0x4e, 0x4b, 0x43, 0x08, 0x5f, 0x48, 0x49, + 0x44, 0x0c, 0x41, 0xd0, 0x0c, 0x0f, 0x08, 0x5f, + 0x55, 0x49, 0x44, 0x0a, 0x03, 0x14, 0x18, 0x5f, + 0x53, 0x54, 0x41, 0x00, 0xa0, 0x0c, 0x50, 0x49, + 0x52, 0x56, 0x50, 0x49, 0x52, 0x43, 0xa4, 0x0a, + 0x0b, 0xa1, 0x04, 0xa4, 0x0a, 0x09, 0x08, 0x5f, + 0x50, 0x52, 0x53, 0x11, 0x09, 0x0a, 0x06, 0x23, + 0xf8, 0xde, 0x18, 0x79, 0x00, 0x08, 0x43, 0x42, + 0x30, 0x33, 0x11, 0x09, 0x0a, 0x06, 0x23, 0x00, + 0x00, 0x18, 0x79, 0x00, 0x8b, 0x43, 0x42, 0x30, + 0x33, 0x0a, 0x01, 0x43, 0x49, 0x52, 0x43, 0x14, + 0x2b, 0x5f, 0x43, 0x52, 0x53, 0x00, 0x7b, 0x50, + 0x49, 0x52, 0x43, 0x0a, 0x8f, 0x60, 0xa0, 0x0e, + 0x50, 0x49, 0x52, 0x56, 0x60, 0x79, 0x0a, 0x01, + 0x60, 0x43, 0x49, 0x52, 0x43, 0xa1, 0x08, 0x70, + 0x0a, 0x00, 0x43, 0x49, 0x52, 0x43, 0xa4, 0x43, + 0x42, 0x30, 0x33, 0x14, 0x0d, 0x5f, 0x44, 0x49, + 0x53, 0x00, 0x70, 0x0a, 0x80, 0x50, 0x49, 0x52, + 0x43, 0x14, 0x1b, 0x5f, 0x53, 0x52, 0x53, 0x01, + 0x8b, 0x68, 0x0a, 0x01, 0x53, 0x49, 0x52, 0x43, + 0x82, 0x53, 0x49, 0x52, 0x43, 0x60, 0x70, 0x76, + 0x60, 0x50, 0x49, 0x52, 0x43, 0x5b, 0x82, 0x4f, + 0x0a, 0x4c, 0x4e, 0x4b, 0x44, 0x08, 0x5f, 0x48, + 0x49, 0x44, 0x0c, 0x41, 0xd0, 0x0c, 0x0f, 0x08, + 0x5f, 0x55, 0x49, 0x44, 0x0a, 0x04, 0x14, 0x18, + 0x5f, 0x53, 0x54, 0x41, 0x00, 0xa0, 0x0c, 0x50, + 0x49, 0x52, 0x56, 0x50, 0x49, 0x52, 0x44, 0xa4, + 0x0a, 0x0b, 0xa1, 0x04, 0xa4, 0x0a, 0x09, 0x08, + 0x5f, 0x50, 0x52, 0x53, 0x11, 0x09, 0x0a, 0x06, + 0x23, 0xf8, 0xde, 0x18, 0x79, 0x00, 0x08, 0x43, + 0x42, 0x30, 0x34, 0x11, 0x09, 0x0a, 0x06, 0x23, + 0x00, 0x00, 0x18, 0x79, 0x00, 0x8b, 0x43, 0x42, + 0x30, 0x34, 0x0a, 0x01, 0x43, 0x49, 0x52, 0x44, + 0x14, 0x2b, 0x5f, 0x43, 0x52, 0x53, 0x00, 0x7b, + 0x50, 0x49, 0x52, 0x44, 0x0a, 0x8f, 0x60, 0xa0, + 0x0e, 0x50, 0x49, 0x52, 0x56, 0x60, 0x79, 0x0a, + 0x01, 0x60, 0x43, 0x49, 0x52, 0x44, 0xa1, 0x08, + 0x70, 0x0a, 0x00, 0x43, 0x49, 0x52, 0x44, 0xa4, + 0x43, 0x42, 0x30, 0x34, 0x14, 0x0d, 0x5f, 0x44, + 0x49, 0x53, 0x00, 0x70, 0x0a, 0x80, 0x50, 0x49, + 0x52, 0x44, 0x14, 0x1b, 0x5f, 0x53, 0x52, 0x53, + 0x01, 0x8b, 0x68, 0x0a, 0x01, 0x53, 0x49, 0x52, + 0x44, 0x82, 0x53, 0x49, 0x52, 0x44, 0x60, 0x70, + 0x76, 0x60, 0x50, 0x49, 0x52, 0x44, 0x5b, 0x82, + 0x4f, 0x0a, 0x4c, 0x4e, 0x4b, 0x45, 0x08, 0x5f, + 0x48, 0x49, 0x44, 0x0c, 0x41, 0xd0, 0x0c, 0x0f, + 0x08, 0x5f, 0x55, 0x49, 0x44, 0x0a, 0x05, 0x14, + 0x18, 0x5f, 0x53, 0x54, 0x41, 0x00, 0xa0, 0x0c, + 0x50, 0x49, 0x52, 0x56, 0x50, 0x49, 0x52, 0x45, + 0xa4, 0x0a, 0x0b, 0xa1, 0x04, 0xa4, 0x0a, 0x09, + 0x08, 0x5f, 0x50, 0x52, 0x53, 0x11, 0x09, 0x0a, + 0x06, 0x23, 0xf8, 0xde, 0x18, 0x79, 0x00, 0x08, + 0x43, 0x42, 0x30, 0x35, 0x11, 0x09, 0x0a, 0x06, + 0x23, 0x00, 0x00, 0x18, 0x79, 0x00, 0x8b, 0x43, + 0x42, 0x30, 0x35, 0x0a, 0x01, 0x43, 0x49, 0x52, + 0x45, 0x14, 0x2b, 0x5f, 0x43, 0x52, 0x53, 0x00, + 0x7b, 0x50, 0x49, 0x52, 0x45, 0x0a, 0x8f, 0x60, + 0xa0, 0x0e, 0x50, 0x49, 0x52, 0x56, 0x60, 0x79, + 0x0a, 0x01, 0x60, 0x43, 0x49, 0x52, 0x45, 0xa1, + 0x08, 0x70, 0x0a, 0x00, 0x43, 0x49, 0x52, 0x45, + 0xa4, 0x43, 0x42, 0x30, 0x35, 0x14, 0x0d, 0x5f, + 0x44, 0x49, 0x53, 0x00, 0x70, 0x0a, 0x80, 0x50, + 0x49, 0x52, 0x45, 0x14, 0x1b, 0x5f, 0x53, 0x52, + 0x53, 0x01, 0x8b, 0x68, 0x0a, 0x01, 0x53, 0x49, + 0x52, 0x45, 0x82, 0x53, 0x49, 0x52, 0x45, 0x60, + 0x70, 0x76, 0x60, 0x50, 0x49, 0x52, 0x45, 0x5b, + 0x82, 0x4f, 0x0a, 0x4c, 0x4e, 0x4b, 0x46, 0x08, + 0x5f, 0x48, 0x49, 0x44, 0x0c, 0x41, 0xd0, 0x0c, + 0x0f, 0x08, 0x5f, 0x55, 0x49, 0x44, 0x0a, 0x06, + 0x14, 0x18, 0x5f, 0x53, 0x54, 0x41, 0x00, 0xa0, + 0x0c, 0x50, 0x49, 0x52, 0x56, 0x50, 0x49, 0x52, + 0x46, 0xa4, 0x0a, 0x0b, 0xa1, 0x04, 0xa4, 0x0a, + 0x09, 0x08, 0x5f, 0x50, 0x52, 0x53, 0x11, 0x09, + 0x0a, 0x06, 0x23, 0xf8, 0xde, 0x18, 0x79, 0x00, + 0x08, 0x43, 0x42, 0x30, 0x36, 0x11, 0x09, 0x0a, + 0x06, 0x23, 0x00, 0x00, 0x18, 0x79, 0x00, 0x8b, + 0x43, 0x42, 0x30, 0x36, 0x0a, 0x01, 0x43, 0x49, + 0x52, 0x46, 0x14, 0x2b, 0x5f, 0x43, 0x52, 0x53, + 0x00, 0x7b, 0x50, 0x49, 0x52, 0x46, 0x0a, 0x8f, + 0x60, 0xa0, 0x0e, 0x50, 0x49, 0x52, 0x56, 0x60, + 0x79, 0x0a, 0x01, 0x60, 0x43, 0x49, 0x52, 0x46, + 0xa1, 0x08, 0x70, 0x0a, 0x00, 0x43, 0x49, 0x52, + 0x46, 0xa4, 0x43, 0x42, 0x30, 0x36, 0x14, 0x0d, + 0x5f, 0x44, 0x49, 0x53, 0x00, 0x70, 0x0a, 0x80, + 0x50, 0x49, 0x52, 0x46, 0x14, 0x1b, 0x5f, 0x53, + 0x52, 0x53, 0x01, 0x8b, 0x68, 0x0a, 0x01, 0x53, + 0x49, 0x52, 0x46, 0x82, 0x53, 0x49, 0x52, 0x46, + 0x60, 0x70, 0x76, 0x60, 0x50, 0x49, 0x52, 0x46, + 0x5b, 0x82, 0x4f, 0x0a, 0x4c, 0x4e, 0x4b, 0x47, + 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, 0x41, 0xd0, + 0x0c, 0x0f, 0x08, 0x5f, 0x55, 0x49, 0x44, 0x0a, + 0x07, 0x14, 0x18, 0x5f, 0x53, 0x54, 0x41, 0x00, + 0xa0, 0x0c, 0x50, 0x49, 0x52, 0x56, 0x50, 0x49, + 0x52, 0x47, 0xa4, 0x0a, 0x0b, 0xa1, 0x04, 0xa4, + 0x0a, 0x09, 0x08, 0x5f, 0x50, 0x52, 0x53, 0x11, + 0x09, 0x0a, 0x06, 0x23, 0xf8, 0xde, 0x18, 0x79, + 0x00, 0x08, 0x43, 0x42, 0x30, 0x37, 0x11, 0x09, + 0x0a, 0x06, 0x23, 0x00, 0x00, 0x18, 0x79, 0x00, + 0x8b, 0x43, 0x42, 0x30, 0x37, 0x0a, 0x01, 0x43, + 0x49, 0x52, 0x47, 0x14, 0x2b, 0x5f, 0x43, 0x52, + 0x53, 0x00, 0x7b, 0x50, 0x49, 0x52, 0x47, 0x0a, + 0x8f, 0x60, 0xa0, 0x0e, 0x50, 0x49, 0x52, 0x56, + 0x60, 0x79, 0x0a, 0x01, 0x60, 0x43, 0x49, 0x52, + 0x47, 0xa1, 0x08, 0x70, 0x0a, 0x00, 0x43, 0x49, + 0x52, 0x47, 0xa4, 0x43, 0x42, 0x30, 0x37, 0x14, + 0x0d, 0x5f, 0x44, 0x49, 0x53, 0x00, 0x70, 0x0a, + 0x80, 0x50, 0x49, 0x52, 0x47, 0x14, 0x1b, 0x5f, + 0x53, 0x52, 0x53, 0x01, 0x8b, 0x68, 0x0a, 0x01, + 0x53, 0x49, 0x52, 0x47, 0x82, 0x53, 0x49, 0x52, + 0x47, 0x60, 0x70, 0x76, 0x60, 0x50, 0x49, 0x52, + 0x47, 0x5b, 0x82, 0x4f, 0x0a, 0x4c, 0x4e, 0x4b, + 0x48, 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, 0x41, + 0xd0, 0x0c, 0x0f, 0x08, 0x5f, 0x55, 0x49, 0x44, + 0x0a, 0x08, 0x14, 0x18, 0x5f, 0x53, 0x54, 0x41, + 0x00, 0xa0, 0x0c, 0x50, 0x49, 0x52, 0x56, 0x50, + 0x49, 0x52, 0x48, 0xa4, 0x0a, 0x0b, 0xa1, 0x04, + 0xa4, 0x0a, 0x09, 0x08, 0x5f, 0x50, 0x52, 0x53, + 0x11, 0x09, 0x0a, 0x06, 0x23, 0xf8, 0xde, 0x18, + 0x79, 0x00, 0x08, 0x43, 0x42, 0x30, 0x38, 0x11, + 0x09, 0x0a, 0x06, 0x23, 0x00, 0x00, 0x18, 0x79, + 0x00, 0x8b, 0x43, 0x42, 0x30, 0x38, 0x0a, 0x01, + 0x43, 0x49, 0x52, 0x48, 0x14, 0x2b, 0x5f, 0x43, + 0x52, 0x53, 0x00, 0x7b, 0x50, 0x49, 0x52, 0x48, + 0x0a, 0x8f, 0x60, 0xa0, 0x0e, 0x50, 0x49, 0x52, + 0x56, 0x60, 0x79, 0x0a, 0x01, 0x60, 0x43, 0x49, + 0x52, 0x48, 0xa1, 0x08, 0x70, 0x0a, 0x00, 0x43, + 0x49, 0x52, 0x48, 0xa4, 0x43, 0x42, 0x30, 0x38, + 0x14, 0x0d, 0x5f, 0x44, 0x49, 0x53, 0x00, 0x70, + 0x0a, 0x80, 0x50, 0x49, 0x52, 0x48, 0x14, 0x1b, + 0x5f, 0x53, 0x52, 0x53, 0x01, 0x8b, 0x68, 0x0a, + 0x01, 0x53, 0x49, 0x52, 0x48, 0x82, 0x53, 0x49, + 0x52, 0x48, 0x60, 0x70, 0x76, 0x60, 0x50, 0x49, + 0x52, 0x48, 0x5b, 0x82, 0x48, 0x07, 0x53, 0x49, + 0x4f, 0x5f, 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, + 0x41, 0xd0, 0x0c, 0x02, 0x08, 0x5f, 0x43, 0x52, + 0x53, 0x11, 0x42, 0x06, 0x0a, 0x5e, 0x47, 0x01, + 0x60, 0x00, 0x60, 0x00, 0x01, 0x01, 0x47, 0x01, + 0x64, 0x00, 0x64, 0x00, 0x01, 0x01, 0x47, 0x01, + 0x20, 0x02, 0x20, 0x02, 0x01, 0x04, 0x47, 0x01, + 0x24, 0x02, 0x24, 0x02, 0x01, 0x04, 0x86, 0x09, + 0x00, 0x01, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, + 0x00, 0x10, 0x47, 0x01, 0xd0, 0x04, 0xd0, 0x04, + 0x01, 0x02, 0x47, 0x01, 0x61, 0x00, 0x61, 0x00, + 0x01, 0x01, 0x47, 0x01, 0x00, 0x04, 0x00, 0x04, + 0x01, 0x08, 0x47, 0x01, 0xb2, 0x00, 0xb2, 0x00, + 0x01, 0x01, 0x47, 0x01, 0x84, 0x00, 0x84, 0x00, + 0x01, 0x01, 0x47, 0x01, 0x72, 0x00, 0x72, 0x00, + 0x01, 0x06, 0x79, 0x00, 0x5b, 0x82, 0x2c, 0x43, + 0x4f, 0x4d, 0x31, 0x08, 0x5f, 0x48, 0x49, 0x44, + 0x0c, 0x41, 0xd0, 0x05, 0x01, 0x08, 0x5f, 0x55, + 0x49, 0x44, 0x0a, 0x01, 0x08, 0x5f, 0x43, 0x52, + 0x53, 0x11, 0x10, 0x0a, 0x0d, 0x47, 0x01, 0xf8, + 0x03, 0xf8, 0x03, 0x01, 0x08, 0x22, 0x10, 0x00, + 0x79, 0x00, 0x5b, 0x82, 0x2c, 0x43, 0x4f, 0x4d, + 0x32, 0x08, 0x5f, 0x48, 0x49, 0x44, 0x0c, 0x41, + 0xd0, 0x05, 0x01, 0x08, 0x5f, 0x55, 0x49, 0x44, + 0x0a, 0x02, 0x08, 0x5f, 0x43, 0x52, 0x53, 0x11, + 0x10, 0x0a, 0x0d, 0x47, 0x01, 0xf8, 0x02, 0xf8, + 0x02, 0x01, 0x08, 0x22, 0x08, 0x00, 0x79, 0x00, + 0x5b, 0x82, 0x25, 0x52, 0x54, 0x43, 0x5f, 0x08, + 0x5f, 0x48, 0x49, 0x44, 0x0c, 0x41, 0xd0, 0x0b, + 0x00, 0x08, 0x5f, 0x43, 0x52, 0x53, 0x11, 0x10, + 0x0a, 0x0d, 0x47, 0x01, 0x70, 0x00, 0x70, 0x00, + 0x01, 0x02, 0x22, 0x00, 0x01, 0x79, 0x00, 0x5b, + 0x82, 0x2b, 0x50, 0x49, 0x43, 0x5f, 0x08, 0x5f, + 0x48, 0x49, 0x44, 0x0b, 0x41, 0xd0, 0x08, 0x5f, + 0x43, 0x52, 0x53, 0x11, 0x18, 0x0a, 0x15, 0x47, + 0x01, 0x20, 0x00, 0x20, 0x00, 0x01, 0x02, 0x47, + 0x01, 0xa0, 0x00, 0xa0, 0x00, 0x01, 0x02, 0x22, + 0x04, 0x00, 0x79, 0x00, 0x5b, 0x82, 0x25, 0x54, + 0x49, 0x4d, 0x52, 0x08, 0x5f, 0x48, 0x49, 0x44, + 0x0c, 0x41, 0xd0, 0x01, 0x00, 0x08, 0x5f, 0x43, + 0x52, 0x53, 0x11, 0x10, 0x0a, 0x0d, 0x47, 0x01, + 0x40, 0x00, 0x40, 0x00, 0x01, 0x04, 0x22, 0x01, + 0x00, 0x79, 0x00, 0x10, 0x39, 0x2e, 0x5f, 0x53, + 0x42, 0x5f, 0x50, 0x43, 0x30, 0x30, 0x5b, 0x82, + 0x2d, 0x48, 0x50, 0x45, 0x54, 0x08, 0x5f, 0x48, + 0x49, 0x44, 0x0c, 0x41, 0xd0, 0x01, 0x03, 0x08, + 0x5f, 0x55, 0x49, 0x44, 0x0a, 0x00, 0x08, 0x5f, + 0x43, 0x52, 0x53, 0x11, 0x11, 0x0a, 0x0e, 0x86, + 0x09, 0x00, 0x01, 0x00, 0x00, 0xd0, 0xfe, 0x00, + 0x04, 0x00, 0x00, 0x79 + }; + + dsdt = (void *) (((uintptr_t) tb) + DSDT_OFFSET); + /* copy DSDT template to guest memory */ + memcpy(dsdt, dsdt_tmpl, 2604); + + pci_write_dsdt(); + + /* write checksum */ + acpitbl_write8(dsdt, 0x9, acpitbl_checksum(dsdt, 2604)); +} + +int +acpi_build(int ncpu) +{ + int err; + + acpi_ncpu = ncpu; + tb = paddr_guest2host(XHYVE_ACPI_BASE, XHYVE_ACPI_SIZE); + if (tb == NULL) { + return (EFAULT); + } + + err = xh_vm_get_hpet_capabilities(&hpet_capabilities); + if (err != 0) { + return (err); + } + + acpitbl_build_rdsp(); + acpitbl_build_rsdt(); + acpitbl_build_xsdt(); + acpitbl_build_madt(); + acpitbl_build_fadt(); + acpitbl_build_hpet(); + acpitbl_build_mcfg(); + acpitbl_build_facs(); + acpitbl_build_dsdt(); + + return 0; +} diff --git a/vendor/github.com/hooklift/xhyve/atkbdc.c b/vendor/github.com/hooklift/xhyve/atkbdc.c new file mode 100644 index 0000000..e8df7f9 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/atkbdc.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define KBD_DATA_PORT 0x60 +#define KBD_STS_CTL_PORT 0x64 +#define KBD_SYS_FLAG 0x4 +#define KBDC_RESET 0xfe + +static int +atkbdc_data_handler(UNUSED int vcpu, UNUSED int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + if (bytes != 1) + return (-1); + + *eax = 0; + + return (0); +} + +static int +atkbdc_sts_ctl_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + int error, retval; + + if (bytes != 1) + return (-1); + + retval = 0; + if (in) { + *eax = KBD_SYS_FLAG; /* system passed POST */ + } else { + switch (*eax) { + case KBDC_RESET: /* Pulse "reset" line. */ + error = xh_vm_suspend(VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + break; + } + } + + return (retval); +} + +INOUT_PORT(atkdbc, KBD_DATA_PORT, IOPORT_F_INOUT, atkbdc_data_handler); +SYSRES_IO(KBD_DATA_PORT, 1); +INOUT_PORT(atkbdc, KBD_STS_CTL_PORT, IOPORT_F_INOUT, atkbdc_sts_ctl_handler); +SYSRES_IO(KBD_STS_CTL_PORT, 1); diff --git a/vendor/github.com/hooklift/xhyve/block_if.c b/vendor/github.com/hooklift/xhyve/block_if.c new file mode 100644 index 0000000..4d5df95 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/block_if.c @@ -0,0 +1,843 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BLOCKIF_SIG 0xb109b109 +/* xhyve: FIXME + * + * // #define BLOCKIF_NUMTHR 8 + * + * OS X does not support preadv/pwritev, we need to serialize reads and writes + * for the time being until we find a better solution. + */ +#define BLOCKIF_NUMTHR 1 + +#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +struct blockif_ctxt { + int bc_magic; + int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +#pragma clang diagnostic pop + +static ssize_t +preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t res; + + res = lseek(fd, offset, SEEK_SET); + assert(res == offset); + return readv(fd, iov, iovcnt); +} + +static ssize_t +pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t res; + + res = lseek(fd, offset, SEEK_SET); + assert(res == offset); + return writev(fd, iov, iovcnt); +} + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be, *tbe; + off_t off; + int i; + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_req = breq; + be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + case BOP_FLUSH: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + return (be->be_status == BST_PEND); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = t; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + *bep = be; + return (1); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_elem *tbe; + + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +{ + struct blockif_req *br; + // off_t arg[2]; + ssize_t clen, len, off, boff, voff; + int i, err; + + br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; + err = 0; + switch (be->be_op) { + case BOP_READ: + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, ((size_t) len), br->br_offset + off) < 0) + { + err = errno; + break; + } + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy(((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), buf + boff, clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + break; + case BOP_WRITE: + if (bc->bc_rdonly) { + err = EROFS; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy((buf + boff), + ((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, ((size_t) len), br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + break; + case BOP_FLUSH: + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DKIOCSYNCHRONIZECACHE)) + err = errno; + } else if (fsync(bc->bc_fd)) + err = errno; + break; + case BOP_DELETE: + if (!bc->bc_candelete) { + err = EOPNOTSUPP; + // } else if (bc->bc_rdonly) { + // err = EROFS; + // } else if (bc->bc_ischr) { + // arg[0] = br->br_offset; + // arg[1] = br->br_resid; + // if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) { + // err = errno; + // } else { + // br->br_resid = 0; + // } + } else { + err = EOPNOTSUPP; + } + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + pthread_t t; + uint8_t *buf; + + bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + + pthread_mutex_lock(&bc->bc_mtx); + for (;;) { + while (blockif_dequeue(bc, t, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be, buf); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + /* Check ctxt status here to see if exit requested */ + if (bc->bc_closing) + break; + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + } + pthread_mutex_unlock(&bc->bc_mtx); + + if (buf) + free(buf); + pthread_exit(NULL); + return (NULL); +} + +static void +blockif_sigcont_handler(UNUSED int signal, UNUSED enum ev_type type, + UNUSED void *arg) +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +} + +struct blockif_ctxt * +blockif_open(const char *optstr, UNUSED const char *ident) +{ + // char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; + struct blockif_ctxt *bc; + struct stat sbuf; + // struct diocgattr_arg arg; + off_t size, psectsz, psectoff; + int extra, fd, i, sectsz; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; + + pthread_once(&blockif_once, blockif_init); + + fd = -1; + ssopt = 0; + nocache = 0; + sync = 0; + ro = 0; + + pssopt = 0; + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) + nocache = 1; + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) + sync = 1; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } + } + + extra = 0; + if (nocache) { + perror("xhyve: nocache support unimplemented"); + goto err; + // extra |= O_DIRECT; + } + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + perror("Could not open backing file"); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + goto err; + } + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; + if (S_ISCHR(sbuf.st_mode)) { + perror("xhyve: raw device support unimplemented"); + goto err; + // if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + // ioctl(fd, DIOCGSECTORSIZE, §sz)) + // { + // perror("Could not fetch dev blk/sector size"); + // goto err; + // } + // assert(size != 0); + // assert(sectsz != 0); + // if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + // ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + // strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + // arg.len = sizeof(arg.value.i); + // if (ioctl(fd, DIOCGATTR, &arg) == 0) + // candelete = arg.value.i; + // if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + // geom = 1; + } else + psectsz = sbuf.st_blksize; + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + // /* + // * Some backend drivers (e.g. cd0, ada0) require that the I/O + // * size be a multiple of the device's sector size. + // * + // * Validate that the emulated sector size complies with this + // * requirement. + // */ + // if (S_ISCHR(sbuf.st_mode)) { + // if (ssopt < sectsz || (ssopt % sectsz) != 0) { + // fprintf(stderr, "Sector size %d incompatible " + // "with underlying device sector size %d\n", + // ssopt, sectsz); + // goto err; + // } + // } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_magic = (int) BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + bc->bc_psectsz = (int) psectsz; + bc->bc_psectoff = (int) psectoff; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + } + + return (bc); +err: + if (fd >= 0) + close(fd); + return (NULL); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (!TAILQ_EMPTY(&bc->bc_freeq)) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + + pthread_mutex_lock(&bc->bc_mtx); + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + blockif_complete(bc, be); + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int err, i; + + err = 0; + + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + + /* + * Stop the block i/o thread + */ + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_closing = 1; + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535LL*16*255) + sectors = 65535LL*16*255; + + if (sectors >= 65536LL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (uint8_t) ((hcyl + 1023) / 1024); + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = (uint16_t) (hcyl / heads); + *h = heads; + *s = (uint8_t) secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (bc->bc_sectsz); +} + +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (bc->bc_rdonly); +} + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == ((int) BLOCKIF_SIG)); + return (bc->bc_candelete); +} diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/initrd.gz b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/initrd.gz new file mode 100644 index 0000000..c427032 Binary files /dev/null and b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/initrd.gz differ diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/tinycore.txt b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/tinycore.txt new file mode 100644 index 0000000..e04ded1 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/tinycore.txt @@ -0,0 +1,9 @@ +These are binaries from + http://tinycorelinux.net +with the following patch applied: + +mkdir initrd +( cd initrd ; zcat ../initrd.gz | sudo cpio -idm ) +sudo sed -i '/^# ttyS0$/s#^..##' initrd/etc/securetty +sudo sed -i '/^tty1:/s#tty1#ttyS0#g' initrd/etc/inittab +( cd initrd ; find | sudo cpio -o -H newc ) | gzip -c > initrd.gz diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.so b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.so new file mode 100644 index 0000000..75833f8 Binary files /dev/null and b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.so differ diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.txt b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.txt new file mode 100644 index 0000000..20aa597 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/userboot.txt @@ -0,0 +1,8 @@ +userboot.so is the FreeBSD user-mode bootloader + +So, this is a bit horrible but it works: + + - userboot is compiled on FreeBSD with '-target x86_64-apple-darwin14' CFLAGS + - same for the dependencies of userboot (stand, ficl, zfs) + - you have to use the MachO linker set header (include/xhyve/support/linker_set.h) + - the resulting object files are linked on OS X with 'clang -dead_strip -shared -o userboot.so *.o *.So' diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/vmlinuz b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/vmlinuz new file mode 100644 index 0000000..cb74d02 Binary files /dev/null and b/vendor/github.com/hooklift/xhyve/cmd/xhyve/imgs/vmlinuz differ diff --git a/vendor/github.com/hooklift/xhyve/cmd/xhyve/main.go b/vendor/github.com/hooklift/xhyve/cmd/xhyve/main.go new file mode 100644 index 0000000..dc2a70c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/cmd/xhyve/main.go @@ -0,0 +1,38 @@ +package main + +import ( + "fmt" + "os" + "os/signal" + + "github.com/hooklift/xhyve" +) + +func init() { + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt, os.Kill) + go func() { + s := <-c + fmt.Printf("Signal %s received\n", s) + }() +} + +func main() { + done := make(chan bool) + ptyCh := make(chan string) + + go func() { + if err := xhyve.Run(os.Args, ptyCh); err != nil { + fmt.Println(err) + } + done <- true + }() + + fmt.Printf("Waiting on a pseudo-terminal to be ready... ") + pty := <-ptyCh + fmt.Printf("done\n") + fmt.Printf("Hook up your terminal emulator to %s in order to connect to your VM\n", pty) + + <-done + fmt.Println("Hypervisor goroutine finished!") +} diff --git a/vendor/github.com/hooklift/xhyve/consport.c b/vendor/github.com/hooklift/xhyve/consport.c new file mode 100644 index 0000000..f5d0b73 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/consport.c @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BVM_CONSOLE_PORT 0x220 +#define BVM_CONS_SIG ('b' << 8 | 'v') + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + static int opened; + + if (bytes == 2 && in) { + *eax = BVM_CONS_SIG; + return (0); + } + + /* + * Guests might probe this port to look for old ISA devices + * using single-byte reads. Return 0xff for those. + */ + if (bytes == 1 && in) { + *eax = 0xff; + return (0); + } + + if (bytes != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (in) + *eax = (uint32_t) ttyread(); + else + ttywrite((unsigned char) *eax); + + return (0); +} + +SYSRES_IO(BVM_CONSOLE_PORT, 4); + +static struct inout_port consport = { + "bvmcons", + BVM_CONSOLE_PORT, + 1, + IOPORT_F_INOUT, + console_handler, + NULL +}; + +void +init_bvmcons(void) +{ + register_inout(&consport); +} diff --git a/vendor/github.com/hooklift/xhyve/dbgport.c b/vendor/github.com/hooklift/xhyve/dbgport.c new file mode 100644 index 0000000..54c968d --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/dbgport.c @@ -0,0 +1,142 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BVM_DBG_PORT 0x224 +#define BVM_DBG_SIG ('B' << 8 | 'V') + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +static int +dbg_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, uint32_t *eax, + UNUSED void *arg) +{ + char ch; + int nwritten, nread, printonce; + + if (bytes == 2 && in) { + *eax = BVM_DBG_SIG; + return (0); + } + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept(listen_fd, NULL, NULL); + if (conn_fd >= 0) + fcntl(conn_fd, F_SETFL, O_NONBLOCK); + else if (errno != EINTR) + perror("accept"); + } + + if (in) { + nread = (int) read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = (uint32_t) (-1); + else if (nread == 1) + *eax = (uint32_t) ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = (char) *eax; + nwritten = (int) write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +static struct inout_port dbgport = { + "bvmdbg", + BVM_DBG_PORT, + 1, + IOPORT_F_INOUT, + dbg_handler, + NULL +}; + +SYSRES_IO(BVM_DBG_PORT, 4); + +void +init_dbgport(int sport) +{ + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(listen_fd, 1) < 0) { + perror("listen"); + exit(1); + } + + register_inout(&dbgport); +} diff --git a/vendor/github.com/hooklift/xhyve/fbsd.c b/vendor/github.com/hooklift/xhyve/fbsd.c new file mode 100644 index 0000000..6a2d2bc --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/fbsd.c @@ -0,0 +1,1006 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Copyright (c) 2011 Google, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define I386_TSS_SIZE 104 + +#define DESC_PRESENT 0x00000080 +#define DESC_DEF32 0x00004000 +#define DESC_GRAN 0x00008000 +#define DESC_UNUSABLE 0x00010000 + +#define GUEST_CODE_SEL 1 +#define GUEST_DATA_SEL 2 +#define GUEST_TSS_SEL 3 +#define GUEST_GDTR_LIMIT64 (3 * 8 - 1) + +#define BSP 0 +#define NDISKS 32 + +static struct { + char *userboot; + char *bootvolume; + char *kernelenv; + char *cons; +} config; + +static char *host_base; +static struct termios term, oldterm; +static int disk_fd[NDISKS]; +static int ndisks; +static int consin_fd, consout_fd; +static jmp_buf exec_done; + +static uint64_t vcpu_gdt_base, vcpu_cr3, vcpu_rsp, vcpu_rip; + +typedef void (*func_t)(struct loader_callbacks *, void *, int, int); + +static void cb_exit(void); + +static struct segment_descriptor i386_gdt[] = { + { .sd_lolimit = 0, .sd_type = 0, /* NULL */ + .sd_p = 0, .sd_hilimit = 0, .sd_def32 = 0, .sd_gran = 0}, + + { .sd_lolimit = 0xffff, .sd_type = SDT_MEMER, /* CODE */ + .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, + + { .sd_lolimit = 0xffff, .sd_type = SDT_MEMRW, /* DATA */ + .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, + + { .sd_lolimit = I386_TSS_SIZE - 1, /* TSS */ + .sd_type = SDT_SYS386TSS, .sd_p = 1 } +}; + +static int +fbsd_set_regs_i386(uint32_t eip, uint32_t gdt_base, uint32_t esp) +{ + uint64_t cr0, rflags, desc_base; + uint32_t desc_access, desc_limit, tss_base; + uint16_t gsel; + struct segment_descriptor *gdt; + int error; + + cr0 = CR0_PE | CR0_NE; + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CR4, 0)) != 0) + goto done; + + /* + * Forcing EFER to 0 causes bhyve to clear the "IA-32e guest + * mode" entry control. + */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_EFER, 0))) + goto done; + + gdt = xh_vm_map_gpa(gdt_base, 0x1000); + if (gdt == NULL) + return (EFAULT); + memcpy(gdt, i386_gdt, sizeof(i386_gdt)); + desc_base = gdt_base; + desc_limit = sizeof(i386_gdt) - 1; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); + if (error != 0) + goto done; + + /* Place the TSS one page above the GDT. */ + tss_base = gdt_base + 0x1000; + gdt[3].sd_lobase = tss_base; + + rflags = 0x2; + error = xh_vm_set_register(BSP, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0xffffffff; + desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMERA; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_CS, desc_base, desc_limit, + desc_access); + + desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMRWA; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_DS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_ES, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_FS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_GS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_SS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + desc_base = tss_base; + desc_limit = I386_TSS_SIZE - 1; + desc_access = DESC_PRESENT | SDT_SYS386BSY; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_TR, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_LDTR, 0, 0, DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_TSS_SEL, SEL_KPL); + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_TR, gsel)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_RIP, eip)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_RSP, esp)) != 0) + goto done; + + error = 0; +done: + return (error); +} + +static int +fbsd_set_regs(uint64_t rip, uint64_t cr3, uint64_t gdt_base, uint64_t rsp) +{ + int error; + uint64_t cr0, cr4, efer, rflags, desc_base; + uint32_t desc_access, desc_limit; + uint16_t gsel; + + cr0 = CR0_PE | CR0_PG | CR0_NE; + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + cr4 = CR4_PAE; + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + efer = EFER_LME | EFER_LMA; + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_EFER, efer))) + goto done; + + rflags = 0x2; + error = xh_vm_set_register(BSP, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0; + desc_access = 0x0000209B; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_CS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + desc_access = 0x00000093; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_DS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_ES, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_FS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_GS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_SS, desc_base, desc_limit, + desc_access); + + if (error) + goto done; + + /* + * XXX TR is pointing to null selector even though we set the + * TSS segment to be usable with a base address and limit of 0. + */ + desc_access = 0x0000008b; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_LDTR, 0, 0, DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + /* XXX TR is pointing to the null selector */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_TR, 0)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + /* page table base */ + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_CR3, cr3)) != 0) + goto done; + + desc_base = gdt_base; + desc_limit = GUEST_GDTR_LIMIT64; + error = xh_vm_set_desc(BSP, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); + if (error != 0) + goto done; + + if ((error = xh_vm_set_register(BSP, VM_REG_GUEST_RSP, rsp)) != 0) + goto done; + + error = 0; +done: + return (error); +} + +/* + * Console i/o callbacks + */ + +static void +cb_putc(UNUSED void *arg, int ch) +{ + char c = (char) ch; + + (void) write(consout_fd, &c, 1); +} + +static int +cb_getc(UNUSED void *arg) +{ + char c; + + if (read(consin_fd, &c, 1) == 1) + return (c); + return (-1); +} + +static int +cb_poll(UNUSED void *arg) +{ + int n; + + if (ioctl(consin_fd, FIONREAD, &n) >= 0) + return (n > 0); + return (0); +} + +/* + * Host filesystem i/o callbacks + */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct cb_file { + int cf_isdir; + size_t cf_size; + struct stat cf_stat; + union { + int fd; + DIR *dir; + } cf_u; +}; +#pragma clang diagnostic pop + +static int +cb_open(UNUSED void *arg, const char *filename, void **hp) +{ + struct stat st; + struct cb_file *cf; + char path[PATH_MAX]; + + if (!host_base) + return (ENOENT); + + strlcpy(path, host_base, PATH_MAX); + if (path[strlen(path) - 1] == '/') + path[strlen(path) - 1] = 0; + strlcat(path, filename, PATH_MAX); + cf = malloc(sizeof(struct cb_file)); + if (stat(path, &cf->cf_stat) < 0) { + free(cf); + return (errno); + } + + cf->cf_size = (size_t) st.st_size; + if (S_ISDIR(cf->cf_stat.st_mode)) { + cf->cf_isdir = 1; + cf->cf_u.dir = opendir(path); + if (!cf->cf_u.dir) + goto out; + *hp = cf; + return (0); + } + if (S_ISREG(cf->cf_stat.st_mode)) { + cf->cf_isdir = 0; + cf->cf_u.fd = open(path, O_RDONLY); + if (cf->cf_u.fd < 0) + goto out; + *hp = cf; + return (0); + } + +out: + free(cf); + return (EINVAL); +} + +// static int +// cb_close(UNUSED void *arg, void *h) +// { +// struct cb_file *cf = h; +// +// if (cf->cf_isdir) +// closedir(cf->cf_u.dir); +// else +// close(cf->cf_u.fd); +// free(cf); +// +// return (0); +// } + +// static int +// cb_isdir(UNUSED void *arg, void *h) +// { +// struct cb_file *cf = h; +// +// return (cf->cf_isdir); +// } + +// static int +// cb_read(UNUSED void *arg, void *h, void *buf, size_t size, size_t *resid) +// { +// struct cb_file *cf = h; +// ssize_t sz; +// +// if (cf->cf_isdir) +// return (EINVAL); +// sz = read(cf->cf_u.fd, buf, size); +// if (sz < 0) +// return (EINVAL); +// *resid = size - ((size_t) sz); +// return (0); +// } + +//static int +//cb_readdir(UNUSED void *arg, void *h, uint32_t *fileno_return, +// uint8_t *type_return, size_t *namelen_return, char *name) +//{ +// struct cb_file *cf = h; +// struct dirent *dp; +// +// if (!cf->cf_isdir) +// return (EINVAL); +// +// dp = readdir(cf->cf_u.dir); +// if (!dp) +// return (ENOENT); +// +// /* +// * Note: d_namlen is in the range 0..255 and therefore less +// * than PATH_MAX so we don't need to test before copying. +// */ +// *fileno_return = dp->d_fileno; +// *type_return = dp->d_type; +// *namelen_return = dp->d_namlen; +// memcpy(name, dp->d_name, dp->d_namlen); +// name[dp->d_namlen] = 0; +// +// return (0); +//} + +// static int +// cb_seek(UNUSED void *arg, void *h, uint64_t offset, int whence) +// { +// struct cb_file *cf = h; +// +// if (cf->cf_isdir) +// return (EINVAL); +// if (lseek(cf->cf_u.fd, ((off_t) offset), whence) < 0) +// return (errno); +// return (0); +// } + +// static int +// cb_stat(UNUSED void *arg, void *h, int *mode, int *uid, int *gid, +// uint64_t *size) +// { +// struct cb_file *cf = h; +// +// *mode = cf->cf_stat.st_mode; +// *uid = (int) cf->cf_stat.st_uid; +// *gid = (int) cf->cf_stat.st_gid; +// *size = (uint64_t) cf->cf_stat.st_size; +// return (0); +// } + +/* + * Disk image i/o callbacks + */ + +static int +cb_diskread(UNUSED void *arg, int unit, uint64_t from, void *to, size_t size, + size_t *resid) +{ + ssize_t n; + + if (unit < 0 || unit >= ndisks ) + return (EIO); + n = pread(disk_fd[unit], to, size, ((off_t) from)); + if (n < 0) + return (errno); + *resid = size - ((size_t) n); + return (0); +} + +#define DIOCGSECTORSIZE _IOR('d', 128, u_int) +#define DIOCGMEDIASIZE _IOR('d', 129, off_t) + +static int +cb_diskioctl(UNUSED void *arg, int unit, u_long cmd, void *data) +{ + struct stat sb; + + if (unit < 0 || unit >= ndisks) + return (EBADF); + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = 512; + break; + case DIOCGMEDIASIZE: + if (fstat(disk_fd[unit], &sb) == 0) + *(off_t *)data = sb.st_size; + else + return (ENOTTY); + break; + default: + abort(); + return (ENOTTY); + } + + return (0); +} + +/* + * Guest virtual machine i/o callbacks + */ +static int +cb_copyin(UNUSED void *arg, const void *from, uint64_t to, size_t size) +{ + char *ptr; + + to &= 0x7fffffff; + + ptr = xh_vm_map_gpa(to, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(ptr, from, size); + return (0); +} + +static int +cb_copyout(UNUSED void *arg, uint64_t from, void *to, size_t size) +{ + char *ptr; + + from &= 0x7fffffff; + + ptr = xh_vm_map_gpa(from, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(to, ptr, size); + return (0); +} + +static void +cb_setreg(UNUSED void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case 4: + vmreg = VM_REG_GUEST_RSP; + vcpu_rsp = v; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + abort(); + } + + error = xh_vm_set_register(BSP, vmreg, v); + if (error) { + perror("xh_vm_set_register"); + cb_exit(); + } +} + +static void +cb_setmsr(UNUSED void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case MSR_EFER: + vmreg = VM_REG_GUEST_EFER; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + abort(); + } + + error = xh_vm_set_register(BSP, vmreg, v); + if (error) { + perror("xh_vm_set_msr"); + cb_exit(); + } +} + +static void +cb_setcr(UNUSED void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case 0: + vmreg = VM_REG_GUEST_CR0; + break; + case 3: + vmreg = VM_REG_GUEST_CR3; + vcpu_cr3 = v; + break; + case 4: + vmreg = VM_REG_GUEST_CR4; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + fprintf(stderr, "test_setcr(%d): not implemented\n", r); + cb_exit(); + } + + error = xh_vm_set_register(BSP, vmreg, v); + if (error) { + perror("vm_set_cr"); + cb_exit(); + } +} + +static void +cb_setgdt(UNUSED void *arg, uint64_t base, size_t size) +{ + int error; + + error = xh_vm_set_desc(BSP, VM_REG_GUEST_GDTR, base, + ((uint32_t) (size - 1)), 0); + + if (error != 0) { + perror("vm_set_desc(gdt)"); + cb_exit(); + } + + vcpu_gdt_base = base; +} + +__attribute__ ((noreturn)) static void +cb_exec(UNUSED void *arg, uint64_t rip) +{ + int error; + + if (vcpu_cr3 == 0) { + error = fbsd_set_regs_i386(((uint32_t) rip), ((uint32_t) vcpu_gdt_base), + ((uint32_t) vcpu_rsp)); + } else { + error = fbsd_set_regs(rip, vcpu_cr3, vcpu_gdt_base, vcpu_rsp); + } + + if (error) { + perror("fbsd_set_regs"); + cb_exit(); + } + + vcpu_rip = rip; + + longjmp(exec_done, 1); +} + +/* + * Misc + */ + +static void +cb_delay(UNUSED void *arg, int usec) +{ + usleep((useconds_t) usec); +} + +__attribute__ ((noreturn)) static void +cb_exit(void) +{ + tcsetattr(consout_fd, TCSAFLUSH, &oldterm); + fprintf(stderr, "fbsd: error\n"); + exit(-1); +} + +static void +cb_getmem(UNUSED void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem) +{ + *ret_lowmem = xh_vm_get_lowmem_size(); + *ret_highmem = xh_vm_get_highmem_size(); +} + +struct env { + const char *str; /* name=value */ + SLIST_ENTRY(env) next; +}; + +static SLIST_HEAD(envhead, env) envhead; + +static void +addenv(const char *str) +{ + struct env *env; + + env = malloc(sizeof(struct env)); + env->str = str; + SLIST_INSERT_HEAD(&envhead, env, next); +} + +static const char * +cb_getenv(UNUSED void *arg, int num) +{ + int i; + struct env *env; + + i = 0; + SLIST_FOREACH(env, &envhead, next) { + if (i == num) + return (env->str); + i++; + } + + return (NULL); +} + +static struct loader_callbacks cb = { + .getc = cb_getc, + .putc = cb_putc, + .poll = cb_poll, + + // .open = cb_open, + // .close = cb_close, + // .isdir = cb_isdir, + // .read = cb_read, + // .readdir = cb_readdir, + // .seek = cb_seek, + // .stat = cb_stat, + + .open = cb_open, + .close = NULL, + .isdir = NULL, + .read = NULL, + .readdir = NULL, + .seek = NULL, + .stat = NULL, + + .diskread = cb_diskread, + .diskioctl = cb_diskioctl, + + .copyin = cb_copyin, + .copyout = cb_copyout, + .setreg = cb_setreg, + .setmsr = cb_setmsr, + .setcr = cb_setcr, + .setgdt = cb_setgdt, + .exec = cb_exec, + + .delay = cb_delay, + .exit = cb_exit, + .getmem = cb_getmem, + + .getenv = cb_getenv, +}; + +static int +altcons_open(char *path) +{ + struct stat sb; + int err; + int fd; + + /* + * Allow stdio to be passed in so that the same string + * can be used for the bhyveload console and bhyve com-port + * parameters + */ + if (!strcmp(path, "stdio")) + return (0); + + err = stat(path, &sb); + if (err == 0) { + if (!S_ISCHR(sb.st_mode)) + err = ENOTSUP; + else { + fd = open(path, O_RDWR | O_NONBLOCK); + if (fd < 0) + err = errno; + else + consin_fd = consout_fd = fd; + } + } + + return (err); +} + +static int +disk_open(char *path) +{ + int err, fd; + + if (ndisks >= NDISKS) + return (ERANGE); + + err = 0; + fd = open(path, O_RDONLY); + + if (fd > 0) { + disk_fd[ndisks] = fd; + ndisks++; + } else + err = errno; + + return (err); +} + +void +fbsd_init(char *userboot_path, char *bootvolume_path, char *kernelenv, + char *cons) +{ + config.userboot = userboot_path; + config.bootvolume = bootvolume_path; + config.kernelenv = kernelenv; + config.cons = cons; +} + +uint64_t +fbsd_load(void) +{ + void *h; + int i; + func_t func; + + host_base = NULL; + consin_fd = STDIN_FILENO; + consout_fd = STDOUT_FILENO; + + if (config.cons) { + altcons_open(config.cons); + } + + if (config.bootvolume) { + disk_open(config.bootvolume); + } else { + fprintf(stderr, "fbsd: no boot volume\n"); + exit(-1); + } + + if (config.kernelenv) { + addenv(config.kernelenv); + } + + //host_base = optarg h + + tcgetattr(consout_fd, &term); + oldterm = term; + cfmakeraw(&term); + term.c_cflag |= CLOCAL; + + tcsetattr(consout_fd, TCSAFLUSH, &term); + + h = dlopen(config.userboot, RTLD_LOCAL); + if (!h) { + fprintf(stderr, "%s\n", dlerror()); + exit(-1); + } + + func = (func_t) dlsym(h, "loader_main"); + if (!func) { + fprintf(stderr, "%s\n", dlerror()); + exit(-1); + } + + addenv("smbios.bios.vendor=BHYVE"); + addenv("boot_serial=1"); + + if (!setjmp(exec_done)) { + func(&cb, NULL, USERBOOT_VERSION_3, ndisks); + } + + for (i = 0; i < ndisks; i++) { + close(disk_fd[i]); + } + + if (config.cons) { + assert(consin_fd == consout_fd); + close(consin_fd); + } + + return vcpu_rip; +} diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/acpi.h b/vendor/github.com/hooklift/xhyve/include/xhyve/acpi.h new file mode 100644 index 0000000..11969af --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/acpi.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +/* if set, create AML instead of ASL and calling out to iasl */ +#define ACPITBL_AML 1 + +#define SCI_INT 9 + +#define SMI_CMD 0xb2 +#define BHYVE_ACPI_ENABLE 0xa0 +#define BHYVE_ACPI_DISABLE 0xa1 + +#define PM1A_EVT_ADDR 0x400 +#define PM1A_EVT_ADDR2 0x402 +#define PM1A_CNT_ADDR 0x404 + +#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */ + +int acpi_build(int ncpu); +void dsdt_line(const char *fmt, ...); +void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); +void dsdt_fixed_irq(uint8_t irq); +void dsdt_fixed_mem32(uint32_t base, uint32_t length); +void dsdt_indent(int levels); +void dsdt_unindent(int levels); +void dsdt_fixup(int bus, uint16_t iobase, uint16_t iolimit, uint32_t membase32, + uint32_t memlimit32, uint64_t membase64, uint64_t memlimit64); +void sci_init(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/ahci.h b/vendor/github.com/hooklift/xhyve/include/xhyve/ahci.h new file mode 100644 index 0000000..ec1916e --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/ahci.h @@ -0,0 +1,319 @@ +/*- + * Copyright (c) 1998 - 2008 Søren Schmidt + * Copyright (c) 2009-2012 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +/* ATA register defines */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ + +/* SATA register defines */ +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 + +/* SATA AHCI v1.0 register defines */ +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 + +/* Just to be sure, if building as module. */ +#if MAXPHYS < 512 * 1024 +#undef MAXPHYS +#define MAXPHYS 512 * 1024 +#endif +/* Pessimistic prognosis on number of required S/G entries */ +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +/* Command list. 32 commands. First, 1Kbyte aligned. */ +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 +/* Command tables. Up to 32 commands, Each, 128byte aligned. */ +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +/* Total main work area. */ +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/block_if.h b/vendor/github.com/hooklift/xhyve/include/xhyve/block_if.h new file mode 100644 index 0000000..fecfdbc --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/block_if.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#pragma once + +#include +#include + +#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct blockif_req { + struct iovec br_iov[BLOCKIF_IOV_MAX]; + int br_iovcnt; + off_t br_offset; + ssize_t br_resid; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; +}; +#pragma clang diagnostic pop + +struct blockif_ctxt; +struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/dbgport.h b/vendor/github.com/hooklift/xhyve/include/xhyve/dbgport.h new file mode 100644 index 0000000..22a23af --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/dbgport.h @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +void init_dbgport(int port); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/fbsd.h b/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/fbsd.h new file mode 100644 index 0000000..30d4388 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/fbsd.h @@ -0,0 +1,102 @@ +#pragma once + +#include + +/* + * USERBOOT interface versions + */ +#define USERBOOT_VERSION_1 1 +#define USERBOOT_VERSION_2 2 +#define USERBOOT_VERSION_3 3 + +/* + * Exit codes from the loader + */ +#define USERBOOT_EXIT_QUIT 1 +#define USERBOOT_EXIT_REBOOT 2 + +struct loader_callbacks { + /* Console i/o */ + + /* Wait until a key is pressed on the console and then return it */ + int (*getc)(void *arg); + /* Write the character ch to the console */ + void (*putc)(void *arg, int ch); + /* Return non-zero if a key can be read from the console */ + int (*poll)(void *arg); + + /* Host filesystem i/o */ + + /* Open a file in the host filesystem */ + int (*open)(void *arg, const char *filename, void **h_return); + /* Close a file */ + int (*close)(void *arg, void *h); + /* Return non-zero if the file is a directory */ + int (*isdir)(void *arg, void *h); + /* Read size bytes from a file. The number of bytes remaining in dst after + * reading is returned in *resid_return + */ + int (*read)(void *arg, void *h, void *dst, size_t size, + size_t *resid_return); + /* Read an entry from a directory. The entry's inode number is returned in + * fileno_return, its type in *type_return and the name length in + * *namelen_return. The name itself is copied to the buffer name which must + * be at least PATH_MAX in size. + */ + int (*readdir)(void *arg, void *h, uint32_t *fileno_return, + uint8_t *type_return, size_t *namelen_return, char *name); + /* Seek to a location within an open file */ + int (*seek)(void *arg, void *h, uint64_t offset, int whence); + /* Return some stat(2) related information about the file */ + int (*stat)(void *arg, void *h, int *mode_return, int *uid_return, + int *gid_return, uint64_t *size_return); + + /* Disk image i/o */ + + /* Read from a disk image at the given offset */ + int (*diskread)(void *arg, int unit, uint64_t offset, void *dst, + size_t size, size_t *resid_return); + + /* Guest virtual machine i/o */ + + /* Copy to the guest address space */ + int (*copyin)(void *arg, const void *from, uint64_t to, size_t size); + /* Copy from the guest address space */ + int (*copyout)(void *arg, uint64_t from, void *to, size_t size); + /* Set a guest register value */ + void (*setreg)(void *arg, int, uint64_t); + /* Set a guest MSR value */ + void (*setmsr)(void *arg, int, uint64_t); + /* Set a guest CR value */ + void (*setcr)(void *arg, int, uint64_t); + /* Set the guest GDT address */ + void (*setgdt)(void *arg, uint64_t, size_t); + /* Transfer control to the guest at the given address */ + void (*exec)(void *arg, uint64_t pc); + + /* Misc */ + + /* Sleep for usec microseconds */ + void (*delay)(void *arg, int usec); + /* Exit with the given exit code */ + void (*exit)(void); + /* Return guest physical memory map details */ + void (*getmem)(void *arg, uint64_t *lowmem, uint64_t *highmem); + /* ioctl interface to the disk device */ + int (*diskioctl)(void *arg, int unit, u_long cmd, void *data); + /* + * Returns an environment variable in the form "name=value". + * + * If there are no more variables that need to be set in the + * loader environment then return NULL. + * + * 'num' is used as a handle for the callback to identify which + * environment variable to return next. It will begin at 0 and + * each invocation will add 1 to the previous value of 'num'. + */ + const char * (*getenv)(void *arg, int num); +}; + +void fbsd_init(char *userboot_path, char *bootvolume_path, char *kernelenv, + char *cons); +uint64_t fbsd_load(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/kexec.h b/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/kexec.h new file mode 100644 index 0000000..791d963 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/firmware/kexec.h @@ -0,0 +1,89 @@ +#pragma once + +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" +struct setup_header { + uint8_t setup_sects; /* The size of the setup in sectors */ + uint16_t root_flags; /* If set, the root is mounted readonly */ + uint32_t syssize; /* The size of the 32-bit code in 16-byte paras */ + uint16_t ram_size; /* DO NOT USE - for bootsect.S use only */ + uint16_t vid_mode; /* Video mode control */ + uint16_t root_dev; /* Default root device number */ + uint16_t boot_flag; /* 0xAA55 magic number */ + uint16_t jump; /* Jump instruction */ + uint32_t header; /* Magic signature "HdrS" */ + uint16_t version; /* Boot protocol version supported */ + uint32_t realmode_swtch; /* Boot loader hook (see below) */ + uint16_t start_sys_seg; /* The load-low segment (0x1000) (obsolete) */ + uint16_t kernel_version; /* Pointer to kernel version string */ + uint8_t type_of_loader; /* Boot loader identifier */ + uint8_t loadflags; /* Boot protocol option flags */ + uint16_t setup_move_size; /* Move to high memory size (used with hooks) */ + uint32_t code32_start; /* Boot loader hook (see below) */ + uint32_t ramdisk_image; /* initrd load address (set by boot loader) */ + uint32_t ramdisk_size; /* initrd size (set by boot loader) */ + uint32_t bootsect_kludge; /* DO NOT USE - for bootsect.S use only */ + uint16_t heap_end_ptr; /* Free memory after setup end */ + uint8_t ext_loader_ver; /* Extended boot loader version */ + uint8_t ext_loader_type; /* Extended boot loader ID */ + uint32_t cmd_line_ptr; /* 32-bit pointer to the kernel command line */ + uint32_t nitrd_addr_max; /* Highest legal initrd address */ + uint32_t kernel_alignment; /* Physical addr alignment required for kernel */ + uint8_t relocatable_kernel; /* Whether kernel is relocatable or not */ + uint8_t min_alignment; /* Minimum alignment, as a power of two */ + uint16_t xloadflags; /* Boot protocol option flags */ + uint32_t cmdline_size; /* Maximum size of the kernel command line */ + uint32_t hardware_subarch; /* Hardware subarchitecture */ + uint64_t hardware_subarch_data; /* Subarchitecture-specific data */ + uint32_t payload_offset; /* Offset of kernel payload */ + uint32_t payload_length; /* Length of kernel payload */ + uint64_t setup_data; /* 64bit pointer to linked list of struct setup_data */ + uint64_t pref_address; /* Preferred loading address */ + uint32_t init_size; /* Linear memory required during initialization */ + uint32_t handover_offset; /* Offset of handover entry point */ +} __attribute__((packed)); + +struct zero_page { + uint8_t screen_info[64]; + uint8_t apm_bios_info[20]; + uint8_t _0[4]; + uint64_t tboot_addr; + uint8_t ist_info[16]; + uint8_t _1[16]; + uint8_t hd0_info[16]; + uint8_t hd1_info[16]; + uint8_t sys_desc_table[16]; + uint8_t olpc_ofw_header[16]; + uint32_t ext_ramdisk_image; + uint32_t ext_ramdisk_size; + uint32_t ext_cmd_line_ptr; + uint8_t _2[116]; + uint8_t edid_info[128]; + uint8_t efi_info[32]; + uint32_t alt_mem_k; + uint32_t scratch; + uint8_t e820_entries; + uint8_t eddbuf_entries; + uint8_t edd_mbr_sig_buf_entries; + uint8_t kbd_status; + uint8_t _3[3]; + uint8_t sentinel; + uint8_t _4[1]; + struct setup_header setup_header; + uint8_t _5[(0x290 - 0x1f1 - sizeof(struct setup_header))]; + uint32_t edd_mbr_sig_buffer[16]; + struct { + uint64_t addr; + uint64_t size; + uint32_t type; + } __attribute__((packed)) e820_map[128]; + uint8_t _6[48]; + uint8_t eddbuf[492]; + uint8_t _7[276]; +} __attribute__((packed)); +#pragma clang diagnostic pop + +void kexec_init(char *kernel_path, char *initrd_path, char *cmdline); +uint64_t kexec(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/inout.h b/vendor/github.com/hooklift/xhyve/include/xhyve/inout.h new file mode 100644 index 0000000..f2c6632 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/inout.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +struct vm_exit; + +/* + * inout emulation handlers return 0 on success and -1 on failure. + */ +typedef int (*inout_func_t)(int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct inout_port { + const char *name; + int port; + int size; + int flags; + inout_func_t handler; + void *arg; +}; +#pragma clang diagnostic pop + +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) + +/* + * The following flags are used internally and must not be used by + * device models. + */ +#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, port) = { \ + #name, \ + (port), \ + 1, \ + (flags), \ + (handler), \ + 0 \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, port)) + +void init_inout(void); +int emulate_inout(int vcpu, struct vm_exit *vmexit, int strict); +int register_inout(struct inout_port *iop); +int unregister_inout(struct inout_port *iop); +void init_bvmcons(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/ioapic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/ioapic.h new file mode 100644 index 0000000..7f577ab --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/ioapic.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +/* + * Allocate a PCI IRQ from the I/O APIC. + */ +void ioapic_init(void); +int ioapic_pci_alloc_irq(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/mem.h b/vendor/github.com/hooklift/xhyve/include/xhyve/mem.h new file mode 100644 index 0000000..17d68f3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/mem.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +typedef int (*mem_func_t)(int vcpu, int dir, uint64_t addr, int size, + uint64_t *val, void *arg1, long arg2); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#pragma clang diagnostic pop + +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ + +void init_mem(void); +int emulate_mem(int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging); + +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/mevent.h b/vendor/github.com/hooklift/xhyve/include/xhyve/mevent.h new file mode 100644 index 0000000..d09ed2e --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/mevent.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); +int mevent_dispatch(void); +void mevent_exit(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/mptbl.h b/vendor/github.com/hooklift/xhyve/include/xhyve/mptbl.h new file mode 100644 index 0000000..df1b9e2 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/mptbl.h @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +int mptable_build(int ncpu); +void mptable_add_oemtbl(void *tbl, int tblsz); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/pci_emul.h b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_emul.h new file mode 100644 index 0000000..04fcd7f --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_emul.h @@ -0,0 +1,278 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ + +struct pci_devinst; +struct memory_region; + +struct pci_devemu { + /* name of device emulation */ + char *pe_emu; + /* instance creation */ + int (*pe_init)(struct pci_devinst *, char *opts); + /* ACPI DSDT enumeration */ + void (*pe_write_dsdt)(struct pci_devinst *); + /* config space read/write callbacks */ + int (*pe_cfgwrite)(int vcpu, struct pci_devinst *pi, + int offset, int bytes, uint32_t val); + int (*pe_cfgread)(int vcpu, struct pci_devinst *pi, int offset, int bytes, + uint32_t *retval); + /* BAR read/write callbacks */ + void (*pe_barwrite)(int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value); + uint64_t (*pe_barread)(int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size); +}; + +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x) + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; +#pragma clang diagnostic pop + +#define PI_NAMESZ 40 + +struct msix_table_entry { + uint64_t addr; + uint32_t msg_data; + uint32_t vector_control; +}; + +/* + * In case the structure is modified to hold extra information, use a define + * for the size that should be emulated. + */ +#define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct pci_devinst { + struct pci_devemu *pi_d; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + int pi_bar_getsize; + int pi_prevcap; + int pi_capend; + + struct { + int8_t pin; + enum lintr_stat state; + int pirq_pin; + int ioapic_irq; + pthread_mutex_t lock; + } pi_lintr; + + struct { + int enabled; + uint64_t addr; + uint64_t msg_data; + int maxmsgnum; + } pi_msi; + + struct { + int enabled; + int table_bar; + int pba_bar; + uint32_t table_offset; + int table_count; + uint32_t pba_offset; + int pba_size; + int function_mask; + struct msix_table_entry *table; /* allocated at runtime */ + } pi_msix; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; +#pragma clang diagnostic pop + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; + +struct msixcap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t table_info; /* bar index and offset within it */ + uint32_t pba_info; /* bar index and offset within it */ +} __packed; + +struct pciecap { + uint8_t capid; + uint8_t nextptr; + uint16_t pcie_capabilities; + + uint32_t dev_capabilities; /* all devices */ + uint16_t dev_control; + uint16_t dev_status; + + uint32_t link_capabilities; /* devices with links */ + uint16_t link_control; + uint16_t link_status; + + uint32_t slot_capabilities; /* ports with slots */ + uint16_t slot_control; + uint16_t slot_status; + + uint16_t root_control; /* root ports */ + uint16_t root_capabilities; + uint32_t root_status; + + uint32_t dev_capabilities2; /* all devices */ + uint16_t dev_control2; + uint16_t dev_status2; + + uint32_t link_capabilities2; /* devices with links */ + uint16_t link_control2; + uint16_t link_status2; + + uint32_t slot_capabilities2; /* ports with slots */ + uint16_t slot_control2; + uint16_t slot_status2; +} __packed; +#pragma clang diagnostic pop + +typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, + int ioapic_irq, void *arg); + +int init_pci(void); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, + enum pcibar_type type, uint64_t size); +int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, + uint64_t hostbase, enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +void pci_generate_msix(struct pci_devinst *pi, int msgnum); +void pci_lintr_assert(struct pci_devinst *pi); +void pci_lintr_deassert(struct pci_devinst *pi); +void pci_lintr_request(struct pci_devinst *pi); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msix_enabled(struct pci_devinst *pi); +int pci_msix_table_bar(struct pci_devinst *pi); +int pci_msix_pba_bar(struct pci_devinst *pi); +int pci_msi_msgnum(struct pci_devinst *pi); +int pci_parse_slot(char *opt); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); +int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); +int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value); +uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); +int pci_count_lintr(int bus); +void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); +void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); +int pci_bus_configured(int bus); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset)) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset)) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset)) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset))); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset))); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(((uintptr_t) &pi->pi_cfgdata) + ((unsigned) offset))); +} diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/pci_irq.h b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_irq.h new file mode 100644 index 0000000..76b72dd --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_irq.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +struct pci_devinst; + +void pci_irq_assert(struct pci_devinst *pi); +void pci_irq_deassert(struct pci_devinst *pi); +void pci_irq_init(void); +void pci_irq_reserve(int irq); +void pci_irq_use(int irq); +int pirq_alloc_pin(void); +int pirq_irq(int pin); +uint8_t pirq_read(int pin); +void pirq_write(int pin, uint8_t val); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/pci_lpc.h b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_lpc.h new file mode 100644 index 0000000..7e8ed3c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/pci_lpc.h @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +typedef void (*lpc_write_dsdt_t)(void); + +struct lpc_dsdt { + lpc_write_dsdt_t handler; +}; + +#define LPC_DSDT(handler) \ + static struct lpc_dsdt __CONCAT(__lpc_dsdt, handler) = { \ + (handler), \ + }; \ + DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, handler)) + +enum lpc_sysres_type { + LPC_SYSRES_IO, + LPC_SYSRES_MEM +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct lpc_sysres { + enum lpc_sysres_type type; + uint32_t base; + uint32_t length; +}; +#pragma clang diagnostic pop + +#define LPC_SYSRES(type, base, length) \ + static struct lpc_sysres __CONCAT(__lpc_sysres, base) = {\ + (type), \ + (base), \ + (length) \ + }; \ + DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, base)) + +#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length) +#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) + +int lpc_device_parse(const char *opt); +char *lpc_pirq_name(int pin); +void lpc_pirq_routed(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/rtc.h b/vendor/github.com/hooklift/xhyve/include/xhyve/rtc.h new file mode 100644 index 0000000..0437564 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/rtc.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +void rtc_init(int use_localtime); + diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/smbiostbl.h b/vendor/github.com/hooklift/xhyve/include/xhyve/smbiostbl.h new file mode 100644 index 0000000..a860419 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/smbiostbl.h @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +int smbios_build(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/acpi_hpet.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/acpi_hpet.h new file mode 100644 index 0000000..034d6af --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/acpi_hpet.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2005 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#define HPET_MEM_WIDTH 0x400 /* Expected memory region size */ + +/* General registers */ +#define HPET_CAPABILITIES 0x0 /* General capabilities and ID */ +#define HPET_CAP_VENDOR_ID 0xffff0000 +#define HPET_CAP_LEG_RT 0x00008000 +#define HPET_CAP_COUNT_SIZE 0x00002000 /* 1 = 64-bit, 0 = 32-bit */ +#define HPET_CAP_NUM_TIM 0x00001f00 +#define HPET_CAP_REV_ID 0x000000ff +#define HPET_PERIOD 0x4 /* Period (1/hz) of timer */ +#define HPET_CONFIG 0x10 /* General configuration register */ +#define HPET_CNF_LEG_RT 0x00000002 +#define HPET_CNF_ENABLE 0x00000001 +#define HPET_ISR 0x20 /* General interrupt status register */ +#define HPET_MAIN_COUNTER 0xf0 /* Main counter register */ + +/* Timer registers */ +#define HPET_TIMER_CAP_CNF(x) ((x) * 0x20 + 0x100) +#define HPET_TCAP_INT_ROUTE 0xffffffff00000000 +#define HPET_TCAP_FSB_INT_DEL 0x00008000 +#define HPET_TCNF_FSB_EN 0x00004000 +#define HPET_TCNF_INT_ROUTE 0x00003e00 +#define HPET_TCNF_32MODE 0x00000100 +#define HPET_TCNF_VAL_SET 0x00000040 +#define HPET_TCAP_SIZE 0x00000020 /* 1 = 64-bit, 0 = 32-bit */ +#define HPET_TCAP_PER_INT 0x00000010 /* Supports periodic interrupts */ +#define HPET_TCNF_TYPE 0x00000008 /* 1 = periodic, 0 = one-shot */ +#define HPET_TCNF_INT_ENB 0x00000004 +#define HPET_TCNF_INT_TYPE 0x00000002 /* 1 = level triggered, 0 = edge */ +#define HPET_TIMER_COMPARATOR(x) ((x) * 0x20 + 0x108) +#define HPET_TIMER_FSB_VAL(x) ((x) * 0x20 + 0x110) +#define HPET_TIMER_FSB_ADDR(x) ((x) * 0x20 + 0x114) + +#define HPET_MIN_CYCLES 128 /* Period considered reliable. */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/apicreg.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/apicreg.h new file mode 100644 index 0000000..a36ccd5 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/apicreg.h @@ -0,0 +1,509 @@ +/*- + * Copyright (c) 1996, by Peter Wemm and Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +/* + * Local && I/O APIC definitions. + */ + +/* + * Pentium P54C+ Built-in APIC + * (Advanced programmable Interrupt Controller) + * + * Base Address of Built-in APIC in memory location + * is 0xfee00000. + * + * Map of APIC Registers: + * + * Offset (hex) Description Read/Write state + * 000 Reserved + * 010 Reserved + * 020 ID Local APIC ID R/W + * 030 VER Local APIC Version R + * 040 Reserved + * 050 Reserved + * 060 Reserved + * 070 Reserved + * 080 Task Priority Register R/W + * 090 Arbitration Priority Register R + * 0A0 Processor Priority Register R + * 0B0 EOI Register W + * 0C0 RRR Remote read R + * 0D0 Logical Destination R/W + * 0E0 Destination Format Register 0..27 R; 28..31 R/W + * 0F0 SVR Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W + * 100 ISR 000-031 R + * 110 ISR 032-063 R + * 120 ISR 064-095 R + * 130 ISR 095-128 R + * 140 ISR 128-159 R + * 150 ISR 160-191 R + * 160 ISR 192-223 R + * 170 ISR 224-255 R + * 180 TMR 000-031 R + * 190 TMR 032-063 R + * 1A0 TMR 064-095 R + * 1B0 TMR 095-128 R + * 1C0 TMR 128-159 R + * 1D0 TMR 160-191 R + * 1E0 TMR 192-223 R + * 1F0 TMR 224-255 R + * 200 IRR 000-031 R + * 210 IRR 032-063 R + * 220 IRR 064-095 R + * 230 IRR 095-128 R + * 240 IRR 128-159 R + * 250 IRR 160-191 R + * 260 IRR 192-223 R + * 270 IRR 224-255 R + * 280 Error Status Register R + * 290 Reserved + * 2A0 Reserved + * 2B0 Reserved + * 2C0 Reserved + * 2D0 Reserved + * 2E0 Reserved + * 2F0 Local Vector Table (CMCI) R/W + * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W + * 310 ICR_HI Interrupt Command Reg. (32-63) R/W + * 320 Local Vector Table (Timer) R/W + * 330 Local Vector Table (Thermal) R/W (PIV+) + * 340 Local Vector Table (Performance) R/W (P6+) + * 350 LVT1 Local Vector Table (LINT0) R/W + * 360 LVT2 Local Vector Table (LINT1) R/W + * 370 LVT3 Local Vector Table (ERROR) R/W + * 380 Initial Count Reg. for Timer R/W + * 390 Current Count of Timer R + * 3A0 Reserved + * 3B0 Reserved + * 3C0 Reserved + * 3D0 Reserved + * 3E0 Timer Divide Configuration Reg. R/W + * 3F0 Reserved + */ + + +/****************************************************************************** + * global defines, etc. + */ + + +/****************************************************************************** + * LOCAL APIC structure + */ + +#define PAD3 int : 32; int : 32; int : 32 +#define PAD4 int : 32; int : 32; int : 32; int : 32 + +struct LAPIC { + /* reserved */ PAD4; + /* reserved */ PAD4; + uint32_t id; PAD3; + uint32_t version; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + uint32_t tpr; PAD3; + uint32_t apr; PAD3; + uint32_t ppr; PAD3; + uint32_t eoi; PAD3; + /* reserved */ PAD4; + uint32_t ldr; PAD3; + uint32_t dfr; PAD3; + uint32_t svr; PAD3; + uint32_t isr0; PAD3; + uint32_t isr1; PAD3; + uint32_t isr2; PAD3; + uint32_t isr3; PAD3; + uint32_t isr4; PAD3; + uint32_t isr5; PAD3; + uint32_t isr6; PAD3; + uint32_t isr7; PAD3; + uint32_t tmr0; PAD3; + uint32_t tmr1; PAD3; + uint32_t tmr2; PAD3; + uint32_t tmr3; PAD3; + uint32_t tmr4; PAD3; + uint32_t tmr5; PAD3; + uint32_t tmr6; PAD3; + uint32_t tmr7; PAD3; + uint32_t irr0; PAD3; + uint32_t irr1; PAD3; + uint32_t irr2; PAD3; + uint32_t irr3; PAD3; + uint32_t irr4; PAD3; + uint32_t irr5; PAD3; + uint32_t irr6; PAD3; + uint32_t irr7; PAD3; + uint32_t esr; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + uint32_t lvt_cmci; PAD3; + uint32_t icr_lo; PAD3; + uint32_t icr_hi; PAD3; + uint32_t lvt_timer; PAD3; + uint32_t lvt_thermal; PAD3; + uint32_t lvt_pcint; PAD3; + uint32_t lvt_lint0; PAD3; + uint32_t lvt_lint1; PAD3; + uint32_t lvt_error; PAD3; + uint32_t icr_timer; PAD3; + uint32_t ccr_timer; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + uint32_t dcr_timer; PAD3; + /* reserved */ PAD4; +}; + +typedef struct LAPIC lapic_t; + +enum LAPIC_REGISTERS { + LAPIC_ID = 0x2, + LAPIC_VERSION = 0x3, + LAPIC_TPR = 0x8, + LAPIC_APR = 0x9, + LAPIC_PPR = 0xa, + LAPIC_EOI = 0xb, + LAPIC_LDR = 0xd, + LAPIC_DFR = 0xe, /* Not in x2APIC */ + LAPIC_SVR = 0xf, + LAPIC_ISR0 = 0x10, + LAPIC_ISR1 = 0x11, + LAPIC_ISR2 = 0x12, + LAPIC_ISR3 = 0x13, + LAPIC_ISR4 = 0x14, + LAPIC_ISR5 = 0x15, + LAPIC_ISR6 = 0x16, + LAPIC_ISR7 = 0x17, + LAPIC_TMR0 = 0x18, + LAPIC_TMR1 = 0x19, + LAPIC_TMR2 = 0x1a, + LAPIC_TMR3 = 0x1b, + LAPIC_TMR4 = 0x1c, + LAPIC_TMR5 = 0x1d, + LAPIC_TMR6 = 0x1e, + LAPIC_TMR7 = 0x1f, + LAPIC_IRR0 = 0x20, + LAPIC_IRR1 = 0x21, + LAPIC_IRR2 = 0x22, + LAPIC_IRR3 = 0x23, + LAPIC_IRR4 = 0x24, + LAPIC_IRR5 = 0x25, + LAPIC_IRR6 = 0x26, + LAPIC_IRR7 = 0x27, + LAPIC_ESR = 0x28, + LAPIC_LVT_CMCI = 0x2f, + LAPIC_ICR_LO = 0x30, + LAPIC_ICR_HI = 0x31, /* Not in x2APIC */ + LAPIC_LVT_TIMER = 0x32, + LAPIC_LVT_THERMAL = 0x33, + LAPIC_LVT_PCINT = 0x34, + LAPIC_LVT_LINT0 = 0x35, + LAPIC_LVT_LINT1 = 0x36, + LAPIC_LVT_ERROR = 0x37, + LAPIC_ICR_TIMER = 0x38, + LAPIC_CCR_TIMER = 0x39, + LAPIC_DCR_TIMER = 0x3e, + LAPIC_SELF_IPI = 0x3f, /* Only in x2APIC */ +}; + +/* + * The LAPIC_SELF_IPI register only exists in x2APIC mode. The + * formula below is applicable only to reserve the memory region, + * i.e. for xAPIC mode, where LAPIC_SELF_IPI finely serves as the + * address past end of the region. + */ +#define LAPIC_MEM_REGION (LAPIC_SELF_IPI * 0x10) + +#define LAPIC_MEM_MUL 0x10 + +/****************************************************************************** + * I/O APIC structure + */ + +struct IOAPIC { + uint32_t ioregsel; PAD3; + uint32_t iowin; PAD3; +}; + +typedef struct IOAPIC ioapic_t; + +#undef PAD4 +#undef PAD3 + + +/****************************************************************************** + * various code 'logical' values + */ + +/****************************************************************************** + * LOCAL APIC defines + */ + +/* default physical locations of LOCAL (CPU) APICs */ +#define DEFAULT_APIC_BASE 0xfee00000 + +/* constants relating to APIC ID registers */ +#define APIC_ID_MASK 0xff000000 +#define APIC_ID_SHIFT 24 +#define APIC_ID_CLUSTER 0xf0 +#define APIC_ID_CLUSTER_ID 0x0f +#define APIC_MAX_CLUSTER 0xe +#define APIC_MAX_INTRACLUSTER_ID 3 +#define APIC_ID_CLUSTER_SHIFT 4 + +/* fields in VER */ +#define APIC_VER_VERSION 0x000000ff +#define APIC_VER_MAXLVT 0x00ff0000 +#define MAXLVTSHIFT 16 +#define APIC_VER_EOI_SUPPRESSION 0x01000000 + +/* fields in LDR */ +#define APIC_LDR_RESERVED 0x00ffffff + +/* fields in DFR */ +#define APIC_DFR_RESERVED 0x0fffffff +#define APIC_DFR_MODEL_MASK 0xf0000000 +#define APIC_DFR_MODEL_FLAT 0xf0000000 +#define APIC_DFR_MODEL_CLUSTER 0x00000000 + +/* fields in SVR */ +#define APIC_SVR_VECTOR 0x000000ff +#define APIC_SVR_VEC_PROG 0x000000f0 +#define APIC_SVR_VEC_FIX 0x0000000f +#define APIC_SVR_ENABLE 0x00000100 +# define APIC_SVR_SWDIS 0x00000000 +# define APIC_SVR_SWEN 0x00000100 +#define APIC_SVR_FOCUS 0x00000200 +# define APIC_SVR_FEN 0x00000000 +# define APIC_SVR_FDIS 0x00000200 +#define APIC_SVR_EOI_SUPPRESSION 0x00001000 + +/* fields in TPR */ +#define APIC_TPR_PRIO 0x000000ff +# define APIC_TPR_INT 0x000000f0 +# define APIC_TPR_SUB 0x0000000f + +/* fields in ESR */ +#define APIC_ESR_SEND_CS_ERROR 0x00000001 +#define APIC_ESR_RECEIVE_CS_ERROR 0x00000002 +#define APIC_ESR_SEND_ACCEPT 0x00000004 +#define APIC_ESR_RECEIVE_ACCEPT 0x00000008 +#define APIC_ESR_SEND_ILLEGAL_VECTOR 0x00000020 +#define APIC_ESR_RECEIVE_ILLEGAL_VECTOR 0x00000040 +#define APIC_ESR_ILLEGAL_REGISTER 0x00000080 + +/* fields in ICR_LOW */ +#define APIC_VECTOR_MASK 0x000000ff + +#define APIC_DELMODE_MASK 0x00000700 +# define APIC_DELMODE_FIXED 0x00000000 +# define APIC_DELMODE_LOWPRIO 0x00000100 +# define APIC_DELMODE_SMI 0x00000200 +# define APIC_DELMODE_RR 0x00000300 +# define APIC_DELMODE_NMI 0x00000400 +# define APIC_DELMODE_INIT 0x00000500 +# define APIC_DELMODE_STARTUP 0x00000600 +# define APIC_DELMODE_RESV 0x00000700 + +#define APIC_DESTMODE_MASK 0x00000800 +# define APIC_DESTMODE_PHY 0x00000000 +# define APIC_DESTMODE_LOG 0x00000800 + +#define APIC_DELSTAT_MASK 0x00001000 +# define APIC_DELSTAT_IDLE 0x00000000 +# define APIC_DELSTAT_PEND 0x00001000 + +#define APIC_RESV1_MASK 0x00002000 + +#define APIC_LEVEL_MASK 0x00004000 +# define APIC_LEVEL_DEASSERT 0x00000000 +# define APIC_LEVEL_ASSERT 0x00004000 + +#define APIC_TRIGMOD_MASK 0x00008000 +# define APIC_TRIGMOD_EDGE 0x00000000 +# define APIC_TRIGMOD_LEVEL 0x00008000 + +#define APIC_RRSTAT_MASK 0x00030000 +# define APIC_RRSTAT_INVALID 0x00000000 +# define APIC_RRSTAT_INPROG 0x00010000 +# define APIC_RRSTAT_VALID 0x00020000 +# define APIC_RRSTAT_RESV 0x00030000 + +#define APIC_DEST_MASK 0x000c0000 +# define APIC_DEST_DESTFLD 0x00000000 +# define APIC_DEST_SELF 0x00040000 +# define APIC_DEST_ALLISELF 0x00080000 +# define APIC_DEST_ALLESELF 0x000c0000 + +#define APIC_RESV2_MASK 0xfff00000 + +#define APIC_ICRLO_RESV_MASK (APIC_RESV1_MASK | APIC_RESV2_MASK) + +/* fields in LVT1/2 */ +#define APIC_LVT_VECTOR 0x000000ff +#define APIC_LVT_DM 0x00000700 +# define APIC_LVT_DM_FIXED 0x00000000 +# define APIC_LVT_DM_SMI 0x00000200 +# define APIC_LVT_DM_NMI 0x00000400 +# define APIC_LVT_DM_INIT 0x00000500 +# define APIC_LVT_DM_EXTINT 0x00000700 +#define APIC_LVT_DS 0x00001000 +#define APIC_LVT_IIPP 0x00002000 +#define APIC_LVT_IIPP_INTALO 0x00002000 +#define APIC_LVT_IIPP_INTAHI 0x00000000 +#define APIC_LVT_RIRR 0x00004000 +#define APIC_LVT_TM 0x00008000 +#define APIC_LVT_M 0x00010000 + + +/* fields in LVT Timer */ +#define APIC_LVTT_VECTOR 0x000000ff +#define APIC_LVTT_DS 0x00001000 +#define APIC_LVTT_M 0x00010000 +#define APIC_LVTT_TM 0x00020000 +# define APIC_LVTT_TM_ONE_SHOT 0x00000000 +# define APIC_LVTT_TM_PERIODIC 0x00020000 + + +/* APIC timer current count */ +#define APIC_TIMER_MAX_COUNT 0xffffffff + +/* fields in TDCR */ +#define APIC_TDCR_2 0x00 +#define APIC_TDCR_4 0x01 +#define APIC_TDCR_8 0x02 +#define APIC_TDCR_16 0x03 +#define APIC_TDCR_32 0x08 +#define APIC_TDCR_64 0x09 +#define APIC_TDCR_128 0x0a +#define APIC_TDCR_1 0x0b + +/* LVT table indices */ +#define APIC_LVT_LINT0 0 +#define APIC_LVT_LINT1 1 +#define APIC_LVT_TIMER 2 +#define APIC_LVT_ERROR 3 +#define APIC_LVT_PMC 4 +#define APIC_LVT_THERMAL 5 +#define APIC_LVT_CMCI 6 +#define APIC_LVT_MAX APIC_LVT_CMCI + +/****************************************************************************** + * I/O APIC defines + */ + +/* default physical locations of an IO APIC */ +#define DEFAULT_IO_APIC_BASE 0xfec00000 + +/* window register offset */ +#define IOAPIC_WINDOW 0x10 +#define IOAPIC_EOIR 0x40 + +/* indexes into IO APIC */ +#define IOAPIC_ID 0x00 +#define IOAPIC_VER 0x01 +#define IOAPIC_ARB 0x02 +#define IOAPIC_REDTBL 0x10 +#define IOAPIC_REDTBL0 IOAPIC_REDTBL +#define IOAPIC_REDTBL1 (IOAPIC_REDTBL+0x02) +#define IOAPIC_REDTBL2 (IOAPIC_REDTBL+0x04) +#define IOAPIC_REDTBL3 (IOAPIC_REDTBL+0x06) +#define IOAPIC_REDTBL4 (IOAPIC_REDTBL+0x08) +#define IOAPIC_REDTBL5 (IOAPIC_REDTBL+0x0a) +#define IOAPIC_REDTBL6 (IOAPIC_REDTBL+0x0c) +#define IOAPIC_REDTBL7 (IOAPIC_REDTBL+0x0e) +#define IOAPIC_REDTBL8 (IOAPIC_REDTBL+0x10) +#define IOAPIC_REDTBL9 (IOAPIC_REDTBL+0x12) +#define IOAPIC_REDTBL10 (IOAPIC_REDTBL+0x14) +#define IOAPIC_REDTBL11 (IOAPIC_REDTBL+0x16) +#define IOAPIC_REDTBL12 (IOAPIC_REDTBL+0x18) +#define IOAPIC_REDTBL13 (IOAPIC_REDTBL+0x1a) +#define IOAPIC_REDTBL14 (IOAPIC_REDTBL+0x1c) +#define IOAPIC_REDTBL15 (IOAPIC_REDTBL+0x1e) +#define IOAPIC_REDTBL16 (IOAPIC_REDTBL+0x20) +#define IOAPIC_REDTBL17 (IOAPIC_REDTBL+0x22) +#define IOAPIC_REDTBL18 (IOAPIC_REDTBL+0x24) +#define IOAPIC_REDTBL19 (IOAPIC_REDTBL+0x26) +#define IOAPIC_REDTBL20 (IOAPIC_REDTBL+0x28) +#define IOAPIC_REDTBL21 (IOAPIC_REDTBL+0x2a) +#define IOAPIC_REDTBL22 (IOAPIC_REDTBL+0x2c) +#define IOAPIC_REDTBL23 (IOAPIC_REDTBL+0x2e) + +/* fields in VER */ +#define IOART_VER_VERSION 0x000000ff +#define IOART_VER_MAXREDIR 0x00ff0000 +#define MAXREDIRSHIFT 16 + +/* + * fields in the IO APIC's redirection table entries + */ +#define IOART_DEST APIC_ID_MASK /* broadcast addr: all APICs */ + +#define IOART_RESV 0x00fe0000 /* reserved */ + +#define IOART_INTMASK 0x00010000 /* R/W: INTerrupt mask */ +# define IOART_INTMCLR 0x00000000 /* clear, allow INTs */ +# define IOART_INTMSET 0x00010000 /* set, inhibit INTs */ + +#define IOART_TRGRMOD 0x00008000 /* R/W: trigger mode */ +# define IOART_TRGREDG 0x00000000 /* edge */ +# define IOART_TRGRLVL 0x00008000 /* level */ + +#define IOART_REM_IRR 0x00004000 /* RO: remote IRR */ + +#define IOART_INTPOL 0x00002000 /* R/W: INT input pin polarity */ +# define IOART_INTAHI 0x00000000 /* active high */ +# define IOART_INTALO 0x00002000 /* active low */ + +#define IOART_DELIVS 0x00001000 /* RO: delivery status */ + +#define IOART_DESTMOD 0x00000800 /* R/W: destination mode */ +# define IOART_DESTPHY 0x00000000 /* physical */ +# define IOART_DESTLOG 0x00000800 /* logical */ + +#define IOART_DELMOD 0x00000700 /* R/W: delivery mode */ +# define IOART_DELFIXED 0x00000000 /* fixed */ +# define IOART_DELLOPRI 0x00000100 /* lowest priority */ +# define IOART_DELSMI 0x00000200 /* System Management INT */ +# define IOART_DELRSV1 0x00000300 /* reserved */ +# define IOART_DELNMI 0x00000400 /* NMI signal */ +# define IOART_DELINIT 0x00000500 /* INIT signal */ +# define IOART_DELRSV2 0x00000600 /* reserved */ +# define IOART_DELEXINT 0x00000700 /* External INTerrupt */ + +#define IOART_INTVEC 0x000000ff /* R/W: INTerrupt vector field */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/ata.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/ata.h new file mode 100644 index 0000000..7d0a7fd --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/ata.h @@ -0,0 +1,643 @@ +/*- + * Copyright (c) 2000 - 2008 Søren Schmidt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +#pragma clang diagnostic ignored "-Wpacked" + +/* ATA/ATAPI device parameters */ +struct ata_params { +/*000*/ u_int16_t config; /* configuration info */ +#define ATA_PROTO_MASK 0x8003 +#define ATA_PROTO_ATAPI 0x8000 +#define ATA_PROTO_ATAPI_12 0x8000 +#define ATA_PROTO_ATAPI_16 0x8001 +#define ATA_PROTO_CFA 0x848a +#define ATA_ATAPI_TYPE_MASK 0x1f00 +#define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */ +#define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ +#define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ +#define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ +#define ATA_DRQ_MASK 0x0060 +#define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ +#define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ +#define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */ +#define ATA_RESP_INCOMPLETE 0x0004 + +/*001*/ u_int16_t cylinders; /* # of cylinders */ +/*002*/ u_int16_t specconf; /* specific configuration */ +/*003*/ u_int16_t heads; /* # heads */ + u_int16_t obsolete4; + u_int16_t obsolete5; +/*006*/ u_int16_t sectors; /* # sectors/track */ +/*007*/ u_int16_t vendor7[3]; +/*010*/ u_int8_t serial[20]; /* serial number */ +/*020*/ u_int16_t retired20; + u_int16_t retired21; + u_int16_t obsolete22; +/*023*/ u_int8_t revision[8]; /* firmware revision */ +/*027*/ u_int8_t model[40]; /* model name */ +/*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ +/*048*/ u_int16_t usedmovsd; /* double word read/write? */ +/*049*/ u_int16_t capabilities1; +#define ATA_SUPPORT_DMA 0x0100 +#define ATA_SUPPORT_LBA 0x0200 +#define ATA_SUPPORT_IORDY 0x0400 +#define ATA_SUPPORT_IORDYDIS 0x0800 +#define ATA_SUPPORT_OVERLAP 0x4000 + +/*050*/ u_int16_t capabilities2; +/*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */ +#define ATA_RETIRED_PIO_MASK 0x0300 + +/*052*/ u_int16_t retired_dmamode; /* DMA modes */ +#define ATA_RETIRED_DMA_MASK 0x0003 + +/*053*/ u_int16_t atavalid; /* fields valid */ +#define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */ +#define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */ +#define ATA_FLAG_88 0x0004 /* word 88 valid */ + +/*054*/ u_int16_t current_cylinders; +/*055*/ u_int16_t current_heads; +/*056*/ u_int16_t current_sectors; +/*057*/ u_int16_t current_size_1; +/*058*/ u_int16_t current_size_2; +/*059*/ u_int16_t multi; +#define ATA_MULTI_VALID 0x0100 + +/*060*/ u_int16_t lba_size_1; + u_int16_t lba_size_2; + u_int16_t obsolete62; +/*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */ +/*064*/ u_int16_t apiomodes; /* advanced PIO modes */ + +/*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */ +/*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */ +/*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */ +/*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */ +/*069*/ u_int16_t support3; +#define ATA_SUPPORT_RZAT 0x0020 +#define ATA_SUPPORT_DRAT 0x4000 + u_int16_t reserved70; +/*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ +/*072*/ u_int16_t rlsservice; /* rel time (us) for service */ + u_int16_t reserved73; + u_int16_t reserved74; +/*075*/ u_int16_t queue; +#define ATA_QUEUE_LEN(x) ((x) & 0x001f) + +/*76*/ u_int16_t satacapabilities; +#define ATA_SATA_GEN1 0x0002 +#define ATA_SATA_GEN2 0x0004 +#define ATA_SATA_GEN3 0x0008 +#define ATA_SUPPORT_NCQ 0x0100 +#define ATA_SUPPORT_IFPWRMNGTRCV 0x0200 +#define ATA_SUPPORT_PHYEVENTCNT 0x0400 +#define ATA_SUPPORT_NCQ_UNLOAD 0x0800 +#define ATA_SUPPORT_NCQ_PRIO 0x1000 +#define ATA_SUPPORT_HAPST 0x2000 +#define ATA_SUPPORT_DAPST 0x4000 +#define ATA_SUPPORT_READLOGDMAEXT 0x8000 + +/*77*/ u_int16_t satacapabilities2; +#define ATA_SATA_CURR_GEN_MASK 0x0006 +#define ATA_SUPPORT_NCQ_STREAM 0x0010 +#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 +#define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 +/*78*/ u_int16_t satasupport; +#define ATA_SUPPORT_NONZERO 0x0002 +#define ATA_SUPPORT_AUTOACTIVATE 0x0004 +#define ATA_SUPPORT_IFPWRMNGT 0x0008 +#define ATA_SUPPORT_INORDERDATA 0x0010 +#define ATA_SUPPORT_ASYNCNOTIF 0x0020 +#define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 +/*79*/ u_int16_t sataenabled; +#define ATA_ENABLED_DAPST 0x0080 + +/*080*/ u_int16_t version_major; +/*081*/ u_int16_t version_minor; + + struct { +/*082/085*/ u_int16_t command1; +#define ATA_SUPPORT_SMART 0x0001 +#define ATA_SUPPORT_SECURITY 0x0002 +#define ATA_SUPPORT_REMOVABLE 0x0004 +#define ATA_SUPPORT_POWERMGT 0x0008 +#define ATA_SUPPORT_PACKET 0x0010 +#define ATA_SUPPORT_WRITECACHE 0x0020 +#define ATA_SUPPORT_LOOKAHEAD 0x0040 +#define ATA_SUPPORT_RELEASEIRQ 0x0080 +#define ATA_SUPPORT_SERVICEIRQ 0x0100 +#define ATA_SUPPORT_RESET 0x0200 +#define ATA_SUPPORT_PROTECTED 0x0400 +#define ATA_SUPPORT_WRITEBUFFER 0x1000 +#define ATA_SUPPORT_READBUFFER 0x2000 +#define ATA_SUPPORT_NOP 0x4000 + +/*083/086*/ u_int16_t command2; +#define ATA_SUPPORT_MICROCODE 0x0001 +#define ATA_SUPPORT_QUEUED 0x0002 +#define ATA_SUPPORT_CFA 0x0004 +#define ATA_SUPPORT_APM 0x0008 +#define ATA_SUPPORT_NOTIFY 0x0010 +#define ATA_SUPPORT_STANDBY 0x0020 +#define ATA_SUPPORT_SPINUP 0x0040 +#define ATA_SUPPORT_MAXSECURITY 0x0100 +#define ATA_SUPPORT_AUTOACOUSTIC 0x0200 +#define ATA_SUPPORT_ADDRESS48 0x0400 +#define ATA_SUPPORT_OVERLAY 0x0800 +#define ATA_SUPPORT_FLUSHCACHE 0x1000 +#define ATA_SUPPORT_FLUSHCACHE48 0x2000 + +/*084/087*/ u_int16_t extension; +#define ATA_SUPPORT_SMARTLOG 0x0001 +#define ATA_SUPPORT_SMARTTEST 0x0002 +#define ATA_SUPPORT_MEDIASN 0x0004 +#define ATA_SUPPORT_MEDIAPASS 0x0008 +#define ATA_SUPPORT_STREAMING 0x0010 +#define ATA_SUPPORT_GENLOG 0x0020 +#define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040 +#define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080 +#define ATA_SUPPORT_64BITWWN 0x0100 +#define ATA_SUPPORT_UNLOAD 0x2000 + } __packed support, enabled; + +/*088*/ u_int16_t udmamodes; /* UltraDMA modes */ +/*089*/ u_int16_t erase_time; /* time req'd in 2min units */ +/*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */ +/*091*/ u_int16_t apm_value; +/*092*/ u_int16_t master_passwd_revision; /* password revision code */ +/*093*/ u_int16_t hwres; +#define ATA_CABLE_ID 0x2000 + +/*094*/ u_int16_t acoustic; +#define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff) +#define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8) + +/*095*/ u_int16_t stream_min_req_size; +/*096*/ u_int16_t stream_transfer_time; +/*097*/ u_int16_t stream_access_latency; +/*098*/ u_int32_t stream_granularity; +/*100*/ u_int16_t lba_size48_1; + u_int16_t lba_size48_2; + u_int16_t lba_size48_3; + u_int16_t lba_size48_4; + u_int16_t reserved104; +/*105*/ u_int16_t max_dsm_blocks; +/*106*/ u_int16_t pss; +#define ATA_PSS_LSPPS 0x000F +#define ATA_PSS_LSSABOVE512 0x1000 +#define ATA_PSS_MULTLS 0x2000 +#define ATA_PSS_VALID_MASK 0xC000 +#define ATA_PSS_VALID_VALUE 0x4000 +/*107*/ u_int16_t isd; +/*108*/ u_int16_t wwn[4]; + u_int16_t reserved112[5]; +/*117*/ u_int16_t lss_1; +/*118*/ u_int16_t lss_2; +/*119*/ u_int16_t support2; +#define ATA_SUPPORT_WRITEREADVERIFY 0x0002 +#define ATA_SUPPORT_WRITEUNCORREXT 0x0004 +#define ATA_SUPPORT_RWLOGDMAEXT 0x0008 +#define ATA_SUPPORT_MICROCODE3 0x0010 +#define ATA_SUPPORT_FREEFALL 0x0020 +/*120*/ u_int16_t enabled2; + u_int16_t reserved121[6]; +/*127*/ u_int16_t removable_status; +/*128*/ u_int16_t security_status; +#define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */ +#define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */ +#define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */ +#define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */ +#define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */ +#define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */ +#define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */ + + u_int16_t reserved129[31]; +/*160*/ u_int16_t cfa_powermode1; + u_int16_t reserved161; +/*162*/ u_int16_t cfa_kms_support; +/*163*/ u_int16_t cfa_trueide_modes; +/*164*/ u_int16_t cfa_memory_modes; + u_int16_t reserved165[4]; +/*169*/ u_int16_t support_dsm; +#define ATA_SUPPORT_DSM_TRIM 0x0001 + u_int16_t reserved170[6]; +/*176*/ u_int8_t media_serial[60]; +/*206*/ u_int16_t sct; + u_int16_t reserved206[2]; +/*209*/ u_int16_t lsalign; +/*210*/ u_int16_t wrv_sectors_m3_1; + u_int16_t wrv_sectors_m3_2; +/*212*/ u_int16_t wrv_sectors_m2_1; + u_int16_t wrv_sectors_m2_2; +/*214*/ u_int16_t nv_cache_caps; +/*215*/ u_int16_t nv_cache_size_1; + u_int16_t nv_cache_size_2; +/*217*/ u_int16_t media_rotation_rate; +#define ATA_RATE_NOT_REPORTED 0x0000 +#define ATA_RATE_NON_ROTATING 0x0001 + u_int16_t reserved218; +/*219*/ u_int16_t nv_cache_opt; +/*220*/ u_int16_t wrv_mode; + u_int16_t reserved221; +/*222*/ u_int16_t transport_major; +/*223*/ u_int16_t transport_minor; + u_int16_t reserved224[31]; +/*255*/ u_int16_t integrity; +} __packed; + +/* ATA Dataset Management */ +#define ATA_DSM_BLK_SIZE 512 +#define ATA_DSM_BLK_RANGES 64 +#define ATA_DSM_RANGE_SIZE 8 +#define ATA_DSM_RANGE_MAX 65535 + +/* + * ATA Device Register + * + * bit 7 Obsolete (was 1 in early ATA specs) + * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS + * bit 5 Obsolete (was 1 in early ATA specs) + * bit 4 1 = Slave Drive, 0 = Master Drive + * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number +*/ + +#define ATA_DEV_MASTER 0x00 +#define ATA_DEV_SLAVE 0x10 +#define ATA_DEV_LBA 0x40 + +/* ATA limits */ +#define ATA_MAX_28BIT_LBA 268435455UL + +/* ATA Status Register */ +#define ATA_STATUS_ERROR 0x01 +#define ATA_STATUS_DEVICE_FAULT 0x20 + +/* ATA Error Register */ +#define ATA_ERROR_ABORT 0x04 +#define ATA_ERROR_ID_NOT_FOUND 0x10 + +/* ATA HPA Features */ +#define ATA_HPA_FEAT_MAX_ADDR 0x00 +#define ATA_HPA_FEAT_SET_PWD 0x01 +#define ATA_HPA_FEAT_LOCK 0x02 +#define ATA_HPA_FEAT_UNLOCK 0x03 +#define ATA_HPA_FEAT_FREEZE 0x04 + +/* ATA transfer modes */ +#define ATA_MODE_MASK 0x0f +#define ATA_DMA_MASK 0xf0 +#define ATA_PIO 0x00 +#define ATA_PIO0 0x08 +#define ATA_PIO1 0x09 +#define ATA_PIO2 0x0a +#define ATA_PIO3 0x0b +#define ATA_PIO4 0x0c +#define ATA_PIO_MAX 0x0f +#define ATA_DMA 0x10 +#define ATA_WDMA0 0x20 +#define ATA_WDMA1 0x21 +#define ATA_WDMA2 0x22 +#define ATA_UDMA0 0x40 +#define ATA_UDMA1 0x41 +#define ATA_UDMA2 0x42 +#define ATA_UDMA3 0x43 +#define ATA_UDMA4 0x44 +#define ATA_UDMA5 0x45 +#define ATA_UDMA6 0x46 +#define ATA_SA150 0x47 +#define ATA_SA300 0x48 +#define ATA_DMA_MAX 0x4f + + +/* ATA commands */ +#define ATA_NOP 0x00 /* NOP */ +#define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ +#define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */ +#define ATA_DATA_SET_MANAGEMENT 0x06 +#define ATA_DSM_TRIM 0x01 +#define ATA_DEVICE_RESET 0x08 /* reset device */ +#define ATA_READ 0x20 /* read */ +#define ATA_READ48 0x24 /* read 48bit LBA */ +#define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */ +#define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */ +#define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */ +#define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */ +#define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */ +#define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */ +#define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */ +#define ATA_WRITE 0x30 /* write */ +#define ATA_WRITE48 0x34 /* write 48bit LBA */ +#define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */ +#define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/ +#define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */ +#define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */ +#define ATA_WRITE_STREAM_DMA48 0x3a +#define ATA_WRITE_STREAM48 0x3b +#define ATA_WRITE_DMA_FUA48 0x3d +#define ATA_WRITE_DMA_QUEUED_FUA48 0x3e +#define ATA_WRITE_LOG_EXT 0x3f +#define ATA_READ_VERIFY 0x40 +#define ATA_READ_VERIFY48 0x42 +#define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ +#define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ +#define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ +#define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ +#define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ +#define ATA_SFPDMA_DSM 0x00 /* Data set management */ +#define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxilary */ +#define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ +#define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ +#define ATA_RECV_FPDMA_QUEUED 0x65 /* recieve DMA NCQ */ +#define ATA_SEP_ATTN 0x67 /* SEP request */ +#define ATA_SEEK 0x70 /* seek */ +#define ATA_PACKET_CMD 0xa0 /* packet command */ +#define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ +#define ATA_SERVICE 0xa2 /* service command */ +#define ATA_SMART_CMD 0xb0 /* SMART command */ +#define ATA_CFA_ERASE 0xc0 /* CFA erase */ +#define ATA_READ_MUL 0xc4 /* read multi */ +#define ATA_WRITE_MUL 0xc5 /* write multi */ +#define ATA_SET_MULTI 0xc6 /* set multi size */ +#define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */ +#define ATA_READ_DMA 0xc8 /* read DMA */ +#define ATA_WRITE_DMA 0xca /* write DMA */ +#define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */ +#define ATA_WRITE_MUL_FUA48 0xce +#define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */ +#define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */ +#define ATA_STANDBY_CMD 0xe2 /* standby */ +#define ATA_IDLE_CMD 0xe3 /* idle */ +#define ATA_READ_BUFFER 0xe4 /* read buffer */ +#define ATA_READ_PM 0xe4 /* read portmultiplier */ +#define ATA_SLEEP 0xe6 /* sleep */ +#define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ +#define ATA_WRITE_PM 0xe8 /* write portmultiplier */ +#define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ +#define ATA_ATA_IDENTIFY 0xec /* get ATA params */ +#define ATA_SETFEATURES 0xef /* features command */ +#define ATA_SF_SETXFER 0x03 /* set transfer mode */ +#define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ +#define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ +#define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ +#define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ +#define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ +#define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ +#define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ +#define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ +#define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ +#define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ +#define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ +#define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ +#define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ +#define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ +#define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */ +#define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */ +#define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */ +#define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ +#define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ + + +/* ATAPI commands */ +#define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ +#define ATAPI_REZERO 0x01 /* rewind */ +#define ATAPI_REQUEST_SENSE 0x03 /* get sense data */ +#define ATAPI_FORMAT 0x04 /* format unit */ +#define ATAPI_READ 0x08 /* read data */ +#define ATAPI_WRITE 0x0a /* write data */ +#define ATAPI_WEOF 0x10 /* write filemark */ +#define ATAPI_WF_WRITE 0x01 +#define ATAPI_SPACE 0x11 /* space command */ +#define ATAPI_SP_FM 0x01 +#define ATAPI_SP_EOD 0x03 +#define ATAPI_INQUIRY 0x12 /* get inquiry data */ +#define ATAPI_MODE_SELECT 0x15 /* mode select */ +#define ATAPI_ERASE 0x19 /* erase */ +#define ATAPI_MODE_SENSE 0x1a /* mode sense */ +#define ATAPI_START_STOP 0x1b /* start/stop unit */ +#define ATAPI_SS_LOAD 0x01 +#define ATAPI_SS_RETENSION 0x02 +#define ATAPI_SS_EJECT 0x04 +#define ATAPI_PREVENT_ALLOW 0x1e /* media removal */ +#define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */ +#define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */ +#define ATAPI_READ_BIG 0x28 /* read data */ +#define ATAPI_WRITE_BIG 0x2a /* write data */ +#define ATAPI_LOCATE 0x2b /* locate to position */ +#define ATAPI_READ_POSITION 0x34 /* read position */ +#define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */ +#define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */ +#define ATAPI_READ_BUFFER 0x3c /* read device buffer */ +#define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */ +#define ATAPI_READ_TOC 0x43 /* get table of contents */ +#define ATAPI_PLAY_10 0x45 /* play by lba */ +#define ATAPI_PLAY_MSF 0x47 /* play by MSF address */ +#define ATAPI_PLAY_TRACK 0x48 /* play by track number */ +#define ATAPI_PAUSE 0x4b /* pause audio operation */ +#define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */ +#define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */ +#define ATAPI_RESERVE_TRACK 0x53 /* reserve track */ +#define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */ +#define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */ +#define ATAPI_REPAIR_TRACK 0x58 /* repair track */ +#define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */ +#define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */ +#define ATAPI_CLOSE_TRACK 0x5b /* close track/session */ +#define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */ +#define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */ +#define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */ +#define ATAPI_BLANK 0xa1 /* blank the media */ +#define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */ +#define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */ +#define ATAPI_PLAY_12 0xa5 /* play by lba */ +#define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */ +#define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */ +#define ATAPI_PLAY_CD 0xb4 /* universal play command */ +#define ATAPI_SET_SPEED 0xbb /* set drive speed */ +#define ATAPI_MECH_STATUS 0xbd /* get changer status */ +#define ATAPI_READ_CD 0xbe /* read data */ +#define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ + + +struct ata_ioc_devices { + int channel; + char name[2][32]; + struct ata_params params[2]; +}; + +/* pr channel ATA ioctl calls */ +#define IOCATAGMAXCHANNEL _IOR('a', 1, int) +#define IOCATAREINIT _IOW('a', 2, int) +#define IOCATAATTACH _IOW('a', 3, int) +#define IOCATADETACH _IOW('a', 4, int) +#define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices) + +/* ATAPI request sense structure */ +struct atapi_sense { + u_int8_t error; /* current or deferred errors */ +#define ATA_SENSE_VALID 0x80 + + u_int8_t segment; /* segment number */ + u_int8_t key; /* sense key */ +#define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */ +#define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */ +#define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */ +#define ATA_SENSE_NOT_READY 0x02 /* no access to drive */ +#define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */ +#define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */ +#define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */ +#define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */ +#define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */ +#define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */ +#define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */ +#define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */ +#define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */ +#define ATA_SENSE_EQUAL 0x0c /* equal */ +#define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */ +#define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */ +#define ATA_SENSE_RESERVED 0x0f +#define ATA_SENSE_ILI 0x20; +#define ATA_SENSE_EOM 0x40; +#define ATA_SENSE_FILEMARK 0x80; + + u_int32_t cmd_info; /* cmd information */ + u_int8_t sense_length; /* additional sense len (n-7) */ + u_int32_t cmd_specific_info; /* additional cmd spec info */ + u_int8_t asc; /* additional sense code */ + u_int8_t ascq; /* additional sense code qual */ + u_int8_t replaceable_unit_code; /* replaceable unit code */ + u_int8_t specific; /* sense key specific */ +#define ATA_SENSE_SPEC_VALID 0x80 +#define ATA_SENSE_SPEC_MASK 0x7f + + u_int8_t specific1; /* sense key specific */ + u_int8_t specific2; /* sense key specific */ +} __packed; + +struct ata_ioc_request { + union { + struct { + u_int8_t command; + u_int8_t feature; + u_int64_t lba; + u_int16_t count; + } ata; + struct { + char ccb[16]; + struct atapi_sense sense; + } atapi; + } u; + caddr_t data; + int count; + int flags; +#define ATA_CMD_CONTROL 0x01 +#define ATA_CMD_READ 0x02 +#define ATA_CMD_WRITE 0x04 +#define ATA_CMD_ATAPI 0x08 + + int timeout; + int error; +}; + +struct ata_security_password { + u_int16_t ctrl; +#define ATA_SECURITY_PASSWORD_USER 0x0000 +#define ATA_SECURITY_PASSWORD_MASTER 0x0001 +#define ATA_SECURITY_ERASE_NORMAL 0x0000 +#define ATA_SECURITY_ERASE_ENHANCED 0x0002 +#define ATA_SECURITY_LEVEL_HIGH 0x0000 +#define ATA_SECURITY_LEVEL_MAXIMUM 0x0100 + + u_int8_t password[32]; + u_int16_t revision; + u_int16_t reserved[238]; +}; + +/* pr device ATA ioctl calls */ +#define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request) +#define IOCATAGPARM _IOR('a', 101, struct ata_params) +#define IOCATAGMODE _IOR('a', 102, int) +#define IOCATASMODE _IOW('a', 103, int) + +#define IOCATAGSPINDOWN _IOR('a', 104, int) +#define IOCATASSPINDOWN _IOW('a', 105, int) + + +struct ata_ioc_raid_config { + int lun; + int type; +#define AR_JBOD 0x0001 +#define AR_SPAN 0x0002 +#define AR_RAID0 0x0004 +#define AR_RAID1 0x0008 +#define AR_RAID01 0x0010 +#define AR_RAID3 0x0020 +#define AR_RAID4 0x0040 +#define AR_RAID5 0x0080 + + int interleave; + int status; +#define AR_READY 1 +#define AR_DEGRADED 2 +#define AR_REBUILDING 4 + + int progress; + int total_disks; + int disks[16]; +}; + +struct ata_ioc_raid_status { + int lun; + int type; + int interleave; + int status; + int progress; + int total_disks; + struct { + int state; +#define AR_DISK_ONLINE 0x01 +#define AR_DISK_PRESENT 0x02 +#define AR_DISK_SPARE 0x04 + int lun; + } disks[16]; +}; + +/* ATA RAID ioctl calls */ +#define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config) +#define IOCATARAIDDELETE _IOW('a', 201, int) +#define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status) +#define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config) +#define IOCATARAIDREBUILD _IOW('a', 204, int) + +#pragma clang diagnostic pop diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/atomic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/atomic.h new file mode 100644 index 0000000..b2a8903 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/atomic.h @@ -0,0 +1,443 @@ +/*- + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +#define __compiler_membar() __asm __volatile(" " : : : "memory") + +#define mb() __asm __volatile("mfence;" : : : "memory") +#define wmb() __asm __volatile("sfence;" : : : "memory") +#define rmb() __asm __volatile("lfence;" : : : "memory") + +/* + * Various simple operations on memory, each of which is atomic in the + * presence of interrupts and multiple processors. + * + * atomic_set_char(P, V) (*(u_char *)(P) |= (V)) + * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V)) + * atomic_add_char(P, V) (*(u_char *)(P) += (V)) + * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V)) + * + * atomic_set_short(P, V) (*(u_short *)(P) |= (V)) + * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V)) + * atomic_add_short(P, V) (*(u_short *)(P) += (V)) + * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V)) + * + * atomic_set_int(P, V) (*(u_int *)(P) |= (V)) + * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V)) + * atomic_add_int(P, V) (*(u_int *)(P) += (V)) + * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V)) + * atomic_swap_int(P, V) (return (*(u_int *)(P)); *(u_int *)(P) = (V);) + * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;) + * + * atomic_set_long(P, V) (*(u_long *)(P) |= (V)) + * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V)) + * atomic_add_long(P, V) (*(u_long *)(P) += (V)) + * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V)) + * atomic_swap_long(P, V) (return (*(u_long *)(P)); *(u_long *)(P) = (V);) + * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;) + */ + +#define MPLOCKED "lock ; " + +/* + * The assembly is volatilized to avoid code chunk removal by the compiler. + * GCC aggressively reorders operations and memory clobbering is necessary + * in order to avoid that for memory barriers. + */ +#define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ +static __inline void \ +atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ +{ \ + __asm __volatile(MPLOCKED OP \ + : "+m" (*p) \ + : CONS (V) \ + : "cc"); \ +} \ + \ +static __inline void \ +atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ +{ \ + __asm __volatile(MPLOCKED OP \ + : "+m" (*p) \ + : CONS (V) \ + : "memory", "cc"); \ +} \ +struct __hack + +/* + * Atomic compare and set, used by the mutex functions + * + * if (*dst == expect) *dst = src (all 32 bit words) + * + * Returns 0 on failure, non-zero on success + */ + +static __inline int +atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src) +{ + u_char res; + + __asm __volatile( + " " MPLOCKED " " + " cmpxchgl %3,%1 ; " + " sete %0 ; " + "# atomic_cmpset_int" + : "=q" (res), /* 0 */ + "+m" (*dst), /* 1 */ + "+a" (expect) /* 2 */ + : "r" (src) /* 3 */ + : "memory", "cc"); + return (res); +} + +static __inline int +atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) +{ + u_char res; + + __asm __volatile( + " " MPLOCKED " " + " cmpxchgq %3,%1 ; " + " sete %0 ; " + "# atomic_cmpset_long" + : "=q" (res), /* 0 */ + "+m" (*dst), /* 1 */ + "+a" (expect) /* 2 */ + : "r" (src) /* 3 */ + : "memory", "cc"); + return (res); +} + +/* + * Atomically add the value of v to the integer pointed to by p and return + * the previous value of *p. + */ +static __inline u_int +atomic_fetchadd_int(volatile u_int *p, u_int v) +{ + + __asm __volatile( + " " MPLOCKED " " + " xaddl %0,%1 ; " + "# atomic_fetchadd_int" + : "+r" (v), /* 0 */ + "+m" (*p) /* 1 */ + : : "cc"); + return (v); +} + +/* + * Atomically add the value of v to the long integer pointed to by p and return + * the previous value of *p. + */ +static __inline u_long +atomic_fetchadd_long(volatile u_long *p, u_long v) +{ + + __asm __volatile( + " " MPLOCKED " " + " xaddq %0,%1 ; " + "# atomic_fetchadd_long" + : "+r" (v), /* 0 */ + "+m" (*p) /* 1 */ + : : "cc"); + return (v); +} + +static __inline int +atomic_testandset_int(volatile u_int *p, u_int v) +{ + u_char res; + + __asm __volatile( + " " MPLOCKED " " + " btsl %2,%1 ; " + " setc %0 ; " + "# atomic_testandset_int" + : "=q" (res), /* 0 */ + "+m" (*p) /* 1 */ + : "Ir" (v & 0x1f) /* 2 */ + : "cc"); + return (res); +} + +static __inline int +atomic_testandset_long(volatile u_long *p, u_int v) +{ + u_char res; + + __asm __volatile( + " " MPLOCKED " " + " btsq %2,%1 ; " + " setc %0 ; " + "# atomic_testandset_long" + : "=q" (res), /* 0 */ + "+m" (*p) /* 1 */ + : "Jr" ((u_long)(v & 0x3f)) /* 2 */ + : "cc"); + return (res); +} + +/* + * We assume that a = b will do atomic loads and stores. Due to the + * IA32 memory model, a simple store guarantees release semantics. + * + * However, loads may pass stores, so for atomic_load_acq we have to + * ensure a Store/Load barrier to do the load in SMP kernels. We use + * "lock cmpxchg" as recommended by the AMD Software Optimization + * Guide, and not mfence. For UP kernels, however, the cache of the + * single processor is always consistent, so we only need to take care + * of the compiler. + */ +#define ATOMIC_STORE(TYPE) \ +static __inline void \ +atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ +{ \ + __compiler_membar(); \ + *p = v; \ +} \ +struct __hack + +#define ATOMIC_LOAD(TYPE, LOP) \ +static __inline u_##TYPE \ +atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ +{ \ + u_##TYPE res; \ + \ + __asm __volatile(MPLOCKED LOP \ + : "=a" (res), /* 0 */ \ + "+m" (*p) /* 1 */ \ + : : "memory", "cc"); \ + return (res); \ +} \ +struct __hack + +ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); +ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v); +ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v); +ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v); + +ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v); +ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v); +ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v); +ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v); + +ATOMIC_ASM(set, int, "orl %1,%0", "ir", v); +ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v); +ATOMIC_ASM(add, int, "addl %1,%0", "ir", v); +ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v); + +ATOMIC_ASM(set, long, "orq %1,%0", "ir", v); +ATOMIC_ASM(clear, long, "andq %1,%0", "ir", ~v); +ATOMIC_ASM(add, long, "addq %1,%0", "ir", v); +ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v); + +ATOMIC_LOAD(char, "cmpxchgb %b0,%1"); +ATOMIC_LOAD(short, "cmpxchgw %w0,%1"); +ATOMIC_LOAD(int, "cmpxchgl %0,%1"); +ATOMIC_LOAD(long, "cmpxchgq %0,%1"); + +ATOMIC_STORE(char); +ATOMIC_STORE(short); +ATOMIC_STORE(int); +ATOMIC_STORE(long); + +#undef ATOMIC_ASM +#undef ATOMIC_LOAD +#undef ATOMIC_STORE + +/* Read the current value and store a new value in the destination. */ + +static __inline u_int +atomic_swap_int(volatile u_int *p, u_int v) +{ + + __asm __volatile( + " xchgl %1,%0 ; " + "# atomic_swap_int" + : "+r" (v), /* 0 */ + "+m" (*p)); /* 1 */ + return (v); +} + +static __inline u_long +atomic_swap_long(volatile u_long *p, u_long v) +{ + + __asm __volatile( + " xchgq %1,%0 ; " + "# atomic_swap_long" + : "+r" (v), /* 0 */ + "+m" (*p)); /* 1 */ + return (v); +} + +#define atomic_set_acq_char atomic_set_barr_char +#define atomic_set_rel_char atomic_set_barr_char +#define atomic_clear_acq_char atomic_clear_barr_char +#define atomic_clear_rel_char atomic_clear_barr_char +#define atomic_add_acq_char atomic_add_barr_char +#define atomic_add_rel_char atomic_add_barr_char +#define atomic_subtract_acq_char atomic_subtract_barr_char +#define atomic_subtract_rel_char atomic_subtract_barr_char + +#define atomic_set_acq_short atomic_set_barr_short +#define atomic_set_rel_short atomic_set_barr_short +#define atomic_clear_acq_short atomic_clear_barr_short +#define atomic_clear_rel_short atomic_clear_barr_short +#define atomic_add_acq_short atomic_add_barr_short +#define atomic_add_rel_short atomic_add_barr_short +#define atomic_subtract_acq_short atomic_subtract_barr_short +#define atomic_subtract_rel_short atomic_subtract_barr_short + +#define atomic_set_acq_int atomic_set_barr_int +#define atomic_set_rel_int atomic_set_barr_int +#define atomic_clear_acq_int atomic_clear_barr_int +#define atomic_clear_rel_int atomic_clear_barr_int +#define atomic_add_acq_int atomic_add_barr_int +#define atomic_add_rel_int atomic_add_barr_int +#define atomic_subtract_acq_int atomic_subtract_barr_int +#define atomic_subtract_rel_int atomic_subtract_barr_int +#define atomic_cmpset_acq_int atomic_cmpset_int +#define atomic_cmpset_rel_int atomic_cmpset_int + +#define atomic_set_acq_long atomic_set_barr_long +#define atomic_set_rel_long atomic_set_barr_long +#define atomic_clear_acq_long atomic_clear_barr_long +#define atomic_clear_rel_long atomic_clear_barr_long +#define atomic_add_acq_long atomic_add_barr_long +#define atomic_add_rel_long atomic_add_barr_long +#define atomic_subtract_acq_long atomic_subtract_barr_long +#define atomic_subtract_rel_long atomic_subtract_barr_long +#define atomic_cmpset_acq_long atomic_cmpset_long +#define atomic_cmpset_rel_long atomic_cmpset_long + +#define atomic_readandclear_int(p) atomic_swap_int(p, 0) +#define atomic_readandclear_long(p) atomic_swap_long(p, 0) + +/* Operations on 8-bit bytes. */ +#define atomic_set_8 atomic_set_char +#define atomic_set_acq_8 atomic_set_acq_char +#define atomic_set_rel_8 atomic_set_rel_char +#define atomic_clear_8 atomic_clear_char +#define atomic_clear_acq_8 atomic_clear_acq_char +#define atomic_clear_rel_8 atomic_clear_rel_char +#define atomic_add_8 atomic_add_char +#define atomic_add_acq_8 atomic_add_acq_char +#define atomic_add_rel_8 atomic_add_rel_char +#define atomic_subtract_8 atomic_subtract_char +#define atomic_subtract_acq_8 atomic_subtract_acq_char +#define atomic_subtract_rel_8 atomic_subtract_rel_char +#define atomic_load_acq_8 atomic_load_acq_char +#define atomic_store_rel_8 atomic_store_rel_char + +/* Operations on 16-bit words. */ +#define atomic_set_16 atomic_set_short +#define atomic_set_acq_16 atomic_set_acq_short +#define atomic_set_rel_16 atomic_set_rel_short +#define atomic_clear_16 atomic_clear_short +#define atomic_clear_acq_16 atomic_clear_acq_short +#define atomic_clear_rel_16 atomic_clear_rel_short +#define atomic_add_16 atomic_add_short +#define atomic_add_acq_16 atomic_add_acq_short +#define atomic_add_rel_16 atomic_add_rel_short +#define atomic_subtract_16 atomic_subtract_short +#define atomic_subtract_acq_16 atomic_subtract_acq_short +#define atomic_subtract_rel_16 atomic_subtract_rel_short +#define atomic_load_acq_16 atomic_load_acq_short +#define atomic_store_rel_16 atomic_store_rel_short + +/* Operations on 32-bit double words. */ +#define atomic_set_32 atomic_set_int +#define atomic_set_acq_32 atomic_set_acq_int +#define atomic_set_rel_32 atomic_set_rel_int +#define atomic_clear_32 atomic_clear_int +#define atomic_clear_acq_32 atomic_clear_acq_int +#define atomic_clear_rel_32 atomic_clear_rel_int +#define atomic_add_32 atomic_add_int +#define atomic_add_acq_32 atomic_add_acq_int +#define atomic_add_rel_32 atomic_add_rel_int +#define atomic_subtract_32 atomic_subtract_int +#define atomic_subtract_acq_32 atomic_subtract_acq_int +#define atomic_subtract_rel_32 atomic_subtract_rel_int +#define atomic_load_acq_32 atomic_load_acq_int +#define atomic_store_rel_32 atomic_store_rel_int +#define atomic_cmpset_32 atomic_cmpset_int +#define atomic_cmpset_acq_32 atomic_cmpset_acq_int +#define atomic_cmpset_rel_32 atomic_cmpset_rel_int +#define atomic_swap_32 atomic_swap_int +#define atomic_readandclear_32 atomic_readandclear_int +#define atomic_fetchadd_32 atomic_fetchadd_int +#define atomic_testandset_32 atomic_testandset_int + +/* Operations on 64-bit quad words. */ +#define atomic_set_64 atomic_set_long +#define atomic_set_acq_64 atomic_set_acq_long +#define atomic_set_rel_64 atomic_set_rel_long +#define atomic_clear_64 atomic_clear_long +#define atomic_clear_acq_64 atomic_clear_acq_long +#define atomic_clear_rel_64 atomic_clear_rel_long +#define atomic_add_64 atomic_add_long +#define atomic_add_acq_64 atomic_add_acq_long +#define atomic_add_rel_64 atomic_add_rel_long +#define atomic_subtract_64 atomic_subtract_long +#define atomic_subtract_acq_64 atomic_subtract_acq_long +#define atomic_subtract_rel_64 atomic_subtract_rel_long +#define atomic_load_acq_64 atomic_load_acq_long +#define atomic_store_rel_64 atomic_store_rel_long +#define atomic_cmpset_64 atomic_cmpset_long +#define atomic_cmpset_acq_64 atomic_cmpset_acq_long +#define atomic_cmpset_rel_64 atomic_cmpset_rel_long +#define atomic_swap_64 atomic_swap_long +#define atomic_readandclear_64 atomic_readandclear_long +#define atomic_testandset_64 atomic_testandset_long + +/* Operations on pointers. */ +#define atomic_set_ptr atomic_set_long +#define atomic_set_acq_ptr atomic_set_acq_long +#define atomic_set_rel_ptr atomic_set_rel_long +#define atomic_clear_ptr atomic_clear_long +#define atomic_clear_acq_ptr atomic_clear_acq_long +#define atomic_clear_rel_ptr atomic_clear_rel_long +#define atomic_add_ptr atomic_add_long +#define atomic_add_acq_ptr atomic_add_acq_long +#define atomic_add_rel_ptr atomic_add_rel_long +#define atomic_subtract_ptr atomic_subtract_long +#define atomic_subtract_acq_ptr atomic_subtract_acq_long +#define atomic_subtract_rel_ptr atomic_subtract_rel_long +#define atomic_load_acq_ptr atomic_load_acq_long +#define atomic_store_rel_ptr atomic_store_rel_long +#define atomic_cmpset_ptr atomic_cmpset_long +#define atomic_cmpset_acq_ptr atomic_cmpset_acq_long +#define atomic_cmpset_rel_ptr atomic_cmpset_rel_long +#define atomic_swap_ptr atomic_swap_long +#define atomic_readandclear_ptr atomic_readandclear_long diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/bitset.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/bitset.h new file mode 100644 index 0000000..83de227 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/bitset.h @@ -0,0 +1,215 @@ +/*- + * Copyright (c) 2008, Jeffrey Roberson + * All rights reserved. + * + * Copyright (c) 2008 Nokia Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +/* + * Macros addressing word and bit within it, tuned to make compiler + * optimize cases when SETSIZE fits into single machine word. + */ +#define _BITSET_BITS (sizeof(long) * NBBY) + +#define __bitset_words(_s) (howmany(_s, _BITSET_BITS)) + +#define __bitset_mask(_s, n) \ + (1L << ((__bitset_words((_s)) == 1) ? \ + (size_t)(n) : ((n) % _BITSET_BITS))) + +#define __bitset_word(_s, n) \ + ((__bitset_words((_s)) == 1) ? 0 : ((n) / _BITSET_BITS)) + +#define BITSET_DEFINE(t, _s) \ +struct t { \ + long __bits[__bitset_words((_s))]; \ +} + +#define BITSET_T_INITIALIZER(x) \ + { .__bits = { x } } + +#define BITSET_FSET(n) \ + [ 0 ... ((n) - 1) ] = (-1L) + +#define BIT_CLR(_s, n, p) \ + ((p)->__bits[__bitset_word(_s, n)] &= ~__bitset_mask((_s), (n))) + +#define BIT_COPY(_s, f, t) (void)(*(t) = *(f)) + +#define BIT_ISSET(_s, n, p) \ + ((((p)->__bits[__bitset_word(_s, n)] & __bitset_mask((_s), (n))) != 0)) + +#define BIT_SET(_s, n, p) \ + ((p)->__bits[__bitset_word(_s, n)] |= __bitset_mask((_s), (n))) + +#define BIT_ZERO(_s, p) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + (p)->__bits[__i] = 0L; \ +} while (0) + +#define BIT_FILL(_s, p) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + (p)->__bits[__i] = -1L; \ +} while (0) + +#define BIT_SETOF(_s, n, p) do { \ + BIT_ZERO(_s, p); \ + (p)->__bits[__bitset_word(_s, n)] = __bitset_mask((_s), (n)); \ +} while (0) + +/* Is p empty. */ +#define BIT_EMPTY(_s, p) __extension__ ({ \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + if ((p)->__bits[__i]) \ + break; \ + __i == __bitset_words((_s)); \ +}) + +/* Is p full set. */ +#define BIT_ISFULLSET(_s, p) __extension__ ({ \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + if ((p)->__bits[__i] != (long)-1) \ + break; \ + __i == __bitset_words((_s)); \ +}) + +/* Is c a subset of p. */ +#define BIT_SUBSET(_s, p, c) __extension__ ({ \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + if (((c)->__bits[__i] & \ + (p)->__bits[__i]) != \ + (c)->__bits[__i]) \ + break; \ + __i == __bitset_words((_s)); \ +}) + +/* Are there any common bits between b & c? */ +#define BIT_OVERLAP(_s, p, c) __extension__ ({ \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + if (((c)->__bits[__i] & \ + (p)->__bits[__i]) != 0) \ + break; \ + __i != __bitset_words((_s)); \ +}) + +/* Compare two sets, returns 0 if equal 1 otherwise. */ +#define BIT_CMP(_s, p, c) __extension__ ({ \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + if (((c)->__bits[__i] != \ + (p)->__bits[__i])) \ + break; \ + __i != __bitset_words((_s)); \ +}) + +#define BIT_OR(_s, d, s) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + (d)->__bits[__i] |= (s)->__bits[__i]; \ +} while (0) + +#define BIT_AND(_s, d, s) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + (d)->__bits[__i] &= (s)->__bits[__i]; \ +} while (0) + +#define BIT_NAND(_s, d, s) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + (d)->__bits[__i] &= ~(s)->__bits[__i]; \ +} while (0) + +#define BIT_CLR_ATOMIC(_s, n, p) \ + atomic_clear_long(((volatile u_long *) \ + &(p)->__bits[__bitset_word(_s, n)]), __bitset_mask((_s), n)) + +#define BIT_SET_ATOMIC(_s, n, p) \ + atomic_set_long(((volatile u_long *) &(p)->__bits[__bitset_word(_s, n)]), \ + __bitset_mask((_s), n)) + +#define BIT_SET_ATOMIC_ACQ(_s, n, p) \ + atomic_set_acq_long(&(p)->__bits[__bitset_word(_s, n)], \ + __bitset_mask((_s), n)) + +/* Convenience functions catering special cases. */ +#define BIT_AND_ATOMIC(_s, d, s) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + atomic_clear_long(&(d)->__bits[__i], \ + ~(s)->__bits[__i]); \ +} while (0) + +#define BIT_OR_ATOMIC(_s, d, s) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + atomic_set_long(&(d)->__bits[__i], \ + (s)->__bits[__i]); \ +} while (0) + +#define BIT_COPY_STORE_REL(_s, f, t) do { \ + size_t __i; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + atomic_store_rel_long(&(t)->__bits[__i], \ + (f)->__bits[__i]); \ +} while (0) + +#define BIT_FFS(_s, p) __extension__ ({ \ + size_t __i; \ + int __bit; \ + \ + __bit = 0; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) { \ + if ((p)->__bits[__i] != 0) { \ + __bit = ffsl((p)->__bits[__i]); \ + __bit += __i * _BITSET_BITS; \ + break; \ + } \ + } \ + __bit; \ +}) + +#define BIT_COUNT(_s, p) __extension__ ({ \ + size_t __i; \ + int __count; \ + \ + __count = 0; \ + for (__i = 0; __i < __bitset_words((_s)); __i++) \ + __count += __bitcountl((p)->__bits[__i]); \ + __count; \ +}) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/cpuset.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/cpuset.h new file mode 100644 index 0000000..46539d3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/cpuset.h @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 2008, Jeffrey Roberson + * All rights reserved. + * + * Copyright (c) 2008 Nokia Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#define CPU_MAXSIZE 32 + +#ifndef CPU_SETSIZE +#define CPU_SETSIZE CPU_MAXSIZE +#endif + +// #define _NCPUBITS _BITSET_BITS +// #define _NCPUWORDS __bitset_words(CPU_SETSIZE) + +BITSET_DEFINE(_cpuset, CPU_SETSIZE); +typedef struct _cpuset cpuset_t; + +// #define CPUSET_FSET BITSET_FSET(_NCPUWORDS) +// #define CPUSET_T_INITIALIZER BITSET_T_INITIALIZER + +// #define CPUSETBUFSIZ ((2 + sizeof(long) * 2) * _NCPUWORDS) + +#define CPU_CLR(n, p) BIT_CLR(CPU_SETSIZE, n, p) +// #define CPU_COPY(f, t) BIT_COPY(CPU_SETSIZE, f, t) +#define CPU_ISSET(n, p) BIT_ISSET(CPU_SETSIZE, n, p) +#define CPU_SET(n, p) BIT_SET(CPU_SETSIZE, n, p) +#define CPU_ZERO(p) BIT_ZERO(CPU_SETSIZE, p) +// #define CPU_FILL(p) BIT_FILL(CPU_SETSIZE, p) +#define CPU_SETOF(n, p) BIT_SETOF(CPU_SETSIZE, n, p) +#define CPU_EMPTY(p) BIT_EMPTY(CPU_SETSIZE, p) +// #define CPU_ISFULLSET(p) BIT_ISFULLSET(CPU_SETSIZE, p) +// #define CPU_SUBSET(p, c) BIT_SUBSET(CPU_SETSIZE, p, c) +// #define CPU_OVERLAP(p, c) BIT_OVERLAP(CPU_SETSIZE, p, c) +#define CPU_CMP(p, c) BIT_CMP(CPU_SETSIZE, p, c) +// #define CPU_OR(d, s) BIT_OR(CPU_SETSIZE, d, s) +#define CPU_AND(d, s) BIT_AND(CPU_SETSIZE, d, s) +// #define CPU_NAND(d, s) BIT_NAND(CPU_SETSIZE, d, s) +#define CPU_CLR_ATOMIC(n, p) BIT_CLR_ATOMIC(CPU_SETSIZE, n, p) +#define CPU_SET_ATOMIC(n, p) BIT_SET_ATOMIC(CPU_SETSIZE, n, p) +// #define CPU_SET_ATOMIC_ACQ(n, p) BIT_SET_ATOMIC_ACQ(CPU_SETSIZE, n, p) +// #define CPU_AND_ATOMIC(n, p) BIT_AND_ATOMIC(CPU_SETSIZE, n, p) +// #define CPU_OR_ATOMIC(d, s) BIT_OR_ATOMIC(CPU_SETSIZE, d, s) +// #define CPU_COPY_STORE_REL(f, t) BIT_COPY_STORE_REL(CPU_SETSIZE, f, t) +#define CPU_FFS(p) BIT_FFS(CPU_SETSIZE, p) +// #define CPU_COUNT(p) BIT_COUNT(CPU_SETSIZE, p) + +// /* +// * Valid cpulevel_t values. +// */ +// #define CPU_LEVEL_ROOT 1 /* All system cpus. */ +// #define CPU_LEVEL_CPUSET 2 /* Available cpus for which. */ +// #define CPU_LEVEL_WHICH 3 /* Actual mask/id for which. */ + +// /* +// * Valid cpuwhich_t values. +// */ +// #define CPU_WHICH_TID 1 /* Specifies a thread id. */ +// #define CPU_WHICH_PID 2 /* Specifies a process id. */ +// #define CPU_WHICH_CPUSET 3 /* Specifies a set id. */ +// #define CPU_WHICH_IRQ 4 /* Specifies an irq #. */ +// #define CPU_WHICH_JAIL 5 /* Specifies a jail id. */ +// #define CPU_WHICH_DOMAIN 6 /* Specifies a NUMA domain id. */ + +// /* +// * Reserved cpuset identifiers. +// */ +// #define CPUSET_INVALID -1 +// #define CPUSET_DEFAULT 0 + +// #ifdef _KERNEL +// LIST_HEAD(setlist, cpuset); + +// /* +// * cpusets encapsulate cpu binding information for one or more threads. +// * +// * a - Accessed with atomics. +// * s - Set at creation, never modified. Only a ref required to read. +// * c - Locked internally by a cpuset lock. +// * +// * The bitmask is only modified while holding the cpuset lock. It may be +// * read while only a reference is held but the consumer must be prepared +// * to deal with inconsistent results. +// */ +// struct cpuset { +// cpuset_t cs_mask; /* bitmask of valid cpus. */ +// volatile u_int cs_ref; /* (a) Reference count. */ +// int cs_flags; /* (s) Flags from below. */ +// cpusetid_t cs_id; /* (s) Id or INVALID. */ +// struct cpuset *cs_parent; /* (s) Pointer to our parent. */ +// LIST_ENTRY(cpuset) cs_link; /* (c) All identified sets. */ +// LIST_ENTRY(cpuset) cs_siblings; /* (c) Sibling set link. */ +// struct setlist cs_children; /* (c) List of children. */ +// }; + +// #define CPU_SET_ROOT 0x0001 /* Set is a root set. */ +// #define CPU_SET_RDONLY 0x0002 /* No modification allowed. */ + +// extern cpuset_t *cpuset_root; +// struct prison; +// struct proc; + +// struct cpuset *cpuset_thread0(void); +// struct cpuset *cpuset_ref(struct cpuset *); +// void cpuset_rel(struct cpuset *); +// int cpuset_setthread(lwpid_t id, cpuset_t *); +// int cpuset_setithread(lwpid_t id, int cpu); +// int cpuset_create_root(struct prison *, struct cpuset **); +// int cpuset_setproc_update_set(struct proc *, struct cpuset *); +// char *cpusetobj_strprint(char *, const cpuset_t *); +// int cpusetobj_strscan(cpuset_t *, const char *); + +// #else +// __BEGIN_DECLS +// int cpuset(cpusetid_t *); +// int cpuset_setid(cpuwhich_t, id_t, cpusetid_t); +// int cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *); +// int cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, cpuset_t *); +// int cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, const cpuset_t *); +// __END_DECLS +// #endif diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8253reg.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8253reg.h new file mode 100644 index 0000000..2934e95 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8253reg.h @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 1993 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Header: timerreg.h,v 1.2 93/02/28 15:08:58 mccanne Exp + * $FreeBSD$ + */ + +/* + * Register definitions for the Intel 8253 Programmable Interval Timer. + * + * This chip has three independent 16-bit down counters that can be + * read on the fly. There are three mode registers and three countdown + * registers. The countdown registers are addressed directly, via the + * first three I/O ports. The three mode registers are accessed via + * the fourth I/O port, with two bits in the mode byte indicating the + * register. (Why are hardware interfaces always so braindead?). + * + * To write a value into the countdown register, the mode register + * is first programmed with a command indicating the which byte of + * the two byte register is to be modified. The three possibilities + * are load msb (TMR_MR_MSB), load lsb (TMR_MR_LSB), or load lsb then + * msb (TMR_MR_BOTH). + * + * To read the current value ("on the fly") from the countdown register, + * you write a "latch" command into the mode register, then read the stable + * value from the corresponding I/O port. For example, you write + * TMR_MR_LATCH into the corresponding mode register. Presumably, + * after doing this, a write operation to the I/O port would result + * in undefined behavior (but hopefully not fry the chip). + * Reading in this manner has no side effects. + */ + +/* + * Macros for specifying values to be written into a mode register. + */ + +#pragma once + +#define TIMER_REG_CNTR0 0 /* timer 0 counter port */ +#define TIMER_REG_CNTR1 1 /* timer 1 counter port */ +#define TIMER_REG_CNTR2 2 /* timer 2 counter port */ +#define TIMER_REG_MODE 3 /* timer mode port */ +#define TIMER_SEL0 0x00 /* select counter 0 */ +#define TIMER_SEL1 0x40 /* select counter 1 */ +#define TIMER_SEL2 0x80 /* select counter 2 */ +#define TIMER_INTTC 0x00 /* mode 0, intr on terminal cnt */ +#define TIMER_ONESHOT 0x02 /* mode 1, one shot */ +#define TIMER_RATEGEN 0x04 /* mode 2, rate generator */ +#define TIMER_SQWAVE 0x06 /* mode 3, square wave */ +#define TIMER_SWSTROBE 0x08 /* mode 4, s/w triggered strobe */ +#define TIMER_HWSTROBE 0x0a /* mode 5, h/w triggered strobe */ +#define TIMER_LATCH 0x00 /* latch counter for reading */ +#define TIMER_LSB 0x10 /* r/w counter LSB */ +#define TIMER_MSB 0x20 /* r/w counter MSB */ +#define TIMER_16BIT 0x30 /* r/w counter 16 bits, LSB first */ +#define TIMER_BCD 0x01 /* count in BCD */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8259.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8259.h new file mode 100644 index 0000000..ac1e498 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/i8259.h @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 2003 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Register defintions for the i8259A programmable interrupt controller. + */ + +#pragma once + +/* Initialization control word 1. Written to even address. */ +#define ICW1_IC4 0x01 /* ICW4 present */ +#define ICW1_SNGL 0x02 /* 1 = single, 0 = cascaded */ +#define ICW1_ADI 0x04 /* 1 = 4, 0 = 8 byte vectors */ +#define ICW1_LTIM 0x08 /* 1 = level trigger, 0 = edge */ +#define ICW1_RESET 0x10 /* must be 1 */ +/* 0x20 - 0x80 - in 8080/8085 mode only */ + +/* Initialization control word 2. Written to the odd address. */ +/* No definitions, it is the base vector of the IDT for 8086 mode */ + +/* Initialization control word 3. Written to the odd address. */ +/* For a master PIC, bitfield indicating a slave 8259 on given input */ +/* For slave, lower 3 bits are the slave's ID binary id on master */ + +/* Initialization control word 4. Written to the odd address. */ +#define ICW4_8086 0x01 /* 1 = 8086, 0 = 8080 */ +#define ICW4_AEOI 0x02 /* 1 = Auto EOI */ +#define ICW4_MS 0x04 /* 1 = buffered master, 0 = slave */ +#define ICW4_BUF 0x08 /* 1 = enable buffer mode */ +#define ICW4_SFNM 0x10 /* 1 = special fully nested mode */ + +/* Operation control words. Written after initialization. */ + +/* Operation control word type 1 */ +/* + * No definitions. Written to the odd address. Bitmask for interrupts. + * 1 = disabled. + */ + +/* Operation control word type 2. Bit 3 (0x08) must be zero. Even address. */ +#define OCW2_L0 0x01 /* Level */ +#define OCW2_L1 0x02 +#define OCW2_L2 0x04 +/* 0x08 must be 0 to select OCW2 vs OCW3 */ +/* 0x10 must be 0 to select OCW2 vs ICW1 */ +#define OCW2_EOI 0x20 /* 1 = EOI */ +#define OCW2_SL 0x40 /* EOI mode */ +#define OCW2_R 0x80 /* EOI mode */ + +/* Operation control word type 3. Bit 3 (0x08) must be set. Even address. */ +#define OCW3_RIS 0x01 /* 1 = read IS, 0 = read IR */ +#define OCW3_RR 0x02 /* register read */ +#define OCW3_P 0x04 /* poll mode command */ +/* 0x08 must be 1 to select OCW3 vs OCW2 */ +#define OCW3_SEL 0x08 /* must be 1 */ +/* 0x10 must be 0 to select OCW3 vs ICW1 */ +#define OCW3_SMM 0x20 /* special mode mask */ +#define OCW3_ESMM 0x40 /* enable SMM */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/linker_set.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/linker_set.h new file mode 100644 index 0000000..657435d --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/linker_set.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 1999 John D. Polstra + * Copyright (c) 1999,2001 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* xhyve: sort of working linker sets for MachO */ + +#pragma once + +#define __GLOBL1(sym) __asm__(".globl " #sym) +#define __GLOBL(sym) __GLOBL1(sym) +#define __section(x) __attribute__((__section__(x))) + +/* + * The following macros are used to declare global sets of objects, which + * are collected by the linker into a `linker_set' as defined below. + * For ELF, this is done by constructing a separate segment for each set. + */ + +#define __MAKE_SET_CONST const + +/* + * Private macros, not to be used outside this header file. + */ +#define __MAKE_SET(set, sym) \ + __GLOBL(__CONCAT(__start_set_,set)); \ + __GLOBL(__CONCAT(__stop_set_,set)); \ + static void const * __MAKE_SET_CONST \ + __set_##set##_sym_##sym __section("__"#set",__set") \ + __used = &(sym) + +/* + * Public macros. + */ +#define TEXT_SET(set, sym) __MAKE_SET(set, sym) +#define DATA_SET(set, sym) __MAKE_SET(set, sym) +#define BSS_SET(set, sym) __MAKE_SET(set, sym) +#define ABS_SET(set, sym) __MAKE_SET(set, sym) +#define SET_ENTRY(set, sym) __MAKE_SET(set, sym) + +/* + * Initialize before referring to a given linker set. + */ +#define SET_DECLARE(set, ptype) \ + extern ptype __weak *__CONCAT(__start_set_,set) \ + __asm("segment$start$__"#set); \ + extern ptype __weak *__CONCAT(__stop_set_,set) \ + __asm("segment$end$__"#set) + +#define SET_BEGIN(set) \ + (&__CONCAT(__start_set_,set)) +#define SET_LIMIT(set) \ + (&__CONCAT(__stop_set_,set)) + +/* + * Iterate over all the elements of a set. + * + * Sets always contain addresses of things, and "pvar" points to words + * containing those addresses. Thus is must be declared as "type **pvar", + * and the address of each set item is obtained inside the loop by "*pvar". + */ +#define SET_FOREACH(pvar, set) \ + for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++) + +#define SET_ITEM(set, i) \ + ((SET_BEGIN(set))[i]) + +/* + * Provide a count of the items in a set. + */ +#define SET_COUNT(set) \ + (SET_LIMIT(set) - SET_BEGIN(set)) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/md5.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/md5.h new file mode 100644 index 0000000..4825027 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/md5.h @@ -0,0 +1,52 @@ +/* MD5.H - header file for MD5C.C + * $FreeBSD$ + */ + +/*- + Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +#pragma once + +#include + +#define MD5_BLOCK_LENGTH 64 +#define MD5_DIGEST_LENGTH 16 +#define MD5_DIGEST_STRING_LENGTH (MD5_DIGEST_LENGTH * 2 + 1) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* MD5 context. */ +typedef struct MD5Context { + u_int32_t state[4]; /* state (ABCD) */ + u_int32_t count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ +} MD5_CTX; +#pragma clang diagnostic pop + +void MD5Init(MD5_CTX *); +void MD5Update(MD5_CTX *, const void *, unsigned int); +void MD5Final(unsigned char [16], MD5_CTX *); +char * MD5End(MD5_CTX *, char *); +char * MD5File(const char *, char *); +char * MD5FileChunk(const char *, char *, off_t, off_t); +char * MD5Data(const void *, unsigned int, char *); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/misc.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/misc.h new file mode 100644 index 0000000..d3497b3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/misc.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include + +#define UNUSED __attribute__ ((unused)) +#define CTASSERT(x) _Static_assert ((x), "CTASSERT") +#define XHYVE_PAGE_SIZE 0x1000 +#define XHYVE_PAGE_MASK (XHYVE_PAGE_SIZE - 1) +#define XHYVE_PAGE_SHIFT 12 +#define __aligned(x) __attribute__ ((aligned ((x)))) +#define __packed __attribute__ ((packed)) +#define nitems(x) (sizeof((x)) / sizeof((x)[0])) +#define powerof2(x) ((((x)-1)&(x))==0) +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#define nitems(x) (sizeof((x)) / sizeof((x)[0])) +#define min(x, y) (((x) < (y)) ? (x) : (y)) + +#define xhyve_abort(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + abort(); \ + } while (0) + +#define xhyve_warn(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) + +#ifdef XHYVE_CONFIG_ASSERT +#define KASSERT(exp, msg) if (!(exp)) xhyve_abort msg +#define KWARN(exp, msg) if (!(exp)) xhyve_warn msg +#else +#define KASSERT(exp, msg) if (0) xhyve_abort msg +#define KWARN(exp, msg) if (0) xhyve_warn msg +#endif + +#define FALSE 0 +#define TRUE 1 + +#define XHYVE_PROT_READ 1 +#define XHYVE_PROT_WRITE 2 +#define XHYVE_PROT_EXECUTE 4 + +#define VM_SUCCESS 0 + +/* sys/sys/types.h */ +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +typedef unsigned long u_long; + +static inline void cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) { + __asm__ __volatile__ ("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax), "c" (cx)); +} + +static inline void do_cpuid(unsigned ax, unsigned *p) { + __asm__ __volatile__ ("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/mptable.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/mptable.h new file mode 100644 index 0000000..f8ff7f5 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/mptable.h @@ -0,0 +1,194 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +enum busTypes { + NOBUS = 0, + CBUS = 1, + CBUSII = 2, + EISA = 3, + ISA = 6, + MCA = 9, + PCI = 13, + XPRESS = 18, + MAX_BUSTYPE = 18, + UNKNOWN_BUSTYPE = 0xff +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" + +/* MP Floating Pointer Structure */ +typedef struct MPFPS { + uint8_t signature[4]; + uint32_t pap; + uint8_t length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t config_type; + uint8_t mpfb2; + uint8_t mpfb3; + uint8_t mpfb4; + uint8_t mpfb5; +} __packed *mpfps_t; + +#define MPFB2_IMCR_PRESENT 0x80 +#define MPFB2_MUL_CLK_SRCS 0x40 + +/* MP Configuration Table Header */ +typedef struct MPCTH { + uint8_t signature[4]; + uint16_t base_table_length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t oem_id[8]; + uint8_t product_id[12]; + uint32_t oem_table_pointer; + uint16_t oem_table_size; + uint16_t entry_count; + uint32_t apic_address; + uint16_t extended_table_length; + uint8_t extended_table_checksum; + uint8_t reserved; +} __packed *mpcth_t; + +/* Base table entries */ + +#define MPCT_ENTRY_PROCESSOR 0 +#define MPCT_ENTRY_BUS 1 +#define MPCT_ENTRY_IOAPIC 2 +#define MPCT_ENTRY_INT 3 +#define MPCT_ENTRY_LOCAL_INT 4 + +typedef struct PROCENTRY { + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t cpu_flags; + uint32_t cpu_signature; + uint32_t feature_flags; + uint32_t reserved1; + uint32_t reserved2; +} __packed *proc_entry_ptr; + +#define PROCENTRY_FLAG_EN 0x01 +#define PROCENTRY_FLAG_BP 0x02 + +typedef struct BUSENTRY { + uint8_t type; + uint8_t bus_id; + uint8_t bus_type[6]; +} __packed *bus_entry_ptr; + +typedef struct IOAPICENTRY { + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t apic_flags; + uint32_t apic_address; +} __packed *io_apic_entry_ptr; + +#define IOAPICENTRY_FLAG_EN 0x01 + +typedef struct INTENTRY { + uint8_t type; + uint8_t int_type; + uint16_t int_flags; + uint8_t src_bus_id; + uint8_t src_bus_irq; + uint8_t dst_apic_id; + uint8_t dst_apic_int; +} __packed *int_entry_ptr; + +#define INTENTRY_TYPE_INT 0 +#define INTENTRY_TYPE_NMI 1 +#define INTENTRY_TYPE_SMI 2 +#define INTENTRY_TYPE_EXTINT 3 + +#define INTENTRY_FLAGS_POLARITY 0x3 +#define INTENTRY_FLAGS_POLARITY_CONFORM 0x0 +#define INTENTRY_FLAGS_POLARITY_ACTIVEHI 0x1 +#define INTENTRY_FLAGS_POLARITY_ACTIVELO 0x3 +#define INTENTRY_FLAGS_TRIGGER 0xc +#define INTENTRY_FLAGS_TRIGGER_CONFORM 0x0 +#define INTENTRY_FLAGS_TRIGGER_EDGE 0x4 +#define INTENTRY_FLAGS_TRIGGER_LEVEL 0xc + +/* Extended table entries */ + +typedef struct EXTENTRY { + uint8_t type; + uint8_t length; +} __packed *ext_entry_ptr; + +#define MPCT_EXTENTRY_SAS 0x80 +#define MPCT_EXTENTRY_BHD 0x81 +#define MPCT_EXTENTRY_CBASM 0x82 + +typedef struct SASENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_type; + uint64_t address_base; + uint64_t address_length; +} __packed *sas_entry_ptr; + +#define SASENTRY_TYPE_IO 0 +#define SASENTRY_TYPE_MEMORY 1 +#define SASENTRY_TYPE_PREFETCH 2 + +typedef struct BHDENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t bus_info; + uint8_t parent_bus; + uint8_t reserved[3]; +} __packed *bhd_entry_ptr; + +#define BHDENTRY_INFO_SUBTRACTIVE_DECODE 0x1 + +typedef struct CBASMENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_mod; + uint32_t predefined_range; +} __packed *cbasm_entry_ptr; + +#define CBASMENTRY_ADDRESS_MOD_ADD 0x0 +#define CBASMENTRY_ADDRESS_MOD_SUBTRACT 0x1 + +#define CBASMENTRY_RANGE_ISA_IO 0 +#define CBASMENTRY_RANGE_VGA_IO 1 + +#pragma clang diagnostic pop diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/ns16550.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/ns16550.h new file mode 100644 index 0000000..2a023d0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/ns16550.h @@ -0,0 +1,242 @@ +/*- + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ns16550.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +/* + * NS8250... UART registers. + */ + +/* 8250 registers #[0-6]. */ + +#pragma once + +#define com_data 0 /* data register (R/W) */ +#define REG_DATA com_data + +#define com_ier 1 /* interrupt enable register (W) */ +#define REG_IER com_ier +#define IER_ERXRDY 0x1 +#define IER_ETXRDY 0x2 +#define IER_ERLS 0x4 +#define IER_EMSC 0x8 + +#define IER_BITS "\20\1ERXRDY\2ETXRDY\3ERLS\4EMSC" + +#define com_iir 2 /* interrupt identification register (R) */ +#define REG_IIR com_iir +#define IIR_IMASK 0xf +#define IIR_RXTOUT 0xc +#define IIR_BUSY 0x7 +#define IIR_RLS 0x6 +#define IIR_RXRDY 0x4 +#define IIR_TXRDY 0x2 +#define IIR_NOPEND 0x1 +#define IIR_MLSC 0x0 +#define IIR_FIFO_MASK 0xc0 /* set if FIFOs are enabled */ + +#define IIR_BITS "\20\1NOPEND\2TXRDY\3RXRDY" + +#define com_lcr 3 /* line control register (R/W) */ +#define com_cfcr com_lcr /* character format control register (R/W) */ +#define REG_LCR com_lcr +#define LCR_DLAB 0x80 +#define CFCR_DLAB LCR_DLAB +#define LCR_EFR_ENABLE 0xbf /* magic to enable EFR on 16650 up */ +#define CFCR_EFR_ENABLE LCR_EFR_ENABLE +#define LCR_SBREAK 0x40 +#define CFCR_SBREAK LCR_SBREAK +#define LCR_PZERO 0x30 +#define CFCR_PZERO LCR_PZERO +#define LCR_PONE 0x20 +#define CFCR_PONE LCR_PONE +#define LCR_PEVEN 0x10 +#define CFCR_PEVEN LCR_PEVEN +#define LCR_PODD 0x00 +#define CFCR_PODD LCR_PODD +#define LCR_PENAB 0x08 +#define CFCR_PENAB LCR_PENAB +#define LCR_STOPB 0x04 +#define CFCR_STOPB LCR_STOPB +#define LCR_8BITS 0x03 +#define CFCR_8BITS LCR_8BITS +#define LCR_7BITS 0x02 +#define CFCR_7BITS LCR_7BITS +#define LCR_6BITS 0x01 +#define CFCR_6BITS LCR_6BITS +#define LCR_5BITS 0x00 +#define CFCR_5BITS LCR_5BITS + +#define com_mcr 4 /* modem control register (R/W) */ +#define REG_MCR com_mcr +#define MCR_PRESCALE 0x80 /* only available on 16650 up */ +#define MCR_LOOPBACK 0x10 +#define MCR_IE 0x08 +#define MCR_IENABLE MCR_IE +#define MCR_DRS 0x04 +#define MCR_RTS 0x02 +#define MCR_DTR 0x01 + +#define MCR_BITS "\20\1DTR\2RTS\3DRS\4IE\5LOOPBACK\10PRESCALE" + +#define com_lsr 5 /* line status register (R/W) */ +#define REG_LSR com_lsr +#define LSR_RCV_FIFO 0x80 +#define LSR_TEMT 0x40 +#define LSR_TSRE LSR_TEMT +#define LSR_THRE 0x20 +#define LSR_TXRDY LSR_THRE +#define LSR_BI 0x10 +#define LSR_FE 0x08 +#define LSR_PE 0x04 +#define LSR_OE 0x02 +#define LSR_RXRDY 0x01 +#define LSR_RCV_MASK 0x1f + +#define LSR_BITS "\20\1RXRDY\2OE\3PE\4FE\5BI\6THRE\7TEMT\10RCV_FIFO" + +#define com_msr 6 /* modem status register (R/W) */ +#define REG_MSR com_msr +#define MSR_DCD 0x80 +#define MSR_RI 0x40 +#define MSR_DSR 0x20 +#define MSR_CTS 0x10 +#define MSR_DDCD 0x08 +#define MSR_TERI 0x04 +#define MSR_DDSR 0x02 +#define MSR_DCTS 0x01 + +#define MSR_BITS "\20\1DCTS\2DDSR\3TERI\4DDCD\5CTS\6DSR\7RI\10DCD" + +/* 8250 multiplexed registers #[0-1]. Access enabled by LCR[7]. */ +#define com_dll 0 /* divisor latch low (R/W) */ +#define com_dlbl com_dll +#define com_dlm 1 /* divisor latch high (R/W) */ +#define com_dlbh com_dlm +#define REG_DLL com_dll +#define REG_DLH com_dlm + +/* 16450 register #7. Not multiplexed. */ +#define com_scr 7 /* scratch register (R/W) */ + +/* 16550 register #2. Not multiplexed. */ +#define com_fcr 2 /* FIFO control register (W) */ +#define com_fifo com_fcr +#define REG_FCR com_fcr +#define FCR_ENABLE 0x01 +#define FIFO_ENABLE FCR_ENABLE +#define FCR_RCV_RST 0x02 +#define FIFO_RCV_RST FCR_RCV_RST +#define FCR_XMT_RST 0x04 +#define FIFO_XMT_RST FCR_XMT_RST +#define FCR_DMA 0x08 +#define FIFO_DMA_MODE FCR_DMA +#define FCR_RX_LOW 0x00 +#define FIFO_RX_LOW FCR_RX_LOW +#define FCR_RX_MEDL 0x40 +#define FIFO_RX_MEDL FCR_RX_MEDL +#define FCR_RX_MEDH 0x80 +#define FIFO_RX_MEDH FCR_RX_MEDH +#define FCR_RX_HIGH 0xc0 +#define FIFO_RX_HIGH FCR_RX_HIGH + +#define FCR_BITS "\20\1ENABLE\2RCV_RST\3XMT_RST\4DMA" + +/* 16650 registers #2,[4-7]. Access enabled by LCR_EFR_ENABLE. */ + +#define com_efr 2 /* enhanced features register (R/W) */ +#define REG_EFR com_efr +#define EFR_CTS 0x80 +#define EFR_AUTOCTS EFR_CTS +#define EFR_RTS 0x40 +#define EFR_AUTORTS EFR_RTS +#define EFR_EFE 0x10 /* enhanced functions enable */ + +#define com_xon1 4 /* XON 1 character (R/W) */ +#define com_xon2 5 /* XON 2 character (R/W) */ +#define com_xoff1 6 /* XOFF 1 character (R/W) */ +#define com_xoff2 7 /* XOFF 2 character (R/W) */ + +#define DW_REG_USR 31 /* DesignWare derived Uart Status Reg */ +#define com_usr 39 /* Octeon 16750/16550 Uart Status Reg */ +#define REG_USR com_usr +#define USR_BUSY 1 /* Uart Busy. Serial transfer in progress */ +#define USR_TXFIFO_NOTFULL 2 /* Uart TX FIFO Not full */ + +/* 16950 register #1. Access enabled by ACR[7]. Also requires !LCR[7]. */ +#define com_asr 1 /* additional status register (R[0-7]/W[0-1]) */ + +/* 16950 register #3. R/W access enabled by ACR[7]. */ +#define com_rfl 3 /* receiver fifo level (R) */ + +/* + * 16950 register #4. Access enabled by ACR[7]. Also requires + * !LCR_EFR_ENABLE. + */ +#define com_tfl 4 /* transmitter fifo level (R) */ + +/* + * 16950 register #5. Accessible if !LCR_EFR_ENABLE. Read access also + * requires ACR[6]. + */ +#define com_icr 5 /* index control register (R/W) */ + +/* + * 16950 register #7. It is the same as com_scr except it has a different + * abbreviation in the manufacturer's data sheet and it also serves as an + * index into the Indexed Control register set. + */ +#define com_spr com_scr /* scratch pad (and index) register (R/W) */ +#define REG_SPR com_scr + +/* + * 16950 indexed control registers #[0-0x13]. Access is via index in SPR, + * data in ICR (if ICR is accessible). + */ + +#define com_acr 0 /* additional control register (R/W) */ +#define ACR_ASE 0x80 /* ASR/RFL/TFL enable */ +#define ACR_ICRE 0x40 /* ICR enable */ +#define ACR_TLE 0x20 /* TTL/RTL enable */ + +#define com_cpr 1 /* clock prescaler register (R/W) */ +#define com_tcr 2 /* times clock register (R/W) */ +#define com_ttl 4 /* transmitter trigger level (R/W) */ +#define com_rtl 5 /* receiver trigger level (R/W) */ +/* ... */ + +/* Hardware extension mode register for RSB-2000/3000. */ +#define com_emr com_msr +#define EMR_EXBUFF 0x04 +#define EMR_CTSFLW 0x08 +#define EMR_DSRFLW 0x10 +#define EMR_RTSFLW 0x20 +#define EMR_DTRFLW 0x40 +#define EMR_EFMODE 0x80 diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/pcireg.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/pcireg.h new file mode 100644 index 0000000..720a497 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/pcireg.h @@ -0,0 +1,945 @@ +/*- + * Copyright (c) 1997, Stefan Esser + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#pragma once + +/* + * PCIM_xxx: mask to locate subfield in register + * PCIR_xxx: config register offset + * PCIC_xxx: device class + * PCIS_xxx: device subclass + * PCIP_xxx: device programming interface + * PCIV_xxx: PCI vendor ID (only required to fixup ancient devices) + * PCID_xxx: device ID + * PCIY_xxx: capability identification number + * PCIZ_xxx: extended capability identification number + */ + +/* some PCI bus constants */ +#define PCI_DOMAINMAX 65535 /* highest supported domain number */ +#define PCI_BUSMAX 255 /* highest supported bus number */ +#define PCI_SLOTMAX 31 /* highest supported slot number */ +#define PCI_FUNCMAX 7 /* highest supported function number */ +#define PCI_REGMAX 255 /* highest supported config register addr. */ +#define PCIE_REGMAX 4095 /* highest supported config register addr. */ +#define PCI_MAXHDRTYPE 2 + +#define PCIE_ARI_SLOTMAX 0 +#define PCIE_ARI_FUNCMAX 255 + +#define PCI_RID_BUS_SHIFT 8 +#define PCI_RID_SLOT_SHIFT 3 +#define PCI_RID_FUNC_SHIFT 0 + +#define PCI_RID(bus, slot, func) \ + ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \ + (((slot) & PCI_SLOTMAX) << PCI_RID_SLOT_SHIFT) | \ + (((func) & PCI_FUNCMAX) << PCI_RID_FUNC_SHIFT)) + +#define PCI_ARI_RID(bus, func) \ + ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \ + (((func) & PCIE_ARI_FUNCMAX) << PCI_RID_FUNC_SHIFT)) + +#define PCI_RID2BUS(rid) (((rid) >> PCI_RID_BUS_SHIFT) & PCI_BUSMAX) +#define PCI_RID2SLOT(rid) (((rid) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX) +#define PCI_RID2FUNC(rid) (((rid) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX) + +#define PCIE_ARI_RID2SLOT(rid) (0) +#define PCIE_ARI_RID2FUNC(rid) \ + (((rid) >> PCI_RID_FUNC_SHIFT) & PCIE_ARI_FUNCMAX) + +#define PCIE_ARI_SLOT(func) (((func) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX) +#define PCIE_ARI_FUNC(func) (((func) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX) + +/* PCI config header registers for all devices */ + +#define PCIR_DEVVENDOR 0x00 +#define PCIR_VENDOR 0x00 +#define PCIR_DEVICE 0x02 +#define PCIR_COMMAND 0x04 +#define PCIM_CMD_PORTEN 0x0001 +#define PCIM_CMD_MEMEN 0x0002 +#define PCIM_CMD_BUSMASTEREN 0x0004 +#define PCIM_CMD_SPECIALEN 0x0008 +#define PCIM_CMD_MWRICEN 0x0010 +#define PCIM_CMD_PERRESPEN 0x0040 +#define PCIM_CMD_SERRESPEN 0x0100 +#define PCIM_CMD_BACKTOBACK 0x0200 +#define PCIM_CMD_INTxDIS 0x0400 +#define PCIR_STATUS 0x06 +#define PCIM_STATUS_INTxSTATE 0x0008 +#define PCIM_STATUS_CAPPRESENT 0x0010 +#define PCIM_STATUS_66CAPABLE 0x0020 +#define PCIM_STATUS_BACKTOBACK 0x0080 +#define PCIM_STATUS_MDPERR 0x0100 +#define PCIM_STATUS_SEL_FAST 0x0000 +#define PCIM_STATUS_SEL_MEDIMUM 0x0200 +#define PCIM_STATUS_SEL_SLOW 0x0400 +#define PCIM_STATUS_SEL_MASK 0x0600 +#define PCIM_STATUS_STABORT 0x0800 +#define PCIM_STATUS_RTABORT 0x1000 +#define PCIM_STATUS_RMABORT 0x2000 +#define PCIM_STATUS_SERR 0x4000 +#define PCIM_STATUS_PERR 0x8000 +#define PCIR_REVID 0x08 +#define PCIR_PROGIF 0x09 +#define PCIR_SUBCLASS 0x0a +#define PCIR_CLASS 0x0b +#define PCIR_CACHELNSZ 0x0c +#define PCIR_LATTIMER 0x0d +#define PCIR_HDRTYPE 0x0e +#define PCIM_HDRTYPE 0x7f +#define PCIM_HDRTYPE_NORMAL 0x00 +#define PCIM_HDRTYPE_BRIDGE 0x01 +#define PCIM_HDRTYPE_CARDBUS 0x02 +#define PCIM_MFDEV 0x80 +#define PCIR_BIST 0x0f + +/* Capability Register Offsets */ + +#define PCICAP_ID 0x0 +#define PCICAP_NEXTPTR 0x1 + +/* Capability Identification Numbers */ + +#define PCIY_PMG 0x01 /* PCI Power Management */ +#define PCIY_AGP 0x02 /* AGP */ +#define PCIY_VPD 0x03 /* Vital Product Data */ +#define PCIY_SLOTID 0x04 /* Slot Identification */ +#define PCIY_MSI 0x05 /* Message Signaled Interrupts */ +#define PCIY_CHSWP 0x06 /* CompactPCI Hot Swap */ +#define PCIY_PCIX 0x07 /* PCI-X */ +#define PCIY_HT 0x08 /* HyperTransport */ +#define PCIY_VENDOR 0x09 /* Vendor Unique */ +#define PCIY_DEBUG 0x0a /* Debug port */ +#define PCIY_CRES 0x0b /* CompactPCI central resource control */ +#define PCIY_HOTPLUG 0x0c /* PCI Hot-Plug */ +#define PCIY_SUBVENDOR 0x0d /* PCI-PCI bridge subvendor ID */ +#define PCIY_AGP8X 0x0e /* AGP 8x */ +#define PCIY_SECDEV 0x0f /* Secure Device */ +#define PCIY_EXPRESS 0x10 /* PCI Express */ +#define PCIY_MSIX 0x11 /* MSI-X */ +#define PCIY_SATA 0x12 /* SATA */ +#define PCIY_PCIAF 0x13 /* PCI Advanced Features */ + +/* Extended Capability Register Fields */ + +#define PCIR_EXTCAP 0x100 +#define PCIM_EXTCAP_ID 0x0000ffff +#define PCIM_EXTCAP_VER 0x000f0000 +#define PCIM_EXTCAP_NEXTPTR 0xfff00000 +#define PCI_EXTCAP_ID(ecap) ((ecap) & PCIM_EXTCAP_ID) +#define PCI_EXTCAP_VER(ecap) (((ecap) & PCIM_EXTCAP_VER) >> 16) +#define PCI_EXTCAP_NEXTPTR(ecap) (((ecap) & PCIM_EXTCAP_NEXTPTR) >> 20) + +/* Extended Capability Identification Numbers */ + +#define PCIZ_AER 0x0001 /* Advanced Error Reporting */ +#define PCIZ_VC 0x0002 /* Virtual Channel if MFVC Ext Cap not set */ +#define PCIZ_SERNUM 0x0003 /* Device Serial Number */ +#define PCIZ_PWRBDGT 0x0004 /* Power Budgeting */ +#define PCIZ_RCLINK_DCL 0x0005 /* Root Complex Link Declaration */ +#define PCIZ_RCLINK_CTL 0x0006 /* Root Complex Internal Link Control */ +#define PCIZ_RCEC_ASSOC 0x0007 /* Root Complex Event Collector Association */ +#define PCIZ_MFVC 0x0008 /* Multi-Function Virtual Channel */ +#define PCIZ_VC2 0x0009 /* Virtual Channel if MFVC Ext Cap set */ +#define PCIZ_RCRB 0x000a /* RCRB Header */ +#define PCIZ_VENDOR 0x000b /* Vendor Unique */ +#define PCIZ_CAC 0x000c /* Configuration Access Correction -- obsolete */ +#define PCIZ_ACS 0x000d /* Access Control Services */ +#define PCIZ_ARI 0x000e /* Alternative Routing-ID Interpretation */ +#define PCIZ_ATS 0x000f /* Address Translation Services */ +#define PCIZ_SRIOV 0x0010 /* Single Root IO Virtualization */ +#define PCIZ_MRIOV 0x0011 /* Multiple Root IO Virtualization */ +#define PCIZ_MULTICAST 0x0012 /* Multicast */ +#define PCIZ_PAGE_REQ 0x0013 /* Page Request */ +#define PCIZ_AMD 0x0014 /* Reserved for AMD */ +#define PCIZ_RESIZE_BAR 0x0015 /* Resizable BAR */ +#define PCIZ_DPA 0x0016 /* Dynamic Power Allocation */ +#define PCIZ_TPH_REQ 0x0017 /* TPH Requester */ +#define PCIZ_LTR 0x0018 /* Latency Tolerance Reporting */ +#define PCIZ_SEC_PCIE 0x0019 /* Secondary PCI Express */ +#define PCIZ_PMUX 0x001a /* Protocol Multiplexing */ +#define PCIZ_PASID 0x001b /* Process Address Space ID */ +#define PCIZ_LN_REQ 0x001c /* LN Requester */ +#define PCIZ_DPC 0x001d /* Downstream Porto Containment */ +#define PCIZ_L1PM 0x001e /* L1 PM Substates */ + +/* config registers for header type 0 devices */ + +#define PCIR_BARS 0x10 +#define PCIR_BAR(x) (PCIR_BARS + (x) * 4) +#define PCIR_MAX_BAR_0 5 +#define PCI_RID2BAR(rid) (((rid) - PCIR_BARS) / 4) +#define PCI_BAR_IO(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_IO_SPACE) +#define PCI_BAR_MEM(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_MEM_SPACE) +#define PCIM_BAR_SPACE 0x00000001 +#define PCIM_BAR_MEM_SPACE 0 +#define PCIM_BAR_IO_SPACE 1 +#define PCIM_BAR_MEM_TYPE 0x00000006 +#define PCIM_BAR_MEM_32 0 +#define PCIM_BAR_MEM_1MB 2 /* Locate below 1MB in PCI <= 2.1 */ +#define PCIM_BAR_MEM_64 4 +#define PCIM_BAR_MEM_PREFETCH 0x00000008 +#define PCIM_BAR_MEM_BASE 0xfffffffffffffff0ULL +#define PCIM_BAR_IO_RESERVED 0x00000002 +#define PCIM_BAR_IO_BASE 0xfffffffc +#define PCIR_CIS 0x28 +#define PCIM_CIS_ASI_MASK 0x00000007 +#define PCIM_CIS_ASI_CONFIG 0 +#define PCIM_CIS_ASI_BAR0 1 +#define PCIM_CIS_ASI_BAR1 2 +#define PCIM_CIS_ASI_BAR2 3 +#define PCIM_CIS_ASI_BAR3 4 +#define PCIM_CIS_ASI_BAR4 5 +#define PCIM_CIS_ASI_BAR5 6 +#define PCIM_CIS_ASI_ROM 7 +#define PCIM_CIS_ADDR_MASK 0x0ffffff8 +#define PCIM_CIS_ROM_MASK 0xf0000000 +#define PCIM_CIS_CONFIG_MASK 0xff +#define PCIR_SUBVEND_0 0x2c +#define PCIR_SUBDEV_0 0x2e +#define PCIR_BIOS 0x30 +#define PCIM_BIOS_ENABLE 0x01 +#define PCIM_BIOS_ADDR_MASK 0xfffff800 +#define PCIR_CAP_PTR 0x34 +#define PCIR_INTLINE 0x3c +#define PCIR_INTPIN 0x3d +#define PCIR_MINGNT 0x3e +#define PCIR_MAXLAT 0x3f + +/* config registers for header type 1 (PCI-to-PCI bridge) devices */ + +#define PCIR_MAX_BAR_1 1 +#define PCIR_SECSTAT_1 0x1e + +#define PCIR_PRIBUS_1 0x18 +#define PCIR_SECBUS_1 0x19 +#define PCIR_SUBBUS_1 0x1a +#define PCIR_SECLAT_1 0x1b + +#define PCIR_IOBASEL_1 0x1c +#define PCIR_IOLIMITL_1 0x1d +#define PCIR_IOBASEH_1 0x30 +#define PCIR_IOLIMITH_1 0x32 +#define PCIM_BRIO_16 0x0 +#define PCIM_BRIO_32 0x1 +#define PCIM_BRIO_MASK 0xf + +#define PCIR_MEMBASE_1 0x20 +#define PCIR_MEMLIMIT_1 0x22 + +#define PCIR_PMBASEL_1 0x24 +#define PCIR_PMLIMITL_1 0x26 +#define PCIR_PMBASEH_1 0x28 +#define PCIR_PMLIMITH_1 0x2c +#define PCIM_BRPM_32 0x0 +#define PCIM_BRPM_64 0x1 +#define PCIM_BRPM_MASK 0xf + +#define PCIR_BIOS_1 0x38 +#define PCIR_BRIDGECTL_1 0x3e + +/* config registers for header type 2 (CardBus) devices */ + +#define PCIR_MAX_BAR_2 0 +#define PCIR_CAP_PTR_2 0x14 +#define PCIR_SECSTAT_2 0x16 + +#define PCIR_PRIBUS_2 0x18 +#define PCIR_SECBUS_2 0x19 +#define PCIR_SUBBUS_2 0x1a +#define PCIR_SECLAT_2 0x1b + +#define PCIR_MEMBASE0_2 0x1c +#define PCIR_MEMLIMIT0_2 0x20 +#define PCIR_MEMBASE1_2 0x24 +#define PCIR_MEMLIMIT1_2 0x28 +#define PCIR_IOBASE0_2 0x2c +#define PCIR_IOLIMIT0_2 0x30 +#define PCIR_IOBASE1_2 0x34 +#define PCIR_IOLIMIT1_2 0x38 + +#define PCIR_BRIDGECTL_2 0x3e + +#define PCIR_SUBVEND_2 0x40 +#define PCIR_SUBDEV_2 0x42 + +#define PCIR_PCCARDIF_2 0x44 + +/* PCI device class, subclass and programming interface definitions */ + +#define PCIC_OLD 0x00 +#define PCIS_OLD_NONVGA 0x00 +#define PCIS_OLD_VGA 0x01 + +#define PCIC_STORAGE 0x01 +#define PCIS_STORAGE_SCSI 0x00 +#define PCIS_STORAGE_IDE 0x01 +#define PCIP_STORAGE_IDE_MODEPRIM 0x01 +#define PCIP_STORAGE_IDE_PROGINDPRIM 0x02 +#define PCIP_STORAGE_IDE_MODESEC 0x04 +#define PCIP_STORAGE_IDE_PROGINDSEC 0x08 +#define PCIP_STORAGE_IDE_MASTERDEV 0x80 +#define PCIS_STORAGE_FLOPPY 0x02 +#define PCIS_STORAGE_IPI 0x03 +#define PCIS_STORAGE_RAID 0x04 +#define PCIS_STORAGE_ATA_ADMA 0x05 +#define PCIS_STORAGE_SATA 0x06 +#define PCIP_STORAGE_SATA_AHCI_1_0 0x01 +#define PCIS_STORAGE_SAS 0x07 +#define PCIS_STORAGE_NVM 0x08 +#define PCIP_STORAGE_NVM_NVMHCI_1_0 0x01 +#define PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0 0x02 +#define PCIS_STORAGE_OTHER 0x80 + +#define PCIC_NETWORK 0x02 +#define PCIS_NETWORK_ETHERNET 0x00 +#define PCIS_NETWORK_TOKENRING 0x01 +#define PCIS_NETWORK_FDDI 0x02 +#define PCIS_NETWORK_ATM 0x03 +#define PCIS_NETWORK_ISDN 0x04 +#define PCIS_NETWORK_WORLDFIP 0x05 +#define PCIS_NETWORK_PICMG 0x06 +#define PCIS_NETWORK_OTHER 0x80 + +#define PCIC_DISPLAY 0x03 +#define PCIS_DISPLAY_VGA 0x00 +#define PCIS_DISPLAY_XGA 0x01 +#define PCIS_DISPLAY_3D 0x02 +#define PCIS_DISPLAY_OTHER 0x80 + +#define PCIC_MULTIMEDIA 0x04 +#define PCIS_MULTIMEDIA_VIDEO 0x00 +#define PCIS_MULTIMEDIA_AUDIO 0x01 +#define PCIS_MULTIMEDIA_TELE 0x02 +#define PCIS_MULTIMEDIA_HDA 0x03 +#define PCIS_MULTIMEDIA_OTHER 0x80 + +#define PCIC_MEMORY 0x05 +#define PCIS_MEMORY_RAM 0x00 +#define PCIS_MEMORY_FLASH 0x01 +#define PCIS_MEMORY_OTHER 0x80 + +#define PCIC_BRIDGE 0x06 +#define PCIS_BRIDGE_HOST 0x00 +#define PCIS_BRIDGE_ISA 0x01 +#define PCIS_BRIDGE_EISA 0x02 +#define PCIS_BRIDGE_MCA 0x03 +#define PCIS_BRIDGE_PCI 0x04 +#define PCIP_BRIDGE_PCI_SUBTRACTIVE 0x01 +#define PCIS_BRIDGE_PCMCIA 0x05 +#define PCIS_BRIDGE_NUBUS 0x06 +#define PCIS_BRIDGE_CARDBUS 0x07 +#define PCIS_BRIDGE_RACEWAY 0x08 +#define PCIS_BRIDGE_PCI_TRANSPARENT 0x09 +#define PCIS_BRIDGE_INFINIBAND 0x0a +#define PCIS_BRIDGE_OTHER 0x80 + +#define PCIC_SIMPLECOMM 0x07 +#define PCIS_SIMPLECOMM_UART 0x00 +#define PCIP_SIMPLECOMM_UART_8250 0x00 +#define PCIP_SIMPLECOMM_UART_16450A 0x01 +#define PCIP_SIMPLECOMM_UART_16550A 0x02 +#define PCIP_SIMPLECOMM_UART_16650A 0x03 +#define PCIP_SIMPLECOMM_UART_16750A 0x04 +#define PCIP_SIMPLECOMM_UART_16850A 0x05 +#define PCIP_SIMPLECOMM_UART_16950A 0x06 +#define PCIS_SIMPLECOMM_PAR 0x01 +#define PCIS_SIMPLECOMM_MULSER 0x02 +#define PCIS_SIMPLECOMM_MODEM 0x03 +#define PCIS_SIMPLECOMM_GPIB 0x04 +#define PCIS_SIMPLECOMM_SMART_CARD 0x05 +#define PCIS_SIMPLECOMM_OTHER 0x80 + +#define PCIC_BASEPERIPH 0x08 +#define PCIS_BASEPERIPH_PIC 0x00 +#define PCIP_BASEPERIPH_PIC_8259A 0x00 +#define PCIP_BASEPERIPH_PIC_ISA 0x01 +#define PCIP_BASEPERIPH_PIC_EISA 0x02 +#define PCIP_BASEPERIPH_PIC_IO_APIC 0x10 +#define PCIP_BASEPERIPH_PIC_IOX_APIC 0x20 +#define PCIS_BASEPERIPH_DMA 0x01 +#define PCIS_BASEPERIPH_TIMER 0x02 +#define PCIS_BASEPERIPH_RTC 0x03 +#define PCIS_BASEPERIPH_PCIHOT 0x04 +#define PCIS_BASEPERIPH_SDHC 0x05 +#define PCIS_BASEPERIPH_IOMMU 0x06 +#define PCIS_BASEPERIPH_OTHER 0x80 + +#define PCIC_INPUTDEV 0x09 +#define PCIS_INPUTDEV_KEYBOARD 0x00 +#define PCIS_INPUTDEV_DIGITIZER 0x01 +#define PCIS_INPUTDEV_MOUSE 0x02 +#define PCIS_INPUTDEV_SCANNER 0x03 +#define PCIS_INPUTDEV_GAMEPORT 0x04 +#define PCIS_INPUTDEV_OTHER 0x80 + +#define PCIC_DOCKING 0x0a +#define PCIS_DOCKING_GENERIC 0x00 +#define PCIS_DOCKING_OTHER 0x80 + +#define PCIC_PROCESSOR 0x0b +#define PCIS_PROCESSOR_386 0x00 +#define PCIS_PROCESSOR_486 0x01 +#define PCIS_PROCESSOR_PENTIUM 0x02 +#define PCIS_PROCESSOR_ALPHA 0x10 +#define PCIS_PROCESSOR_POWERPC 0x20 +#define PCIS_PROCESSOR_MIPS 0x30 +#define PCIS_PROCESSOR_COPROC 0x40 + +#define PCIC_SERIALBUS 0x0c +#define PCIS_SERIALBUS_FW 0x00 +#define PCIS_SERIALBUS_ACCESS 0x01 +#define PCIS_SERIALBUS_SSA 0x02 +#define PCIS_SERIALBUS_USB 0x03 +#define PCIP_SERIALBUS_USB_UHCI 0x00 +#define PCIP_SERIALBUS_USB_OHCI 0x10 +#define PCIP_SERIALBUS_USB_EHCI 0x20 +#define PCIP_SERIALBUS_USB_XHCI 0x30 +#define PCIP_SERIALBUS_USB_DEVICE 0xfe +#define PCIS_SERIALBUS_FC 0x04 +#define PCIS_SERIALBUS_SMBUS 0x05 +#define PCIS_SERIALBUS_INFINIBAND 0x06 +#define PCIS_SERIALBUS_IPMI 0x07 +#define PCIP_SERIALBUS_IPMI_SMIC 0x00 +#define PCIP_SERIALBUS_IPMI_KCS 0x01 +#define PCIP_SERIALBUS_IPMI_BT 0x02 +#define PCIS_SERIALBUS_SERCOS 0x08 +#define PCIS_SERIALBUS_CANBUS 0x09 + +#define PCIC_WIRELESS 0x0d +#define PCIS_WIRELESS_IRDA 0x00 +#define PCIS_WIRELESS_IR 0x01 +#define PCIS_WIRELESS_RF 0x10 +#define PCIS_WIRELESS_BLUETOOTH 0x11 +#define PCIS_WIRELESS_BROADBAND 0x12 +#define PCIS_WIRELESS_80211A 0x20 +#define PCIS_WIRELESS_80211B 0x21 +#define PCIS_WIRELESS_OTHER 0x80 + +#define PCIC_INTELLIIO 0x0e +#define PCIS_INTELLIIO_I2O 0x00 + +#define PCIC_SATCOM 0x0f +#define PCIS_SATCOM_TV 0x01 +#define PCIS_SATCOM_AUDIO 0x02 +#define PCIS_SATCOM_VOICE 0x03 +#define PCIS_SATCOM_DATA 0x04 + +#define PCIC_CRYPTO 0x10 +#define PCIS_CRYPTO_NETCOMP 0x00 +#define PCIS_CRYPTO_ENTERTAIN 0x10 +#define PCIS_CRYPTO_OTHER 0x80 + +#define PCIC_DASP 0x11 +#define PCIS_DASP_DPIO 0x00 +#define PCIS_DASP_PERFCNTRS 0x01 +#define PCIS_DASP_COMM_SYNC 0x10 +#define PCIS_DASP_MGMT_CARD 0x20 +#define PCIS_DASP_OTHER 0x80 + +#define PCIC_OTHER 0xff + +/* Bridge Control Values. */ +#define PCIB_BCR_PERR_ENABLE 0x0001 +#define PCIB_BCR_SERR_ENABLE 0x0002 +#define PCIB_BCR_ISA_ENABLE 0x0004 +#define PCIB_BCR_VGA_ENABLE 0x0008 +#define PCIB_BCR_MASTER_ABORT_MODE 0x0020 +#define PCIB_BCR_SECBUS_RESET 0x0040 +#define PCIB_BCR_SECBUS_BACKTOBACK 0x0080 +#define PCIB_BCR_PRI_DISCARD_TIMEOUT 0x0100 +#define PCIB_BCR_SEC_DISCARD_TIMEOUT 0x0200 +#define PCIB_BCR_DISCARD_TIMER_STATUS 0x0400 +#define PCIB_BCR_DISCARD_TIMER_SERREN 0x0800 + +/* PCI power manangement */ +#define PCIR_POWER_CAP 0x2 +#define PCIM_PCAP_SPEC 0x0007 +#define PCIM_PCAP_PMEREQCLK 0x0008 +#define PCIM_PCAP_DEVSPECINIT 0x0020 +#define PCIM_PCAP_AUXPWR_0 0x0000 +#define PCIM_PCAP_AUXPWR_55 0x0040 +#define PCIM_PCAP_AUXPWR_100 0x0080 +#define PCIM_PCAP_AUXPWR_160 0x00c0 +#define PCIM_PCAP_AUXPWR_220 0x0100 +#define PCIM_PCAP_AUXPWR_270 0x0140 +#define PCIM_PCAP_AUXPWR_320 0x0180 +#define PCIM_PCAP_AUXPWR_375 0x01c0 +#define PCIM_PCAP_AUXPWRMASK 0x01c0 +#define PCIM_PCAP_D1SUPP 0x0200 +#define PCIM_PCAP_D2SUPP 0x0400 +#define PCIM_PCAP_D0PME 0x0800 +#define PCIM_PCAP_D1PME 0x1000 +#define PCIM_PCAP_D2PME 0x2000 +#define PCIM_PCAP_D3PME_HOT 0x4000 +#define PCIM_PCAP_D3PME_COLD 0x8000 + +#define PCIR_POWER_STATUS 0x4 +#define PCIM_PSTAT_D0 0x0000 +#define PCIM_PSTAT_D1 0x0001 +#define PCIM_PSTAT_D2 0x0002 +#define PCIM_PSTAT_D3 0x0003 +#define PCIM_PSTAT_DMASK 0x0003 +#define PCIM_PSTAT_NOSOFTRESET 0x0008 +#define PCIM_PSTAT_PMEENABLE 0x0100 +#define PCIM_PSTAT_D0POWER 0x0000 +#define PCIM_PSTAT_D1POWER 0x0200 +#define PCIM_PSTAT_D2POWER 0x0400 +#define PCIM_PSTAT_D3POWER 0x0600 +#define PCIM_PSTAT_D0HEAT 0x0800 +#define PCIM_PSTAT_D1HEAT 0x0a00 +#define PCIM_PSTAT_D2HEAT 0x0c00 +#define PCIM_PSTAT_D3HEAT 0x0e00 +#define PCIM_PSTAT_DATASELMASK 0x1e00 +#define PCIM_PSTAT_DATAUNKN 0x0000 +#define PCIM_PSTAT_DATADIV10 0x2000 +#define PCIM_PSTAT_DATADIV100 0x4000 +#define PCIM_PSTAT_DATADIV1000 0x6000 +#define PCIM_PSTAT_DATADIVMASK 0x6000 +#define PCIM_PSTAT_PME 0x8000 + +#define PCIR_POWER_BSE 0x6 +#define PCIM_PMCSR_BSE_D3B3 0x00 +#define PCIM_PMCSR_BSE_D3B2 0x40 +#define PCIM_PMCSR_BSE_BPCCE 0x80 + +#define PCIR_POWER_DATA 0x7 + +/* VPD capability registers */ +#define PCIR_VPD_ADDR 0x2 +#define PCIR_VPD_DATA 0x4 + +/* PCI Message Signalled Interrupts (MSI) */ +#define PCIR_MSI_CTRL 0x2 +#define PCIM_MSICTRL_VECTOR 0x0100 +#define PCIM_MSICTRL_64BIT 0x0080 +#define PCIM_MSICTRL_MME_MASK 0x0070 +#define PCIM_MSICTRL_MME_1 0x0000 +#define PCIM_MSICTRL_MME_2 0x0010 +#define PCIM_MSICTRL_MME_4 0x0020 +#define PCIM_MSICTRL_MME_8 0x0030 +#define PCIM_MSICTRL_MME_16 0x0040 +#define PCIM_MSICTRL_MME_32 0x0050 +#define PCIM_MSICTRL_MMC_MASK 0x000E +#define PCIM_MSICTRL_MMC_1 0x0000 +#define PCIM_MSICTRL_MMC_2 0x0002 +#define PCIM_MSICTRL_MMC_4 0x0004 +#define PCIM_MSICTRL_MMC_8 0x0006 +#define PCIM_MSICTRL_MMC_16 0x0008 +#define PCIM_MSICTRL_MMC_32 0x000A +#define PCIM_MSICTRL_MSI_ENABLE 0x0001 +#define PCIR_MSI_ADDR 0x4 +#define PCIR_MSI_ADDR_HIGH 0x8 +#define PCIR_MSI_DATA 0x8 +#define PCIR_MSI_DATA_64BIT 0xc +#define PCIR_MSI_MASK 0x10 +#define PCIR_MSI_PENDING 0x14 + +/* PCI-X definitions */ + +/* For header type 0 devices */ +#define PCIXR_COMMAND 0x2 +#define PCIXM_COMMAND_DPERR_E 0x0001 /* Data Parity Error Recovery */ +#define PCIXM_COMMAND_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCIXM_COMMAND_MAX_READ 0x000c /* Maximum Burst Read Count */ +#define PCIXM_COMMAND_MAX_READ_512 0x0000 +#define PCIXM_COMMAND_MAX_READ_1024 0x0004 +#define PCIXM_COMMAND_MAX_READ_2048 0x0008 +#define PCIXM_COMMAND_MAX_READ_4096 0x000c +#define PCIXM_COMMAND_MAX_SPLITS 0x0070 /* Maximum Split Transactions */ +#define PCIXM_COMMAND_MAX_SPLITS_1 0x0000 +#define PCIXM_COMMAND_MAX_SPLITS_2 0x0010 +#define PCIXM_COMMAND_MAX_SPLITS_3 0x0020 +#define PCIXM_COMMAND_MAX_SPLITS_4 0x0030 +#define PCIXM_COMMAND_MAX_SPLITS_8 0x0040 +#define PCIXM_COMMAND_MAX_SPLITS_12 0x0050 +#define PCIXM_COMMAND_MAX_SPLITS_16 0x0060 +#define PCIXM_COMMAND_MAX_SPLITS_32 0x0070 +#define PCIXM_COMMAND_VERSION 0x3000 +#define PCIXR_STATUS 0x4 +#define PCIXM_STATUS_DEVFN 0x000000FF +#define PCIXM_STATUS_BUS 0x0000FF00 +#define PCIXM_STATUS_64BIT 0x00010000 +#define PCIXM_STATUS_133CAP 0x00020000 +#define PCIXM_STATUS_SC_DISCARDED 0x00040000 +#define PCIXM_STATUS_UNEXP_SC 0x00080000 +#define PCIXM_STATUS_COMPLEX_DEV 0x00100000 +#define PCIXM_STATUS_MAX_READ 0x00600000 +#define PCIXM_STATUS_MAX_READ_512 0x00000000 +#define PCIXM_STATUS_MAX_READ_1024 0x00200000 +#define PCIXM_STATUS_MAX_READ_2048 0x00400000 +#define PCIXM_STATUS_MAX_READ_4096 0x00600000 +#define PCIXM_STATUS_MAX_SPLITS 0x03800000 +#define PCIXM_STATUS_MAX_SPLITS_1 0x00000000 +#define PCIXM_STATUS_MAX_SPLITS_2 0x00800000 +#define PCIXM_STATUS_MAX_SPLITS_3 0x01000000 +#define PCIXM_STATUS_MAX_SPLITS_4 0x01800000 +#define PCIXM_STATUS_MAX_SPLITS_8 0x02000000 +#define PCIXM_STATUS_MAX_SPLITS_12 0x02800000 +#define PCIXM_STATUS_MAX_SPLITS_16 0x03000000 +#define PCIXM_STATUS_MAX_SPLITS_32 0x03800000 +#define PCIXM_STATUS_MAX_CUM_READ 0x1C000000 +#define PCIXM_STATUS_RCVD_SC_ERR 0x20000000 +#define PCIXM_STATUS_266CAP 0x40000000 +#define PCIXM_STATUS_533CAP 0x80000000 + +/* For header type 1 devices (PCI-X bridges) */ +#define PCIXR_SEC_STATUS 0x2 +#define PCIXM_SEC_STATUS_64BIT 0x0001 +#define PCIXM_SEC_STATUS_133CAP 0x0002 +#define PCIXM_SEC_STATUS_SC_DISC 0x0004 +#define PCIXM_SEC_STATUS_UNEXP_SC 0x0008 +#define PCIXM_SEC_STATUS_SC_OVERRUN 0x0010 +#define PCIXM_SEC_STATUS_SR_DELAYED 0x0020 +#define PCIXM_SEC_STATUS_BUS_MODE 0x03c0 +#define PCIXM_SEC_STATUS_VERSION 0x3000 +#define PCIXM_SEC_STATUS_266CAP 0x4000 +#define PCIXM_SEC_STATUS_533CAP 0x8000 +#define PCIXR_BRIDGE_STATUS 0x4 +#define PCIXM_BRIDGE_STATUS_DEVFN 0x000000FF +#define PCIXM_BRIDGE_STATUS_BUS 0x0000FF00 +#define PCIXM_BRIDGE_STATUS_64BIT 0x00010000 +#define PCIXM_BRIDGE_STATUS_133CAP 0x00020000 +#define PCIXM_BRIDGE_STATUS_SC_DISCARDED 0x00040000 +#define PCIXM_BRIDGE_STATUS_UNEXP_SC 0x00080000 +#define PCIXM_BRIDGE_STATUS_SC_OVERRUN 0x00100000 +#define PCIXM_BRIDGE_STATUS_SR_DELAYED 0x00200000 +#define PCIXM_BRIDGE_STATUS_DEVID_MSGCAP 0x20000000 +#define PCIXM_BRIDGE_STATUS_266CAP 0x40000000 +#define PCIXM_BRIDGE_STATUS_533CAP 0x80000000 + +/* HT (HyperTransport) Capability definitions */ +#define PCIR_HT_COMMAND 0x2 +#define PCIM_HTCMD_CAP_MASK 0xf800 /* Capability type. */ +#define PCIM_HTCAP_SLAVE 0x0000 /* 000xx */ +#define PCIM_HTCAP_HOST 0x2000 /* 001xx */ +#define PCIM_HTCAP_SWITCH 0x4000 /* 01000 */ +#define PCIM_HTCAP_INTERRUPT 0x8000 /* 10000 */ +#define PCIM_HTCAP_REVISION_ID 0x8800 /* 10001 */ +#define PCIM_HTCAP_UNITID_CLUMPING 0x9000 /* 10010 */ +#define PCIM_HTCAP_EXT_CONFIG_SPACE 0x9800 /* 10011 */ +#define PCIM_HTCAP_ADDRESS_MAPPING 0xa000 /* 10100 */ +#define PCIM_HTCAP_MSI_MAPPING 0xa800 /* 10101 */ +#define PCIM_HTCAP_DIRECT_ROUTE 0xb000 /* 10110 */ +#define PCIM_HTCAP_VCSET 0xb800 /* 10111 */ +#define PCIM_HTCAP_RETRY_MODE 0xc000 /* 11000 */ +#define PCIM_HTCAP_X86_ENCODING 0xc800 /* 11001 */ +#define PCIM_HTCAP_GEN3 0xd000 /* 11010 */ +#define PCIM_HTCAP_FLE 0xd800 /* 11011 */ +#define PCIM_HTCAP_PM 0xe000 /* 11100 */ +#define PCIM_HTCAP_HIGH_NODE_COUNT 0xe800 /* 11101 */ + +/* HT MSI Mapping Capability definitions. */ +#define PCIM_HTCMD_MSI_ENABLE 0x0001 +#define PCIM_HTCMD_MSI_FIXED 0x0002 +#define PCIR_HTMSI_ADDRESS_LO 0x4 +#define PCIR_HTMSI_ADDRESS_HI 0x8 + +/* PCI Vendor capability definitions */ +#define PCIR_VENDOR_LENGTH 0x2 +#define PCIR_VENDOR_DATA 0x3 + +/* PCI EHCI Debug Port definitions */ +#define PCIR_DEBUG_PORT 0x2 +#define PCIM_DEBUG_PORT_OFFSET 0x1FFF +#define PCIM_DEBUG_PORT_BAR 0xe000 + +/* PCI-PCI Bridge Subvendor definitions */ +#define PCIR_SUBVENDCAP_ID 0x4 + +/* PCI Express definitions */ +#define PCIER_FLAGS 0x2 +#define PCIEM_FLAGS_VERSION 0x000F +#define PCIEM_FLAGS_TYPE 0x00F0 +#define PCIEM_TYPE_ENDPOINT 0x0000 +#define PCIEM_TYPE_LEGACY_ENDPOINT 0x0010 +#define PCIEM_TYPE_ROOT_PORT 0x0040 +#define PCIEM_TYPE_UPSTREAM_PORT 0x0050 +#define PCIEM_TYPE_DOWNSTREAM_PORT 0x0060 +#define PCIEM_TYPE_PCI_BRIDGE 0x0070 +#define PCIEM_TYPE_PCIE_BRIDGE 0x0080 +#define PCIEM_TYPE_ROOT_INT_EP 0x0090 +#define PCIEM_TYPE_ROOT_EC 0x00a0 +#define PCIEM_FLAGS_SLOT 0x0100 +#define PCIEM_FLAGS_IRQ 0x3e00 +#define PCIER_DEVICE_CAP 0x4 +#define PCIEM_CAP_MAX_PAYLOAD 0x00000007 +#define PCIEM_CAP_PHANTHOM_FUNCS 0x00000018 +#define PCIEM_CAP_EXT_TAG_FIELD 0x00000020 +#define PCIEM_CAP_L0S_LATENCY 0x000001c0 +#define PCIEM_CAP_L1_LATENCY 0x00000e00 +#define PCIEM_CAP_ROLE_ERR_RPT 0x00008000 +#define PCIEM_CAP_SLOT_PWR_LIM_VAL 0x03fc0000 +#define PCIEM_CAP_SLOT_PWR_LIM_SCALE 0x0c000000 +#define PCIEM_CAP_FLR 0x10000000 +#define PCIER_DEVICE_CTL 0x8 +#define PCIEM_CTL_COR_ENABLE 0x0001 +#define PCIEM_CTL_NFER_ENABLE 0x0002 +#define PCIEM_CTL_FER_ENABLE 0x0004 +#define PCIEM_CTL_URR_ENABLE 0x0008 +#define PCIEM_CTL_RELAXED_ORD_ENABLE 0x0010 +#define PCIEM_CTL_MAX_PAYLOAD 0x00e0 +#define PCIEM_CTL_EXT_TAG_FIELD 0x0100 +#define PCIEM_CTL_PHANTHOM_FUNCS 0x0200 +#define PCIEM_CTL_AUX_POWER_PM 0x0400 +#define PCIEM_CTL_NOSNOOP_ENABLE 0x0800 +#define PCIEM_CTL_MAX_READ_REQUEST 0x7000 +#define PCIEM_CTL_BRDG_CFG_RETRY 0x8000 /* PCI-E - PCI/PCI-X bridges */ +#define PCIEM_CTL_INITIATE_FLR 0x8000 /* FLR capable endpoints */ +#define PCIER_DEVICE_STA 0xa +#define PCIEM_STA_CORRECTABLE_ERROR 0x0001 +#define PCIEM_STA_NON_FATAL_ERROR 0x0002 +#define PCIEM_STA_FATAL_ERROR 0x0004 +#define PCIEM_STA_UNSUPPORTED_REQ 0x0008 +#define PCIEM_STA_AUX_POWER 0x0010 +#define PCIEM_STA_TRANSACTION_PND 0x0020 +#define PCIER_LINK_CAP 0xc +#define PCIEM_LINK_CAP_MAX_SPEED 0x0000000f +#define PCIEM_LINK_CAP_MAX_WIDTH 0x000003f0 +#define PCIEM_LINK_CAP_ASPM 0x00000c00 +#define PCIEM_LINK_CAP_L0S_EXIT 0x00007000 +#define PCIEM_LINK_CAP_L1_EXIT 0x00038000 +#define PCIEM_LINK_CAP_CLOCK_PM 0x00040000 +#define PCIEM_LINK_CAP_SURPRISE_DOWN 0x00080000 +#define PCIEM_LINK_CAP_DL_ACTIVE 0x00100000 +#define PCIEM_LINK_CAP_LINK_BW_NOTIFY 0x00200000 +#define PCIEM_LINK_CAP_ASPM_COMPLIANCE 0x00400000 +#define PCIEM_LINK_CAP_PORT 0xff000000 +#define PCIER_LINK_CTL 0x10 +#define PCIEM_LINK_CTL_ASPMC_DIS 0x0000 +#define PCIEM_LINK_CTL_ASPMC_L0S 0x0001 +#define PCIEM_LINK_CTL_ASPMC_L1 0x0002 +#define PCIEM_LINK_CTL_ASPMC 0x0003 +#define PCIEM_LINK_CTL_RCB 0x0008 +#define PCIEM_LINK_CTL_LINK_DIS 0x0010 +#define PCIEM_LINK_CTL_RETRAIN_LINK 0x0020 +#define PCIEM_LINK_CTL_COMMON_CLOCK 0x0040 +#define PCIEM_LINK_CTL_EXTENDED_SYNC 0x0080 +#define PCIEM_LINK_CTL_ECPM 0x0100 +#define PCIEM_LINK_CTL_HAWD 0x0200 +#define PCIEM_LINK_CTL_LBMIE 0x0400 +#define PCIEM_LINK_CTL_LABIE 0x0800 +#define PCIER_LINK_STA 0x12 +#define PCIEM_LINK_STA_SPEED 0x000f +#define PCIEM_LINK_STA_WIDTH 0x03f0 +#define PCIEM_LINK_STA_TRAINING_ERROR 0x0400 +#define PCIEM_LINK_STA_TRAINING 0x0800 +#define PCIEM_LINK_STA_SLOT_CLOCK 0x1000 +#define PCIEM_LINK_STA_DL_ACTIVE 0x2000 +#define PCIEM_LINK_STA_LINK_BW_MGMT 0x4000 +#define PCIEM_LINK_STA_LINK_AUTO_BW 0x8000 +#define PCIER_SLOT_CAP 0x14 +#define PCIEM_SLOT_CAP_APB 0x00000001 +#define PCIEM_SLOT_CAP_PCP 0x00000002 +#define PCIEM_SLOT_CAP_MRLSP 0x00000004 +#define PCIEM_SLOT_CAP_AIP 0x00000008 +#define PCIEM_SLOT_CAP_PIP 0x00000010 +#define PCIEM_SLOT_CAP_HPS 0x00000020 +#define PCIEM_SLOT_CAP_HPC 0x00000040 +#define PCIEM_SLOT_CAP_SPLV 0x00007f80 +#define PCIEM_SLOT_CAP_SPLS 0x00018000 +#define PCIEM_SLOT_CAP_EIP 0x00020000 +#define PCIEM_SLOT_CAP_NCCS 0x00040000 +#define PCIEM_SLOT_CAP_PSN 0xfff80000 +#define PCIER_SLOT_CTL 0x18 +#define PCIEM_SLOT_CTL_ABPE 0x0001 +#define PCIEM_SLOT_CTL_PFDE 0x0002 +#define PCIEM_SLOT_CTL_MRLSCE 0x0004 +#define PCIEM_SLOT_CTL_PDCE 0x0008 +#define PCIEM_SLOT_CTL_CCIE 0x0010 +#define PCIEM_SLOT_CTL_HPIE 0x0020 +#define PCIEM_SLOT_CTL_AIC 0x00c0 +#define PCIEM_SLOT_CTL_PIC 0x0300 +#define PCIEM_SLOT_CTL_PCC 0x0400 +#define PCIEM_SLOT_CTL_EIC 0x0800 +#define PCIEM_SLOT_CTL_DLLSCE 0x1000 +#define PCIER_SLOT_STA 0x1a +#define PCIEM_SLOT_STA_ABP 0x0001 +#define PCIEM_SLOT_STA_PFD 0x0002 +#define PCIEM_SLOT_STA_MRLSC 0x0004 +#define PCIEM_SLOT_STA_PDC 0x0008 +#define PCIEM_SLOT_STA_CC 0x0010 +#define PCIEM_SLOT_STA_MRLSS 0x0020 +#define PCIEM_SLOT_STA_PDS 0x0040 +#define PCIEM_SLOT_STA_EIS 0x0080 +#define PCIEM_SLOT_STA_DLLSC 0x0100 +#define PCIER_ROOT_CTL 0x1c +#define PCIEM_ROOT_CTL_SERR_CORR 0x0001 +#define PCIEM_ROOT_CTL_SERR_NONFATAL 0x0002 +#define PCIEM_ROOT_CTL_SERR_FATAL 0x0004 +#define PCIEM_ROOT_CTL_PME 0x0008 +#define PCIEM_ROOT_CTL_CRS_VIS 0x0010 +#define PCIER_ROOT_CAP 0x1e +#define PCIEM_ROOT_CAP_CRS_VIS 0x0001 +#define PCIER_ROOT_STA 0x20 +#define PCIEM_ROOT_STA_PME_REQID_MASK 0x0000ffff +#define PCIEM_ROOT_STA_PME_STATUS 0x00010000 +#define PCIEM_ROOT_STA_PME_PEND 0x00020000 +#define PCIER_DEVICE_CAP2 0x24 +#define PCIEM_CAP2_ARI 0x20 +#define PCIER_DEVICE_CTL2 0x28 +#define PCIEM_CTL2_COMP_TIMEOUT_VAL 0x000f +#define PCIEM_CTL2_COMP_TIMEOUT_DIS 0x0010 +#define PCIEM_CTL2_ARI 0x0020 +#define PCIEM_CTL2_ATOMIC_REQ_ENABLE 0x0040 +#define PCIEM_CTL2_ATOMIC_EGR_BLOCK 0x0080 +#define PCIEM_CTL2_ID_ORDERED_REQ_EN 0x0100 +#define PCIEM_CTL2_ID_ORDERED_CMP_EN 0x0200 +#define PCIEM_CTL2_LTR_ENABLE 0x0400 +#define PCIEM_CTL2_OBFF 0x6000 +#define PCIEM_OBFF_DISABLE 0x0000 +#define PCIEM_OBFF_MSGA_ENABLE 0x2000 +#define PCIEM_OBFF_MSGB_ENABLE 0x4000 +#define PCIEM_OBFF_WAKE_ENABLE 0x6000 +#define PCIEM_CTL2_END2END_TLP 0x8000 +#define PCIER_DEVICE_STA2 0x2a +#define PCIER_LINK_CAP2 0x2c +#define PCIER_LINK_CTL2 0x30 +#define PCIER_LINK_STA2 0x32 +#define PCIER_SLOT_CAP2 0x34 +#define PCIER_SLOT_CTL2 0x38 +#define PCIER_SLOT_STA2 0x3a + +/* MSI-X definitions */ +#define PCIR_MSIX_CTRL 0x2 +#define PCIM_MSIXCTRL_MSIX_ENABLE 0x8000 +#define PCIM_MSIXCTRL_FUNCTION_MASK 0x4000 +#define PCIM_MSIXCTRL_TABLE_SIZE 0x07FF +#define PCIR_MSIX_TABLE 0x4 +#define PCIR_MSIX_PBA 0x8 +#define PCIM_MSIX_BIR_MASK 0x7 +#define PCIM_MSIX_BIR_BAR_10 0 +#define PCIM_MSIX_BIR_BAR_14 1 +#define PCIM_MSIX_BIR_BAR_18 2 +#define PCIM_MSIX_BIR_BAR_1C 3 +#define PCIM_MSIX_BIR_BAR_20 4 +#define PCIM_MSIX_BIR_BAR_24 5 +#define PCIM_MSIX_VCTRL_MASK 0x1 + +/* PCI Advanced Features definitions */ +#define PCIR_PCIAF_CAP 0x3 +#define PCIM_PCIAFCAP_TP 0x01 +#define PCIM_PCIAFCAP_FLR 0x02 +#define PCIR_PCIAF_CTRL 0x4 +#define PCIR_PCIAFCTRL_FLR 0x01 +#define PCIR_PCIAF_STATUS 0x5 +#define PCIR_PCIAFSTATUS_TP 0x01 + +/* Advanced Error Reporting */ +#define PCIR_AER_UC_STATUS 0x04 +#define PCIM_AER_UC_TRAINING_ERROR 0x00000001 +#define PCIM_AER_UC_DL_PROTOCOL_ERROR 0x00000010 +#define PCIM_AER_UC_SURPRISE_LINK_DOWN 0x00000020 +#define PCIM_AER_UC_POISONED_TLP 0x00001000 +#define PCIM_AER_UC_FC_PROTOCOL_ERROR 0x00002000 +#define PCIM_AER_UC_COMPLETION_TIMEOUT 0x00004000 +#define PCIM_AER_UC_COMPLETER_ABORT 0x00008000 +#define PCIM_AER_UC_UNEXPECTED_COMPLETION 0x00010000 +#define PCIM_AER_UC_RECEIVER_OVERFLOW 0x00020000 +#define PCIM_AER_UC_MALFORMED_TLP 0x00040000 +#define PCIM_AER_UC_ECRC_ERROR 0x00080000 +#define PCIM_AER_UC_UNSUPPORTED_REQUEST 0x00100000 +#define PCIM_AER_UC_ACS_VIOLATION 0x00200000 +#define PCIM_AER_UC_INTERNAL_ERROR 0x00400000 +#define PCIM_AER_UC_MC_BLOCKED_TLP 0x00800000 +#define PCIM_AER_UC_ATOMIC_EGRESS_BLK 0x01000000 +#define PCIM_AER_UC_TLP_PREFIX_BLOCKED 0x02000000 +#define PCIR_AER_UC_MASK 0x08 /* Shares bits with UC_STATUS */ +#define PCIR_AER_UC_SEVERITY 0x0c /* Shares bits with UC_STATUS */ +#define PCIR_AER_COR_STATUS 0x10 +#define PCIM_AER_COR_RECEIVER_ERROR 0x00000001 +#define PCIM_AER_COR_BAD_TLP 0x00000040 +#define PCIM_AER_COR_BAD_DLLP 0x00000080 +#define PCIM_AER_COR_REPLAY_ROLLOVER 0x00000100 +#define PCIM_AER_COR_REPLAY_TIMEOUT 0x00001000 +#define PCIM_AER_COR_ADVISORY_NF_ERROR 0x00002000 +#define PCIM_AER_COR_INTERNAL_ERROR 0x00004000 +#define PCIM_AER_COR_HEADER_LOG_OVFLOW 0x00008000 +#define PCIR_AER_COR_MASK 0x14 /* Shares bits with COR_STATUS */ +#define PCIR_AER_CAP_CONTROL 0x18 +#define PCIM_AER_FIRST_ERROR_PTR 0x0000001f +#define PCIM_AER_ECRC_GEN_CAPABLE 0x00000020 +#define PCIM_AER_ECRC_GEN_ENABLE 0x00000040 +#define PCIM_AER_ECRC_CHECK_CAPABLE 0x00000080 +#define PCIM_AER_ECRC_CHECK_ENABLE 0x00000100 +#define PCIM_AER_MULT_HDR_CAPABLE 0x00000200 +#define PCIM_AER_MULT_HDR_ENABLE 0x00000400 +#define PCIM_AER_TLP_PREFIX_LOG_PRESENT 0x00000800 +#define PCIR_AER_HEADER_LOG 0x1c +#define PCIR_AER_ROOTERR_CMD 0x2c /* Only for root complex ports */ +#define PCIM_AER_ROOTERR_COR_ENABLE 0x00000001 +#define PCIM_AER_ROOTERR_NF_ENABLE 0x00000002 +#define PCIM_AER_ROOTERR_F_ENABLE 0x00000004 +#define PCIR_AER_ROOTERR_STATUS 0x30 /* Only for root complex ports */ +#define PCIM_AER_ROOTERR_COR_ERR 0x00000001 +#define PCIM_AER_ROOTERR_MULTI_COR_ERR 0x00000002 +#define PCIM_AER_ROOTERR_UC_ERR 0x00000004 +#define PCIM_AER_ROOTERR_MULTI_UC_ERR 0x00000008 +#define PCIM_AER_ROOTERR_FIRST_UC_FATAL 0x00000010 +#define PCIM_AER_ROOTERR_NF_ERR 0x00000020 +#define PCIM_AER_ROOTERR_F_ERR 0x00000040 +#define PCIM_AER_ROOTERR_INT_MESSAGE 0xf8000000 +#define PCIR_AER_COR_SOURCE_ID 0x34 /* Only for root complex ports */ +#define PCIR_AER_ERR_SOURCE_ID 0x36 /* Only for root complex ports */ +#define PCIR_AER_TLP_PREFIX_LOG 0x38 /* Only for TLP prefix functions */ + +/* Virtual Channel definitions */ +#define PCIR_VC_CAP1 0x04 +#define PCIM_VC_CAP1_EXT_COUNT 0x00000007 +#define PCIM_VC_CAP1_LOWPRI_EXT_COUNT 0x00000070 +#define PCIR_VC_CAP2 0x08 +#define PCIR_VC_CONTROL 0x0C +#define PCIR_VC_STATUS 0x0E +#define PCIR_VC_RESOURCE_CAP(n) (0x10 + (n) * 0x0C) +#define PCIR_VC_RESOURCE_CTL(n) (0x14 + (n) * 0x0C) +#define PCIR_VC_RESOURCE_STA(n) (0x18 + (n) * 0x0C) + +/* Serial Number definitions */ +#define PCIR_SERIAL_LOW 0x04 +#define PCIR_SERIAL_HIGH 0x08 + +/* SR-IOV definitions */ +#define PCIR_SRIOV_CTL 0x08 +#define PCIM_SRIOV_VF_EN 0x01 +#define PCIM_SRIOV_VF_MSE 0x08 /* Memory space enable. */ +#define PCIM_SRIOV_ARI_EN 0x10 +#define PCIR_SRIOV_TOTAL_VFS 0x0E +#define PCIR_SRIOV_NUM_VFS 0x10 +#define PCIR_SRIOV_VF_OFF 0x14 +#define PCIR_SRIOV_VF_STRIDE 0x16 +#define PCIR_SRIOV_VF_DID 0x1A +#define PCIR_SRIOV_PAGE_CAP 0x1C +#define PCIR_SRIOV_PAGE_SIZE 0x20 + +#define PCI_SRIOV_BASE_PAGE_SHIFT 12 + +#define PCIR_SRIOV_BARS 0x24 +#define PCIR_SRIOV_BAR(x) (PCIR_SRIOV_BARS + (x) * 4) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/psl.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/psl.h new file mode 100644 index 0000000..a4779c0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/psl.h @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)psl.h 5.2 (Berkeley) 1/18/91 + * $FreeBSD$ + */ + +#pragma once + +/* + * 386 processor status longword. + */ +#define PSL_C 0x00000001 /* carry bit */ +#define PSL_PF 0x00000004 /* parity bit */ +#define PSL_AF 0x00000010 /* bcd carry bit */ +#define PSL_Z 0x00000040 /* zero bit */ +#define PSL_N 0x00000080 /* negative bit */ +#define PSL_T 0x00000100 /* trace enable bit */ +#define PSL_I 0x00000200 /* interrupt enable bit */ +#define PSL_D 0x00000400 /* string instruction direction bit */ +#define PSL_V 0x00000800 /* overflow bit */ +#define PSL_IOPL 0x00003000 /* i/o privilege level */ +#define PSL_NT 0x00004000 /* nested task bit */ +#define PSL_RF 0x00010000 /* resume flag bit */ +#define PSL_VM 0x00020000 /* virtual 8086 mode bit */ +#define PSL_AC 0x00040000 /* alignment checking */ +#define PSL_VIF 0x00080000 /* virtual interrupt enable */ +#define PSL_VIP 0x00100000 /* virtual interrupt pending */ +#define PSL_ID 0x00200000 /* identification bit */ + +/* + * The i486 manual says that we are not supposed to change reserved flags, + * but this is too much trouble since the reserved flags depend on the cpu + * and setting them to their historical values works in practice. + */ +#define PSL_RESERVED_DEFAULT 0x00000002 + +/* + * Initial flags for kernel and user mode. The kernel later inherits + * PSL_I and some other flags from user mode. + */ +#define PSL_KERNEL PSL_RESERVED_DEFAULT +#define PSL_USER (PSL_RESERVED_DEFAULT | PSL_I) + +/* + * Bits that can be changed in user mode on 486's. We allow these bits + * to be changed using ptrace(), sigreturn() and procfs. Setting PS_NT + * is undesirable but it may as well be allowed since users can inflict + * it on the kernel directly. Changes to PSL_AC are silently ignored on + * 386's. + * + * Users are allowed to change the privileged flag PSL_RF. The cpu sets PSL_RF + * in tf_eflags for faults. Debuggers should sometimes set it there too. + * tf_eflags is kept in the signal context during signal handling and there is + * no other place to remember it, so the PSL_RF bit may be corrupted by the + * signal handler without us knowing. Corruption of the PSL_RF bit at worst + * causes one more or one less debugger trap, so allowing it is fairly + * harmless. + */ +#define PSL_USERCHANGE (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_T \ + | PSL_D | PSL_V | PSL_NT | PSL_RF | PSL_AC | PSL_ID) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/rtc.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/rtc.h new file mode 100644 index 0000000..fead20c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/rtc.h @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)rtc.h 7.1 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +/* + * MC146818 RTC Register locations + */ + +#pragma once + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_SECALRM 0x01 /* seconds alarm */ +#define RTC_MIN 0x02 /* minutes */ +#define RTC_MINALRM 0x03 /* minutes alarm */ +#define RTC_HRS 0x04 /* hours */ +#define RTC_HRSALRM 0x05 /* hours alarm */ +#define RTC_WDAY 0x06 /* week day */ +#define RTC_DAY 0x07 /* day of month */ +#define RTC_MONTH 0x08 /* month of year */ +#define RTC_YEAR 0x09 /* month of year */ + +#define RTC_STATUSA 0x0a /* status register A */ +#define RTCSA_TUP 0x80 /* time update, don't look now */ +#define RTCSA_RESET 0x70 /* reset divider */ +#define RTCSA_DIVIDER 0x20 /* divider correct for 32768 Hz */ +#define RTCSA_8192 0x03 /* 8192 Hz interrupt */ +#define RTCSA_4096 0x04 +#define RTCSA_2048 0x05 +#define RTCSA_1024 0x06 /* default for profiling */ +#define RTCSA_PROF RTCSA_1024 +#define RTC_PROFRATE 1024 +#define RTCSA_512 0x07 +#define RTCSA_256 0x08 +#define RTCSA_128 0x09 +#define RTCSA_NOPROF RTCSA_128 +#define RTC_NOPROFRATE 128 +#define RTCSA_64 0x0a +#define RTCSA_32 0x0b /* 32 Hz interrupt */ + +#define RTC_STATUSB 0x0b /* status register B */ +#define RTCSB_DST 0x01 /* USA Daylight Savings Time enable */ +#define RTCSB_24HR 0x02 /* 0 = 12 hours, 1 = 24 hours */ +#define RTCSB_BCD 0x04 /* 0 = BCD, 1 = Binary coded time */ +#define RTCSB_SQWE 0x08 /* 1 = output sqare wave at SQW pin */ +#define RTCSB_UINTR 0x10 /* 1 = enable update-ended interrupt */ +#define RTCSB_AINTR 0x20 /* 1 = enable alarm interrupt */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ +#define RTCIR_UPDATE 0x10 /* update intr */ +#define RTCIR_ALARM 0x20 /* alarm intr */ +#define RTCIR_PERIOD 0x40 /* periodic intr */ +#define RTCIR_INT 0x80 /* interrupt output signal */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_DIAG 0x0e /* status register E - bios diagnostic */ +#define RTCDG_BITS "\020\010clock_battery\007ROM_cksum\006config_unit\005memory_size\004fixed_disk\003invalid_time" + +#define RTC_RESET 0x0f /* status register F - reset code byte */ +#define RTCRS_RST 0x00 /* normal reset */ +#define RTCRS_LOAD 0x04 /* load system */ + +#define RTC_FDISKETTE 0x10 /* diskette drive type in upper/lower nibble */ +#define RTCFDT_NONE 0 /* none present */ +#define RTCFDT_360K 0x10 /* 360K */ +#define RTCFDT_12M 0x20 /* 1.2M */ +#define RTCFDT_720K 0x30 /* 720K */ +#define RTCFDT_144M 0x40 /* 1.44M */ +#define RTCFDT_288M_1 0x50 /* 2.88M, some BIOSes */ +#define RTCFDT_288M 0x60 /* 2.88M */ + +#define RTC_BASELO 0x15 /* low byte of basemem size */ +#define RTC_BASEHI 0x16 /* high byte of basemem size */ +#define RTC_EXTLO 0x17 /* low byte of extended mem size */ +#define RTC_EXTHI 0x18 /* low byte of extended mem size */ + +#define RTC_CENTURY 0x32 /* current century */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/segments.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/segments.h new file mode 100644 index 0000000..f91c4a3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/segments.h @@ -0,0 +1,280 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)segments.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +// /* +// * X86 Segmentation Data Structures and definitions +// */ + +/* Selectors */ +#define SEL_RPL_MASK 3 /* requester priv level */ +#define ISPL(s) ((s) & 3) /* priority level of a selector */ +#define SEL_KPL 0 /* kernel priority level */ +#define SEL_UPL 3 /* user priority level */ +#define ISLDT(s) ((s) & SEL_LDT) /* is it local or global */ +#define SEL_LDT 4 /* local descriptor table */ +#define IDXSEL(s) (((s)>>3) & 0x1fff) /* index of selector */ +#define LSEL(s,r) (((s)<<3) | SEL_LDT | r) /* a local selector */ +#define GSEL(s,r) (((s)<<3) | r) /* a global selector */ + +/* + * User segment descriptors (%cs, %ds etc for i386 apps. 64 bit wide) + * For long-mode apps, %cs only has the conforming bit in sd_type, the sd_dpl, + * sd_p, sd_l and sd_def32 which must be zero). %ds only has sd_p. + */ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" +struct segment_descriptor { + unsigned sd_lolimit:16; /* segment extent (lsb) */ + unsigned sd_lobase:24; /* segment base address (lsb) */ + unsigned sd_type:5; /* segment type */ + unsigned sd_dpl:2; /* segment descriptor priority level */ + unsigned sd_p:1; /* segment descriptor present */ + unsigned sd_hilimit:4; /* segment extent (msb) */ + unsigned sd_xx:2; /* unused */ + unsigned sd_def32:1; /* default 32 vs 16 bit size */ + unsigned sd_gran:1; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8; /* segment base address (msb) */ +} __packed; +#pragma clang diagnostic pop + +struct user_segment_descriptor { + uint64_t sd_lolimit:16; /* segment extent (lsb) */ + uint64_t sd_lobase:24; /* segment base address (lsb) */ + uint64_t sd_type:5; /* segment type */ + uint64_t sd_dpl:2; /* segment descriptor priority level */ + uint64_t sd_p:1; /* segment descriptor present */ + uint64_t sd_hilimit:4; /* segment extent (msb) */ + uint64_t sd_xx:1; /* unused */ + uint64_t sd_long:1; /* long mode (cs only) */ + uint64_t sd_def32:1; /* default 32 vs 16 bit size */ + uint64_t sd_gran:1; /* limit granularity (byte/page units)*/ + uint64_t sd_hibase:8; /* segment base address (msb) */ +}; + +#define USD_GETBASE(sd) (((sd)->sd_lobase) | (sd)->sd_hibase << 24) +#define USD_SETBASE(sd, b) (sd)->sd_lobase = (b); \ + (sd)->sd_hibase = ((b) >> 24); +#define USD_GETLIMIT(sd) (((sd)->sd_lolimit) | (sd)->sd_hilimit << 16) +#define USD_SETLIMIT(sd, l) (sd)->sd_lolimit = (l); \ + (sd)->sd_hilimit = ((l) >> 16); + +// #ifdef __i386__ +// /* +// * Gate descriptors (e.g. indirect descriptors) +// */ +// struct gate_descriptor { +// unsigned gd_looffset:16; /* gate offset (lsb) */ +// unsigned gd_selector:16; /* gate segment selector */ +// unsigned gd_stkcpy:5; /* number of stack wds to cpy */ +// unsigned gd_xx:3; /* unused */ +// unsigned gd_type:5; /* segment type */ +// unsigned gd_dpl:2; /* segment descriptor priority level */ +// unsigned gd_p:1; /* segment descriptor present */ +// unsigned gd_hioffset:16; /* gate offset (msb) */ +// } __packed; + +// /* +// * Generic descriptor +// */ +// union descriptor { +// struct segment_descriptor sd; +// struct gate_descriptor gd; +// }; +// #else +// /* +// * Gate descriptors (e.g. indirect descriptors, trap, interrupt etc. 128 bit) +// * Only interrupt and trap gates have gd_ist. +// */ +// struct gate_descriptor { +// uint64_t gd_looffset:16; /* gate offset (lsb) */ +// uint64_t gd_selector:16; /* gate segment selector */ +// uint64_t gd_ist:3; /* IST table index */ +// uint64_t gd_xx:5; /* unused */ +// uint64_t gd_type:5; /* segment type */ +// uint64_t gd_dpl:2; /* segment descriptor priority level */ +// uint64_t gd_p:1; /* segment descriptor present */ +// uint64_t gd_hioffset:48; /* gate offset (msb) */ +// uint64_t sd_xx1:32; +// } __packed; + +// /* +// * Generic descriptor +// */ +// union descriptor { +// struct user_segment_descriptor sd; +// struct gate_descriptor gd; +// }; +// #endif + + /* system segments and gate types */ +#define SDT_SYSNULL 0 /* system null */ +#define SDT_SYS286TSS 1 /* system 286 TSS available */ +#define SDT_SYSLDT 2 /* system local descriptor table */ +#define SDT_SYS286BSY 3 /* system 286 TSS busy */ +#define SDT_SYS286CGT 4 /* system 286 call gate */ +#define SDT_SYSTASKGT 5 /* system task gate */ +#define SDT_SYS286IGT 6 /* system 286 interrupt gate */ +#define SDT_SYS286TGT 7 /* system 286 trap gate */ +#define SDT_SYSNULL2 8 /* system null again */ +#define SDT_SYS386TSS 9 /* system 386 TSS available */ +#define SDT_SYSTSS 9 /* system available 64 bit TSS */ +#define SDT_SYSNULL3 10 /* system null again */ +#define SDT_SYS386BSY 11 /* system 386 TSS busy */ +#define SDT_SYSBSY 11 /* system busy 64 bit TSS */ +#define SDT_SYS386CGT 12 /* system 386 call gate */ +#define SDT_SYSCGT 12 /* system 64 bit call gate */ +#define SDT_SYSNULL4 13 /* system null again */ +#define SDT_SYS386IGT 14 /* system 386 interrupt gate */ +#define SDT_SYSIGT 14 /* system 64 bit interrupt gate */ +#define SDT_SYS386TGT 15 /* system 386 trap gate */ +#define SDT_SYSTGT 15 /* system 64 bit trap gate */ + +// /* memory segment types */ +// #define SDT_MEMRO 16 memory read only +// #define SDT_MEMROA 17 /* memory read only accessed */ +#define SDT_MEMRW 18 /* memory read write */ +#define SDT_MEMRWA 19 /* memory read write accessed */ +// #define SDT_MEMROD 20 /* memory read only expand dwn limit */ +// #define SDT_MEMRODA 21 /* memory read only expand dwn limit accessed */ +// #define SDT_MEMRWD 22 /* memory read write expand dwn limit */ +// #define SDT_MEMRWDA 23 /* memory read write expand dwn limit accessed*/ +// #define SDT_MEME 24 /* memory execute only */ +// #define SDT_MEMEA 25 /* memory execute only accessed */ +#define SDT_MEMER 26 /* memory execute read */ +#define SDT_MEMERA 27 /* memory execute read accessed */ +// #define SDT_MEMEC 28 /* memory execute only conforming */ +// #define SDT_MEMEAC 29 /* memory execute only accessed conforming */ +// #define SDT_MEMERC 30 /* memory execute read conforming */ +// #define SDT_MEMERAC 31 /* memory execute read accessed conforming */ + +// /* +// * Size of IDT table +// */ +// #define NIDT 256 /* 32 reserved, 0x80 syscall, most are h/w */ +// #define NRSVIDT 32 /* reserved entries for cpu exceptions */ + +/* + * Entries in the Interrupt Descriptor Table (IDT) + */ +#define IDT_DE 0 /* #DE: Divide Error */ +#define IDT_DB 1 /* #DB: Debug */ +#define IDT_NMI 2 /* Nonmaskable External Interrupt */ +#define IDT_BP 3 /* #BP: Breakpoint */ +#define IDT_OF 4 /* #OF: Overflow */ +#define IDT_BR 5 /* #BR: Bound Range Exceeded */ +#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ +#define IDT_NM 7 /* #NM: No Math Coprocessor */ +#define IDT_DF 8 /* #DF: Double Fault */ +#define IDT_FPUGP 9 /* Coprocessor Segment Overrun */ +#define IDT_TS 10 /* #TS: Invalid TSS */ +#define IDT_NP 11 /* #NP: Segment Not Present */ +#define IDT_SS 12 /* #SS: Stack Segment Fault */ +#define IDT_GP 13 /* #GP: General Protection Fault */ +#define IDT_PF 14 /* #PF: Page Fault */ +#define IDT_MF 16 /* #MF: FPU Floating-Point Error */ +#define IDT_AC 17 /* #AC: Alignment Check */ +#define IDT_MC 18 /* #MC: Machine Check */ +#define IDT_XF 19 /* #XF: SIMD Floating-Point Exception */ +#define IDT_IO_INTS NRSVIDT /* Base of IDT entries for I/O interrupts. */ +#define IDT_SYSCALL 0x80 /* System Call Interrupt Vector */ +#define IDT_DTRACE_RET 0x92 /* DTrace pid provider Interrupt Vector */ +#define IDT_EVTCHN 0x93 /* Xen HVM Event Channel Interrupt Vector */ + +// #if defined(__i386__) +// /* +// * Entries in the Global Descriptor Table (GDT) +// * Note that each 4 entries share a single 32 byte L1 cache line. +// * Some of the fast syscall instructions require a specific order here. +// */ +// #define GNULL_SEL 0 /* Null Descriptor */ +// #define GPRIV_SEL 1 /* SMP Per-Processor Private Data */ +// #define GUFS_SEL 2 /* User %fs Descriptor (order critical: 1) */ +// #define GUGS_SEL 3 /* User %gs Descriptor (order critical: 2) */ +// #define GCODE_SEL 4 /* Kernel Code Descriptor (order critical: 1) */ +// #define GDATA_SEL 5 /* Kernel Data Descriptor (order critical: 2) */ +// #define GUCODE_SEL 6 /* User Code Descriptor (order critical: 3) */ +// #define GUDATA_SEL 7 /* User Data Descriptor (order critical: 4) */ +// #define GBIOSLOWMEM_SEL 8 /* BIOS low memory access (must be entry 8) */ +// #define GPROC0_SEL 9 /* Task state process slot zero and up */ +// #define GLDT_SEL 10 /* Default User LDT */ +// #define GUSERLDT_SEL 11 /* User LDT */ +// #define GPANIC_SEL 12 /* Task state to consider panic from */ +// #define GBIOSCODE32_SEL 13 /* BIOS interface (32bit Code) */ +// #define GBIOSCODE16_SEL 14 /* BIOS interface (16bit Code) */ +// #define GBIOSDATA_SEL 15 /* BIOS interface (Data) */ +// #define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ +// #define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ +// #define GNDIS_SEL 18 /* For the NDIS layer */ +// #define NGDT 19 + +// /* +// * Entries in the Local Descriptor Table (LDT) +// */ +// #define LSYS5CALLS_SEL 0 /* forced by intel BCS */ +// #define LSYS5SIGR_SEL 1 +// #define L43BSDCALLS_SEL 2 /* notyet */ +// #define LUCODE_SEL 3 +// #define LSOL26CALLS_SEL 4 /* Solaris >= 2.6 system call gate */ +// #define LUDATA_SEL 5 +// /* separate stack, es,fs,gs sels ? */ +// /* #define LPOSIXCALLS_SEL 5*/ /* notyet */ +// #define LBSDICALLS_SEL 16 /* BSDI system call gate */ +// #define NLDT (LBSDICALLS_SEL + 1) + +// #else /* !__i386__ */ +// /* +// * Entries in the Global Descriptor Table (GDT) +// */ +// #define GNULL_SEL 0 /* Null Descriptor */ +// #define GNULL2_SEL 1 /* Null Descriptor */ +// #define GUFS32_SEL 2 /* User 32 bit %fs Descriptor */ +// #define GUGS32_SEL 3 /* User 32 bit %gs Descriptor */ +// #define GCODE_SEL 4 /* Kernel Code Descriptor */ +// #define GDATA_SEL 5 /* Kernel Data Descriptor */ +// #define GUCODE32_SEL 6 /* User 32 bit code Descriptor */ +// #define GUDATA_SEL 7 /* User 32/64 bit Data Descriptor */ +// #define GUCODE_SEL 8 /* User 64 bit Code Descriptor */ +// #define GPROC0_SEL 9 /* TSS for entering kernel etc */ +// /* slot 10 is second half of GPROC0_SEL */ +// #define GUSERLDT_SEL 11 /* LDT */ +// /* slot 12 is second half of GUSERLDT_SEL */ +// #define NGDT 13 +// #endif /* __i386__ */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/specialreg.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/specialreg.h new file mode 100644 index 0000000..70a6b44 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/specialreg.h @@ -0,0 +1,845 @@ +/*- + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +#pragma once + +/* + * Bits in 386 special registers: + */ +#define CR0_PE 0x00000001 /* Protected mode Enable */ +#define CR0_MP 0x00000002 /* "Math" (fpu) Present */ +#define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ +#define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */ +#define CR0_PG 0x80000000 /* PaGing enable */ + +/* + * Bits in 486 special registers: + */ +#define CR0_ET 0x00000010 /* Extension type */ +#define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ +#define CR0_WP 0x00010000 /* Write Protect (honor page protect in + all modes) */ +#define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */ +#define CR0_NW 0x20000000 /* Not Write-through */ +#define CR0_CD 0x40000000 /* Cache Disable */ + +#define CR3_PCID_SAVE 0x8000000000000000 +#define CR3_PCID_MASK 0xfff + +/* + * Bits in PPro special registers + */ +#define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ +#define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ +#define CR4_TSD 0x00000004 /* Time stamp disable */ +#define CR4_DE 0x00000008 /* Debugging extensions */ +#define CR4_PSE 0x00000010 /* Page size extensions */ +#define CR4_PAE 0x00000020 /* Physical address extension */ +#define CR4_MCE 0x00000040 /* Machine check enable */ +#define CR4_PGE 0x00000080 /* Page global enable */ +#define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ +#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ +#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ +#define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ +#define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */ +#define CR4_PCIDE 0x00020000 /* Enable Context ID */ +#define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ +#define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ + +/* + * Bits in AMD64 special registers. EFER is 64 bits wide. + */ +#define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ +#define EFER_LME 0x000000100 /* Long mode enable (R/W) */ +#define EFER_LMA 0x000000400 /* Long mode active (R) */ +#define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ +#define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ +#define EFER_LMSLE 0x000002000 /* Long Mode Segment Limit Enable */ +#define EFER_FFXSR 0x000004000 /* Fast FXSAVE/FSRSTOR */ +#define EFER_TCE 0x000008000 /* Translation Cache Extension */ + +/* + * Intel Extended Features registers + */ +#define XCR0 0 /* XFEATURE_ENABLED_MASK register */ + +#define XFEATURE_ENABLED_X87 0x00000001 +#define XFEATURE_ENABLED_SSE 0x00000002 +#define XFEATURE_ENABLED_YMM_HI128 0x00000004 +#define XFEATURE_ENABLED_AVX XFEATURE_ENABLED_YMM_HI128 +#define XFEATURE_ENABLED_BNDREGS 0x00000008 +#define XFEATURE_ENABLED_BNDCSR 0x00000010 +#define XFEATURE_ENABLED_OPMASK 0x00000020 +#define XFEATURE_ENABLED_ZMM_HI256 0x00000040 +#define XFEATURE_ENABLED_HI16_ZMM 0x00000080 + +#define XFEATURE_AVX \ + (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX) +#define XFEATURE_AVX512 \ + (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 | \ + XFEATURE_ENABLED_HI16_ZMM) +#define XFEATURE_MPX \ + (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR) + +/* + * CPUID instruction features register + */ +#define CPUID_FPU 0x00000001 +#define CPUID_VME 0x00000002 +#define CPUID_DE 0x00000004 +#define CPUID_PSE 0x00000008 +#define CPUID_TSC 0x00000010 +#define CPUID_MSR 0x00000020 +#define CPUID_PAE 0x00000040 +#define CPUID_MCE 0x00000080 +#define CPUID_CX8 0x00000100 +#define CPUID_APIC 0x00000200 +#define CPUID_B10 0x00000400 +#define CPUID_SEP 0x00000800 +#define CPUID_MTRR 0x00001000 +#define CPUID_PGE 0x00002000 +#define CPUID_MCA 0x00004000 +#define CPUID_CMOV 0x00008000 +#define CPUID_PAT 0x00010000 +#define CPUID_PSE36 0x00020000 +#define CPUID_PSN 0x00040000 +#define CPUID_CLFSH 0x00080000 +#define CPUID_B20 0x00100000 +#define CPUID_DS 0x00200000 +#define CPUID_ACPI 0x00400000 +#define CPUID_MMX 0x00800000 +#define CPUID_FXSR 0x01000000 +#define CPUID_SSE 0x02000000 +#define CPUID_XMM 0x02000000 +#define CPUID_SSE2 0x04000000 +#define CPUID_SS 0x08000000 +#define CPUID_HTT 0x10000000 +#define CPUID_TM 0x20000000 +#define CPUID_IA64 0x40000000 +#define CPUID_PBE 0x80000000 + +#define CPUID2_SSE3 0x00000001 +#define CPUID2_PCLMULQDQ 0x00000002 +#define CPUID2_DTES64 0x00000004 +#define CPUID2_MON 0x00000008 +#define CPUID2_DS_CPL 0x00000010 +#define CPUID2_VMX 0x00000020 +#define CPUID2_SMX 0x00000040 +#define CPUID2_EST 0x00000080 +#define CPUID2_TM2 0x00000100 +#define CPUID2_SSSE3 0x00000200 +#define CPUID2_CNXTID 0x00000400 +#define CPUID2_SDBG 0x00000800 +#define CPUID2_FMA 0x00001000 +#define CPUID2_CX16 0x00002000 +#define CPUID2_XTPR 0x00004000 +#define CPUID2_PDCM 0x00008000 +#define CPUID2_PCID 0x00020000 +#define CPUID2_DCA 0x00040000 +#define CPUID2_SSE41 0x00080000 +#define CPUID2_SSE42 0x00100000 +#define CPUID2_X2APIC 0x00200000 +#define CPUID2_MOVBE 0x00400000 +#define CPUID2_POPCNT 0x00800000 +#define CPUID2_TSCDLT 0x01000000 +#define CPUID2_AESNI 0x02000000 +#define CPUID2_XSAVE 0x04000000 +#define CPUID2_OSXSAVE 0x08000000 +#define CPUID2_AVX 0x10000000 +#define CPUID2_F16C 0x20000000 +#define CPUID2_RDRAND 0x40000000 +#define CPUID2_HV 0x80000000 + +/* + * Important bits in the Thermal and Power Management flags + * CPUID.6 EAX and ECX. + */ +#define CPUTPM1_SENSOR 0x00000001 +#define CPUTPM1_TURBO 0x00000002 +#define CPUTPM1_ARAT 0x00000004 +#define CPUTPM2_EFFREQ 0x00000001 + +/* + * Important bits in the AMD extended cpuid flags + */ +#define AMDID_SYSCALL 0x00000800 +#define AMDID_MP 0x00080000 +#define AMDID_NX 0x00100000 +#define AMDID_EXT_MMX 0x00400000 +#define AMDID_FFXSR 0x02000000 +#define AMDID_PAGE1GB 0x04000000 +#define AMDID_RDTSCP 0x08000000 +#define AMDID_LM 0x20000000 +#define AMDID_EXT_3DNOW 0x40000000 +#define AMDID_3DNOW 0x80000000 + +#define AMDID2_LAHF 0x00000001 +#define AMDID2_CMP 0x00000002 +#define AMDID2_SVM 0x00000004 +#define AMDID2_EXT_APIC 0x00000008 +#define AMDID2_CR8 0x00000010 +#define AMDID2_ABM 0x00000020 +#define AMDID2_SSE4A 0x00000040 +#define AMDID2_MAS 0x00000080 +#define AMDID2_PREFETCH 0x00000100 +#define AMDID2_OSVW 0x00000200 +#define AMDID2_IBS 0x00000400 +#define AMDID2_XOP 0x00000800 +#define AMDID2_SKINIT 0x00001000 +#define AMDID2_WDT 0x00002000 +#define AMDID2_LWP 0x00008000 +#define AMDID2_FMA4 0x00010000 +#define AMDID2_TCE 0x00020000 +#define AMDID2_NODE_ID 0x00080000 +#define AMDID2_TBM 0x00200000 +#define AMDID2_TOPOLOGY 0x00400000 +#define AMDID2_PCXC 0x00800000 +#define AMDID2_PNXC 0x01000000 +#define AMDID2_DBE 0x04000000 +#define AMDID2_PTSC 0x08000000 +#define AMDID2_PTSCEL2I 0x10000000 + +/* + * CPUID instruction 1 eax info + */ +#define CPUID_STEPPING 0x0000000f +#define CPUID_MODEL 0x000000f0 +#define CPUID_FAMILY 0x00000f00 +#define CPUID_EXT_MODEL 0x000f0000 +#define CPUID_EXT_FAMILY 0x0ff00000 +// #ifdef __i386__ +// #define CPUID_TO_MODEL(id) \ +// ((((id) & CPUID_MODEL) >> 4) | \ +// ((((id) & CPUID_FAMILY) >= 0x600) ? \ +// (((id) & CPUID_EXT_MODEL) >> 12) : 0)) +// #define CPUID_TO_FAMILY(id) \ +// ((((id) & CPUID_FAMILY) >> 8) + \ +// ((((id) & CPUID_FAMILY) == 0xf00) ? \ +// (((id) & CPUID_EXT_FAMILY) >> 20) : 0)) +// #else +// #define CPUID_TO_MODEL(id) \ +// ((((id) & CPUID_MODEL) >> 4) | \ +// (((id) & CPUID_EXT_MODEL) >> 12)) +// #define CPUID_TO_FAMILY(id) \ +// ((((id) & CPUID_FAMILY) >> 8) + \ +// (((id) & CPUID_EXT_FAMILY) >> 20)) +// #endif + +/* + * CPUID instruction 1 ebx info + */ +#define CPUID_BRAND_INDEX 0x000000ff +#define CPUID_CLFUSH_SIZE 0x0000ff00 +#define CPUID_HTT_CORES 0x00ff0000 +#define CPUID_LOCAL_APIC_ID 0xff000000 + +/* + * CPUID instruction 5 info + */ +#define CPUID5_MON_MIN_SIZE 0x0000ffff /* eax */ +#define CPUID5_MON_MAX_SIZE 0x0000ffff /* ebx */ +#define CPUID5_MON_MWAIT_EXT 0x00000001 /* ecx */ +#define CPUID5_MWAIT_INTRBREAK 0x00000002 /* ecx */ + +/* + * MWAIT cpu power states. Lower 4 bits are sub-states. + */ +#define MWAIT_C0 0xf0 +#define MWAIT_C1 0x00 +#define MWAIT_C2 0x10 +#define MWAIT_C3 0x20 +#define MWAIT_C4 0x30 + +/* + * MWAIT extensions. + */ +/* Interrupt breaks MWAIT even when masked. */ +#define MWAIT_INTRBREAK 0x00000001 + +/* + * CPUID instruction 6 ecx info + */ +#define CPUID_PERF_STAT 0x00000001 +#define CPUID_PERF_BIAS 0x00000008 + +/* + * CPUID instruction 0xb ebx info. + */ +#define CPUID_TYPE_INVAL 0 +#define CPUID_TYPE_SMT 1 +#define CPUID_TYPE_CORE 2 + +/* + * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1 + */ +#define CPUID_EXTSTATE_XSAVEOPT 0x00000001 +#define CPUID_EXTSTATE_XSAVEC 0x00000002 +#define CPUID_EXTSTATE_XINUSE 0x00000004 +#define CPUID_EXTSTATE_XSAVES 0x00000008 + +/* + * AMD extended function 8000_0007h edx info + */ +#define AMDPM_TS 0x00000001 +#define AMDPM_FID 0x00000002 +#define AMDPM_VID 0x00000004 +#define AMDPM_TTP 0x00000008 +#define AMDPM_TM 0x00000010 +#define AMDPM_STC 0x00000020 +#define AMDPM_100MHZ_STEPS 0x00000040 +#define AMDPM_HW_PSTATE 0x00000080 +#define AMDPM_TSC_INVARIANT 0x00000100 +#define AMDPM_CPB 0x00000200 + +/* + * AMD extended function 8000_0008h ecx info + */ +#define AMDID_CMP_CORES 0x000000ff +#define AMDID_COREID_SIZE 0x0000f000 +#define AMDID_COREID_SIZE_SHIFT 12 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info + */ +#define CPUID_STDEXT_FSGSBASE 0x00000001 +#define CPUID_STDEXT_TSC_ADJUST 0x00000002 +#define CPUID_STDEXT_BMI1 0x00000008 +#define CPUID_STDEXT_HLE 0x00000010 +#define CPUID_STDEXT_AVX2 0x00000020 +#define CPUID_STDEXT_SMEP 0x00000080 +#define CPUID_STDEXT_BMI2 0x00000100 +#define CPUID_STDEXT_ERMS 0x00000200 +#define CPUID_STDEXT_INVPCID 0x00000400 +#define CPUID_STDEXT_RTM 0x00000800 +#define CPUID_STDEXT_MPX 0x00004000 +#define CPUID_STDEXT_AVX512F 0x00010000 +#define CPUID_STDEXT_RDSEED 0x00040000 +#define CPUID_STDEXT_ADX 0x00080000 +#define CPUID_STDEXT_SMAP 0x00100000 +#define CPUID_STDEXT_CLFLUSHOPT 0x00800000 +#define CPUID_STDEXT_PROCTRACE 0x02000000 +#define CPUID_STDEXT_AVX512PF 0x04000000 +#define CPUID_STDEXT_AVX512ER 0x08000000 +#define CPUID_STDEXT_AVX512CD 0x10000000 +#define CPUID_STDEXT_SHA 0x20000000 + +/* + * CPUID manufacturers identifiers + */ +#define AMD_VENDOR_ID "AuthenticAMD" +#define CENTAUR_VENDOR_ID "CentaurHauls" +#define CYRIX_VENDOR_ID "CyrixInstead" +#define INTEL_VENDOR_ID "GenuineIntel" +#define NEXGEN_VENDOR_ID "NexGenDriven" +#define NSC_VENDOR_ID "Geode by NSC" +#define RISE_VENDOR_ID "RiseRiseRise" +#define SIS_VENDOR_ID "SiS SiS SiS " +#define TRANSMETA_VENDOR_ID "GenuineTMx86" +#define UMC_VENDOR_ID "UMC UMC UMC " + +/* + * Model-specific registers for the i386 family + */ +#define MSR_P5_MC_ADDR 0x000 +#define MSR_P5_MC_TYPE 0x001 +#define MSR_TSC 0x010 +#define MSR_P5_CESR 0x011 +#define MSR_P5_CTR0 0x012 +#define MSR_P5_CTR1 0x013 +#define MSR_IA32_PLATFORM_ID 0x017 +#define MSR_APICBASE 0x01b +#define MSR_EBL_CR_POWERON 0x02a +#define MSR_TEST_CTL 0x033 +#define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_BIOS_UPDT_TRIG 0x079 +#define MSR_BBL_CR_D0 0x088 +#define MSR_BBL_CR_D1 0x089 +#define MSR_BBL_CR_D2 0x08a +#define MSR_BIOS_SIGN 0x08b +#define MSR_PERFCTR0 0x0c1 +#define MSR_PERFCTR1 0x0c2 +#define MSR_PLATFORM_INFO 0x0ce +#define MSR_MPERF 0x0e7 +#define MSR_APERF 0x0e8 +#define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ +#define MSR_MTRRcap 0x0fe +#define MSR_BBL_CR_ADDR 0x116 +#define MSR_BBL_CR_DECC 0x118 +#define MSR_BBL_CR_CTL 0x119 +#define MSR_BBL_CR_TRIG 0x11a +#define MSR_BBL_CR_BUSY 0x11b +#define MSR_BBL_CR_CTL3 0x11e +#define MSR_SYSENTER_CS_MSR 0x174 +#define MSR_SYSENTER_ESP_MSR 0x175 +#define MSR_SYSENTER_EIP_MSR 0x176 +#define MSR_MCG_CAP 0x179 +#define MSR_MCG_STATUS 0x17a +#define MSR_MCG_CTL 0x17b +#define MSR_EVNTSEL0 0x186 +#define MSR_EVNTSEL1 0x187 +#define MSR_THERM_CONTROL 0x19a +#define MSR_THERM_INTERRUPT 0x19b +#define MSR_THERM_STATUS 0x19c +#define MSR_IA32_MISC_ENABLE 0x1a0 +#define MSR_IA32_TEMPERATURE_TARGET 0x1a2 +#define MSR_TURBO_RATIO_LIMIT 0x1ad +#define MSR_TURBO_RATIO_LIMIT1 0x1ae +#define MSR_DEBUGCTLMSR 0x1d9 +#define MSR_LASTBRANCHFROMIP 0x1db +#define MSR_LASTBRANCHTOIP 0x1dc +#define MSR_LASTINTFROMIP 0x1dd +#define MSR_LASTINTTOIP 0x1de +#define MSR_ROB_CR_BKUPTMPDR6 0x1e0 +#define MSR_MTRRVarBase 0x200 +#define MSR_MTRR64kBase 0x250 +#define MSR_MTRR16kBase 0x258 +#define MSR_MTRR4kBase 0x268 +#define MSR_PAT 0x277 +#define MSR_MC0_CTL2 0x280 +#define MSR_MTRRdefType 0x2ff +#define MSR_MC0_CTL 0x400 +#define MSR_MC0_STATUS 0x401 +#define MSR_MC0_ADDR 0x402 +#define MSR_MC0_MISC 0x403 +#define MSR_MC1_CTL 0x404 +#define MSR_MC1_STATUS 0x405 +#define MSR_MC1_ADDR 0x406 +#define MSR_MC1_MISC 0x407 +#define MSR_MC2_CTL 0x408 +#define MSR_MC2_STATUS 0x409 +#define MSR_MC2_ADDR 0x40a +#define MSR_MC2_MISC 0x40b +#define MSR_MC3_CTL 0x40c +#define MSR_MC3_STATUS 0x40d +#define MSR_MC3_ADDR 0x40e +#define MSR_MC3_MISC 0x40f +#define MSR_MC4_CTL 0x410 +#define MSR_MC4_STATUS 0x411 +#define MSR_MC4_ADDR 0x412 +#define MSR_MC4_MISC 0x413 +#define MSR_RAPL_POWER_UNIT 0x606 +#define MSR_PKG_ENERGY_STATUS 0x611 +#define MSR_DRAM_ENERGY_STATUS 0x619 +#define MSR_PP0_ENERGY_STATUS 0x639 +#define MSR_PP1_ENERGY_STATUS 0x641 + +/* + * VMX MSRs + */ +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 +#define MSR_VMX_PROCBASED_CTLS2 0x48b +#define MSR_VMX_EPT_VPID_CAP 0x48c +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48d +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +/* + * X2APIC MSRs + */ +#define MSR_APIC_000 0x800 +#define MSR_APIC_ID 0x802 +#define MSR_APIC_VERSION 0x803 +#define MSR_APIC_TPR 0x808 +#define MSR_APIC_EOI 0x80b +#define MSR_APIC_LDR 0x80d +#define MSR_APIC_SVR 0x80f +#define MSR_APIC_ISR0 0x810 +#define MSR_APIC_ISR1 0x811 +#define MSR_APIC_ISR2 0x812 +#define MSR_APIC_ISR3 0x813 +#define MSR_APIC_ISR4 0x814 +#define MSR_APIC_ISR5 0x815 +#define MSR_APIC_ISR6 0x816 +#define MSR_APIC_ISR7 0x817 +#define MSR_APIC_TMR0 0x818 +#define MSR_APIC_IRR0 0x820 +#define MSR_APIC_ESR 0x828 +#define MSR_APIC_LVT_CMCI 0x82F +#define MSR_APIC_ICR 0x830 +#define MSR_APIC_LVT_TIMER 0x832 +#define MSR_APIC_LVT_THERMAL 0x833 +#define MSR_APIC_LVT_PCINT 0x834 +#define MSR_APIC_LVT_LINT0 0x835 +#define MSR_APIC_LVT_LINT1 0x836 +#define MSR_APIC_LVT_ERROR 0x837 +#define MSR_APIC_ICR_TIMER 0x838 +#define MSR_APIC_CCR_TIMER 0x839 +#define MSR_APIC_DCR_TIMER 0x83e +#define MSR_APIC_SELF_IPI 0x83f + +#define MSR_IA32_XSS 0xda0 + +#define MSR_IA32_TSC_AUX 0xc0000103 + +/* + * Constants related to MSR's. + */ +#define APICBASE_RESERVED 0x000002ff +#define APICBASE_BSP 0x00000100 +#define APICBASE_X2APIC 0x00000400 +#define APICBASE_ENABLED 0x00000800 +#define APICBASE_ADDRESS 0xfffff000 + +/* MSR_IA32_FEATURE_CONTROL related */ +#define IA32_FEATURE_CONTROL_LOCK 0x01 /* lock bit */ +#define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ +#define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ + +/* + * PAT modes. + */ +#define PAT_UNCACHEABLE 0x00 +#define PAT_WRITE_COMBINING 0x01 +#define PAT_WRITE_THROUGH 0x04 +#define PAT_WRITE_PROTECTED 0x05 +#define PAT_WRITE_BACK 0x06 +#define PAT_UNCACHED 0x07 +#define PAT_VALUE(i, m) ((long long)(m) << (8 * (i))) +#define PAT_MASK(i) PAT_VALUE(i, 0xff) + +/* + * Constants related to MTRRs + */ +#define MTRR_UNCACHEABLE 0x00 +#define MTRR_WRITE_COMBINING 0x01 +#define MTRR_WRITE_THROUGH 0x04 +#define MTRR_WRITE_PROTECTED 0x05 +#define MTRR_WRITE_BACK 0x06 +#define MTRR_N64K 8 /* numbers of fixed-size entries */ +#define MTRR_N16K 16 +#define MTRR_N4K 64 +#define MTRR_CAP_WC 0x0000000000000400 +#define MTRR_CAP_FIXED 0x0000000000000100 +#define MTRR_CAP_VCNT 0x00000000000000ff +#define MTRR_DEF_ENABLE 0x0000000000000800 +#define MTRR_DEF_FIXED_ENABLE 0x0000000000000400 +#define MTRR_DEF_TYPE 0x00000000000000ff +#define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000 +#define MTRR_PHYSBASE_TYPE 0x00000000000000ff +#define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000 +#define MTRR_PHYSMASK_VALID 0x0000000000000800 + +/* + * Cyrix configuration registers, accessible as IO ports. + */ +#define CCR0 0xc0 /* Configuration control register 0 */ +#define CCR0_NC0 0x01 /* First 64K of each 1M memory region is + non-cacheable */ +#define CCR0_NC1 0x02 /* 640K-1M region is non-cacheable */ +#define CCR0_A20M 0x04 /* Enables A20M# input pin */ +#define CCR0_KEN 0x08 /* Enables KEN# input pin */ +#define CCR0_FLUSH 0x10 /* Enables FLUSH# input pin */ +#define CCR0_BARB 0x20 /* Flushes internal cache when entering hold + state */ +#define CCR0_CO 0x40 /* Cache org: 1=direct mapped, 0=2x set + assoc */ +#define CCR0_SUSPEND 0x80 /* Enables SUSP# and SUSPA# pins */ + +#define CCR1 0xc1 /* Configuration control register 1 */ +#define CCR1_RPL 0x01 /* Enables RPLSET and RPLVAL# pins */ +#define CCR1_SMI 0x02 /* Enables SMM pins */ +#define CCR1_SMAC 0x04 /* System management memory access */ +#define CCR1_MMAC 0x08 /* Main memory access */ +#define CCR1_NO_LOCK 0x10 /* Negate LOCK# */ +#define CCR1_SM3 0x80 /* SMM address space address region 3 */ + +#define CCR2 0xc2 +#define CCR2_WB 0x02 /* Enables WB cache interface pins */ +#define CCR2_SADS 0x02 /* Slow ADS */ +#define CCR2_LOCK_NW 0x04 /* LOCK NW Bit */ +#define CCR2_SUSP_HLT 0x08 /* Suspend on HALT */ +#define CCR2_WT1 0x10 /* WT region 1 */ +#define CCR2_WPR1 0x10 /* Write-protect region 1 */ +#define CCR2_BARB 0x20 /* Flushes write-back cache when entering + hold state. */ +#define CCR2_BWRT 0x40 /* Enables burst write cycles */ +#define CCR2_USE_SUSP 0x80 /* Enables suspend pins */ + +#define CCR3 0xc3 +#define CCR3_SMILOCK 0x01 /* SMM register lock */ +#define CCR3_NMI 0x02 /* Enables NMI during SMM */ +#define CCR3_LINBRST 0x04 /* Linear address burst cycles */ +#define CCR3_SMMMODE 0x08 /* SMM Mode */ +#define CCR3_MAPEN0 0x10 /* Enables Map0 */ +#define CCR3_MAPEN1 0x20 /* Enables Map1 */ +#define CCR3_MAPEN2 0x40 /* Enables Map2 */ +#define CCR3_MAPEN3 0x80 /* Enables Map3 */ + +#define CCR4 0xe8 +#define CCR4_IOMASK 0x07 +#define CCR4_MEM 0x08 /* Enables momory bypassing */ +#define CCR4_DTE 0x10 /* Enables directory table entry cache */ +#define CCR4_FASTFPE 0x20 /* Fast FPU exception */ +#define CCR4_CPUID 0x80 /* Enables CPUID instruction */ + +#define CCR5 0xe9 +#define CCR5_WT_ALLOC 0x01 /* Write-through allocate */ +#define CCR5_SLOP 0x02 /* LOOP instruction slowed down */ +#define CCR5_LBR1 0x10 /* Local bus region 1 */ +#define CCR5_ARREN 0x20 /* Enables ARR region */ + +#define CCR6 0xea + +#define CCR7 0xeb + +/* Performance Control Register (5x86 only). */ +#define PCR0 0x20 +#define PCR0_RSTK 0x01 /* Enables return stack */ +#define PCR0_BTB 0x02 /* Enables branch target buffer */ +#define PCR0_LOOP 0x04 /* Enables loop */ +#define PCR0_AIS 0x08 /* Enables all instrcutions stalled to + serialize pipe. */ +#define PCR0_MLR 0x10 /* Enables reordering of misaligned loads */ +#define PCR0_BTBRT 0x40 /* Enables BTB test register. */ +#define PCR0_LSSER 0x80 /* Disable reorder */ + +/* Device Identification Registers */ +#define DIR0 0xfe +#define DIR1 0xff + +/* + * Machine Check register constants. + */ +#define MCG_CAP_COUNT 0x000000ff +#define MCG_CAP_CTL_P 0x00000100 +#define MCG_CAP_EXT_P 0x00000200 +#define MCG_CAP_CMCI_P 0x00000400 +#define MCG_CAP_TES_P 0x00000800 +#define MCG_CAP_EXT_CNT 0x00ff0000 +#define MCG_CAP_SER_P 0x01000000 +#define MCG_STATUS_RIPV 0x00000001 +#define MCG_STATUS_EIPV 0x00000002 +#define MCG_STATUS_MCIP 0x00000004 +#define MCG_CTL_ENABLE 0xffffffffffffffff +#define MCG_CTL_DISABLE 0x0000000000000000 +#define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4) +#define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) +#define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) +#define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) +#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_MCA_ERROR 0x000000000000ffff +#define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 +#define MC_STATUS_OTHER_INFO 0x01ffffff00000000 +#define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_PCC 0x0200000000000000 +#define MC_STATUS_ADDRV 0x0400000000000000 +#define MC_STATUS_MISCV 0x0800000000000000 +#define MC_STATUS_EN 0x1000000000000000 +#define MC_STATUS_UC 0x2000000000000000 +#define MC_STATUS_OVER 0x4000000000000000 +#define MC_STATUS_VAL 0x8000000000000000 +#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ +#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ +#define MC_CTL2_THRESHOLD 0x0000000000007fff +#define MC_CTL2_CMCI_EN 0x0000000040000000 + +/* + * The following four 3-byte registers control the non-cacheable regions. + * These registers must be written as three separate bytes. + * + * NCRx+0: A31-A24 of starting address + * NCRx+1: A23-A16 of starting address + * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx. + * + * The non-cacheable region's starting address must be aligned to the + * size indicated by the NCR_SIZE_xx field. + */ +#define NCR1 0xc4 +#define NCR2 0xc7 +#define NCR3 0xca +#define NCR4 0xcd + +#define NCR_SIZE_0K 0 +#define NCR_SIZE_4K 1 +#define NCR_SIZE_8K 2 +#define NCR_SIZE_16K 3 +#define NCR_SIZE_32K 4 +#define NCR_SIZE_64K 5 +#define NCR_SIZE_128K 6 +#define NCR_SIZE_256K 7 +#define NCR_SIZE_512K 8 +#define NCR_SIZE_1M 9 +#define NCR_SIZE_2M 10 +#define NCR_SIZE_4M 11 +#define NCR_SIZE_8M 12 +#define NCR_SIZE_16M 13 +#define NCR_SIZE_32M 14 +#define NCR_SIZE_4G 15 + +/* + * The address region registers are used to specify the location and + * size for the eight address regions. + * + * ARRx + 0: A31-A24 of start address + * ARRx + 1: A23-A16 of start address + * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx + */ +#define ARR0 0xc4 +#define ARR1 0xc7 +#define ARR2 0xca +#define ARR3 0xcd +#define ARR4 0xd0 +#define ARR5 0xd3 +#define ARR6 0xd6 +#define ARR7 0xd9 + +#define ARR_SIZE_0K 0 +#define ARR_SIZE_4K 1 +#define ARR_SIZE_8K 2 +#define ARR_SIZE_16K 3 +#define ARR_SIZE_32K 4 +#define ARR_SIZE_64K 5 +#define ARR_SIZE_128K 6 +#define ARR_SIZE_256K 7 +#define ARR_SIZE_512K 8 +#define ARR_SIZE_1M 9 +#define ARR_SIZE_2M 10 +#define ARR_SIZE_4M 11 +#define ARR_SIZE_8M 12 +#define ARR_SIZE_16M 13 +#define ARR_SIZE_32M 14 +#define ARR_SIZE_4G 15 + +/* + * The region control registers specify the attributes associated with + * the ARRx addres regions. + */ +#define RCR0 0xdc +#define RCR1 0xdd +#define RCR2 0xde +#define RCR3 0xdf +#define RCR4 0xe0 +#define RCR5 0xe1 +#define RCR6 0xe2 +#define RCR7 0xe3 + +#define RCR_RCD 0x01 /* Disables caching for ARRx (x = 0-6). */ +#define RCR_RCE 0x01 /* Enables caching for ARR7. */ +#define RCR_WWO 0x02 /* Weak write ordering. */ +#define RCR_WL 0x04 /* Weak locking. */ +#define RCR_WG 0x08 /* Write gathering. */ +#define RCR_WT 0x10 /* Write-through. */ +#define RCR_NLB 0x20 /* LBA# pin is not asserted. */ + +/* AMD Write Allocate Top-Of-Memory and Control Register */ +#define AMD_WT_ALLOC_TME 0x40000 /* top-of-memory enable */ +#define AMD_WT_ALLOC_PRE 0x20000 /* programmable range enable */ +#define AMD_WT_ALLOC_FRE 0x10000 /* fixed (A0000-FFFFF) range enable */ + +/* AMD64 MSR's */ +#define MSR_EFER 0xc0000080 /* extended features */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */ +#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ +#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ +#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ +#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_PERFEVSEL0 0xc0010000 +#define MSR_PERFEVSEL1 0xc0010001 +#define MSR_PERFEVSEL2 0xc0010002 +#define MSR_PERFEVSEL3 0xc0010003 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_K7_PERFCTR1 0xc0010005 +#define MSR_K7_PERFCTR2 0xc0010006 +#define MSR_K7_PERFCTR3 0xc0010007 +#define MSR_SYSCFG 0xc0010010 +#define MSR_HWCR 0xc0010015 +#define MSR_IORRBASE0 0xc0010016 +#define MSR_IORRMASK0 0xc0010017 +#define MSR_IORRBASE1 0xc0010018 +#define MSR_IORRMASK1 0xc0010019 +#define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ +#define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ +#define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ +#define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ +#define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ +#define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ +#define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ +#define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ +#define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ +#define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ +#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 +#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ +#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ + +/* MSR_VM_CR related */ +#define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ + +/* VIA ACE crypto featureset: for via_feature_rng */ +#define VIA_HAS_RNG 1 /* cpu has RNG */ + +/* VIA ACE crypto featureset: for via_feature_xcrypt */ +#define VIA_HAS_AES 1 /* cpu has AES */ +#define VIA_HAS_SHA 2 /* cpu has SHA1 & SHA256 */ +#define VIA_HAS_MM 4 /* cpu has RSA instructions */ +#define VIA_HAS_AESCTR 8 /* cpu has AES-CTR instructions */ + +/* Centaur Extended Feature flags */ +#define VIA_CPUID_HAS_RNG 0x000004 +#define VIA_CPUID_DO_RNG 0x000008 +#define VIA_CPUID_HAS_ACE 0x000040 +#define VIA_CPUID_DO_ACE 0x000080 +#define VIA_CPUID_HAS_ACE2 0x000100 +#define VIA_CPUID_DO_ACE2 0x000200 +#define VIA_CPUID_HAS_PHE 0x000400 +#define VIA_CPUID_DO_PHE 0x000800 +#define VIA_CPUID_HAS_PMM 0x001000 +#define VIA_CPUID_DO_PMM 0x002000 + +/* VIA ACE xcrypt-* instruction context control options */ +#define VIA_CRYPT_CWLO_ROUND_M 0x0000000f +#define VIA_CRYPT_CWLO_ALG_M 0x00000070 +#define VIA_CRYPT_CWLO_ALG_AES 0x00000000 +#define VIA_CRYPT_CWLO_KEYGEN_M 0x00000080 +#define VIA_CRYPT_CWLO_KEYGEN_HW 0x00000000 +#define VIA_CRYPT_CWLO_KEYGEN_SW 0x00000080 +#define VIA_CRYPT_CWLO_NORMAL 0x00000000 +#define VIA_CRYPT_CWLO_INTERMEDIATE 0x00000100 +#define VIA_CRYPT_CWLO_ENCRYPT 0x00000000 +#define VIA_CRYPT_CWLO_DECRYPT 0x00000200 +#define VIA_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ +#define VIA_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ +#define VIA_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/timerreg.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/timerreg.h new file mode 100644 index 0000000..4cb33b5 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/timerreg.h @@ -0,0 +1,47 @@ +/*- + * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The outputs of the three timers are connected as follows: + * + * timer 0 -> irq 0 + * timer 1 -> dma chan 0 (for dram refresh) + * timer 2 -> speaker (via keyboard controller) + * + * Timer 0 is used to call hardclock. + * Timer 2 is used to generate console beeps. + */ + +#pragma once + +#include + +#define IO_TIMER1 0x40 /* 8253 Timer #1 */ +#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0) +#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1) +#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2) +#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/tree.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/tree.h new file mode 100644 index 0000000..9c372cc --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/tree.h @@ -0,0 +1,751 @@ +/* $NetBSD: tree.h,v 1.8 2004/03/28 19:38:30 provos Exp $ */ +/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ +/* $FreeBSD: src/sys/sys/tree.h,v 1.7 2007/12/28 07:03:26 jasone Exp $ */ + +/*- + * Copyright 2002 Niels Provos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" + +/* + * This file defines data structures for different types of trees: + * splay trees and red-black trees. + * + * A splay tree is a self-organizing data structure. Every operation + * on the tree causes a splay to happen. The splay moves the requested + * node to the root of the tree and partly rebalances it. + * + * This has the benefit that request locality causes faster lookups as + * the requested nodes move to the top of the tree. On the other hand, + * every lookup causes memory writes. + * + * The Balance Theorem bounds the total access time for m operations + * and n inserts on an initially empty tree as O((m + n)lg n). The + * amortized cost for a sequence of m accesses to a splay tree is O(lg n); + * + * A red-black tree is a binary search tree with the node color as an + * extra attribute. It fulfills a set of conditions: + * - every search path from the root to a leaf consists of the + * same number of black nodes, + * - each red node (except for the root) has a black parent, + * - each leaf node is black. + * + * Every operation on a red-black tree is bounded as O(lg n). + * The maximum height of a red-black tree is 2lg (n+1). + */ + +#define SPLAY_HEAD(name, type) \ +struct name { \ + struct type *sph_root; /* root of the tree */ \ +} + +#define SPLAY_INITIALIZER(root) \ + { NULL } + +#define SPLAY_INIT(root) do { \ + (root)->sph_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ENTRY(type) \ +struct { \ + struct type *spe_left; /* left element */ \ + struct type *spe_right; /* right element */ \ +} + +#define SPLAY_LEFT(elm, field) (elm)->field.spe_left +#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right +#define SPLAY_ROOT(head) (head)->sph_root +#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) + +/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ +#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKLEFT(head, tmp, field) do { \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKRIGHT(head, tmp, field) do { \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ + SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ + SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ + +#define SPLAY_PROTOTYPE(name, type, field, cmp) \ +void name##_SPLAY(struct name *, struct type *); \ +void name##_SPLAY_MINMAX(struct name *, int); \ +struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ +struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ + \ +/* Finds the node with the same key as elm */ \ +static __inline struct type * \ +name##_SPLAY_FIND(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) \ + return(NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) \ + return (head->sph_root); \ + return (NULL); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_NEXT(struct name *head, struct type *elm) \ +{ \ + name##_SPLAY(head, elm); \ + if (SPLAY_RIGHT(elm, field) != NULL) { \ + elm = SPLAY_RIGHT(elm, field); \ + while (SPLAY_LEFT(elm, field) != NULL) { \ + elm = SPLAY_LEFT(elm, field); \ + } \ + } else \ + elm = NULL; \ + return (elm); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_MIN_MAX(struct name *head, int val) \ +{ \ + name##_SPLAY_MINMAX(head, val); \ + return (SPLAY_ROOT(head)); \ +} + +/* Main splay operation. + * Moves node close to the key of elm to top + */ +#define SPLAY_GENERATE(name, type, field, cmp) \ +struct type * \ +name##_SPLAY_INSERT(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) { \ + SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ + } else { \ + int __comp; \ + name##_SPLAY(head, elm); \ + __comp = (cmp)(elm, (head)->sph_root); \ + if(__comp < 0) { \ + SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ + SPLAY_RIGHT(elm, field) = (head)->sph_root; \ + SPLAY_LEFT((head)->sph_root, field) = NULL; \ + } else if (__comp > 0) { \ + SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT(elm, field) = (head)->sph_root; \ + SPLAY_RIGHT((head)->sph_root, field) = NULL; \ + } else \ + return ((head)->sph_root); \ + } \ + (head)->sph_root = (elm); \ + return (NULL); \ +} \ + \ +struct type * \ +name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *__tmp; \ + if (SPLAY_EMPTY(head)) \ + return (NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) { \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ + } else { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ + name##_SPLAY(head, elm); \ + SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ + } \ + return (elm); \ + } \ + return (NULL); \ +} \ + \ +void \ +name##_SPLAY(struct name *head, struct type *elm) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ + int __comp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) > 0){ \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} \ + \ +/* Splay with either the minimum or the maximum element \ + * Used to find minimum or maximum element in tree. \ + */ \ +void name##_SPLAY_MINMAX(struct name *head, int __comp) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while (1) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp > 0) { \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} + +#define SPLAY_NEGINF -1 +#define SPLAY_INF 1 + +#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) +#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) +#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) +#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) +#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) +#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) + +#define SPLAY_FOREACH(x, name, head) \ + for ((x) = SPLAY_MIN(name, head); \ + (x) != NULL; \ + (x) = SPLAY_NEXT(name, head, x)) + +/* Macros that define a red-black tree */ +#define RB_HEAD(name, type) \ +struct name { \ + struct type *rbh_root; /* root of the tree */ \ +} + +#define RB_INITIALIZER(root) \ + { NULL } + +#define RB_INIT(root) do { \ + (root)->rbh_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +/* + * Undef for Linux + */ +#undef RB_BLACK +#undef RB_RED +#undef RB_ROOT + +#define RB_BLACK 0 +#define RB_RED 1 +#define RB_ENTRY(type) \ +struct { \ + struct type *rbe_left; /* left element */ \ + struct type *rbe_right; /* right element */ \ + struct type *rbe_parent; /* parent element */ \ + int rbe_color; /* node color */ \ +} + +#define RB_LEFT(elm, field) (elm)->field.rbe_left +#define RB_RIGHT(elm, field) (elm)->field.rbe_right +#define RB_PARENT(elm, field) (elm)->field.rbe_parent +#define RB_COLOR(elm, field) (elm)->field.rbe_color +#define RB_ROOT(head) (head)->rbh_root +#define RB_EMPTY(head) (RB_ROOT(head) == NULL) + +#define RB_SET(elm, parent, field) do { \ + RB_PARENT(elm, field) = parent; \ + RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ + RB_COLOR(elm, field) = RB_RED; \ +} while (/*CONSTCOND*/ 0) + +#define RB_SET_BLACKRED(black, red, field) do { \ + RB_COLOR(black, field) = RB_BLACK; \ + RB_COLOR(red, field) = RB_RED; \ +} while (/*CONSTCOND*/ 0) + +#ifndef RB_AUGMENT +#define RB_AUGMENT(x) do {} while (0) +#endif + +#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ + (tmp) = RB_RIGHT(elm, field); \ + if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ + RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_LEFT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (/*CONSTCOND*/ 0) + +#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ + (tmp) = RB_LEFT(elm, field); \ + if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ + RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_RIGHT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ +#define RB_PROTOTYPE(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) +#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __unused static) +#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ +attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \ +attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ +attr struct type *name##_RB_INSERT(struct name *, struct type *); \ +attr struct type *name##_RB_FIND(struct name *, struct type *); \ +attr struct type *name##_RB_NFIND(struct name *, struct type *); \ +attr struct type *name##_RB_NEXT(struct type *); \ +attr struct type *name##_RB_PREV(struct type *); \ +attr struct type *name##_RB_MINMAX(struct name *, int) \ + +/* Main rb operation. + * Moves node close to the key of elm to top + */ +#define RB_GENERATE(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp,) +#define RB_GENERATE_STATIC(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp, __unused static) +#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ +attr void \ +name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ +{ \ + struct type *parent, *gparent, *tmp; \ + while ((parent = RB_PARENT(elm, field)) != NULL && \ + RB_COLOR(parent, field) == RB_RED) { \ + gparent = RB_PARENT(parent, field); \ + if (parent == RB_LEFT(gparent, field)) { \ + tmp = RB_RIGHT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_RIGHT(parent, field) == elm) { \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_RIGHT(head, gparent, tmp, field); \ + } else { \ + tmp = RB_LEFT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_LEFT(parent, field) == elm) { \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_LEFT(head, gparent, tmp, field); \ + } \ + } \ + RB_COLOR(head->rbh_root, field) = RB_BLACK; \ +} \ + \ +attr void \ +name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ +{ \ + struct type *tmp; \ + while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ + elm != RB_ROOT(head)) { \ + if (RB_LEFT(parent, field) == elm) { \ + tmp = RB_RIGHT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ + struct type *oleft; \ + if ((oleft = RB_LEFT(tmp, field)) \ + != NULL) \ + RB_COLOR(oleft, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_RIGHT(head, tmp, oleft, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_RIGHT(tmp, field)) \ + RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } else { \ + tmp = RB_LEFT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ + struct type *oright; \ + if ((oright = RB_RIGHT(tmp, field)) \ + != NULL) \ + RB_COLOR(oright, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_LEFT(head, tmp, oright, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_LEFT(tmp, field)) \ + RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } \ + } \ + if (elm) \ + RB_COLOR(elm, field) = RB_BLACK; \ +} \ + \ +attr struct type * \ +name##_RB_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *child, *parent, *old = elm; \ + int color; \ + if (RB_LEFT(elm, field) == NULL) \ + child = RB_RIGHT(elm, field); \ + else if (RB_RIGHT(elm, field) == NULL) \ + child = RB_LEFT(elm, field); \ + else { \ + struct type *left; \ + elm = RB_RIGHT(elm, field); \ + while ((left = RB_LEFT(elm, field)) != NULL) \ + elm = left; \ + child = RB_RIGHT(elm, field); \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ + if (RB_PARENT(elm, field) == old) \ + parent = elm; \ + (elm)->field = (old)->field; \ + if (RB_PARENT(old, field)) { \ + if (RB_LEFT(RB_PARENT(old, field), field) == old)\ + RB_LEFT(RB_PARENT(old, field), field) = elm;\ + else \ + RB_RIGHT(RB_PARENT(old, field), field) = elm;\ + RB_AUGMENT(RB_PARENT(old, field)); \ + } else \ + RB_ROOT(head) = elm; \ + RB_PARENT(RB_LEFT(old, field), field) = elm; \ + if (RB_RIGHT(old, field)) \ + RB_PARENT(RB_RIGHT(old, field), field) = elm; \ + if (parent) { \ + left = parent; \ + do { \ + RB_AUGMENT(left); \ + } while ((left = RB_PARENT(left, field)) != NULL); \ + } \ + goto color; \ + } \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ +color: \ + if (color == RB_BLACK) \ + name##_RB_REMOVE_COLOR(head, parent, child); \ + return (old); \ +} \ + \ +/* Inserts a node into the RB tree */ \ +attr struct type * \ +name##_RB_INSERT(struct name *head, struct type *elm) \ +{ \ + struct type *tmp; \ + struct type *parent = NULL; \ + int comp = 0; \ + tmp = RB_ROOT(head); \ + while (tmp) { \ + parent = tmp; \ + comp = (cmp)(elm, parent); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + RB_SET(elm, parent, field); \ + if (parent != NULL) { \ + if (comp < 0) \ + RB_LEFT(parent, field) = elm; \ + else \ + RB_RIGHT(parent, field) = elm; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_INSERT_COLOR(head, elm); \ + return (NULL); \ +} \ + \ +/* Finds the node with the same key as elm */ \ +attr struct type * \ +name##_RB_FIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (NULL); \ +} \ + \ +/* Finds the first node greater than or equal to the search key */ \ +attr struct type * \ +name##_RB_NFIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *res = NULL; \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) { \ + res = tmp; \ + tmp = RB_LEFT(tmp, field); \ + } \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (res); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_NEXT(struct type *elm) \ +{ \ + if (RB_RIGHT(elm, field)) { \ + elm = RB_RIGHT(elm, field); \ + while (RB_LEFT(elm, field)) \ + elm = RB_LEFT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_PREV(struct type *elm) \ +{ \ + if (RB_LEFT(elm, field)) { \ + elm = RB_LEFT(elm, field); \ + while (RB_RIGHT(elm, field)) \ + elm = RB_RIGHT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +attr struct type * \ +name##_RB_MINMAX(struct name *head, int val) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *parent = NULL; \ + while (tmp) { \ + parent = tmp; \ + if (val < 0) \ + tmp = RB_LEFT(tmp, field); \ + else \ + tmp = RB_RIGHT(tmp, field); \ + } \ + return (parent); \ +} + +#define RB_NEGINF -1 +#define RB_INF 1 + +#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) +#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) +#define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) +#define RB_NEXT(name, x, y) name##_RB_NEXT(y) +#define RB_PREV(name, x, y) name##_RB_PREV(y) +#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) +#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) + +#define RB_FOREACH(x, name, head) \ + for ((x) = RB_MIN(name, head); \ + (x) != NULL; \ + (x) = name##_RB_NEXT(x)) + +#define RB_FOREACH_REVERSE(x, name, head) \ + for ((x) = RB_MAX(name, head); \ + (x) != NULL; \ + (x) = name##_RB_PREV(x)) + +#pragma clang diagnostic pop diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/support/uuid.h b/vendor/github.com/hooklift/xhyve/include/xhyve/support/uuid.h new file mode 100644 index 0000000..5d4c1f3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/support/uuid.h @@ -0,0 +1,153 @@ +/*- + * Copyright (c) 2002,2005 Marcel Moolenaar + * Copyright (c) 2002 Hiten Mahesh Pandya + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +#define _UUID_NODE_LEN 6 + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node[_UUID_NODE_LEN]; +}; +#pragma clang diagnostic pop + +typedef struct uuid uuid_internal_t; + +/* + * This implementation mostly conforms to the DCE 1.1 specification. + * See Also: + * uuidgen(1), uuidgen(2), uuid(3) + */ + +/* Status codes returned by the functions. */ +#define uuid_s_ok 0 +#define uuid_s_bad_version 1 +#define uuid_s_invalid_string_uuid 2 +#define uuid_s_no_memory 3 + +/* + * uuid_create_nil() - create a nil UUID. + * See also: + * http://www.opengroup.org/onlinepubs/009629399/uuid_create_nil.htm + */ +static inline void +uuid_create_nil(uuid_t *u, uint32_t *status) +{ + if (status) + *status = uuid_s_ok; + + bzero(u, sizeof(*u)); +} + +static inline void +uuid_enc_le(void *buf, uuid_t *uuid) +{ + uuid_internal_t *u = (uuid_internal_t *) ((void *) uuid); + uint8_t *p = buf; + int i; + + memcpy(p, &u->time_low, 4); + memcpy(p, &u->time_mid, 2); + memcpy(p, &u->time_hi_and_version, 2); + p[8] = u->clock_seq_hi_and_reserved; + p[9] = u->clock_seq_low; + for (i = 0; i < _UUID_NODE_LEN; i++) + p[10 + i] = u->node[i]; +} + +/* + * uuid_from_string() - convert a string representation of an UUID into + * a binary representation. + * See also: + * http://www.opengroup.org/onlinepubs/009629399/uuid_from_string.htm + * + * NOTE: The sequence field is in big-endian, while the time fields are in + * native byte order. + */ +static inline void +uuid_from_string(const char *s, uuid_t *uuid, uint32_t *status) +{ + uuid_internal_t *u = (uuid_internal_t *) ((void *) uuid); + int n; + + /* Short-circuit 2 special cases: NULL pointer and empty string. */ + if (s == NULL || *s == '\0') { + uuid_create_nil(((uuid_t *) u), status); + return; + } + + /* Assume the worst. */ + if (status != NULL) + *status = uuid_s_invalid_string_uuid; + + /* The UUID string representation has a fixed length. */ + if (strlen(s) != 36) + return; + + /* + * We only work with "new" UUIDs. New UUIDs have the form: + * 01234567-89ab-cdef-0123-456789abcdef + * The so called "old" UUIDs, which we don't support, have the form: + * 0123456789ab.cd.ef.01.23.45.67.89.ab + */ + if (s[8] != '-') + return; + + n = sscanf(s, + "%8x-%4hx-%4hx-%2hhx%2hhx-%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx", + &u->time_low, &u->time_mid, &u->time_hi_and_version, + &u->clock_seq_hi_and_reserved, &u->clock_seq_low, &u->node[0], + &u->node[1], &u->node[2], &u->node[3], &u->node[4], &u->node[5]); + + /* Make sure we have all conversions. */ + if (n != 11) + return; + + /* We have a successful scan. Check semantics... */ + n = u->clock_seq_hi_and_reserved; + if ((n & 0x80) != 0x00 && /* variant 0? */ + (n & 0xc0) != 0x80 && /* variant 1? */ + (n & 0xe0) != 0xc0) { /* variant 2? */ + if (status != NULL) + *status = uuid_s_bad_version; + } else { + if (status != NULL) + *status = uuid_s_ok; + } +} diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/uart_emul.h b/vendor/github.com/hooklift/xhyve/include/xhyve/uart_emul.h new file mode 100644 index 0000000..d7ad2f3 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/uart_emul.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#define UART_IO_BAR_SIZE 8 + +struct uart_softc; + +typedef void (*uart_intr_func_t)(void *arg); +struct uart_softc *uart_init(uart_intr_func_t intr_assert, + uart_intr_func_t intr_deassert, void *arg); + +int uart_legacy_alloc(int unit, int *ioaddr, int *irq); +uint8_t uart_read(struct uart_softc *sc, int offset); +void uart_write(struct uart_softc *sc, int offset, uint8_t value); +int uart_set_backend(struct uart_softc *sc, const char *opt); + +extern void go_set_pty_name(char *name); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/virtio.h b/vendor/github.com/hooklift/xhyve/include/xhyve/virtio.h new file mode 100644 index 0000000..feb0fbb --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/virtio.h @@ -0,0 +1,480 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The 16-byte "desc" descriptors consist of a 64-bit guest + * physical address , a 32-bit length , a 16-bit + * , and a 16-bit field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, is the number of bytes that may + * be read/written from guest physical address . If + * INDIRECT is set, WRITE is ignored and provides the length + * of the indirect descriptors (and must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each must be in the range [0 .. /16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit field and 16-bit index , then + * have 16-bit values, followed by one final 16-bit + * field . The entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each value). However, is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit and 16-bit , then there + * are "vring_used" elements, followed by a 16-bit . + * The "vring_used" elements consist of a 32-bit and a + * 32-bit (vu_tlen below). The is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, and , in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +#pragma clang diagnostic pop + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_RANDOM 0x1002 + +/* + * PCI config space constants. + * + * If MSI-X is enabled, the ISR register is generally not used, + * and the configuration vector and queue vector appear at offsets + * 20 and 22 with the remaining configuration registers at 24. + * If MSI-X is not enabled, those two registers disappear and + * the remaining configuration registers start at offset 20. + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, ((size_t) VRING_ALIGN)); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, ((size_t) VRING_ALIGN)); + + return (size); +} + +struct pci_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct pci_devinst *vs_pi; /* PCI device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + uint8_t vs_status; /* value from last status write */ + uint8_t vs_isr; /* ISR flags, if not MSI-X */ + uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + /* name of driver (for diagnostics) */ + const char *vc_name; + /* number of virtual queues */ + int vc_nvq; + /* size of dev-specific config regs */ + size_t vc_cfgsize; + /* called on virtual device reset */ + void (*vc_reset)(void *); + /* called on QNOTIFY if no VQ notify */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called to read config regs */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to write config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to apply negotiated features */ + void (*vc_apply_features)(void *, uint64_t); + /* hypervisor-provided capabilities */ + uint64_t vc_hv_caps; +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + /* size of this queue (a power of 2) */ + uint16_t vq_qsize; + /* called instead of vc_notify, if not NULL */ + void (*vq_notify)(void *, struct vqueue_info *); + /* backpointer to softc */ + struct virtio_softc *vq_vs; + /* we're the num'th queue in the softc */ + uint16_t vq_num; + /* flags (see above) */ + uint16_t vq_flags; + /* a recent value of vq_avail->va_idx */ + uint16_t vq_last_avail; + /* saved vq_used->vu_idx; see vq_endchains */ + uint16_t vq_save_used; + /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ + uint16_t vq_msix_idx; + /* PFN of virt queue (not shifted!) */ + uint32_t vq_pfn; + /* descriptor array */ + volatile struct virtio_desc *vq_desc; + /* the "avail" ring */ + volatile struct vring_avail *vq_avail; + /* the "used" ring */ + volatile struct vring_used *vq_used; +}; + +#pragma clang diagnostic pop + +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + if (pci_msix_enabled(vs->vs_pi)) + pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); + else { + VS_LOCK(vs); + vs->vs_isr |= VTCFG_ISR_QUEUES; + pci_generate_msi(vs->vs_pi, 0); + pci_lintr_assert(vs->vs_pi); + VS_UNLOCK(vs); + } +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_bar(struct virtio_softc *, int); +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, + int n_iov, uint16_t *flags); +void vq_retchain(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); +uint64_t vi_pci_read(int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size); +void vi_pci_write(int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, + int size, uint64_t value); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmcs.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmcs.h new file mode 100644 index 0000000..4ce70e2 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmcs.h @@ -0,0 +1,386 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include + +int vmcs_getreg(int vcpuid, int ident, uint64_t *rv); +int vmcs_setreg(int vcpuid, int ident, uint64_t val); +int vmcs_getdesc(int vcpuid, int ident, struct seg_desc *desc); +int vmcs_setdesc(int vcpuid, int ident, struct seg_desc *desc); + +static __inline uint64_t +vmcs_read(int vcpuid, uint32_t encoding) +{ + uint64_t val; + + hv_vmx_vcpu_read_vmcs(((hv_vcpuid_t) vcpuid), encoding, &val); + return (val); +} + +static __inline void +vmcs_write(int vcpuid, uint32_t encoding, uint64_t val) +{ + if (encoding == 0x00004002) { + if (val == 0x0000000000000004) { + abort(); + } + } + hv_vmx_vcpu_write_vmcs(((hv_vcpuid_t) vcpuid), encoding, val); +} + +#define vmexit_instruction_length(vcpuid) \ + vmcs_read(vcpuid, VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip(vcpuid) \ + vmcs_read(vcpuid, VMCS_GUEST_RIP) +#define vmcs_instruction_error(vcpuid) \ + vmcs_read(vcpuid, VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason(vcpuid) \ + (vmcs_read(vcpuid, VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification(vcpuid) \ + vmcs_read(vcpuid, VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3(vcpuid) \ + vmcs_read(vcpuid, VMCS_GUEST_CR3) +#define vmcs_gpa(vcpuid) \ + vmcs_read(vcpuid, VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla(vcpuid) \ + vmcs_read(vcpuid, VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info(vcpuid) \ + vmcs_read(vcpuid, VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err(vcpuid) \ + vmcs_read(vcpuid, VMCS_IDT_VECTORING_ERROR) + +#define VMCS_INITIAL 0xffffffffffffffff + +#define VMCS_IDENT(encoding) ((int) (((unsigned) (encoding)) | 0x80000000)) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1U << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700U /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0U << 8) +#define VMCS_INTR_T_NMI (2U << 8) +#define VMCS_INTR_T_HWEXCEPTION (3U << 8) +#define VMCS_INTR_T_SWINTR (4U << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5U << 8) +#define VMCS_INTR_T_SWEXCEPTION (6U << 8) +#define VMCS_INTR_DEL_ERRCODE (1U << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1U << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1U << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1U << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1U << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1U << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx.h new file mode 100644 index 0000000..9402892 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx.h @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; +#pragma clang diagnostic pop + +struct apic_page { + uint32_t reg[XHYVE_PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == XHYVE_PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + IDX_MSR_PAT, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +/* virtual machine softc */ +struct vmx { + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ + uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + struct vm *vm; +}; + +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +void vmx_call_isr(uintptr_t entry); + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +extern char vmx_exit_guest[]; diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_controls.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_controls.h new file mode 100644 index 0000000..06ed294 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_controls.h @@ -0,0 +1,95 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1u << 0) +#define PINBASED_NMI_EXITING (1u << 3) +#define PINBASED_VIRTUAL_NMI (1u << 5) +#define PINBASED_PREMPTION_TIMER (1u << 6) +#define PINBASED_POSTED_INTERRUPT (1u << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1u << 2) +#define PROCBASED_TSC_OFFSET (1u << 3) +#define PROCBASED_HLT_EXITING (1u << 7) +#define PROCBASED_INVLPG_EXITING (1u << 9) +#define PROCBASED_MWAIT_EXITING (1u << 10) +#define PROCBASED_RDPMC_EXITING (1u << 11) +#define PROCBASED_RDTSC_EXITING (1u << 12) +#define PROCBASED_CR3_LOAD_EXITING (1u << 15) +#define PROCBASED_CR3_STORE_EXITING (1u << 16) +#define PROCBASED_CR8_LOAD_EXITING (1u << 19) +#define PROCBASED_CR8_STORE_EXITING (1u << 20) +#define PROCBASED_USE_TPR_SHADOW (1u << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1u << 22) +#define PROCBASED_MOV_DR_EXITING (1u << 23) +#define PROCBASED_IO_EXITING (1u << 24) +#define PROCBASED_IO_BITMAPS (1u << 25) +#define PROCBASED_MTF (1u << 27) +#define PROCBASED_MSR_BITMAPS (1u << 28) +#define PROCBASED_MONITOR_EXITING (1u << 29) +#define PROCBASED_PAUSE_EXITING (1u << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1u << 0) +#define PROCBASED2_ENABLE_EPT (1u << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1u << 2) +#define PROCBASED2_ENABLE_RDTSCP (1u << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1u << 4) +#define PROCBASED2_ENABLE_VPID (1u << 5) +#define PROCBASED2_WBINVD_EXITING (1u << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1u << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1u << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1u << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1u << 10) +#define PROCBASED2_RDRAND_EXITING (1u << 11) +#define PROCBASED2_ENABLE_INVPCID (1u << 12) +#define PROCBASED2_RDSEED_EXITING (1u << 16) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1u << 2) +#define VM_EXIT_HOST_LMA (1u << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1u << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1u << 15) +#define VM_EXIT_SAVE_PAT (1u << 18) +#define VM_EXIT_LOAD_PAT (1u << 19) +#define VM_EXIT_SAVE_EFER (1u << 20) +#define VM_EXIT_LOAD_EFER (1u << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1u << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1u << 2) +#define VM_ENTRY_GUEST_LMA (1u << 9) +#define VM_ENTRY_INTO_SMM (1u << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1u << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1u << 13) +#define VM_ENTRY_LOAD_PAT (1u << 14) +#define VM_ENTRY_LOAD_EFER (1u << 15) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_msr.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_msr.h new file mode 100644 index 0000000..58d532d --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/intel/vmx_msr.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include +#include + +struct vmx; + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, int vcpuid); +int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val); +int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val); + +int vmx_set_ctlreg(hv_vmx_capability_t cap_field, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpic.h new file mode 100644 index 0000000..a17b9db --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpic.h @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +#define IO_ICU1 0x020 /* 8259A Interrupt Controller #1 */ +#define IO_ICU2 0x0a0 /* 8259A Interrupt Controller #2 */ + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpit.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpit.h new file mode 100644 index 0000000..87c61a1 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vatpit.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +//#include + +#define NMISC_PORT 0x61 + +struct vm; +struct vatpit; + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vhpet.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vhpet.h new file mode 100644 index 0000000..ea09e6e --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vhpet.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 0x400 + +struct vm; +struct vhpet; + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size, void *arg); +int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, + int size, void *arg); +int vhpet_getcap(uint32_t *cap); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vioapic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vioapic.h new file mode 100644 index 0000000..b5df1ba --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vioapic.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#define VIOAPIC_BASE 0xfec00000 +#define VIOAPIC_SIZE 0x1000 + +struct vm; +struct vioapic; + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, + uint64_t wval, int size, void *arg); +int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, + uint64_t *rval, int size, void *arg); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic.h new file mode 100644 index 0000000..e7520c6 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic.h @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +struct vm; + +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + +void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +/* Reset the trigger-mode bits for all vectors to be edge-triggered */ +void vlapic_reset_tmr(struct vlapic *vlapic); + +/* + * Set the trigger-mode bit associated with 'vector' to level-triggered if + * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to + * this 'vlapic'. + */ +void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic_priv.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic_priv.h new file mode 100644 index 0000000..62d2811 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vlapic_priv.h @@ -0,0 +1,189 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include +#include + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *x = &(vlapic)->apic_page->irr0; \ + x[0] = x[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", x[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", x[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", x[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", x[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", x[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", x[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", x[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", x[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *x = &(vlapic)->apic_page->isr0; \ + x[0] = x[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", x[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", x[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", x[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", x[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", x[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", x[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", x[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", x[7 << 2]); \ +} while (0) + +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + uint32_t esr_pending; + int esr_firing; + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + OSSpinLock timer_lock; + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + uint64_t msr_apicbase; + enum boot_state boot_state; + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; +}; +#pragma clang diagnostic pop + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vpmtmr.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vpmtmr.h new file mode 100644 index 0000000..06caf53 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vpmtmr.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#define IO_PMTMR 0x408 + +struct vm; +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vrtc.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vrtc.h new file mode 100644 index 0000000..334a628 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/io/vrtc.h @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +#define IO_RTC 0x070 /* 4990A RTC */ + +struct vm; +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm.h new file mode 100644 index 0000000..7b72e8a --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm.h @@ -0,0 +1,314 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +struct vm; +struct vm_exception; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vmspace; +struct vm_object; +struct vm_guest_paging; +struct pmap; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void *(*vmi_vm_init_func_t)(struct vm *vm); +typedef int (*vmi_vcpu_init_func_t)(void *vmi, int vcpu); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + void *rendezvous_cookie, void *suspend_cookie); +typedef void (*vmi_vm_cleanup_func_t)(void *vmi); +typedef void (*vmi_vcpu_cleanup_func_t)(void *vmi, int vcpu); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +typedef void (*vmi_interrupt)(int vcpu); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + vmi_vm_init_func_t vm_init; /* vm-specific initialization */ + vmi_vcpu_init_func_t vcpu_init; + vmi_run_func_t vmrun; + vmi_vm_cleanup_func_t vm_cleanup; + vmi_vcpu_cleanup_func_t vcpu_cleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; + vmi_interrupt vcpu_interrupt; +}; + +extern struct vmm_ops vmm_ops_intel; + +int vmm_init(void); +int vmm_cleanup(void); +int vm_create(struct vm **retvm); +int vcpu_create(struct vm *vm, int vcpu); +void vm_destroy(struct vm *vm); +void vcpu_destroy(struct vm *vm, int vcpu); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, uint64_t gpa, size_t len); +void *vm_gpa2hva(struct vm *vm, uint64_t gpa, uint64_t len); +int vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg); +int vm_get_memobj(struct vm *vm, uint64_t gpa, size_t len, uint64_t *offset, + void **object); +bool vm_mem_allocated(struct vm *vm, uint64_t gpa); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); +int vm_run(struct vm *vm, int vcpu, struct vm_exit *vm_exit); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); + +/* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); + +static __inline int +vcpu_rendezvous_pending(void *rendezvous_cookie) +{ + + return (*(uintptr_t *)rendezvous_cookie != 0); +} + +static __inline int +vcpu_suspended(void *suspend_cookie) +{ + + return (*(int *)suspend_cookie); +} + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu) +{ + return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING); +} + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); + +/* + * Inject exception 'vector' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * retval is_fault Intepretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *is_fault); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(void); + +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static __inline void +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + +int vm_restart_instruction(void *vm, int vcpuid); + +#pragma clang diagnostic pop diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_api.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_api.h new file mode 100644 index 0000000..1cfd7f0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_api.h @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include +#include + +struct iovec; + +/* + * Different styles of mapping the memory assigned to a VM into the address + * space of the controlling process. + */ +enum vm_mmap_style { + VM_MMAP_NONE, /* no mapping */ + VM_MMAP_ALL, /* fully and statically mapped */ + VM_MMAP_SPARSE, /* mappings created on-demand */ +}; + +int xh_vm_create(void); +void xh_vm_destroy(void); +int xh_vcpu_create(int vcpu); +void xh_vcpu_destroy(int vcpu); +int xh_vm_get_memory_seg(uint64_t gpa, size_t *ret_len); +int xh_vm_setup_memory(size_t len, enum vm_mmap_style vms); +void *xh_vm_map_gpa(uint64_t gpa, size_t len); +int xh_vm_gla2gpa(int vcpu, struct vm_guest_paging *paging, uint64_t gla, + int prot, uint64_t *gpa, int *fault); +uint32_t xh_vm_get_lowmem_limit(void); +void xh_vm_set_lowmem_limit(uint32_t limit); +void xh_vm_set_memflags(int flags); +size_t xh_vm_get_lowmem_size(void); +size_t xh_vm_get_highmem_size(void); +int xh_vm_set_desc(int vcpu, int reg, uint64_t base, uint32_t limit, + uint32_t access); +int xh_vm_get_desc(int vcpu, int reg, uint64_t *base, uint32_t *limit, + uint32_t *access); +int xh_vm_get_seg_desc(int vcpu, int reg, struct seg_desc *seg_desc); +int xh_vm_set_register(int vcpu, int reg, uint64_t val); +int xh_vm_get_register(int vcpu, int reg, uint64_t *retval); +int xh_vm_run(int vcpu, struct vm_exit *ret_vmexit); +int xh_vm_suspend(enum vm_suspend_how how); +int xh_vm_reinit(void); +int xh_vm_apicid2vcpu(int apicid); +int xh_vm_inject_exception(int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction); +int xh_vm_lapic_irq(int vcpu, int vector); +int xh_vm_lapic_local_irq(int vcpu, int vector); +int xh_vm_lapic_msi(uint64_t addr, uint64_t msg); +int xh_vm_ioapic_assert_irq(int irq); +int xh_vm_ioapic_deassert_irq(int irq); +int xh_vm_ioapic_pulse_irq(int irq); +int xh_vm_ioapic_pincount(int *pincount); +int xh_vm_isa_assert_irq(int atpic_irq, int ioapic_irq); +int xh_vm_isa_deassert_irq(int atpic_irq, int ioapic_irq); +int xh_vm_isa_pulse_irq(int atpic_irq, int ioapic_irq); +int xh_vm_isa_set_irq_trigger(int atpic_irq, enum vm_intr_trigger trigger); +int xh_vm_inject_nmi(int vcpu); +int xh_vm_capability_name2type(const char *capname); +const char *xh_vm_capability_type2name(int type); +int xh_vm_get_capability(int vcpu, enum vm_cap_type cap, int *retval); +int xh_vm_set_capability(int vcpu, enum vm_cap_type cap, int val); +int xh_vm_get_intinfo(int vcpu, uint64_t *i1, uint64_t *i2); +int xh_vm_set_intinfo(int vcpu, uint64_t exit_intinfo); +uint64_t *xh_vm_get_stats(int vcpu, struct timeval *ret_tv, int *ret_entries); +const char *xh_vm_get_stat_desc(int index); +int xh_vm_get_x2apic_state(int vcpu, enum x2apic_state *s); +int xh_vm_set_x2apic_state(int vcpu, enum x2apic_state s); +int xh_vm_get_hpet_capabilities(uint32_t *capabilities); +int xh_vm_copy_setup(int vcpu, struct vm_guest_paging *pg, uint64_t gla, + size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); +void xh_vm_copyin(struct iovec *iov, void *dst, size_t len); +void xh_vm_copyout(const void *src, struct iovec *iov, size_t len); +int xh_vm_rtc_write(int offset, uint8_t value); +int xh_vm_rtc_read(int offset, uint8_t *retval); +int xh_vm_rtc_settime(time_t secs); +int xh_vm_rtc_gettime(time_t *secs); +int xh_vcpu_reset(int vcpu); +int xh_vm_active_cpus(cpuset_t *cpus); +int xh_vm_suspended_cpus(cpuset_t *cpus); +int xh_vm_activate_cpu(int vcpu); +int xh_vm_restart_instruction(int vcpu); +int xh_vm_emulate_instruction(int vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_callout.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_callout.h new file mode 100644 index 0000000..4068bbd --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_callout.h @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include + +#define SBT_1S ((sbintime_t)1 << 32) +#define SBT_1M (SBT_1S * 60) +#define SBT_1MS (SBT_1S / 1000) +#define SBT_1US (SBT_1S / 1000000) +#define SBT_1NS (SBT_1S / 1000000000) +#define SBT_MAX 0x7fffffffffffffffLL + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +struct bintime { + uint64_t sec; + uint64_t frac; +}; + +typedef int64_t sbintime_t; + +static inline sbintime_t bttosbt(const struct bintime bt) { + return (sbintime_t) ((bt.sec << 32) + (bt.frac >> 32)); +} + +static inline void bintime_mul(struct bintime *bt, unsigned int x) { + uint64_t p1, p2; + + p1 = (bt->frac & 0xffffffffull) * x; + p2 = (bt->frac >> 32) * x + (p1 >> 32); + bt->sec *= x; + bt->sec += (p2 >> 32); + bt->frac = (p2 << 32) | (p1 & 0xffffffffull); +} + +static inline void bintime_add(struct bintime *_bt, const struct bintime *_bt2) +{ + uint64_t _u; + + _u = _bt->frac; + _bt->frac += _bt2->frac; + if (_u > _bt->frac) + _bt->sec++; + _bt->sec += _bt2->sec; +} + +static inline void bintime_sub(struct bintime *_bt, const struct bintime *_bt2) +{ + uint64_t _u; + + _u = _bt->frac; + _bt->frac -= _bt2->frac; + if (_u < _bt->frac) + _bt->sec--; + _bt->sec -= _bt2->sec; +} + +#define bintime_cmp(a, b, cmp) \ + (((a)->sec == (b)->sec) ? \ + ((a)->frac cmp (b)->frac) : \ + ((a)->sec cmp (b)->sec)) + + +void binuptime(struct bintime *bt); +void getmicrotime(struct timeval *tv); + +static inline sbintime_t sbinuptime(void) { + struct bintime _bt; + + binuptime(&_bt); + return (bttosbt(_bt)); +} + +struct callout { + pthread_cond_t wait; + struct callout *prev; + struct callout *next; + uint64_t timeout; + void *argument; + void (*callout)(void *); + int flags; + int queued; +}; + +#define C_ABSOLUTE 0x0200 /* event time is absolute */ +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ +#define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */ +#define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ +#define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ +#define CALLOUT_COMPLETED 0x0020 /* callout thread finished */ +#define CALLOUT_WAITING 0x0040 /* thread waiting for callout to finish */ +//#define CALLOUT_QUEUED 0x0080 + +void callout_system_init(void); +void callout_init(struct callout *c, int mpsafe); +int callout_reset_sbt(struct callout *c, sbintime_t sbt, + sbintime_t precision, void (*ftn)(void *), void *arg, + int flags); + +int callout_stop_safe(struct callout *c, int drain); + +#define callout_active(c) ((c)->flags & CALLOUT_ACTIVE) +#define callout_deactivate(c) ((c)->flags &= ~CALLOUT_ACTIVE) +#define callout_pending(c) ((c)->flags & CALLOUT_PENDING) +#define callout_completed(c) ((c)->flags & CALLOUT_COMPLETED) +#define callout_drain(c) callout_stop_safe(c, 1) +#define callout_stop(c) callout_stop_safe(c, 0) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_common.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_common.h new file mode 100644 index 0000000..f5c5326 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_common.h @@ -0,0 +1,322 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" + +#define VM_MAXCPU 16 /* maximum virtual cpus */ + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_MAX +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; + +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_LAST +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RENDEZVOUS, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_MAX +}; + +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + uint8_t disp_bytes; + uint8_t imm_bytes; + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + uint8_t decoded; /* set to 1 if successfully decoded */ + struct vie_op op; /* opcode description */ +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_inout_str inout_str; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +/* FIXME remove */ +struct vm_memory_segment { + uint64_t gpa; /* in */ + size_t len; +}; + +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +#pragma clang diagnostic pop diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_host.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_host.h new file mode 100644 index 0000000..493d399 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_host.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; +#pragma clang diagnostic pop + +void vmm_host_state_init(void); + +const struct xsave_limits *vmm_get_xsave_limits(void); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_instruction_emul.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_instruction_emul.h new file mode 100644 index 0000000..1097b0c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_instruction_emul.h @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); + +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ioport.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ioport.h new file mode 100644 index 0000000..e6f1675 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ioport.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +struct vm; +struct vm_exit; + +typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, + bool in, int port, int bytes, uint32_t *val); + +int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ktr.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ktr.h new file mode 100644 index 0000000..acfe5b6 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_ktr.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#ifdef XHYVE_CONFIG_TRACE +#define vmmtrace printf +#else +#define vmmtrace if (0) printf +#endif + +struct vm; +extern const char *vm_name(struct vm *vm); + +#define VCPU_CTR0(vm, vcpuid, format) \ +vmmtrace("vm %s[%d]: " format "\n", vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +vmmtrace("vm %s[%d]: " format "\n", vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +vmmtrace("vm %s[%d]: " format "\n", vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +vmmtrace("vm %s[%d]: " format "\n", vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +vmmtrace("vm %s[%d]: " format "\n", vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +vmmtrace("vm %s: " format "\n", vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +vmmtrace("vm %s: " format "\n", vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +vmmtrace("vm %s: " format "\n", vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +vmmtrace("vm %s: " format "\n", vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +vmmtrace("vm %s: " format "\n", vm_name((vm)), (p1), (p2), (p3), (p4)) diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_lapic.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_lapic.h new file mode 100644 index 0000000..e7df042 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_lapic.h @@ -0,0 +1,76 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +struct vm; + +bool lapic_msr(u_int num); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, + bool *retu); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval, + bool *retu); + +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_mem.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_mem.h new file mode 100644 index 0000000..6efbcf4 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_mem.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +struct vmspace; + +int vmm_mem_init(void); +void *vmm_mem_alloc(uint64_t gpa, size_t size); +void vmm_mem_free(uint64_t gpa, size_t size, void *object); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_stat.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_stat.h new file mode 100644 index 0000000..dbd1dc9 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_stat.h @@ -0,0 +1,185 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; +#pragma clang diagnostic pop + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + } + //}; \ + // SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, size_t buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef XHYVE_CONFIG_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#else + (void) vm; + (void) vcpu; + (void) vst; + (void) statidx; + (void) x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef XHYVE_CONFIG_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#else + (void) vm; + (void) vcpu; + (void) vst; + (void) statidx; + (void) val; +#endif +} + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef XHYVE_CONFIG_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#else + (void) vm; + (void) vcpu; + (void) vst; + (void) x; +#endif +} + +static void __inline +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef XHYVE_CONFIG_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#else + (void) vm; + (void) vcpu; + (void) vst; + (void) val; +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_util.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_util.h new file mode 100644 index 0000000..ca71faf --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/vmm_util.h @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +struct trapframe; + +void dump_trapframe(struct trapframe *tf); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/x86.h b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/x86.h new file mode 100644 index 0000000..9b76974 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/vmm/x86.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/xhyve.h b/vendor/github.com/hooklift/xhyve/include/xhyve/xhyve.h new file mode 100644 index 0000000..f9ae309 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/xhyve.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) + +extern int guest_ncpus; +extern char *guest_uuid_str; +extern char *vmname; +extern bool exit_mevent_dispatch_loop; + +void xh_vm_inject_fault(int vcpu, int vector, int errcode_valid, + uint32_t errcode); + +static __inline void +vm_inject_ud(int vcpuid) +{ + xh_vm_inject_fault(vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(int vcpuid) +{ + xh_vm_inject_fault(vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(int vcpuid, uint32_t errcode) +{ + xh_vm_inject_fault(vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(int vcpuid, uint32_t errcode) +{ + xh_vm_inject_fault(vcpuid, IDT_SS, 1, errcode); +} + +void *paddr_guest2host(uintptr_t addr, size_t len); + +void vcpu_set_capabilities(int cpu); +void vcpu_add(int fromcpu, int newcpu, uint64_t rip); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_virtio_msix(void); +int run_xhyve(int argc, char *argv[]); +extern void go_callback_exit(int status); diff --git a/vendor/github.com/hooklift/xhyve/include/xhyve/xmsr.h b/vendor/github.com/hooklift/xhyve/include/xhyve/xmsr.h new file mode 100644 index 0000000..ff793ff --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/include/xhyve/xmsr.h @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +int init_msr(void); +int emulate_wrmsr(int vcpu, uint32_t code, uint64_t val); +int emulate_rdmsr(int vcpu, uint32_t code, uint64_t *val); diff --git a/vendor/github.com/hooklift/xhyve/inout.c b/vendor/github.com/hooklift/xhyve/inout.c new file mode 100644 index 0000000..1ba8e99 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/inout.c @@ -0,0 +1,318 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +#define VERIFY_IOPORT(port, size) \ + assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; +#pragma clang diagnostic pop + +static int +default_inout(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); +} + +static void +register_default_iohandler(int start, int size) +{ + struct inout_port iop; + + VERIFY_IOPORT(start, size); + + bzero(&iop, sizeof(iop)); + iop.name = "default"; + iop.port = start; + iop.size = size; + iop.flags = (int) (IOPORT_F_INOUT | IOPORT_F_DEFAULT); + iop.handler = default_inout; + + register_inout(&iop); +} + +static int +update_register(int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = xh_vm_get_register(vcpuid, reg, &origval); + if (error) + return (error); + val &= vie_size2mask(size); + val |= origval & ~vie_size2mask(size); + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + return xh_vm_set_register(vcpuid, reg, val); +} + +int +emulate_inout(int vcpu, struct vm_exit *vmexit, int strict) +{ + int addrsize, bytes, flags, in, port, prot, rep; + uint32_t eax, val; + inout_func_t handler; + void *arg; + int error, fault, retval; + enum vm_reg_name idxreg; + uint64_t gla, index, iterations, count; + struct vm_inout_str *vis; + struct iovec iov[2]; + + bytes = vmexit->u.inout.bytes; + in = vmexit->u.inout.in; + port = vmexit->u.inout.port; + + assert(port < MAX_IOPORTS); + assert(bytes == 1 || bytes == 2 || bytes == 4); + + handler = inout_handlers[port].handler; + + if (strict && handler == default_inout) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if (in) { + if (!(flags & IOPORT_F_IN)) + return (-1); + } else { + if (!(flags & IOPORT_F_OUT)) + return (-1); + } + + retval = 0; + if (vmexit->u.inout.string) { + vis = &vmexit->u.inout_str; + rep = vis->inout.rep; + addrsize = vis->addrsize; + prot = in ? XHYVE_PROT_WRITE : XHYVE_PROT_READ; + assert(addrsize == 2 || addrsize == 4 || addrsize == 8); + + /* Index register */ + idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + index = vis->index & vie_size2mask(addrsize); + + /* Count register */ + count = vis->count & vie_size2mask(addrsize); + + /* Limit number of back-to-back in/out emulations to 16 */ + iterations = min(count, 16); + while (iterations > 0) { + assert(retval == 0); + if (vie_calculate_gla(vis->paging.cpu_mode, + vis->seg_name, &vis->seg_desc, index, bytes, + addrsize, prot, &gla)) { + vm_inject_gp(vcpu); + break; + } + + error = xh_vm_copy_setup(vcpu, &vis->paging, gla, + ((size_t) bytes), prot, iov, nitems(iov), &fault); + if (error) { + retval = -1; /* Unrecoverable error */ + break; + } else if (fault) { + retval = 0; /* Resume guest to handle fault */ + break; + } + + if (vie_alignment_check(vis->paging.cpl, bytes, + vis->cr0, vis->rflags, gla)) { + vm_inject_ac(vcpu, 0); + break; + } + + val = 0; + if (!in) + xh_vm_copyin(iov, &val, ((size_t) bytes)); + + retval = handler(vcpu, in, port, bytes, &val, arg); + if (retval != 0) + break; + + if (in) + xh_vm_copyout(&val, iov, ((size_t) bytes)); + + /* Update index */ + if (vis->rflags & PSL_D) + index -= ((uint64_t) bytes); + else + index += ((uint64_t) bytes); + + count--; + iterations--; + } + + /* Update index register */ + error = update_register(vcpu, idxreg, index, addrsize); + assert(error == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if (rep) { + error = update_register(vcpu, VM_REG_GUEST_RCX, count, addrsize); + assert(error == 0); + } + + /* Restart the instruction if more iterations remain */ + if (retval == 0 && count != 0) { + error = xh_vm_restart_instruction(vcpu); + assert(error == 0); + } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); + retval = handler(vcpu, in, port, bytes, &val, arg); + if (retval == 0 && in) { + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = xh_vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); + assert(error == 0); + } + } + return (retval); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + /* + * Set up the default handler for all ports + */ + register_default_iohandler(0, MAX_IOPORTS); + + /* + * Overwrite with specified handlers + */ + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + int i; + + VERIFY_IOPORT(iop->port, iop->size); + + /* + * Verify that the new registration is not overwriting an already + * allocated i/o range. + */ + if ((((unsigned) iop->flags) & IOPORT_F_DEFAULT) == 0) { + for (i = iop->port; i < iop->port + iop->size; i++) { + if ((((unsigned) inout_handlers[i].flags) & IOPORT_F_DEFAULT) == 0) + return (-1); + } + } + + for (i = iop->port; i < iop->port + iop->size; i++) { + inout_handlers[i].name = iop->name; + inout_handlers[i].flags = iop->flags; + inout_handlers[i].handler = iop->handler; + inout_handlers[i].arg = iop->arg; + } + + return (0); +} + +int +unregister_inout(struct inout_port *iop) +{ + + VERIFY_IOPORT(iop->port, iop->size); + assert(inout_handlers[iop->port].name == iop->name); + + register_default_iohandler(iop->port, iop->size); + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/ioapic.c b/vendor/github.com/hooklift/xhyve/ioapic.c new file mode 100644 index 0000000..7c7c42d --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/ioapic.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +/* + * Assign PCI INTx interrupts to I/O APIC pins in a round-robin + * fashion. Note that we have no idea what the HPET is using, but the + * HPET is also programmable whereas this is intended for hardwired + * PCI interrupts. + * + * This assumes a single I/O APIC where pins >= 16 are permitted for + * PCI devices. + */ +static int pci_pins; + +void +ioapic_init(void) +{ + + if (xh_vm_ioapic_pincount(&pci_pins) < 0) { + pci_pins = 0; + return; + } + + /* Ignore the first 16 pins. */ + if (pci_pins <= 16) { + pci_pins = 0; + return; + } + pci_pins -= 16; +} + +int +ioapic_pci_alloc_irq(void) +{ + static int last_pin; + + if (pci_pins == 0) + return (-1); + return (16 + (last_pin++ % pci_pins)); +} diff --git a/vendor/github.com/hooklift/xhyve/kexec.c b/vendor/github.com/hooklift/xhyve/kexec.c new file mode 100644 index 0000000..3344d8d --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/kexec.c @@ -0,0 +1,262 @@ +/*- + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ???, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef ALIGNUP +#define ALIGNUP(x, a) (((x - 1) & ~(a - 1)) + a) +#endif + +#define BASE_GDT 0x2000ull +#define BASE_ZEROPAGE 0x3000ull +#define BASE_CMDLINE 0x4000ull +#define BASE_KERNEL 0x100000ull +#define HDRS 0x53726448 /* SrdH */ + +static struct { + uintptr_t base; + size_t size; +} memory, kernel, ramdisk; + +static struct { + char *kernel; + char *initrd; + char *cmdline; +} config; + +static int +kexec_load_kernel(char *path, char *cmdline) { + uint64_t kernel_offset, kernel_size, kernel_init_size, kernel_start, mem_k; + size_t sz, cmdline_len; + volatile struct zero_page *zp; + FILE *f; + + if ((memory.size < (BASE_ZEROPAGE + sizeof(struct zero_page))) || + ((BASE_ZEROPAGE + sizeof(struct zero_page)) > BASE_CMDLINE)) + { + return -1; + } + + zp = ((struct zero_page *) (memory.base + ((off_t) BASE_ZEROPAGE))); + + memset(((void *) ((uintptr_t) zp)), 0, sizeof(struct zero_page)); + + if (!(f = fopen(path, "r"))) { + return -1; + } + + fseek(f, 0L, SEEK_END); + sz = (size_t) ftell(f); + + if (sz < (0x01f1 + sizeof(struct setup_header))) { + fclose(f); + return -1; + } + + fseek(f, 0x01f1, SEEK_SET); + + if (!fread(((void *) ((uintptr_t) &zp->setup_header)), 1, + sizeof(zp->setup_header), f)) + { + fclose(f); + return -1; + } + + if ((zp->setup_header.setup_sects == 0) || /* way way too old */ + (zp->setup_header.boot_flag != 0xaa55) || /* no boot magic */ + (zp->setup_header.header != HDRS) || /* way too old */ + (zp->setup_header.version < 0x020a) || /* too old */ + (!(zp->setup_header.loadflags & 1)) || /* no bzImage */ + (sz < (((zp->setup_header.setup_sects + 1) * 512) + + (zp->setup_header.syssize * 16)))) /* too small */ + { + /* we can't boot this kernel */ + fclose(f); + return -1; + } + + kernel_offset = ((zp->setup_header.setup_sects + 1) * 512); + kernel_size = (sz - kernel_offset); + kernel_init_size = ALIGNUP(zp->setup_header.init_size, 0x1000ull); + kernel_start = (zp->setup_header.relocatable_kernel) ? + ALIGNUP(BASE_KERNEL, zp->setup_header.kernel_alignment) : + zp->setup_header.pref_address; + + if ((kernel_start < BASE_KERNEL) || + (kernel_size > kernel_init_size) || /* XXX: always true? */ + ((kernel_start + kernel_init_size) > memory.size)) /* oom */ + { + fclose(f); + return -1; + } + + /* copy kernel */ + fseek(f, ((long) kernel_offset), SEEK_SET); + if (!fread(((void *) (memory.base + kernel_start)), 1, kernel_size, f)) { + fclose(f); + return -1; + } + + fclose(f); + + /* copy cmdline */ + cmdline_len = strlen(cmdline); + if (((cmdline_len + 1)> zp->setup_header.cmdline_size) || + ((BASE_CMDLINE + (cmdline_len + 1)) > kernel_start)) + { + return -1; + } + + memcpy(((void *) (memory.base + BASE_CMDLINE)), cmdline, cmdline_len); + memset(((void *) (memory.base + BASE_CMDLINE + cmdline_len)), '\0', 1); + zp->setup_header.cmd_line_ptr = ((uint32_t) BASE_CMDLINE); + zp->ext_cmd_line_ptr = ((uint32_t) (BASE_CMDLINE >> 32)); + + zp->setup_header.hardware_subarch = 0; /* PC */ + zp->setup_header.type_of_loader = 0xd; /* kexec */ + + mem_k = (memory.size - 0x100000) >> 10; /* assume memory base is at 0 */ + zp->alt_mem_k = (mem_k > 0xffffffff) ? 0xffffffff : ((uint32_t) mem_k); + + zp->e820_map[0].addr = 0x0000000000000000; + zp->e820_map[0].size = 0x000000000009fc00; + zp->e820_map[0].type = 1; + zp->e820_map[1].addr = 0x0000000000100000; + zp->e820_map[1].size = (memory.size - 0x0000000000100000); + zp->e820_map[1].type = 1; + zp->e820_entries = 2; + + kernel.base = kernel_start; + kernel.size = kernel_init_size; + + return 0; +} + +static int +kexec_load_ramdisk(char *path) { + uint64_t ramdisk_start; + volatile struct zero_page *zp; + size_t sz; + FILE *f; + + zp = ((struct zero_page *) (memory.base + BASE_ZEROPAGE)); + + if (!(f = fopen(path, "r"))) {; + return -1; + } + + fseek(f, 0L, SEEK_END); + sz = (size_t) ftell(f); + fseek(f, 0, SEEK_SET); + + ramdisk_start = ALIGNUP((kernel.base + kernel.size), 0x1000ull); + + if ((ramdisk_start + sz) > memory.size) { + /* not enough memory */ + fclose(f); + return -1; + } + + /* copy ramdisk */ + if (!fread(((void *) (memory.base + ramdisk_start)), 1, sz, f)) { + fclose(f); + return -1; + } + + fclose(f); + + zp->setup_header.ramdisk_image = ((uint32_t) ramdisk_start); + zp->ext_ramdisk_image = ((uint32_t) (ramdisk_start >> 32)); + zp->setup_header.ramdisk_size = ((uint32_t) sz); + zp->ext_ramdisk_size = ((uint32_t) (sz >> 32)); + + ramdisk.base = ramdisk_start; + ramdisk.size = sz; + + return 0; +} + +void +kexec_init(char *kernel_path, char *initrd_path, char *cmdline) { + config.kernel = kernel_path; + config.initrd = initrd_path; + config.cmdline = cmdline; +} + +uint64_t +kexec(void) +{ + uint64_t *gdt_entry; + void *gpa_map; + + gpa_map = xh_vm_map_gpa(0, xh_vm_get_lowmem_size()); + memory.base = (uintptr_t) gpa_map; + memory.size = xh_vm_get_lowmem_size(); + + if (kexec_load_kernel(config.kernel, + config.cmdline ? config.cmdline : "auto")) + { + fprintf(stderr, "kexec: failed to load kernel %s\n", config.kernel); + abort(); + } + + if (config.initrd && kexec_load_ramdisk(config.initrd)) { + fprintf(stderr, "kexec: failed to load initrd %s\n", config.initrd); + abort(); + } + + gdt_entry = ((uint64_t *) (memory.base + BASE_GDT)); + gdt_entry[0] = 0x0000000000000000; /* null */ + gdt_entry[1] = 0x0000000000000000; /* null */ + gdt_entry[2] = 0x00cf9a000000ffff; /* code */ + gdt_entry[3] = 0x00cf92000000ffff; /* data */ + + xh_vcpu_reset(0); + + xh_vm_set_desc(0, VM_REG_GUEST_GDTR, BASE_GDT, 0x1f, 0); + xh_vm_set_desc(0, VM_REG_GUEST_CS, 0, 0xffffffff, 0xc09b); + xh_vm_set_desc(0, VM_REG_GUEST_DS, 0, 0xffffffff, 0xc093); + xh_vm_set_desc(0, VM_REG_GUEST_ES, 0, 0xffffffff, 0xc093); + xh_vm_set_desc(0, VM_REG_GUEST_SS, 0, 0xffffffff, 0xc093); + xh_vm_set_register(0, VM_REG_GUEST_CS, 0x10); + xh_vm_set_register(0, VM_REG_GUEST_DS, 0x18); + xh_vm_set_register(0, VM_REG_GUEST_ES, 0x18); + xh_vm_set_register(0, VM_REG_GUEST_SS, 0x18); + xh_vm_set_register(0, VM_REG_GUEST_CR0, 0x21); /* enable protected mode */ + xh_vm_set_register(0, VM_REG_GUEST_RBP, 0); + xh_vm_set_register(0, VM_REG_GUEST_RDI, 0); + xh_vm_set_register(0, VM_REG_GUEST_RBX, 0); + xh_vm_set_register(0, VM_REG_GUEST_RFLAGS, 0x2); + xh_vm_set_register(0, VM_REG_GUEST_RSI, BASE_ZEROPAGE); + xh_vm_set_register(0, VM_REG_GUEST_RIP, kernel.base); + + return kernel.base; +} diff --git a/vendor/github.com/hooklift/xhyve/md5c.c b/vendor/github.com/hooklift/xhyve/md5c.c new file mode 100644 index 0000000..dbbf1d1 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/md5c.c @@ -0,0 +1,284 @@ +/*- + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + * + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * This code is the same as the code published by RSA Inc. It has been + * edited for clarity and style only. + */ + +#include +#include +#include + +static void MD5Transform(u_int32_t [4], const unsigned char [64]); + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. */ + +void +MD5Init(context) + MD5_CTX *context; +{ + + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* + * MD5 block update operation. Continues an MD5 message-digest + * operation, processing another message block, and updating the + * context. + */ + +void +MD5Update(context, in, inputLen) + MD5_CTX *context; + const void *in; + unsigned int inputLen; +{ + unsigned int i, index, partLen; + const unsigned char *input = in; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((u_int32_t)inputLen << 3)) + < ((u_int32_t)inputLen << 3)) + context->count[1]++; + context->count[1] += ((u_int32_t)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. */ + if (inputLen >= partLen) { + memcpy((void *)&context->buffer[index], (const void *)input, + partLen); + MD5Transform(context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform(context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy((void *)&context->buffer[index], (const void *)&input[i], + inputLen-i); +} + +/* + * MD5 padding. Adds padding followed by original length. + */ + +static void +MD5Pad(MD5_CTX *context) +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + memcpy(bits, context->count, 8); + + /* Pad out to 56 mod 64. */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD5Update(context, PADDING, padLen); + + /* Append length (before padding) */ + MD5Update(context, bits, 8); +} + +/* + * MD5 finalization. Ends an MD5 message-digest operation, writing the + * the message digest and zeroizing the context. + */ + +void +MD5Final (digest, context) + unsigned char digest[16]; + MD5_CTX *context; +{ + /* Do padding. */ + MD5Pad(context); + + /* Store state in digest */ + memcpy(digest, context->state, 16); + + /* Zeroize sensitive information. */ + memset((void *)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. */ + +static void +MD5Transform (state, block) + u_int32_t state[4]; + const unsigned char block[64]; +{ + u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + memcpy(x, block, 64); + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. */ + memset((void *)x, 0, sizeof (x)); +} diff --git a/vendor/github.com/hooklift/xhyve/mem.c b/vendor/github.com/hooklift/xhyve/mem.c new file mode 100644 index 0000000..358d6d6 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/mem.c @@ -0,0 +1,291 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; +#pragma clang diagnostic pop + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +static RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + pthread_rwlock_unlock(&mmio_rwlock); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare) + +static int +mem_read(UNUSED void *unused, int vcpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(vcpu, MEM_F_READ, gpa, size, rval, mr->arg1, + mr->arg2); + return (error); +} + +static int +mem_write(UNUSED void* unused, int vcpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(vcpu, MEM_F_WRITE, gpa, size, &wval, mr->arg1, + mr->arg2); + return (error); +} + +int +emulate_mem(int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct mmio_rb_range *entry; + int err, immutable; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + pthread_rwlock_unlock(&mmio_rwlock); + return (ESRCH); + } + } + + assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + err = xh_vm_emulate_instruction(vcpu, paddr, vie, paging, mem_read, + mem_write, &entry->mr_param); + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + return (err); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + pthread_rwlock_unlock(&mmio_rwlock); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + pthread_rwlock_unlock(&mmio_rwlock); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} diff --git a/vendor/github.com/hooklift/xhyve/mevent.c b/vendor/github.com/hooklift/xhyve/mevent.c new file mode 100644 index 0000000..977abcb --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/mevent.c @@ -0,0 +1,463 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEVENT_MAX 64 + +#define MEV_ADD 1 +#define MEV_ENABLE 2 +#define MEV_DISABLE 3 +#define MEV_DEL_PENDING 4 + +extern char *vmname; +extern bool exit_mevent_dispatch_loop; + +static pthread_t mevent_tid; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct mevent { + void (*me_func)(int, enum ev_type, void *); +#define me_msecs me_fd + int me_fd; + int me_timid; + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; + LIST_ENTRY(mevent) me_list; +}; +#pragma clang diagnostic pop + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, UNUSED enum ev_type type, UNUSED void *param) +{ + char buf[MEVENT_MAX]; + ssize_t status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} + +void mevent_exit(void) +{ + char c; + write(mevent_pipefd[1], &c, 1); +} + +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ADD: + ret = EV_ADD; /* implicitly enabled */ + break; + case MEV_ENABLE: + ret = EV_ENABLE; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + default: + assert(0); + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(UNUSED struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(UNUSED int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + if (mevp->me_type == EVF_TIMER) { + kev[i].ident = (uintptr_t) mevp->me_timid; + kev[i].data = mevp->me_msecs; + } else { + kev[i].ident = (uintptr_t) mevp->me_fd; + kev[i].data = 0; + } + kev[i].filter = (int16_t) mevent_kq_filter(mevp); + kev[i].flags = (uint16_t) mevent_kq_flags(mevp); + kev[i].fflags = (uint32_t) mevent_kq_fflags(mevp); + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +struct mevent * +mevent_add(int tfd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ADD; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ +} + +int mevent_dispatch(void) +{ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; + int ret; + + mevent_tid = pthread_self(); + mevent_set_name(); + + mfd = kqueue(); + assert(mfd > 0); + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + return EPIPE; + //exit(0); + } + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { + if (exit_mevent_dispatch_loop) { + break; + } + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + } + return 0; +} diff --git a/vendor/github.com/hooklift/xhyve/mptbl.c b/vendor/github.com/hooklift/xhyve/mptbl.c new file mode 100644 index 0000000..df2b10b --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/mptbl.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +// #include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MPTABLE_BASE 0xF0000 + +/* floating pointer length + maximum length of configuration table */ +#define MPTABLE_MAX_LENGTH (65536 + 16) + +#define LAPIC_PADDR 0xFEE00000 +#define LAPIC_VERSION 16 + +#define IOAPIC_PADDR 0xFEC00000 +#define IOAPIC_VERSION 0x11 + +#define MP_SPECREV 4 +#define MPFP_SIG "_MP_" + +/* Configuration header defines */ +#define MPCH_SIG "PCMP" +#define MPCH_OEMID "BHyVe " +#define MPCH_OEMID_LEN 8 +#define MPCH_PRODID "Hypervisor " +#define MPCH_PRODID_LEN 12 + +/* Processor entry defines */ +#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */ +#define MPEP_SIG_MODEL 26 +#define MPEP_SIG_STEPPING 5 +#define MPEP_SIG \ + ((MPEP_SIG_FAMILY << 8) | \ + (MPEP_SIG_MODEL << 4) | \ + (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */ + +/* Number of local intr entries */ +#define MPEII_NUM_LOCAL_IRQ 2 + +/* Bus entry defines */ +#define MPE_NUM_BUSES 2 +#define MPE_BUSNAME_LEN 6 +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " + +static void *oem_tbl_start; +static int oem_tbl_size; + +static uint8_t +mpt_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes; + uint8_t sum; + + for(bytes = base, sum = 0; len > 0; len--) { + sum += *bytes++; + } + + return ((uint8_t) (256 - sum)); +} + +static void +mpt_build_mpfp(mpfps_t mpfp, uint64_t gpa) +{ + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIG, 4); + mpfp->pap = (uint32_t) (gpa + sizeof(*mpfp)); + mpfp->length = 1; + mpfp->spec_rev = MP_SPECREV; + mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mpt_build_mpch(mpcth_t mpch) +{ + + memset(mpch, 0, sizeof(*mpch)); + memcpy(mpch->signature, MPCH_SIG, 4); + mpch->spec_rev = MP_SPECREV; + memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->apic_address = LAPIC_PADDR; +} + +static void +mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu) +{ + int i; + + for (i = 0; i < ncpu; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->type = MPCT_ENTRY_PROCESSOR; + mpep->apic_id = (uint8_t) i; // XXX + mpep->apic_version = LAPIC_VERSION; + mpep->cpu_flags = PROCENTRY_FLAG_EN; + if (i == 0) + mpep->cpu_flags |= PROCENTRY_FLAG_BP; + mpep->cpu_signature = MPEP_SIG; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } +} + +static void +mpt_build_localint_entries(int_entry_ptr mpie) +{ + + /* Hardcode LINT0 as ExtINT on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_EXTINT; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 0; + mpie++; + + /* Hardcode LINT1 as NMI on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_NMI; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 1; +} + +static void +mpt_build_bus_entries(bus_entry_ptr mpeb) +{ + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 0; + memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 1; + memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); +} + +static void +mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id) +{ + + memset(mpei, 0, sizeof(*mpei)); + mpei->type = MPCT_ENTRY_IOAPIC; + mpei->apic_id = (uint8_t) id; + mpei->apic_version = IOAPIC_VERSION; + mpei->apic_flags = IOAPICENTRY_FLAG_EN; + mpei->apic_address = IOAPIC_PADDR; +} + +static int +mpt_count_ioint_entries(void) +{ + int bus, count; + + count = 0; + for (bus = 0; bus <= PCI_BUSMAX; bus++) + count += pci_count_lintr(bus); + + /* + * Always include entries for the first 16 pins along with a entry + * for each active PCI INTx pin. + */ + return (16 + count); +} + +static void +mpt_generate_pci_int(int bus, int slot, int pin, UNUSED int pirq_pin, + int ioapic_irq, void *arg) +{ + int_entry_ptr *mpiep, mpie; + + mpiep = arg; + mpie = *mpiep; + memset(mpie, 0, sizeof(*mpie)); + + /* + * This is always after another I/O interrupt entry, so cheat + * and fetch the I/O APIC ID from the prior entry. + */ + mpie->type = MPCT_ENTRY_INT; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_id = (uint8_t) bus; + mpie->src_bus_irq = (uint8_t) (slot << 2 | (pin - 1)); + mpie->dst_apic_id = mpie[-1].dst_apic_id; + mpie->dst_apic_int = (uint8_t) ioapic_irq; + + *mpiep = mpie + 1; +} + +static void +mpt_build_ioint_entries(int_entry_ptr mpie, int id) +{ + int pin, bus; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + /* First, generate the first 16 pins. */ + for (pin = 0; pin < 16; pin++) { + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_INT; + mpie->src_bus_id = 1; + mpie->dst_apic_id = (uint8_t) id; + + /* + * All default configs route IRQs from bus 0 to the first 16 + * pins of the first I/O APIC with an APIC ID of 2. + */ + mpie->dst_apic_int = (uint8_t) pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpie->int_type = INTENTRY_TYPE_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = 0; + break; + case SCI_INT: + /* ACPI SCI is level triggered and active-lo. */ + mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO | + INTENTRY_FLAGS_TRIGGER_LEVEL; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = SCI_INT; + break; + default: + /* All other pins are identity mapped. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = (uint8_t) pin; + break; + } + mpie++; + } + + /* Next, generate entries for any PCI INTx interrupts. */ + for (bus = 0; bus <= PCI_BUSMAX; bus++) + pci_walk_lintr(bus, mpt_generate_pci_int, &mpie); +} + +void +mptable_add_oemtbl(void *tbl, int tblsz) +{ + + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +mptable_build(int ncpu) +{ + mpcth_t mpch; + bus_entry_ptr mpeb; + io_apic_entry_ptr mpei; + proc_entry_ptr mpep; + mpfps_t mpfp; + int_entry_ptr mpie; + int ioints, bus; + char *curraddr; + char *startaddr; + + startaddr = paddr_guest2host(MPTABLE_BASE, MPTABLE_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "mptable requires mapped mem\n"); + return (ENOMEM); + } + + /* + * There is no way to advertise multiple PCI hierarchies via MPtable + * so require that there is no PCI hierarchy with a non-zero bus + * number. + */ + for (bus = 1; bus <= PCI_BUSMAX; bus++) { + if (pci_bus_configured(bus)) { + fprintf(stderr, "MPtable is incompatible with " + "multiple PCI hierarchies.\r\n"); + fprintf(stderr, "MPtable generation can be disabled " + "by passing the -Y option to bhyve(8).\r\n"); + return (EINVAL); + } + } + + curraddr = startaddr; + mpfp = (mpfps_t)curraddr; + mpt_build_mpfp(mpfp, MPTABLE_BASE); + curraddr += sizeof(*mpfp); + + mpch = (mpcth_t)curraddr; + mpt_build_mpch(mpch); + curraddr += sizeof(*mpch); + + mpep = (proc_entry_ptr)curraddr; + mpt_build_proc_entries(mpep, ncpu); + curraddr += sizeof(*mpep) * ((uint64_t) ncpu); + mpch->entry_count += ncpu; + + mpeb = (bus_entry_ptr) curraddr; + mpt_build_bus_entries(mpeb); + curraddr += sizeof(*mpeb) * MPE_NUM_BUSES; + mpch->entry_count += MPE_NUM_BUSES; + + mpei = (io_apic_entry_ptr)curraddr; + mpt_build_ioapic_entries(mpei, 0); + curraddr += sizeof(*mpei); + mpch->entry_count++; + + mpie = (int_entry_ptr) curraddr; + ioints = mpt_count_ioint_entries(); + mpt_build_ioint_entries(mpie, 0); + curraddr += sizeof(*mpie) * ((uint64_t) ioints); + mpch->entry_count += ioints; + + mpie = (int_entry_ptr)curraddr; + mpt_build_localint_entries(mpie); + curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ; + mpch->entry_count += MPEII_NUM_LOCAL_IRQ; + + if (oem_tbl_start) { + mpch->oem_table_pointer = + (uint32_t) (curraddr - startaddr + MPTABLE_BASE); + mpch->oem_table_size = (uint16_t) oem_tbl_size; + memcpy(curraddr, oem_tbl_start, oem_tbl_size); + } + + mpch->base_table_length = (uint16_t) (curraddr - (char *)mpch); + mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length); + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/pci_ahci.c b/vendor/github.com/hooklift/xhyve/pci_ahci.c new file mode 100644 index 0000000..4a30213 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_ahci.c @@ -0,0 +1,2403 @@ +/*- + * Copyright (c) 2013 Zhixiang Yu + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ + +#define PxSIG_ATA 0x00000101 /* ATA drive */ +#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ + +enum sata_fis_type { + FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ + FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ + FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ + FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ + FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ + FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ + FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ + FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ +}; + +/* + * SCSI opcodes + */ +#define TEST_UNIT_READY 0x00 +#define REQUEST_SENSE 0x03 +#define INQUIRY 0x12 +#define START_STOP_UNIT 0x1B +#define PREVENT_ALLOW 0x1E +#define READ_CAPACITY 0x25 +#define READ_10 0x28 +// #define POSITION_TO_ELEMENT 0x2B +#define READ_TOC 0x43 +#define GET_EVENT_STATUS_NOTIFICATION 0x4A +#define MODE_SENSE_10 0x5A +#define REPORT_LUNS 0xA0 +#define READ_12 0xA8 +// #define READ_CD 0xBE + +/* + * SCSI mode page codes + */ +#define MODEPAGE_RW_ERROR_RECOVERY 0x01 +#define MODEPAGE_CD_CAPABILITIES 0x2A + +/* + * ATA commands + */ +#define ATA_SF_ENAB_SATA_SF 0x10 +#define ATA_SATA_SF_AN 0x05 +// #define ATA_SF_DIS_SATA_SF 0x90 + +/* + * Debug printf + */ +#ifdef AHCI_DEBUG +static FILE *dbg; +#define DPRINTF(format, ...) \ + do { \ + fprintf(dbg, format, __VA_ARGS__); \ + fflush(dbg); \ + } while(0) +#else +#define DPRINTF(format, ...) +#endif +#define WPRINTF(format, ...) printf(format, __VA_ARGS__) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" + +struct ahci_ioreq { + struct blockif_req io_req; + struct ahci_port *io_pr; + STAILQ_ENTRY(ahci_ioreq) io_flist; + TAILQ_ENTRY(ahci_ioreq) io_blist; + uint8_t *cfis; + uint32_t len; + uint32_t done; + int slot; + int more; +}; + +#define AHCI_PORT_IDENT 20 + 1 +struct ahci_port { + struct blockif_ctxt *bctx; + struct pci_ahci_softc *pr_sc; + uint8_t *cmd_lst; + uint8_t *rfis; + char ident[AHCI_PORT_IDENT]; + int atapi; + int reset; + int waitforclear; + int mult_sectors; + uint8_t xfermode; + uint8_t err_cfis[20]; + uint8_t sense_key; + uint8_t asc; + u_int ccs; + uint32_t pending; + + uint32_t clb; + uint32_t clbu; + uint32_t fb; + uint32_t fbu; + uint32_t is; + uint32_t ie; + uint32_t cmd; + uint32_t unused0; + uint32_t tfd; + uint32_t sig; + uint32_t ssts; + uint32_t sctl; + uint32_t serr; + uint32_t sact; + uint32_t ci; + uint32_t sntf; + uint32_t fbs; + + /* + * i/o request info + */ + struct ahci_ioreq *ioreq; + int ioqsz; + STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; + TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; +}; + +struct ahci_cmd_hdr { + uint16_t flags; + uint16_t prdtl; + uint32_t prdbc; + uint64_t ctba; + uint32_t reserved[4]; +}; + +struct ahci_prdt_entry { + uint64_t dba; + uint32_t reserved; +#define DBCMASK 0x3fffff + uint32_t dbc; +}; + +struct pci_ahci_softc { + struct pci_devinst *asc_pi; + pthread_mutex_t mtx; + int ports; + uint32_t cap; + uint32_t ghc; + uint32_t is; + uint32_t pi; + uint32_t vs; + uint32_t ccc_ctl; + uint32_t ccc_pts; + uint32_t em_loc; + uint32_t em_ctl; + uint32_t cap2; + uint32_t bohc; + uint32_t lintr; + struct ahci_port port[MAX_PORTS]; +}; + +#pragma clang diagnostic pop + +static void ahci_handle_port(struct ahci_port *p); + +static inline void lba_to_msf(uint8_t *buf, int lba) +{ + lba += 150; + buf[0] = (uint8_t) ((lba / 75) / 60); + buf[1] = (lba / 75) % 60; + buf[2] = lba % 75; +} + +/* + * generate HBA intr depending on whether or not ports within + * the controller have an interrupt pending. + */ +static void +ahci_generate_intr(struct pci_ahci_softc *sc) +{ + struct pci_devinst *pi; + int i; + + pi = sc->asc_pi; + + for (i = 0; i < sc->ports; i++) { + struct ahci_port *pr; + pr = &sc->port[i]; + if (pr->is & pr->ie) + sc->is |= (1 << i); + } + + DPRINTF("%s %x\n", __func__, sc->is); + + if (sc->is && (sc->ghc & AHCI_GHC_IE)) { + if (pci_msi_enabled(pi)) { + /* + * Generate an MSI interrupt on every edge + */ + pci_generate_msi(pi, 0); + } else if (!sc->lintr) { + /* + * Only generate a pin-based interrupt if one wasn't + * in progress + */ + sc->lintr = 1; + pci_lintr_assert(pi); + } + } else if (sc->lintr) { + /* + * No interrupts: deassert pin-based signal if it had + * been asserted + */ + pci_lintr_deassert(pi); + sc->lintr = 0; + } +} + +static void +ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) +{ + int offset, len, irq; + + if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) + return; + + switch (ft) { + case FIS_TYPE_REGD2H: + offset = 0x40; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; + break; + case FIS_TYPE_SETDEVBITS: + offset = 0x58; + len = 8; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; + break; + case FIS_TYPE_PIOSETUP: + offset = 0x20; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; + break; + case FIS_TYPE_REGH2D: + case FIS_TYPE_DMAACT: + case FIS_TYPE_DMASETUP: + case FIS_TYPE_DATA: + case FIS_TYPE_BIST: + WPRINTF("unsupported fis type %d\n", ft); + return; + } + if (fis[2] & ATA_S_ERROR) { + p->waitforclear = 1; + irq |= AHCI_P_IX_TFE; + } + memcpy(p->rfis + offset, fis, len); + if (irq) { + p->is |= ((unsigned) irq); + ahci_generate_intr(p->pr_sc); + } +} + +static void +ahci_write_fis_piosetup(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_PIOSETUP; + ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); +} + +static void +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[8]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + tfd &= 0x77; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_SETDEVBITS; + fis[1] = (1 << 6); + fis[2] = (uint8_t) tfd; + fis[3] = error; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = (uint8_t) slot; + p->err_cfis[2] = (uint8_t) tfd; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else { + *(uint32_t *)((void *) (fis + 4)) = (1 << slot); + p->sact &= ~(1 << slot); + } + p->tfd &= ~((unsigned) 0x77); + p->tfd |= tfd; + ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); +} + +static void +ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[20]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = (1 << 6); + fis[2] = tfd & 0xff; + fis[3] = error; + fis[4] = cfis[4]; + fis[5] = cfis[5]; + fis[6] = cfis[6]; + fis[7] = cfis[7]; + fis[8] = cfis[8]; + fis[9] = cfis[9]; + fis[10] = cfis[10]; + fis[11] = cfis[11]; + fis[12] = cfis[12]; + fis[13] = cfis[13]; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = 0x80; + p->err_cfis[2] = tfd & 0xff; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else + p->ci &= ~(1 << slot); + p->tfd = tfd; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) +{ + uint8_t fis[20]; + + p->tfd = ATA_S_READY | ATA_S_DSC; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = 0; /* No interrupt */ + fis[2] = (uint8_t) p->tfd; /* Status */ + fis[3] = 0; /* No error */ + p->ci &= ~(1 << slot); + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_reset_fis_d2h(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[3] = 1; + fis[4] = 1; + if (p->atapi) { + fis[5] = 0x14; + fis[6] = 0xeb; + } + fis[12] = 1; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_check_stopped(struct ahci_port *p) +{ + /* + * If we are no longer processing the command list and nothing + * is in-flight, clear the running bit, the current command + * slot, the command issue and active bits. + */ + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (p->pending == 0) { + p->ccs = 0; + p->cmd &= ~((unsigned) (AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK)); + p->ci = 0; + p->sact = 0; + p->waitforclear = 0; + } + } +} + +static void +ahci_port_stop(struct ahci_port *p) +{ + struct ahci_ioreq *aior; + uint8_t *cfis; + int slot; + int ncq; + int error; + + ncq = 0; + + TAILQ_FOREACH(aior, &p->iobhd, io_blist) { + /* + * Try to cancel the outstanding blockif request. + */ + error = blockif_cancel(p->bctx, &aior->io_req); + if (error != 0) + continue; + + slot = aior->slot; + cfis = aior->cfis; + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + { + ncq = 1; + } + + if (ncq) + p->sact &= ~(1 << slot); + else + p->ci &= ~(1 << slot); + + /* + * This command is now done. + */ + p->pending &= ~(1 << slot); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + } + + ahci_check_stopped(p); +} + +static void +ahci_port_reset(struct ahci_port *pr) +{ + pr->serr = 0; + pr->sact = 0; + pr->xfermode = ATA_UDMA6; + pr->mult_sectors = 128; + + if (!pr->bctx) { + pr->ssts = ATA_SS_DET_NO_DEVICE; + pr->sig = 0xFFFFFFFF; + pr->tfd = 0x7F; + return; + } + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; + if (pr->sctl & ATA_SC_SPD_MASK) + pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); + else + pr->ssts |= ATA_SS_SPD_GEN3; + pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; + if (!pr->atapi) { + pr->sig = PxSIG_ATA; + pr->tfd |= ATA_S_READY; + } else + pr->sig = PxSIG_ATAPI; + ahci_write_reset_fis_d2h(pr); +} + +static void +ahci_reset(struct pci_ahci_softc *sc) +{ + int i; + + sc->ghc = AHCI_GHC_AE; + sc->is = 0; + + if (sc->lintr) { + pci_lintr_deassert(sc->asc_pi); + sc->lintr = 0; + } + + for (i = 0; i < sc->ports; i++) { + sc->port[i].ie = 0; + sc->port[i].is = 0; + sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); + if (sc->port[i].bctx) + sc->port[i].cmd |= AHCI_P_CMD_CPS; + sc->port[i].sctl = 0; + ahci_port_reset(&sc->port[i]); + } +} + +static void +ata_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i ^ 1] = (uint8_t) *src++; + else + dest[i ^ 1] = ' '; + } +} + +static void +atapi_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i] = (uint8_t) *src++; + else + dest[i] = ' '; + } +} + +/* + * Build up the iovec based on the PRDT, 'done' and 'len'. + */ +static void +ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, + struct ahci_prdt_entry *prdt, uint16_t prdtl) +{ + struct blockif_req *breq = &aior->io_req; + int i, j, skip, todo, left, extra; + uint32_t dbcsz; + + /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ + skip = (int) aior->done; + left = (int) (aior->len - aior->done); + todo = 0; + for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; + i++, prdt++) { + dbcsz = (prdt->dbc & DBCMASK) + 1; + /* Skip already done part of the PRDT */ + if (dbcsz <= ((uint32_t) skip)) { + skip -= dbcsz; + continue; + } + dbcsz -= ((unsigned) skip); + if (dbcsz > ((uint32_t) left)) { + dbcsz = ((uint32_t) left); + } + breq->br_iov[j].iov_base = + paddr_guest2host((prdt->dba + ((uint64_t) skip)), dbcsz); + breq->br_iov[j].iov_len = dbcsz; + todo += dbcsz; + left -= dbcsz; + skip = 0; + j++; + } + + /* If we got limited by IOV length, round I/O down to sector size. */ + if (j == BLOCKIF_IOV_MAX) { + extra = todo % blockif_sectsz(p->bctx); + todo -= extra; + assert(todo > 0); + while (extra > 0) { + if (breq->br_iov[j - 1].iov_len > ((size_t) extra)) { + breq->br_iov[j - 1].iov_len -= ((size_t) extra); + break; + } + extra -= breq->br_iov[j - 1].iov_len; + j--; + } + } + + breq->br_iovcnt = j; + breq->br_resid = todo; + aior->done += ((unsigned) todo); + aior->more = (aior->done < aior->len && i < prdtl); +} + +static void +ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + struct ahci_prdt_entry *prdt; + struct ahci_cmd_hdr *hdr; + uint64_t lba; + uint32_t len; + int err, first, ncq, readop; + + prdt = (struct ahci_prdt_entry *)((void *) (cfis + 0x80)); + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + ncq = 0; + readop = 1; + first = (done == 0); + + if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) + readop = 0; + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = (uint32_t) (cfis[11] << 8 | cfis[3]); + if (!len) + len = 65536; + ncq = 1; + } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = (uint32_t) (cfis[13] << 8 | cfis[12]); + if (!len) + len = 65536; + } else { + lba = (uint64_t) (((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | + (cfis[5] << 8) | cfis[4]); + len = cfis[12]; + if (!len) + len = 256; + } + lba *= (uint64_t) blockif_sectsz(p->bctx); + len *= (uint32_t) blockif_sectsz(p->bctx); + + /* Pull request off free list */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = (off_t) (lba + done); + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + if (readop) + err = blockif_read(p->bctx, breq); + else + err = blockif_write(p->bctx, breq); + assert(err == 0); +} + +static void +ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + int err; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = 0; + aior->done = 0; + aior->more = 0; + breq = &aior->io_req; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_flush(p->bctx, breq); + assert(err == 0); +} + +static inline void +read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *to; + int i, len; + + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + len = size; + to = buf; + prdt = (struct ahci_prdt_entry *)((void *) (cfis + 0x80)); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(prdt->dba, dbcsz); + sublen = ((len < ((int) dbcsz)) ? len : ((int) dbcsz)); + memcpy(to, ptr, sublen); + len -= sublen; + to = (uint8_t *) (((uintptr_t) to) + ((uintptr_t) sublen)); + prdt++; + } +} + +static void +ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + uint8_t *entry; + uint64_t elba; + uint32_t len, elen; + int err, first, ncq; + uint8_t buf[512]; + + first = (done == 0); + if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { + len = (uint32_t) ((((uint16_t) cfis[13]) << 8) | cfis[12]); + len *= 512; + ncq = 0; + } else { /* ATA_SEND_FPDMA_QUEUED */ + len = (uint32_t) ((((uint16_t) cfis[11]) << 8) | cfis[3]); + len *= 512; + ncq = 1; + } + read_prdt(p, slot, cfis, buf, sizeof(buf)); + +next: + entry = &buf[done]; + elba = ((uint64_t)entry[5] << 40) | + ((uint64_t)entry[4] << 32) | + ((uint64_t)entry[3] << 24) | + ((uint64_t)entry[2] << 16) | + ((uint64_t)entry[1] << 8) | + entry[0]; + elen = (uint32_t) ((((uint16_t) entry[7]) << 8) | entry[6]); + done += 8; + if (elen == 0) { + if (done >= len) { + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + p->pending &= ~(1 << slot); + ahci_check_stopped(p); + if (!first) + ahci_handle_port(p); + return; + } + goto next; + } + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + aior->more = (len != done); + + breq = &aior->io_req; + breq->br_offset = (off_t) (elba * ((uint64_t) blockif_sectsz(p->bctx))); + breq->br_resid = elen * ((unsigned) blockif_sectsz(p->bctx)); + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + err = blockif_delete(p->bctx, breq); + assert(err == 0); +} + +static inline void +write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *from; + int i, len; + + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + len = size; + from = buf; + prdt = (struct ahci_prdt_entry *)((void *) (cfis + 0x80)); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(prdt->dba, dbcsz); + sublen = (len < ((int) dbcsz)) ? len : ((int) dbcsz); + memcpy(ptr, from, sublen); + len -= sublen; + from = (void *) (((uintptr_t) from) + ((uintptr_t) sublen)); + prdt++; + } + hdr->prdbc = (uint32_t) (size - len); +} + +static void +ahci_checksum(uint8_t *buf, int size) +{ + int i; + uint8_t sum = 0; + + for (i = 0; i < size - 1; i++) + sum += buf[i]; + buf[size - 1] = (uint8_t) (0x100 - sum); +} + +static void +ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + uint8_t buf[512]; + + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + if (p->atapi || hdr->prdtl == 0 || cfis[4] != 0x10 || + cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + memset(buf, 0, sizeof(buf)); + memcpy(buf, p->err_cfis, sizeof(p->err_cfis)); + ahci_checksum(buf, sizeof(buf)); + + if (cfis[2] == ATA_READ_LOG_EXT) + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); +} + +static void +handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + if (p->atapi || hdr->prdtl == 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + uint64_t sectors; + int sectsz, psectsz, psectoff, candelete, ro; + uint16_t cyl; + uint8_t sech, heads; + + ro = blockif_is_ro(p->bctx); + candelete = blockif_candelete(p->bctx); + sectsz = blockif_sectsz(p->bctx); + sectors = (uint64_t) (blockif_size(p->bctx) / sectsz); + blockif_chs(p->bctx, &cyl, &heads, &sech); + blockif_psectsz(p->bctx, &psectsz, &psectoff); + memset(buf, 0, sizeof(buf)); + buf[0] = 0x0040; + buf[1] = cyl; + buf[3] = heads; + buf[6] = sech; + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); + buf[47] = (0x8000 | 128); + buf[48] = 0x1; + buf[49] = (1 << 8 | 1 << 9 | 1 << 11); + buf[50] = (1 << 14); + buf[53] = (1 << 1 | 1 << 2); + if (p->mult_sectors) + buf[59] = (uint16_t) (0x100 | p->mult_sectors); + if (sectors <= 0x0fffffff) { + buf[60] = (uint16_t) sectors; + buf[61] = (uint16_t)(sectors >> 16); + } else { + buf[60] = 0xffff; + buf[61] = 0x0fff; + } + buf[63] = 0x7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 0x3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[69] = 0; + buf[75] = 31; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[80] = 0x3f0; + buf[81] = 0x28; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[100] = (uint16_t) sectors; + buf[101] = (uint16_t) (sectors >> 16); + buf[102] = (uint16_t) (sectors >> 32); + buf[103] = (sectors >> 48); + if (candelete && !ro) { + buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + buf[105] = 1; + buf[169] = ATA_SUPPORT_DSM_TRIM; + } + buf[106] = 0x4000; + buf[209] = 0x4000; + if (psectsz > sectsz) { + buf[106] |= 0x2000; + buf[106] |= ffsl(psectsz / sectsz) - 1; + buf[209] |= (psectoff / sectsz); + } + if (sectsz > 512) { + buf[106] |= 0x1000; + buf[117] = (uint16_t) (sectsz / 2); + buf[118] = (uint16_t) ((sectsz / 2) >> 16); + } + buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + + memset(buf, 0, sizeof(buf)); + buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); + buf[49] = (1 << 9 | 1 << 8); + buf[50] = (1 << 14 | 1); + buf[53] = (1 << 2 | 1 << 1); + buf[62] = 0x3f; + buf[63] = 7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[78] = (1 << 5); + buf[80] = 0x3f0; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[83] = (1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[36]; + uint8_t *acmd; + int len; + uint32_t tfd; + + acmd = cfis + 0x40; + + if (acmd[1] & 1) { /* VPD */ + if (acmd[2] == 0) { /* Supported VPD pages */ + buf[0] = 0x05; + buf[1] = 0; + buf[2] = 0; + buf[3] = 1; + buf[4] = 0; + len = 4 + buf[3]; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + } else { + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + len = sizeof(buf); + } + + if (len > acmd[4]) + len = acmd[4]; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, len); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static __inline void +be16enc(void *pp, uint16_t u) +{ + unsigned char *p = (unsigned char *)pp; + + p[0] = (u >> 8) & 0xff; + p[1] = u & 0xff; +} + +static __inline uint16_t +be16dec(const void *pp) +{ + unsigned char const *p = (unsigned char const *)pp; + + return ((uint16_t) ((((uint32_t) p[0]) << 8) | ((uint32_t) p[1]))); +} + +static __inline void +be32enc(void *pp, uint32_t u) +{ + unsigned char *p = (unsigned char *)pp; + + p[0] = (u >> 24) & 0xff; + p[1] = (u >> 16) & 0xff; + p[2] = (u >> 8) & 0xff; + p[3] = u & 0xff; +} + +static __inline uint32_t +be32dec(const void *pp) +{ + unsigned char const *p = (unsigned char const *)pp; + + return (uint32_t) ((((uint64_t) p[0]) << 24) | + (((uint64_t) p[1]) << 16) | (((uint64_t) p[2]) << 8) | + ((uint64_t) p[3])); +} + +static void +atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[8]; + uint64_t sectors; + + sectors = (uint64_t) (blockif_size(p->bctx) / 2048); + be32enc(buf, ((uint32_t) (sectors - 1))); + be32enc(buf + 4, 2048); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint8_t format; + int len; + + acmd = cfis + 0x40; + + len = be16dec(acmd + 7); + format = acmd[9] >> 6; + switch (format) { + case 0: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, buf[20], *bp; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + if (start_track > 1 && start_track != 0xaa) { + uint32_t tfd; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + if (start_track <= 1) { + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 1; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + } + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 0xaa; + *bp++ = 0; + sectors = (uint64_t) (blockif_size(p->bctx) / blockif_sectsz(p->bctx)); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, ((int) sectors)); + bp += 3; + } else { + be32enc(bp, ((uint32_t) sectors)); + bp += 4; + } + size = (int) (bp - buf); + be16enc(buf, ((uint16_t) (size - 2))); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 1: + { + uint8_t buf[12]; + + memset(buf, 0, sizeof(buf)); + buf[1] = 0xa; + buf[2] = 0x1; + buf[3] = 0x1; + if (((size_t) len) > sizeof(buf)) + len = sizeof(buf); + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 2: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, *bp, buf[50]; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa2; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + sectors = (uint64_t) (blockif_size(p->bctx) / blockif_sectsz(p->bctx)); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, ((int) sectors)); + bp += 3; + } else { + be32enc(bp, ((uint32_t) sectors)); + bp += 4; + } + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + + size = (int) (bp - buf); + be16enc(buf, ((uint16_t) (size - 2))); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + default: + { + uint32_t tfd; + + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + break; + } + } +} + +static void +atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[16]; + + memset(buf, 0, sizeof(buf)); + buf[3] = 8; + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct blockif_req *breq; + struct pci_ahci_softc *sc; + uint8_t *acmd; + uint64_t lba; + uint32_t len; + int err; + + sc = p->pr_sc; + acmd = cfis + 0x40; + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + prdt = (struct ahci_prdt_entry *)((void *) (cfis + 0x80)); + + lba = be32dec(acmd + 2); + if (acmd[0] == READ_10) + len = be16dec(acmd + 7); + else + len = be32dec(acmd + 6); + if (len == 0) { + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + } + lba *= 2048; + len *= 2048; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = (off_t) (lba + ((uint64_t) done)); + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_read(p->bctx, breq); + assert(err == 0); +} + +static void +atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[64]; + uint8_t *acmd; + int len; + + acmd = cfis + 0x40; + len = acmd[4]; + if (((size_t) len) > sizeof(buf)) + len = sizeof(buf); + memset(buf, 0, len); + buf[0] = 0x70 | (1 << 7); + buf[2] = p->sense_key; + buf[7] = 10; + buf[12] = p->asc; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd = cfis + 0x40; + uint32_t tfd; + + tfd = 0; + + switch (acmd[4] & 3) { + case 0: + case 1: + case 3: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + tfd = ATA_S_READY | ATA_S_DSC; + break; + case 2: + /* TODO eject media */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x53; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + break; + } + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + uint8_t pc, code; + int len; + + tfd = 0; + + acmd = cfis + 0x40; + len = be16dec(acmd + 7); + pc = acmd[2] >> 6; + code = acmd[2] & 0x3f; + + switch (pc) { + case 0: + switch (code) { + case MODEPAGE_RW_ERROR_RECOVERY: + { + uint8_t buf[16]; + + if (((size_t) len) > sizeof(buf)) { + len = sizeof(buf); + } + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 16 - 2); + buf[2] = 0x70; + buf[8] = 0x01; + buf[9] = 16 - 10; + buf[11] = 0x05; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + case MODEPAGE_CD_CAPABILITIES: + { + uint8_t buf[30]; + + if (((size_t) len) > sizeof(buf)) { + len = sizeof(buf); + } + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 30 - 2); + buf[2] = 0x70; + buf[8] = 0x2A; + buf[9] = 30 - 10; + buf[10] = 0x08; + buf[12] = 0x71; + be16enc(&buf[18], 2); + be16enc(&buf[20], 512); + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + default: + goto error; + } + break; + case 3: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x39; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + break; +error: + case 1: + case 2: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + break; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_get_event_status_notification(struct ahci_port *p, int slot, + uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + + acmd = cfis + 0x40; + + /* we don't support asynchronous operation */ + if (!(acmd[1] & 1)) { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + } else { + uint8_t buf[8]; + int len; + + len = be16dec(acmd + 7); + if (((size_t) len) > sizeof(buf)) { + len = sizeof(buf); + } + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 8 - 2); + buf[2] = 0x04; + buf[3] = 0x10; + buf[5] = 0x02; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + + acmd = cfis + 0x40; + +#ifdef AHCI_DEBUG + { + int i; + DPRINTF("ACMD:"); + for (i = 0; i < 16; i++) + DPRINTF("%02x ", acmd[i]); + DPRINTF("\n"); + } +#endif + + switch (acmd[0]) { + case TEST_UNIT_READY: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case INQUIRY: + atapi_inquiry(p, slot, cfis); + break; + case READ_CAPACITY: + atapi_read_capacity(p, slot, cfis); + break; + case PREVENT_ALLOW: + /* TODO */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case READ_TOC: + atapi_read_toc(p, slot, cfis); + break; + case REPORT_LUNS: + atapi_report_luns(p, slot, cfis); + break; + case READ_10: + case READ_12: + atapi_read(p, slot, cfis, 0); + break; + case REQUEST_SENSE: + atapi_request_sense(p, slot, cfis); + break; + case START_STOP_UNIT: + atapi_start_stop_unit(p, slot, cfis); + break; + case MODE_SENSE_10: + atapi_mode_sense(p, slot, cfis); + break; + case GET_EVENT_STATUS_NOTIFICATION: + atapi_get_event_status_notification(p, slot, cfis); + break; + default: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x20; + ahci_write_fis_d2h(p, slot, cfis, ((uint32_t) (p->sense_key << 12)) | + ((uint32_t) (ATA_S_READY | ATA_S_ERROR))); + break; + } +} + +static void +ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + + p->tfd |= ATA_S_BUSY; + switch (cfis[2]) { + case ATA_ATA_IDENTIFY: + handle_identify(p, slot, cfis); + break; + case ATA_SETFEATURES: + { + switch (cfis[3]) { + case ATA_SF_ENAB_SATA_SF: + switch (cfis[12]) { + case ATA_SATA_SF_AN: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + break; + case ATA_SF_ENAB_WCACHE: + case ATA_SF_DIS_WCACHE: + case ATA_SF_ENAB_RCACHE: + case ATA_SF_DIS_RCACHE: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + case ATA_SF_SETXFER: + { + switch (cfis[12] & 0xf8) { + case ATA_PIO: + case ATA_PIO0: + break; + case ATA_WDMA0: + case ATA_UDMA0: + p->xfermode = (cfis[12] & 0x7); + break; + } + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + } + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + } + case ATA_SET_MULTI: + if (cfis[12] != 0 && + (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + } else { + p->mult_sectors = cfis[12]; + p->tfd = ATA_S_DSC | ATA_S_READY; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + case ATA_READ: + case ATA_WRITE: + case ATA_READ48: + case ATA_WRITE48: + case ATA_READ_MUL: + case ATA_WRITE_MUL: + case ATA_READ_MUL48: + case ATA_WRITE_MUL48: + case ATA_READ_DMA: + case ATA_WRITE_DMA: + case ATA_READ_DMA48: + case ATA_WRITE_DMA48: + case ATA_READ_FPDMA_QUEUED: + case ATA_WRITE_FPDMA_QUEUED: + ahci_handle_rw(p, slot, cfis, 0); + break; + case ATA_FLUSHCACHE: + case ATA_FLUSHCACHE48: + ahci_handle_flush(p, slot, cfis); + break; + case ATA_DATA_SET_MANAGEMENT: + if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && + cfis[13] == 0 && cfis[12] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_SEND_FPDMA_QUEUED: + if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && + cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && + cfis[11] == 0 && cfis[13] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_READ_LOG_EXT: + case ATA_READ_LOG_DMA_EXT: + ahci_handle_read_log(p, slot, cfis); + break; + case ATA_NOP: + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_STANDBY_CMD: + case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_CMD: + case ATA_IDLE_IMMEDIATE: + case ATA_SLEEP: + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_ATAPI_IDENTIFY: + handle_atapi_identify(p, slot, cfis); + break; + case ATA_PACKET_CMD: + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else + handle_packet_cmd(p, slot, cfis); + break; + default: + WPRINTF("Unsupported cmd:%02x\n", cfis[2]); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_slot(struct ahci_port *p, int slot) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct pci_ahci_softc *sc; + uint8_t *cfis; + int cfl; + + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + cfl = (hdr->flags & 0x1f) * 4; + cfis = paddr_guest2host(hdr->ctba, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); + prdt = (struct ahci_prdt_entry *)((void *) (cfis + 0x80)); + +#ifdef AHCI_DEBUG + DPRINTF("\ncfis:"); + for (i = 0; i < cfl; i++) { + if (i % 10 == 0) + DPRINTF("\n"); + DPRINTF("%02x ", cfis[i]); + } + DPRINTF("\n"); + + for (i = 0; i < hdr->prdtl; i++) { + DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba); + prdt++; + } +#endif + + if (cfis[0] != FIS_TYPE_REGH2D) { + WPRINTF("Not a H2D FIS:%02x\n", cfis[0]); + return; + } + + if (cfis[1] & 0x80) { + ahci_handle_cmd(p, slot, cfis); + } else { + if (cfis[15] & (1 << 2)) + p->reset = 1; + else if (p->reset) { + p->reset = 0; + ahci_port_reset(p); + } + p->ci &= ~(1 << slot); + } +} + +static void +ahci_handle_port(struct ahci_port *p) +{ + + if (!(p->cmd & AHCI_P_CMD_ST)) + return; + + /* + * Search for any new commands to issue ignoring those that + * are already in-flight. Stop if device is busy or in error. + */ + for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { + if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) + break; + if (p->waitforclear) + break; + if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { + p->cmd &= ~((unsigned) AHCI_P_CMD_CCS_MASK); + p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, ((int) p->ccs)); + } + } +} + +/* + * blockif callback routine - this runs in the context of the blockif + * i/o thread, so the mutex needs to be acquired. + */ +static void +ata_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint32_t tfd; + uint8_t *cfis; + int slot, ncq, dsm; + + DPRINTF("%s %d\n", __func__, err); + + ncq = dsm = 0; + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)((void *) (p->cmd_lst + slot * AHCI_CL_SIZE)); + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + ncq = 1; + if (cfis[2] == ATA_DATA_SET_MANAGEMENT || + (cfis[2] == ATA_SEND_FPDMA_QUEUED && + (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) + dsm = 1; + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + if (dsm) + ahci_handle_dsm_trim(p, slot, cfis, aior->done); + else + ahci_handle_rw(p, slot, cfis, aior->done); + goto out; + } + + if (!err) + tfd = ATA_S_READY | ATA_S_DSC; + else + tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + if (ncq) + ahci_write_fis_sdb(p, slot, cfis, tfd); + else + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +atapi_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint8_t *cfis; + uint32_t tfd; + int slot; + + DPRINTF("%s %d\n", __func__, err); + + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *) + ((void *) (p->cmd_lst + aior->slot * AHCI_CL_SIZE)); + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + atapi_read(p, slot, cfis, aior->done); + goto out; + } + + if (!err) { + tfd = ATA_S_READY | ATA_S_DSC; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x21; + tfd = (uint32_t) ((p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +pci_ahci_ioreq_init(struct ahci_port *pr) +{ + struct ahci_ioreq *vr; + int i; + + pr->ioqsz = blockif_queuesz(pr->bctx); + pr->ioreq = calloc(((size_t) pr->ioqsz), sizeof(struct ahci_ioreq)); + STAILQ_INIT(&pr->iofhd); + + /* + * Add all i/o request entries to the free queue + */ + for (i = 0; i < pr->ioqsz; i++) { + vr = &pr->ioreq[i]; + vr->io_pr = pr; + if (!pr->atapi) + vr->io_req.br_callback = ata_ioreq_cb; + else + vr->io_req.br_callback = atapi_ioreq_cb; + vr->io_req.br_param = vr; + STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); + } + + TAILQ_INIT(&pr->iobhd); +} + +static void +pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + int port = (int) ((offset - AHCI_OFFSET) / AHCI_STEP); + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + struct ahci_port *p = &sc->port[port]; + + DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + port, offset, value); + + switch (offset) { + case AHCI_P_CLB: + p->clb = (uint32_t) value; + break; + case AHCI_P_CLBU: + p->clbu = (uint32_t) value; + break; + case AHCI_P_FB: + p->fb = (uint32_t) value; + break; + case AHCI_P_FBU: + p->fbu = (uint32_t) value; + break; + case AHCI_P_IS: + p->is &= ~value; + break; + case AHCI_P_IE: + p->ie = value & 0xFDC000FF; + ahci_generate_intr(sc); + break; + case AHCI_P_CMD: + { + p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); + p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; + + if (!(value & AHCI_P_CMD_ST)) { + ahci_port_stop(p); + } else { + uint64_t clb; + + p->cmd |= AHCI_P_CMD_CR; + clb = (uint64_t)p->clbu << 32 | p->clb; + p->cmd_lst = paddr_guest2host(clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); + } + + if (value & AHCI_P_CMD_FRE) { + uint64_t fb; + + p->cmd |= AHCI_P_CMD_FR; + fb = (uint64_t)p->fbu << 32 | p->fb; + /* we don't support FBSCP, so rfis size is 256Bytes */ + p->rfis = paddr_guest2host(fb, 256); + } else { + p->cmd &= ~((unsigned) AHCI_P_CMD_FR); + } + + if (value & AHCI_P_CMD_CLO) { + p->tfd &= ~((unsigned) (ATA_S_BUSY | ATA_S_DRQ)); + p->cmd &= ~((unsigned) AHCI_P_CMD_CLO); + } + + if (value & AHCI_P_CMD_ICC_MASK) { + p->cmd &= ~AHCI_P_CMD_ICC_MASK; + } + + ahci_handle_port(p); + break; + } + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_P_SCTL: + p->sctl = (uint32_t) value; + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (value & ATA_SC_DET_RESET) + ahci_port_reset(p); + } + break; + case AHCI_P_SERR: + p->serr &= ~value; + break; + case AHCI_P_SACT: + p->sact |= value; + break; + case AHCI_P_CI: + p->ci |= value; + ahci_handle_port(p); + break; + case AHCI_P_SNTF: + case AHCI_P_FBS: + default: + break; + } +} + +static void +pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + offset, value); + + switch (offset) { + case AHCI_CAP: + case AHCI_PI: + case AHCI_VS: + case AHCI_CAP2: + DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_GHC: + if (value & AHCI_GHC_HR) + ahci_reset(sc); + else if (value & AHCI_GHC_IE) { + sc->ghc |= AHCI_GHC_IE; + ahci_generate_intr(sc); + } + break; + case AHCI_IS: + sc->is &= ~value; + ahci_generate_intr(sc); + break; + default: + break; + } +} + +static void +pci_ahci_write(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + + assert(baridx == 5); + assert((offset % 4) == 0 && size == 4); + + pthread_mutex_lock(&sc->mtx); + + if (offset < AHCI_OFFSET) + pci_ahci_host_write(sc, offset, value); + else if (offset < ((uint64_t) (AHCI_OFFSET + (sc->ports * AHCI_STEP)))) + pci_ahci_port_write(sc, offset, value); + else + WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + + switch (offset) { + case AHCI_CAP: + case AHCI_GHC: + case AHCI_IS: + case AHCI_PI: + case AHCI_VS: + case AHCI_CCCC: + case AHCI_CCCP: + case AHCI_EM_LOC: + case AHCI_EM_CTL: + case AHCI_CAP2: + { + uint32_t *p = &sc->cap; + p += (offset - AHCI_CAP) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n", + offset, value); + + return (value); +} + +static uint64_t +pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + int port = (int) ((offset - AHCI_OFFSET) / AHCI_STEP); + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + + switch (offset) { + case AHCI_P_CLB: + case AHCI_P_CLBU: + case AHCI_P_FB: + case AHCI_P_FBU: + case AHCI_P_IS: + case AHCI_P_IE: + case AHCI_P_CMD: + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + case AHCI_P_SCTL: + case AHCI_P_SERR: + case AHCI_P_SACT: + case AHCI_P_CI: + case AHCI_P_SNTF: + case AHCI_P_FBS: + { + uint32_t *p= &sc->port[port].clb; + p += (offset - AHCI_P_CLB) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + + DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n", + port, offset, value); + + return value; +} + +static uint64_t +pci_ahci_read(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t regoff, int size) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + uint64_t offset; + uint32_t value; + + assert(baridx == 5); + assert(size == 1 || size == 2 || size == 4); + assert((regoff & ((uint64_t) (size - 1))) == 0); + + pthread_mutex_lock(&sc->mtx); + + /* round down to a multiple of 4 bytes */ + offset = regoff & ~((uint64_t) 0x3); + if (offset < AHCI_OFFSET) + value = (uint32_t) pci_ahci_host_read(sc, offset); + else if (offset < ((uint64_t) (AHCI_OFFSET + (sc->ports * AHCI_STEP)))) + value = (uint32_t) pci_ahci_port_read(sc, offset); + else { + value = 0; + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", + regoff); + } + value >>= 8 * (regoff & 0x3); + + pthread_mutex_unlock(&sc->mtx); + + return (value); +} + +static int +pci_ahci_init(struct pci_devinst *pi, char *opts, int atapi) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + struct pci_ahci_softc *sc; + int ret, slots; + MD5_CTX mdctx; + u_char digest[16]; + + ret = 0; + + if (opts == NULL) { + fprintf(stderr, "pci_ahci: backing device required\n"); + return (1); + } + +#ifdef AHCI_DEBUG + dbg = fopen("/tmp/log", "w+"); +#endif + + sc = calloc(1, sizeof(struct pci_ahci_softc)); + pi->pi_arg = sc; + sc->asc_pi = pi; + sc->ports = MAX_PORTS; + + /* + * Only use port 0 for a backing device. All other ports will be + * marked as unused + */ + sc->port[0].atapi = atapi; + + /* + * Attempt to open the backing image. Use the PCI + * slot/func for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + ret = 1; + goto open_fail; + } + sc->port[0].bctx = bctxt; + sc->port[0].pr_sc = sc; + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, ((unsigned int) strlen(opts))); + MD5Final(digest, &mdctx); + snprintf(sc->port[0].ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[0]); + + pthread_mutex_init(&sc->mtx, NULL); + + /* Intel ICH8 AHCI */ + slots = sc->port[0].ioqsz; + if (slots > 32) + slots = 32; + --slots; + sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | + AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | + AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| + AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | + (((unsigned) slots) << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | + (((unsigned) sc->ports) - 1); + + /* Only port 0 implemented */ + sc->pi = 1; + sc->vs = 0x10300; + sc->cap2 = AHCI_CAP2_APST; + ahci_reset(sc); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); + pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); + pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, + ((uint64_t) (AHCI_OFFSET + sc->ports * AHCI_STEP))); + + pci_lintr_request(pi); + +open_fail: + if (ret) { + if (sc->port[0].bctx != NULL) + blockif_close(sc->port[0].bctx); + free(sc); + } + + return (ret); +} + +static int +pci_ahci_hd_init(struct pci_devinst *pi, char *opts) +{ + return (pci_ahci_init(pi, opts, 0)); +} + +static int +pci_ahci_atapi_init(struct pci_devinst *pi, char *opts) +{ + return (pci_ahci_init(pi, opts, 1)); +} + +/* + * Use separate emulation names to distinguish drive and atapi devices + */ +static struct pci_devemu pci_de_ahci_hd = { + .pe_emu = "ahci-hd", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_hd); + +static struct pci_devemu pci_de_ahci_cd = { + .pe_emu = "ahci-cd", + .pe_init = pci_ahci_atapi_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/vendor/github.com/hooklift/xhyve/pci_emul.c b/vendor/github.com/hooklift/xhyve/pci_emul.c new file mode 100644 index 0000000..5b34694 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_emul.c @@ -0,0 +1,2141 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT0 0x0cfc +#define CONF1_DATA_PORT1 0x0cfd +#define CONF1_DATA_PORT2 0x0cfe +#define CONF1_DATA_PORT3 0x0cff + +#define CONF1_ENABLE 0x80000000ul + +#define MAXBUSES (PCI_BUSMAX + 1) +#define MAXSLOTS (PCI_SLOTMAX + 1) +#define MAXFUNCS (PCI_FUNCMAX + 1) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct funcinfo { + char *fi_name; + char *fi_param; + struct pci_devinst *fi_devi; +}; + +struct intxinfo { + int ii_count; + int ii_pirq_pin; + int ii_ioapic_irq; +}; + +struct slotinfo { + struct intxinfo si_intpins[4]; + struct funcinfo si_funcs[MAXFUNCS]; +}; + +struct businfo { + uint16_t iobase, iolimit; /* I/O window */ + uint32_t membase32, memlimit32; /* mmio window below 4GB */ + uint64_t membase64, memlimit64; /* mmio window above 4GB */ + struct slotinfo slotinfo[MAXSLOTS]; +}; +#pragma clang diagnostic pop + +static struct businfo *pci_businfo[MAXBUSES]; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static struct pci_devemu *pci_emul_finddev(char *name); +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(int vcpu, int in, int bus, int slot, int func, int coff, + int bytes, uint32_t *val); + +static __inline void +CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) +{ + + if (bytes == 1) + pci_set_cfgdata8(pi, coff, ((uint8_t) val)); + else if (bytes == 2) + pci_set_cfgdata16(pi, coff, ((uint16_t) val)); + else + pci_set_cfgdata32(pi, coff, val); +} + +static __inline uint32_t +CFGREAD(struct pci_devinst *pi, int coff, int bytes) +{ + + if (bytes == 1) + return (pci_get_cfgdata8(pi, coff)); + else if (bytes == 2) + return (pci_get_cfgdata16(pi, coff)); + else + return (pci_get_cfgdata32(pi, coff)); +} + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * ::,[,] + * [:],[,] + * + * slot is 0..31 + * func is 0..7 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3:0,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); +} + +int +pci_parse_slot(char *opt) +{ + struct businfo *bi; + struct slotinfo *si; + char *emul, *config, *str, *cp; + int error, bnum, snum, fnum; + + error = -1; + str = strdup(opt); + + emul = config = NULL; + if ((cp = strchr(str, ',')) != NULL) { + *cp = '\0'; + emul = cp + 1; + if ((cp = strchr(emul, ',')) != NULL) { + *cp = '\0'; + config = cp + 1; + } + } else { + pci_parse_slot_usage(opt); + goto done; + } + + /* :: */ + if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { + bnum = 0; + /* : */ + if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { + fnum = 0; + /* */ + if (sscanf(str, "%d", &snum) != 1) { + snum = -1; + } + } + } + + if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || + fnum < 0 || fnum >= MAXFUNCS) { + pci_parse_slot_usage(opt); + goto done; + } + + if (pci_businfo[bnum] == NULL) + pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); + + bi = pci_businfo[bnum]; + si = &bi->slotinfo[snum]; + + if (si->si_funcs[fnum].fi_name != NULL) { + fprintf(stderr, "pci slot %d:%d already occupied!\n", + snum, fnum); + goto done; + } + + if (pci_emul_finddev(emul) == NULL) { + fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", + snum, fnum, emul); + goto done; + } + + error = 0; + si->si_funcs[fnum].fi_name = emul; + si->si_funcs[fnum].fi_param = config; + +done: + if (error) + free(str); + + return (error); +} + +static int +pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) +{ + + if (offset < pi->pi_msix.pba_offset) + return (0); + + if (offset >= pi->pi_msix.pba_offset + ((unsigned) pi->pi_msix.pba_size)) { + return (0); + } + + return (1); +} + +int +pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value) +{ + int msix_entry_offset; + int tab_index; + char *dest; + + /* support only 4 or 8 byte writes */ + if (size != 4 && size != 8) + return (-1); + + /* + * Return if table index is beyond what device supports + */ + tab_index = (int) (offset / MSIX_TABLE_ENTRY_SIZE); + if (tab_index >= pi->pi_msix.table_count) + return (-1); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned writes */ + if ((msix_entry_offset % size) != 0) + return (-1); + + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + *((uint32_t *)((void *) dest)) = (uint32_t) value; + else + *((uint64_t *)((void *) dest)) = value; + + return (0); +} + +uint64_t +pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) +{ + char *dest; + int msix_entry_offset; + int tab_index; + uint64_t retval = ~((uint64_t) 0); + + /* + * The PCI standard only allows 4 and 8 byte accesses to the MSI-X + * table but we also allow 1 byte access to accomodate reads from + * ddb. + */ + if (size != 1 && size != 4 && size != 8) + return (retval); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned reads */ + if ((msix_entry_offset % size) != 0) { + return (retval); + } + + tab_index = (int) (offset / MSIX_TABLE_ENTRY_SIZE); + + if (tab_index < pi->pi_msix.table_count) { + /* valid MSI-X Table access */ + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 1) + retval = *((uint8_t *)((void *) dest)); + else if (size == 4) + retval = *((uint32_t *)((void *) dest)); + else + retval = *((uint64_t *)((void *) dest)); + } else if (pci_valid_pba_offset(pi, offset)) { + /* return 0 for PBA access */ + retval = 0; + } + + return (retval); +} + +int +pci_msix_table_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.table_bar); + else + return (-1); +} + +int +pci_msix_pba_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.pba_bar); + else + return (-1); +} + +static int +pci_emul_io_handler(int vcpu, int in, int port, int bytes, uint32_t *eax, + void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if ((pdi->pi_bar[i].type == PCIBAR_IO) && + (((uint64_t) port) >= pdi->pi_bar[i].addr) && + (((uint64_t) (port + bytes)) <= + (pdi->pi_bar[i].addr + pdi->pi_bar[i].size))) + { + offset = ((uint64_t) port) - pdi->pi_bar[i].addr; + if (in) + *eax = (uint32_t) (*pe->pe_barread)(vcpu, pdi, i, offset, + bytes); + else + (*pe->pe_barwrite)(vcpu, pdi, i, offset, bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_mem_handler(int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct pci_devinst *pdi = arg1; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(bidx <= PCI_BARMAX); + assert((pdi->pi_bar[bidx].type == PCIBAR_MEM32) || + (pdi->pi_bar[bidx].type == PCIBAR_MEM64)); + assert((addr >= pdi->pi_bar[bidx].addr) && + ((addr + ((uint64_t) size)) <= + (pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size))); + + offset = addr - pdi->pi_bar[bidx].addr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*pe->pe_barwrite)(vcpu, pdi, bidx, offset, 4, *val & 0xffffffff); + (*pe->pe_barwrite)(vcpu, pdi, bidx, offset + 4, 4, *val >> 32); + } else { + (*pe->pe_barwrite)(vcpu, pdi, bidx, offset, size, *val); + } + } else { + if (size == 8) { + *val = (*pe->pe_barread)(vcpu, pdi, bidx, offset, 4); + *val |= (*pe->pe_barread)(vcpu, pdi, bidx, offset + 4, 4) << 32; + } else { + *val = (*pe->pe_barread)(vcpu, pdi, bidx, offset, size); + } + } + + return (0); +} + + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) +{ + + return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); +} + +/* + * Register (or unregister) the MMIO or I/O region associated with the BAR + * register 'idx' of an emulated pci device. + */ +static void +modify_bar_registration(struct pci_devinst *pi, int idx, int registration) +{ + int error; + struct inout_port iop; + struct mem_range mr; + + switch (pi->pi_bar[idx].type) { + case PCIBAR_IO: + bzero(&iop, sizeof(struct inout_port)); + iop.name = pi->pi_name; + iop.port = (int) pi->pi_bar[idx].addr; + iop.size = (int)pi->pi_bar[idx].size; + if (registration) { + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_io_handler; + iop.arg = pi; + error = register_inout(&iop); + } else + error = unregister_inout(&iop); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + bzero(&mr, sizeof(struct mem_range)); + mr.name = pi->pi_name; + mr.base = pi->pi_bar[idx].addr; + mr.size = pi->pi_bar[idx].size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = pci_emul_mem_handler; + mr.arg1 = pi; + mr.arg2 = idx; + error = register_mem(&mr); + } else + error = unregister_mem(&mr); + break; + case PCIBAR_NONE: + case PCIBAR_MEMHI64: + error = EINVAL; + break; + } + assert(error == 0); +} + +static void +unregister_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 0); +} + +static void +register_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 1); +} + +/* Are we decoding i/o port accesses for the emulated pci device? */ +static int +porten(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_PORTEN); +} + +/* Are we decoding memory accesses for the emulated pci device? */ +static int +memen(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_MEMEN); +} + +/* + * Update the MMIO or I/O address that is decoded by the BAR register. + * + * If the pci device has enabled the address space decoding then intercept + * the address range decoded by the BAR register. + */ +static void +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +{ + int decode; + + if (pi->pi_bar[idx].type == PCIBAR_IO) + decode = porten(pi); + else + decode = memen(pi); + + if (decode) + unregister_bar(pi, idx); + + switch (type) { + case PCIBAR_IO: + case PCIBAR_MEM32: + pi->pi_bar[idx].addr = addr; + break; + case PCIBAR_MEM64: + pi->pi_bar[idx].addr &= ~0xffffffffUL; + pi->pi_bar[idx].addr |= addr; + break; + case PCIBAR_MEMHI64: + pi->pi_bar[idx].addr &= 0xffffffff; + pi->pi_bar[idx].addr |= addr; + break; + default: + assert(0); + } + + if (decode) + register_bar(pi, idx); +} + +int +pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + int error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + addr = 0; + limit = 0; + + if ((size & (size - 1)) != 0) + size = 1UL << flsl((long) size); /* round up to a power of 2 */ + + /* Enforce minimum BAR sizes required by the PCI standard */ + if (type == PCIBAR_IO) { + if (size < 4) + size = 4; + } else { + if (size < 16) + size = 16; + } + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } else { + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; + } + break; + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + case PCIBAR_MEMHI64: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); + assert(0); + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), ((uint32_t) bar)); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + register_bar(pdi, idx); + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, reallen; + uint16_t sts; + + assert(caplen > 0); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) + capoff = CAP_START_OFFSET; + else + capoff = pi->pi_capend + 1; + + /* Check if we have enough space */ + if (capoff + reallen > PCI_REGMAX + 1) + return (-1); + + /* Set the previous capability pointer */ + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + pci_set_cfgdata8(pi, PCIR_CAP_PTR, ((uint8_t) capoff)); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else + pci_set_cfgdata8(pi, pi->pi_prevcap + 1, ((uint8_t) capoff)); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, 0); + + pi->pi_prevcap = capoff; + pi->pi_capend = capoff + reallen - 1; + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static int +pci_emul_init(struct pci_devemu *pde, int bus, int slot, + int func, struct funcinfo *fi) +{ + struct pci_devinst *pdi; + int err; + + pdi = calloc(1, sizeof(struct pci_devinst)); + + pdi->pi_bus = (uint8_t) bus; + pdi->pi_slot = (uint8_t) slot; + pdi->pi_func = (uint8_t) func; + pthread_mutex_init(&pdi->pi_lintr.lock, NULL); + pdi->pi_lintr.pin = 0; + pdi->pi_lintr.state = IDLE; + pdi->pi_lintr.pirq_pin = 0; + pdi->pi_lintr.ioapic_irq = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + err = (*pde->pe_init)(pdi, fi->fi_param); + if (err == 0) + fi->fi_devi = pdi; + else + free(pdi); + + return (err); +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + CTASSERT(sizeof(struct msicap) == 14); + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = (uint8_t) nextptr; + msicap->msgctrl = (uint16_t) (PCIM_MSICTRL_64BIT | (mmc << 1)); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +static void +pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, + uint32_t msix_tab_size) +{ + CTASSERT(sizeof(struct msixcap) == 12); + + assert(msix_tab_size % 4096 == 0); + + bzero(msixcap, sizeof(struct msixcap)); + msixcap->capid = PCIY_MSIX; + + /* + * Message Control Register, all fields set to + * zero except for the Table Size. + * Note: Table size N is encoded as N-1 + */ + msixcap->msgctrl = (uint16_t) (msgnum - 1); + + /* + * MSI-X BAR setup: + * - MSI-X table start at offset 0 + * - PBA table starts at a 4K aligned offset after the MSI-X table + */ + msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; + msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); +} + +static void +pci_msix_table_init(struct pci_devinst *pi, int table_entries) +{ + int i, table_size; + + assert(table_entries > 0); + assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); + + table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, ((size_t) table_size)); + + /* set mask bit of vector control register */ + for (i = 0; i < table_entries; i++) + pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; +} + +int +pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) +{ + uint32_t tab_size; + struct msixcap msixcap; + + assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); + assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); + + tab_size = (uint32_t) (msgnum * MSIX_TABLE_ENTRY_SIZE); + + /* Align table size to nearest 4K */ + tab_size = roundup2(tab_size, 4096u); + + pi->pi_msix.table_bar = barnum; + pi->pi_msix.pba_bar = barnum; + pi->pi_msix.table_offset = 0; + pi->pi_msix.table_count = msgnum; + pi->pi_msix.pba_offset = tab_size; + pi->pi_msix.pba_size = PBA_SIZE(msgnum); + + pci_msix_table_init(pi, msgnum); + + pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); + + /* allocate memory for MSI-X Table and PBA */ + pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, + (tab_size + ((uint32_t) pi->pi_msix.pba_size))); + + return (pci_emul_add_capability(pi, (u_char *)&msixcap, + sizeof(msixcap))); +} + +void +msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask; + int off, table_bar; + + off = offset - capoff; + table_bar = pi->pi_msix.table_bar; + /* Message Control Register */ + if (off == 2 && bytes == 2) { + rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.addr = addrlo; + pi->pi_msi.msg_data = msgdata; + pi->pi_msi.maxmsgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.maxmsgnum = 0; + } + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +static void +pciecap_cfgwrite(struct pci_devinst *pi, UNUSED int capoff, int offset, + int bytes, uint32_t val) +{ + + /* XXX don't write to the readonly parts */ + CFGWRITE(pi, offset, val, bytes); +} + +#define PCIECAP_VERSION 0x2 +int +pci_emul_add_pciecap(struct pci_devinst *pi, int type) +{ + int err; + struct pciecap pciecap; + + CTASSERT(sizeof(struct pciecap) == 60); + + if (type != PCIEM_TYPE_ROOT_PORT) + return (-1); + + bzero(&pciecap, sizeof(pciecap)); + + pciecap.capid = PCIY_EXPRESS; + pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT; + pciecap.link_capabilities = 0x411; /* gen1, x1 */ + pciecap.link_status = 0x11; /* gen1, x1 */ + + err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); + return (err); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly. + * However, some o/s's do 4-byte writes that include these. + * For this case, trim the write back to 2 bytes and adjust + * the data. + */ + if (offset == capoff || offset == capoff + 1) { + if (offset == capoff && bytes == 4) { + bytes = 2; + offset += 2; + val >>= 16; + } else + return; + } + + capid = pci_get_cfgdata8(pi, capoff); + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_MSIX: + msixcap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_EXPRESS: + pciecap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + uint16_t sts; + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) + return (1); + } + return (0); +} + +static int +pci_emul_fallback_handler(UNUSED int vcpu, int dir, UNUSED uint64_t addr, + UNUSED int size, uint64_t *val, UNUSED void *arg1, UNUSED long arg2) +{ + /* + * Ignore writes; return 0xff's for reads. The mem read code + * will take care of truncating to the correct size. + */ + if (dir == MEM_F_READ) { + *val = 0xffffffffffffffff; + } + + return (0); +} + +static int +pci_emul_ecfg_handler(int vcpu, int dir, uint64_t addr, int bytes, + uint64_t *val, UNUSED void *arg1, UNUSED long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + +#define BUSIO_ROUNDUP 32 +#define BUSMEM_ROUNDUP (1024 * 1024) + +int +init_pci(void) +{ + struct mem_range mr; + struct pci_devemu *pde; + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + size_t lowmem; + int bus, slot, func; + int error; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = xh_vm_get_lowmem_limit(); + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + /* + * Keep track of the i/o and memory resources allocated to + * this bus. + */ + bi->iobase = (uint16_t) pci_emul_iobase; + bi->membase32 = (uint32_t) pci_emul_membase32; + bi->membase64 = (uint32_t) pci_emul_membase64; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + pde = pci_emul_finddev(fi->fi_name); + assert(pde != NULL); + error = pci_emul_init(pde, bus, slot, func, fi); + if (error) + return (error); + } + } + + /* + * Add some slop to the I/O and memory resources decoded by + * this bus to give a guest some flexibility if it wants to + * reprogram the BARs. + */ + pci_emul_iobase += BUSIO_ROUNDUP; + pci_emul_iobase = roundup2(pci_emul_iobase, ((uint32_t) BUSIO_ROUNDUP)); + bi->iolimit = (uint16_t) pci_emul_iobase; + + pci_emul_membase32 += BUSMEM_ROUNDUP; + pci_emul_membase32 = roundup2(pci_emul_membase32, + ((uint64_t) BUSMEM_ROUNDUP)); + bi->memlimit32 = (uint32_t) pci_emul_membase32; + + pci_emul_membase64 += BUSMEM_ROUNDUP; + pci_emul_membase64 = roundup2(pci_emul_membase64, + ((uint64_t) BUSMEM_ROUNDUP)); + bi->memlimit64 = pci_emul_membase64; + } + + /* + * PCI backends are initialized before routing INTx interrupts + * so that LPC devices are able to reserve ISA IRQs before + * routing PIRQ pins. + */ + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_devi == NULL) + continue; + pci_lintr_route(fi->fi_devi); + } + } + } + lpc_pirq_routed(); + + /* + * The guest physical memory map looks like the following: + * [0, lowmem) guest system memory + * [lowmem, lowmem_limit) memory hole (may be absent) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware + * [4GB, 4GB + highmem) + */ + + /* + * Accesses to memory addresses that are not allocated to system + * memory or PCI devices return 0xff's. + */ + lowmem = xh_vm_get_lowmem_size(); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); + + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); + assert(error == 0); + + return (0); +} + +static void +pci_apic_prt_entry(UNUSED int bus, int slot, int pin, UNUSED int pirq_pin, + int ioapic_irq, UNUSED void *arg) +{ + + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" Zero,"); + dsdt_line(" 0x%X", ioapic_irq); + dsdt_line(" },"); +} + +static void +pci_pirq_prt_entry(UNUSED int bus, int slot, int pin, int pirq_pin, + UNUSED int ioapic_irq, UNUSED void *arg) +{ + char *name; + + name = lpc_pirq_name(pirq_pin); + if (name == NULL) + return; + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" %s,", name); + dsdt_line(" 0x00"); + dsdt_line(" },"); + free(name); +} + +/* + * A bhyve virtual machine has a flat PCI hierarchy with a root port + * corresponding to each PCI bus. + */ +#if ACPITBL_AML +static void +pci_bus_write_dsdt(int bus) +{ + struct businfo *bi; + + /* + * If there are no devices on this 'bus' then just return. + */ + if ((bi = pci_businfo[bus]) == NULL) { + /* + * Bus 0 is special because it decodes the I/O ports used + * for PCI config space access even if there are no devices + * on it. + */ + if (bus != 0) + return; + } + + dsdt_fixup(bus, bi->iobase, bi->iolimit, bi->membase32, bi->memlimit32, + bi->membase64, bi->memlimit64); + + (void) pci_pirq_prt_entry; + (void) pci_apic_prt_entry; +} +#else +static void +pci_bus_write_dsdt(int bus) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + int count, func, slot; + + /* + * If there are no devices on this 'bus' then just return. + */ + if ((bi = pci_businfo[bus]) == NULL) { + /* + * Bus 0 is special because it decodes the I/O ports used + * for PCI config space access even if there are no devices + * on it. + */ + if (bus != 0) + return; + } + + dsdt_line(" Device (PC%02X)", bus); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); + dsdt_line(" Name (_ADR, Zero)"); + + dsdt_line(" Method (_BBN, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Return (0x%08X)", bus); + dsdt_line(" }"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " + "MaxFixed, PosDecode,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bus); + dsdt_line(" 0x%04X, // Range Maximum", bus); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0001, // Length"); + dsdt_line(" ,, )"); + + if (bus == 0) { + dsdt_indent(3); + dsdt_fixed_ioport(0xCF8, 8); + dsdt_unindent(3); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0000, // Range Minimum"); + dsdt_line(" 0x0CF7, // Range Maximum"); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0CF8, // Length"); + dsdt_line(" ,, , TypeStatic)"); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0D00, // Range Minimum"); + dsdt_line(" 0x%04X, // Range Maximum", + PCI_EMUL_IOBASE - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + PCI_EMUL_IOBASE - 0x0D00); + dsdt_line(" ,, , TypeStatic)"); + + if (bi == NULL) { + dsdt_line(" })"); + goto done; + } + } + assert(bi != NULL); + + /* i/o window */ + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); + dsdt_line(" 0x%04X, // Range Maximum", + bi->iolimit - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + bi->iolimit - bi->iobase); + dsdt_line(" ,, , TypeStatic)"); + + /* mmio window (32-bit) */ + dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x00000000, // Granularity"); + dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); + dsdt_line(" 0x%08X, // Range Maximum\n", + bi->memlimit32 - 1); + dsdt_line(" 0x00000000, // Translation Offset"); + dsdt_line(" 0x%08X, // Length\n", + bi->memlimit32 - bi->membase32); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + + /* mmio window (64-bit) */ + dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x0000000000000000, // Granularity"); + dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); + dsdt_line(" 0x%016lX, // Range Maximum\n", + bi->memlimit64 - 1); + dsdt_line(" 0x0000000000000000, // Translation Offset"); + dsdt_line(" 0x%016lX, // Length\n", + bi->memlimit64 - bi->membase64); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + dsdt_line(" })"); + + count = pci_count_lintr(bus); + if (count != 0) { + dsdt_indent(2); + dsdt_line("Name (PPRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Name (APRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_apic_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Method (_PRT, 0, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (PICM)"); + dsdt_line(" {"); + dsdt_line(" Return (APRT)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (PPRT)"); + dsdt_line(" }"); + dsdt_line("}"); + dsdt_unindent(2); + } + + dsdt_indent(2); + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + pi = si->si_funcs[func].fi_devi; + if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) + pi->pi_d->pe_write_dsdt(pi); + } + } + dsdt_unindent(2); +done: + dsdt_line(" }"); +} +#endif + +#if ACPITBL_AML +void +pci_write_dsdt(void) +{ + int bus; + + for (bus = 0; bus < MAXBUSES; bus++) { + pci_bus_write_dsdt(bus); + } +} +#else +void +pci_write_dsdt(void) +{ + int bus; + + dsdt_indent(1); + dsdt_line("Name (PICM, 0x00)"); + dsdt_line("Method (_PIC, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" Store (Arg0, PICM)"); + dsdt_line("}"); + dsdt_line(""); + dsdt_line("Scope (_SB)"); + dsdt_line("{"); + for (bus = 0; bus < MAXBUSES; bus++) + pci_bus_write_dsdt(bus); + dsdt_line("}"); + dsdt_unindent(1); +} +#endif + +int +pci_bus_configured(int bus) +{ + assert(bus >= 0 && bus < MAXBUSES); + return (pci_businfo[bus] != NULL); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +static int +pci_msi_maxmsgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.maxmsgnum); + else + return (0); +} + +int +pci_msix_enabled(struct pci_devinst *pi) +{ + + return (pi->pi_msix.enabled && !pi->pi_msi.enabled); +} + +void +pci_generate_msix(struct pci_devinst *pi, int index) +{ + struct msix_table_entry *mte; + + if (!pci_msix_enabled(pi)) + return; + + if (pi->pi_msix.function_mask) + return; + + if (index >= pi->pi_msix.table_count) + return; + + mte = &pi->pi_msix.table[index]; + if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* XXX Set PBA bit if interrupt is disabled */ + xh_vm_lapic_msi(mte->addr, mte->msg_data); + } +} + +void +pci_generate_msi(struct pci_devinst *pi, int index) +{ + + if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { + xh_vm_lapic_msi(pi->pi_msi.addr, pi->pi_msi.msg_data + + ((uint64_t) index)); + } +} + +static bool +pci_lintr_permitted(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || + (cmd & PCIM_CMD_INTxDIS))); +} + +void +pci_lintr_request(struct pci_devinst *pi) +{ + struct businfo *bi; + struct slotinfo *si; + int bestpin, bestcount, pin; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + + /* + * Just allocate a pin from our slot. The pin will be + * assigned IRQs later when interrupts are routed. + */ + si = &bi->slotinfo[pi->pi_slot]; + bestpin = 0; + bestcount = si->si_intpins[0].ii_count; + for (pin = 1; pin < 4; pin++) { + if (si->si_intpins[pin].ii_count < bestcount) { + bestpin = pin; + bestcount = si->si_intpins[pin].ii_count; + } + } + + si->si_intpins[bestpin].ii_count++; + pi->pi_lintr.pin = (int8_t) (bestpin + 1); + pci_set_cfgdata8(pi, PCIR_INTPIN, ((uint8_t) (bestpin + 1))); +} + +static void +pci_lintr_route(struct pci_devinst *pi) +{ + struct businfo *bi; + struct intxinfo *ii; + + if (pi->pi_lintr.pin == 0) + return; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; + + /* + * Attempt to allocate an I/O APIC pin for this intpin if one + * is not yet assigned. + */ + if (ii->ii_ioapic_irq == 0) + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); + assert(ii->ii_ioapic_irq > 0); + + /* + * Attempt to allocate a PIRQ pin for this intpin if one is + * not yet assigned. + */ + if (ii->ii_pirq_pin == 0) + ii->ii_pirq_pin = pirq_alloc_pin(); + assert(ii->ii_pirq_pin > 0); + + pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; + pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; + pci_set_cfgdata8(pi, PCIR_INTLINE, ((uint8_t) pirq_irq(ii->ii_pirq_pin))); +} + +void +pci_lintr_assert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == IDLE) { + if (pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } else + pi->pi_lintr.state = PENDING; + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +void +pci_lintr_deassert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED) { + pi->pi_lintr.state = IDLE; + pci_irq_deassert(pi); + } else if (pi->pi_lintr.state == PENDING) + pi->pi_lintr.state = IDLE; + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +static void +pci_lintr_update(struct pci_devinst *pi) +{ + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { + pci_irq_deassert(pi); + pi->pi_lintr.state = PENDING; + } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +int +pci_count_lintr(int bus) +{ + int count, slot, pin; + struct slotinfo *slotinfo; + + count = 0; + if (pci_businfo[bus] != NULL) { + for (slot = 0; slot < MAXSLOTS; slot++) { + slotinfo = &pci_businfo[bus]->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + if (slotinfo->si_intpins[pin].ii_count != 0) + count++; + } + } + } + return (count); +} + +void +pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) +{ + struct businfo *bi; + struct slotinfo *si; + struct intxinfo *ii; + int slot, pin; + + if ((bi = pci_businfo[bus]) == NULL) + return; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + ii = &si->si_intpins[pin]; + if (ii->ii_count != 0) + cb(bus, slot, pin + 1, ii->ii_pirq_pin, + ii->ii_ioapic_irq, arg); + } + } +} + +/* + * Return 1 if the emulated device in 'slot' is a multi-function device. + * Return 0 otherwise. + */ +static int +pci_emul_is_mfdev(int bus, int slot) +{ + struct businfo *bi; + struct slotinfo *si; + int f, numfuncs; + + numfuncs = 0; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + for (f = 0; f < MAXFUNCS; f++) { + if (si->si_funcs[f].fi_devi != NULL) { + numfuncs++; + } + } + } + return (numfuncs > 1); +} + +/* + * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on + * whether or not is a multi-function being emulated in the pci 'slot'. + */ +static void +pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) +{ + int mfdev; + + if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { + mfdev = pci_emul_is_mfdev(bus, slot); + switch (bytes) { + case 1: + case 2: + *rv &= ~((uint32_t) PCIM_MFDEV); + if (mfdev) { + *rv |= PCIM_MFDEV; + } + break; + case 4: + *rv &= ~((uint32_t) (PCIM_MFDEV << 16)); + if (mfdev) { + *rv |= (PCIM_MFDEV << 16); + } + break; + } + } +} + +static void +pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) +{ + int i, rshift; + uint32_t cmd, cmd2, changed, old, readonly; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ + + /* + * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. + * + * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are + * 'write 1 to clear'. However these bits are not set to '1' by + * any device emulation so it is simpler to treat them as readonly. + */ + rshift = (coff & 0x3) * 8; + readonly = 0xFFFFF880 >> rshift; + + old = CFGREAD(pi, coff, bytes); + new &= ~readonly; + new |= (old & readonly); + CFGWRITE(pi, coff, new, bytes); /* update config */ + + cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + changed = cmd ^ cmd2; + + /* + * If the MMIO or I/O address space decoding has changed then + * register/unregister all BARs that decode that address space. + */ + for (i = 0; i <= PCI_BARMAX; i++) { + switch (pi->pi_bar[i].type) { + case PCIBAR_NONE: + case PCIBAR_MEMHI64: + break; + case PCIBAR_IO: + /* I/O address space decoding changed? */ + if (changed & PCIM_CMD_PORTEN) { + if (porten(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + /* MMIO address space decoding changed? */ + if (changed & PCIM_CMD_MEMEN) { + if (memen(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + } + } + + /* + * If INTx has been unmasked and is pending, assert the + * interrupt. + */ + pci_lintr_update(pi); +} + +static void +pci_cfgrw(int vcpu, int in, int bus, int slot, int func, int coff, int bytes, + uint32_t *eax) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + struct pci_devemu *pe; + int idx, needcfg; + uint64_t addr, bar, mask; + + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; + } else + pi = NULL; + + /* + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. + */ + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { + if (in) + *eax = 0xffffffff; + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL) { + needcfg = pe->pe_cfgread(vcpu, pi, coff, bytes, + eax); + } else { + needcfg = 1; + } + + if (needcfg) + *eax = CFGREAD(pi, coff, bytes); + + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(vcpu, pi, coff, bytes, *eax) == 0) + return; + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return; + idx = (coff - PCIR_BAR(0)) / 4; + mask = ~(pi->pi_bar[idx].size - 1); + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + pi->pi_bar[idx].addr = bar = 0; + break; + case PCIBAR_IO: + addr = *eax & mask; + addr &= 0xffff; + bar = addr | PCIM_BAR_IO_SPACE; + /* + * Register the new BAR value for interception + */ + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_IO); + } + break; + case PCIBAR_MEM32: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM32); + } + break; + case PCIBAR_MEM64: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + if (addr != (uint32_t)pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM64); + } + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + addr = ((uint64_t)*eax << 32) & mask; + bar = addr >> 32; + if (bar != pi->pi_bar[idx - 1].addr >> 32) { + update_bar_address(pi, addr, idx - 1, + PCIBAR_MEMHI64); + } + break; + } + pci_set_cfgdata32(pi, coff, ((uint32_t) bar)); + + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { + pci_emul_cmdsts_write(pi, coff, *eax, bytes); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (uint32_t) ((cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | + cfgoff); + + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(int vcpu, int in, int port, int bytes, uint32_t *eax, + UNUSED void *arg) +{ + int coff; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT0); + if (cfgenable) { + pci_cfgrw(vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT3, IOPORT_F_INOUT, pci_emul_cfgdata); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DIOSZ 8 +#define DMEMSZ 4096 +struct pci_emul_dsoftc { + uint8_t ioregs[DIOSZ]; + uint8_t memregs[2][DMEMSZ]; +}; + +#define PCI_EMUL_MSI_MSGS 4 + +static int +pci_emul_dinit(struct pci_devinst *pi, UNUSED char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = calloc(1, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + return (0); +} + +static void +pci_emul_diow(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (baridx == 0) { + if (offset + ((uint64_t) size) > DIOSZ) { + printf("diow: iow too large, offset %llu size %d\n", + offset, size); + return; + } + + if (size == 1) { + sc->ioregs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)((void *) &sc->ioregs[offset]) = value & 0xffff; + } else if (size == 4) { + *(uint32_t *)((void *) &sc->ioregs[offset]) = (uint32_t) value; + } else { + printf("diow: iow unknown size %d\n", size); + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, ((int) (value % + ((uint64_t) pci_msi_maxmsgnum(pi))))); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_maxmsgnum(pi); i++) + pci_generate_msi(pi, i); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + ((uint16_t) size) > DMEMSZ) { + printf("diow: memw too large, offset %llu size %d\n", + offset, size); + return; + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + sc->memregs[i][offset] = (uint8_t) value; + + } else if (size == 2) { + *(uint16_t *)((void *) &sc->memregs[i][offset]) = (uint16_t) value; + } else if (size == 4) { + *(uint32_t *)((void *) &sc->memregs[i][offset]) = (uint32_t) value; + } else if (size == 8) { + *(uint64_t *)((void *) &sc->memregs[i][offset]) = value; + } else { + printf("diow: memw unknown size %d\n", size); + } + + /* + * magic interrupt ?? + */ + } + + if (baridx > 2) { + printf("diow: unknown bar idx %d\n", baridx); + } +} + +static uint64_t +pci_emul_dior(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + int i; + + value = 0; + + if (baridx == 0) { + if (offset + ((uint64_t) size) > DIOSZ) { + printf("dior: ior too large, offset %llu size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->ioregs[offset]; + } else if (size == 2) { + value = *(uint16_t *)((void *) &sc->ioregs[offset]); + } else if (size == 4) { + value = *(uint32_t *)((void *) &sc->ioregs[offset]); + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + ((uint64_t) size) > DMEMSZ) { + printf("dior: memr too large, offset %llu size %d\n", + offset, size); + return (0); + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + value = sc->memregs[i][offset]; + } else if (size == 2) { + value = *(uint16_t *) ((void *) &sc->memregs[i][offset]); + } else if (size == 4) { + value = *(uint32_t *) ((void *) &sc->memregs[i][offset]); + } else if (size == 8) { + value = (uint32_t) *(uint64_t *) ((void *) &sc->memregs[i][offset]); + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + + if (baridx > 2) { + printf("dior: unknown bar idx %d\n", baridx); + return (0); + } + + return (value); +} + +static struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_barwrite = pci_emul_diow, + .pe_barread = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/vendor/github.com/hooklift/xhyve/pci_hostbridge.c b/vendor/github.com/hooklift/xhyve/pci_hostbridge.c new file mode 100644 index 0000000..397fc5f --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_hostbridge.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include + +static int +pci_hostbridge_init(struct pci_devinst *pi, UNUSED char *opts) +{ + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); + + return (0); +} + +static int +pci_amd_hostbridge_init(struct pci_devinst *pi, char *opts) +{ + (void) pci_hostbridge_init(pi, opts); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + + return (0); +} + +static struct pci_devemu pci_de_amd_hostbridge = { + .pe_emu = "amd_hostbridge", + .pe_init = pci_amd_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_amd_hostbridge); + +static struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/vendor/github.com/hooklift/xhyve/pci_irq.c b/vendor/github.com/hooklift/xhyve/pci_irq.c new file mode 100644 index 0000000..4ce0cd5 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_irq.c @@ -0,0 +1,339 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Implement an 8 pin PCI interrupt router compatible with the router + * present on Intel's ICH10 chip. + */ + +/* Fields in each PIRQ register. */ +#define PIRQ_DIS 0x80 +#define PIRQ_IRQ 0x0f + +/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */ +#define PERMITTED_IRQS 0xdef8 +#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0) + +/* IRQ count to disable an IRQ. */ +#define IRQ_DISABLED 0xff + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct pirq { + uint8_t reg; + int use_count; + int active_count; + pthread_mutex_t lock; +} pirqs[8]; +#pragma clang diagnostic pop + +static u_char irq_counts[16]; +static int pirq_cold = 1; + +/* + * Returns true if this pin is enabled with a valid IRQ. Setting the + * register to a reserved IRQ causes interrupts to not be asserted as + * if the pin was disabled. + */ +static bool +pirq_valid_irq(int reg) +{ + if (reg & PIRQ_DIS) + return (false); + return (IRQ_PERMITTED(reg & PIRQ_IRQ)); +} + +uint8_t +pirq_read(int pin) +{ + assert((pin > 0) && (((unsigned) pin) <= nitems(pirqs))); + return (pirqs[pin - 1].reg); +} + +void +pirq_write(int pin, uint8_t val) +{ + struct pirq *pirq; + + assert((pin > 0) && (((unsigned) pin) <= nitems(pirqs))); + pirq = &pirqs[pin - 1]; + pthread_mutex_lock(&pirq->lock); + if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) { + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + xh_vm_isa_deassert_irq(pirq->reg & PIRQ_IRQ, -1); + pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ); + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + xh_vm_isa_assert_irq(pirq->reg & PIRQ_IRQ, -1); + } + pthread_mutex_unlock(&pirq->lock); +} + +void +pci_irq_reserve(int irq) +{ + assert((irq >= 0) && (((unsigned) irq) < nitems(irq_counts))); + assert(pirq_cold); + assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); + irq_counts[irq] = IRQ_DISABLED; +} + +void +pci_irq_use(int irq) +{ + assert((irq >= 0) && (((unsigned) irq) < nitems(irq_counts))); + assert(pirq_cold); + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; +} + +void +pci_irq_init(void) +{ + unsigned i; + + for (i = 0; i < nitems(pirqs); i++) { + pirqs[i].reg = PIRQ_DIS; + pirqs[i].use_count = 0; + pirqs[i].active_count = 0; + pthread_mutex_init(&pirqs[i].lock, NULL); + } + for (i = 0; i < nitems(irq_counts); i++) { + if (IRQ_PERMITTED(i)) + irq_counts[i] = 0; + else + irq_counts[i] = IRQ_DISABLED; + } +} + +void +pci_irq_assert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(((unsigned) pi->pi_lintr.pirq_pin) <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count++; + if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) { + xh_vm_isa_assert_irq(pirq->reg & PIRQ_IRQ, pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + xh_vm_ioapic_assert_irq(pi->pi_lintr.ioapic_irq); +} + +void +pci_irq_deassert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(((unsigned) pi->pi_lintr.pirq_pin) <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count--; + if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) { + xh_vm_isa_deassert_irq(pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + xh_vm_ioapic_deassert_irq(pi->pi_lintr.ioapic_irq); +} + +int +pirq_alloc_pin(void) +{ + int best_count, best_irq, best_pin, irq, pin; + + pirq_cold = 0; + + /* First, find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; ((unsigned) pin) < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } + } + pirqs[best_pin].use_count++; + + /* Second, route this pin to an IRQ. */ + if (pirqs[best_pin].reg == PIRQ_DIS) { + best_irq = -1; + best_count = 0; + for (irq = 0; ((unsigned) irq) < nitems(irq_counts); irq++) { + if (irq_counts[irq] == IRQ_DISABLED) + continue; + if (best_irq == -1 || irq_counts[irq] < best_count) { + best_irq = irq; + best_count = irq_counts[irq]; + } + } + assert(best_irq >= 0); + irq_counts[best_irq]++; + pirqs[best_pin].reg = (uint8_t) best_irq; + xh_vm_isa_set_irq_trigger(best_irq, LEVEL_TRIGGER); + } + + return (best_pin + 1); +} + +int +pirq_irq(int pin) +{ + assert((pin > 0) && (((unsigned) pin) <= nitems(pirqs))); + return (pirqs[pin - 1].reg & PIRQ_IRQ); +} + +/* XXX: Generate $PIR table. */ + +static void +pirq_dsdt(void) +{ + char *irq_prs, *old; + int irq, pin; + + irq_prs = NULL; + for (irq = 0; ((unsigned) irq) < nitems(irq_counts); irq++) { + if (!IRQ_PERMITTED(irq)) + continue; + if (irq_prs == NULL) + asprintf(&irq_prs, "%d", irq); + else { + old = irq_prs; + asprintf(&irq_prs, "%s,%d", old, irq); + free(old); + } + } + + /* + * A helper method to validate a link register's value. This + * duplicates pirq_valid_irq(). + */ + dsdt_line(""); + dsdt_line("Method (PIRV, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ); + dsdt_line(" If (LLess (Local0, 0x03))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x08))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x0D))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" Return (0x01)"); + dsdt_line("}"); + + for (pin = 0; ((unsigned) pin) < nitems(pirqs); pin++) { + dsdt_line(""); + dsdt_line("Device (LNK%c)", 'A' + pin); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))"); + dsdt_line(" Name (_UID, 0x%02X)", pin + 1); + dsdt_line(" Method (_STA, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" If (PIRV (PIR%c))", 'A' + pin); + dsdt_line(" {"); + dsdt_line(" Return (0x0B)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (0x09)"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line(" Name (_PRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {%s}", irq_prs); + dsdt_line(" })"); + dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {}"); + dsdt_line(" })"); + dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)", + pin + 1, 'A' + pin); + dsdt_line(" Method (_CRS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin, + PIRQ_DIS | PIRQ_IRQ); + dsdt_line(" If (PIRV (Local0))"); + dsdt_line(" {"); + dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Store (0x00, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Return (CB%02X)", pin + 1); + dsdt_line(" }"); + dsdt_line(" Method (_DIS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Store (0x80, PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Method (_SRS, 1, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin); + dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin); + dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line("}"); + } + free(irq_prs); +} +LPC_DSDT(pirq_dsdt); diff --git a/vendor/github.com/hooklift/xhyve/pci_lpc.c b/vendor/github.com/hooklift/xhyve/pci_lpc.c new file mode 100644 index 0000000..9d9da1c --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_lpc.c @@ -0,0 +1,427 @@ +/*- + * Copyright (c) 2013 Neel Natu + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 + +SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); +SET_DECLARE(lpc_sysres_set, struct lpc_sysres); + +#define ELCR_PORT 0x4d0 +SYSRES_IO(ELCR_PORT, 2); + +#define IO_TIMER1_PORT 0x40 + +#define NMISC_PORT 0x61 +SYSRES_IO(NMISC_PORT, 1); + +static struct pci_devinst *lpc_bridge; + +#define LPC_UART_NUM 2 + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct lpc_uart_softc { + struct uart_softc *uart_softc; + const char *opts; + int iobase; + int irq; + int enabled; +} lpc_uart_softc[LPC_UART_NUM]; +#pragma clang diagnostic pop + +static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; + +/* + * LPC device configuration is in the following form: + * [,] + * For e.g. "com1,stdio" + */ +int +lpc_device_parse(const char *opts) +{ + int unit, error; + char *str, *cpy, *lpcdev; + + error = -1; + str = cpy = strdup(opts); + lpcdev = strsep(&str, ","); + if (lpcdev != NULL) { + for (unit = 0; unit < LPC_UART_NUM; unit++) { + if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { + lpc_uart_softc[unit].opts = str; + error = 0; + goto done; + } + } + } + +done: + if (error) + free(cpy); + + return (error); +} + +static void +lpc_uart_intr_assert(void *arg) +{ + struct lpc_uart_softc *sc = arg; + + assert(sc->irq >= 0); + + xh_vm_isa_pulse_irq(sc->irq, sc->irq); +} + +static void +lpc_uart_intr_deassert(UNUSED void *arg) +{ + /* + * The COM devices on the LPC bus generate edge triggered interrupts, + * so nothing more to do here. + */ +} + +static int +lpc_uart_io_handler(UNUSED int vcpu, int in, int port, int bytes, uint32_t *eax, + void *arg) +{ + int offset; + struct lpc_uart_softc *sc = arg; + + offset = port - sc->iobase; + + switch (bytes) { + case 1: + if (in) + *eax = uart_read(sc->uart_softc, offset); + else + uart_write(sc->uart_softc, offset, ((uint8_t) *eax)); + break; + case 2: + if (in) { + *eax = (uint32_t) uart_read(sc->uart_softc, offset); + *eax |= (uint32_t) (uart_read(sc->uart_softc, offset + 1) << 8); + } else { + uart_write(sc->uart_softc, offset, ((uint8_t) *eax)); + uart_write(sc->uart_softc, offset + 1, ((uint8_t) (*eax >> 8))); + } + break; + default: + return (-1); + } + + return (0); +} + +static int +lpc_init(void) +{ + struct lpc_uart_softc *sc; + struct inout_port iop; + const char *name; + int unit, error; + + /* COM1 and COM2 */ + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + name = lpc_uart_names[unit]; + + if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { + fprintf(stderr, "Unable to allocate resources for " + "LPC device %s\n", name); + return (-1); + } + pci_irq_reserve(sc->irq); + + sc->uart_softc = uart_init(lpc_uart_intr_assert, + lpc_uart_intr_deassert, sc); + + if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' " + "for LPC device %s\n", sc->opts, name); + return (-1); + } + + bzero(&iop, sizeof(struct inout_port)); + iop.name = name; + iop.port = sc->iobase; + iop.size = UART_IO_BAR_SIZE; + iop.flags = IOPORT_F_INOUT; + iop.handler = lpc_uart_io_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + sc->enabled = 1; + } + + return (0); +} + +static void +pci_lpc_write_dsdt(struct pci_devinst *pi) +{ + struct lpc_dsdt **ldpp, *ldp; + + dsdt_line(""); + dsdt_line("Device (ISA)"); + dsdt_line("{"); + dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); + dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); + dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); + dsdt_line(" {"); + dsdt_line(" Offset (0x60),"); + dsdt_line(" PIRA, 8,"); + dsdt_line(" PIRB, 8,"); + dsdt_line(" PIRC, 8,"); + dsdt_line(" PIRD, 8,"); + dsdt_line(" Offset (0x68),"); + dsdt_line(" PIRE, 8,"); + dsdt_line(" PIRF, 8,"); + dsdt_line(" PIRG, 8,"); + dsdt_line(" PIRH, 8"); + dsdt_line(" }"); + dsdt_line(""); + + dsdt_indent(1); + SET_FOREACH(ldpp, lpc_dsdt_set) { + ldp = *ldpp; + ldp->handler(); + } + + dsdt_line(""); + dsdt_line("Device (PIC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_ICU1, 2); + dsdt_fixed_ioport(IO_ICU2, 2); + dsdt_fixed_irq(2); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (TIMR)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_TIMER1_PORT, 4); + dsdt_fixed_irq(0); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + dsdt_unindent(1); + + dsdt_line("}"); +} + +static void +pci_lpc_sysres_dsdt(void) +{ + struct lpc_sysres **lspp, *lsp; + + dsdt_line(""); + dsdt_line("Device (SIO)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + + dsdt_indent(2); + SET_FOREACH(lspp, lpc_sysres_set) { + lsp = *lspp; + switch (lsp->type) { + case LPC_SYSRES_IO: + dsdt_fixed_ioport(((uint16_t) lsp->base), ((uint16_t) lsp->length)); + break; + case LPC_SYSRES_MEM: + dsdt_fixed_mem32(lsp->base, lsp->length); + break; + } + } + dsdt_unindent(2); + + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(pci_lpc_sysres_dsdt); + +static void +pci_lpc_uart_dsdt(void) +{ + struct lpc_uart_softc *sc; + int unit; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + if (!sc->enabled) + continue; + dsdt_line(""); + dsdt_line("Device (%s)", lpc_uart_names[unit]); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); + dsdt_line(" Name (_UID, %d)", unit + 1); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(((uint16_t) sc->iobase), UART_IO_BAR_SIZE); + dsdt_fixed_irq(((uint8_t) sc->irq)); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + } +} +LPC_DSDT(pci_lpc_uart_dsdt); + +static int +pci_lpc_cfgwrite(UNUSED int vcpu, struct pci_devinst *pi, int coff, int bytes, + uint32_t val) +{ + int pirq_pin; + + if (bytes == 1) { + pirq_pin = 0; + if (coff >= 0x60 && coff <= 0x63) + pirq_pin = coff - 0x60 + 1; + if (coff >= 0x68 && coff <= 0x6b) + pirq_pin = coff - 0x68 + 5; + if (pirq_pin != 0) { + pirq_write(pirq_pin, ((uint8_t) val)); + pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); + return (0); + } + } + return (-1); +} + +static void +pci_lpc_write(UNUSED int vcpu, UNUSED struct pci_devinst *pi, UNUSED int baridx, + UNUSED uint64_t offset, UNUSED int size, UNUSED uint64_t value) +{ +} + +static uint64_t +pci_lpc_read(UNUSED int vcpu, UNUSED struct pci_devinst *pi, UNUSED int baridx, + UNUSED uint64_t offset, UNUSED int size) +{ + return (0); +} + +#define LPC_DEV 0x7000 +#define LPC_VENDOR 0x8086 + +static int +pci_lpc_init(struct pci_devinst *pi, UNUSED char *opts) +{ + + /* + * Do not allow more than one LPC bridge to be configured. + */ + if (lpc_bridge != NULL) { + fprintf(stderr, "Only one LPC bridge is allowed.\n"); + return (-1); + } + + /* + * Enforce that the LPC can only be configured on bus 0. This + * simplifies the ACPI DSDT because it can provide a decode for + * all legacy i/o ports behind bus 0. + */ + if (pi->pi_bus != 0) { + fprintf(stderr, "LPC bridge can be present only on bus 0.\n"); + return (-1); + } + + if (lpc_init() != 0) + return (-1); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + + lpc_bridge = pi; + + return (0); +} + +char * +lpc_pirq_name(int pin) +{ + char *name; + + if (lpc_bridge == NULL) + return (NULL); + asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); + return (name); +} + +void +lpc_pirq_routed(void) +{ + int pin; + + if (lpc_bridge == NULL) + return; + + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); +} + +static struct pci_devemu pci_de_lpc = { + .pe_emu = "lpc", + .pe_init = pci_lpc_init, + .pe_write_dsdt = pci_lpc_write_dsdt, + .pe_cfgwrite = pci_lpc_cfgwrite, + .pe_barwrite = pci_lpc_write, + .pe_barread = pci_lpc_read +}; +PCI_EMUL_SET(pci_de_lpc); diff --git a/vendor/github.com/hooklift/xhyve/pci_uart.c b/vendor/github.com/hooklift/xhyve/pci_uart.c new file mode 100644 index 0000000..01edcb4 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_uart.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +/* + * Pick a PCI vid/did of a chip with a single uart at + * BAR0, that most versions of FreeBSD can understand: + * Siig CyberSerial 1-port. + */ +#define COM_VENDOR 0x131f +#define COM_DEV 0x2000 + +static void +pci_uart_intr_assert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_assert(pi); +} + +static void +pci_uart_intr_deassert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_deassert(pi); +} + +static void +pci_uart_write(UNUSED int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, + int size, uint64_t value) +{ + + assert(baridx == 0); + assert(size == 1); + + uart_write(pi->pi_arg, ((int) offset), ((uint8_t) value)); +} + +static uint64_t +pci_uart_read(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + uint8_t val; + + assert(baridx == 0); + assert(size == 1); + + val = uart_read(pi->pi_arg, ((int) offset)); + return (val); +} + +static int +pci_uart_init(struct pci_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); + pci_lintr_request(pi); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + + sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); + pi->pi_arg = sc; + + if (uart_set_backend(sc, opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' for " + "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func); + return (-1); + } + + return (0); +} + +static struct pci_devemu pci_de_com = { + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read +}; +PCI_EMUL_SET(pci_de_com); diff --git a/vendor/github.com/hooklift/xhyve/pci_virtio_block.c b/vendor/github.com/hooklift/xhyve/pci_virtio_block.c new file mode 100644 index 0000000..763a1e9 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_virtio_block.c @@ -0,0 +1,431 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VTBLK_RINGSZ 64 + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + 1 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + (VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ + VIRTIO_RING_F_INDIRECT_DESC) /* indirect descriptors */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +#pragma clang diagnostic pop + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +#pragma clang diagnostic pop + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + NULL, /* apply negotiated features */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !\n")); + vi_reset_dev(&sc->vbsc_vs); +} + +/* xhyve: FIXME + * + * pci_vtblk_done seems to deadlock when called from pci_vtblk_proc? + */ +static void +pci_vtblk_done_locked(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + //pthread_mutex_lock(&sc->vsc_mtx); + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); + //pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) { + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(br, err); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct pci_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[idx]; + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], + sizeof(struct iovec) * (((size_t) n) - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = (off_t) (vbh->vbh_sector * DEV_BSIZE); + io->io_status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs\n\r", + writeop ? "write" : "read/ident", iolen, i - 1)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + /* xhyve: FIXME */ + pci_vtblk_done_locked(&io->io_req, 0); + return; + default: + /* xhyve: FIXME */ + pci_vtblk_done_locked(&io->io_req, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); +} + +static int +pci_vtblk_init(struct pci_devinst *pi, char *opts) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * The supplied backing file has to exist + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = (uint16_t) i; + } + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, ((unsigned) strlen(opts))); + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = + (uint64_t) (size / DEV_BSIZE); /* 512-byte units */ + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX; + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = (uint32_t) sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (uint8_t) ((sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0); + sc->vbsc_cfg.vbc_topology.alignment_offset = + (uint8_t) ((sto != 0) ? ((sts - sto) / sectsz) : 0); + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); + return (1); + } + vi_set_io_bar(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(UNUSED void *vsc, int offset, UNUSED int size, + UNUSED uint32_t value) +{ + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +static struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/vendor/github.com/hooklift/xhyve/pci_virtio_net_tap.c b/vendor/github.com/hooklift/xhyve/pci_virtio_net_tap.c new file mode 100644 index 0000000..4268528 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_virtio_net_tap.c @@ -0,0 +1,770 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define USE_MEVENT 0 + +#define VTNET_RINGSZ 1024 +#define VTNET_MAXSEGS 32 + +/* + * Host capabilities. Note that we only offer a few of these. + */ +// #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +// #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +// #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +// #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +// #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +// #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +// #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +// #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +// #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +// #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +// #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +// #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +// #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +// #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +// #define VIRTIO_NET_F_GUEST_ANNOUNCE (1 << 21) /* guest can send gratuit. pkts */ + +#define VTNET_S_HOSTCAPS \ + (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY) + +#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +// #define VTNET_CTLQ 2 /* NB: not yet supported */ +#define VTNET_MAXQ 3 + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +#pragma clang diagnostic pop + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + int vsc_tapfd; + int vsc_rx_ready; + volatile int resetting;/* set and checked outside lock */ + uint64_t vsc_features; /* negotiated features */ + struct virtio_net_config vsc_config; + pthread_mutex_t rx_mtx; + int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; +}; +#pragma clang diagnostic pop + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +/* + * If the transmit thread is active then stall until it is done. + */ +static void +pci_vtnet_txwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * If the receive thread is active then stall until it is done. + */ +static void +pci_vtnet_rxwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->rx_mtx); + while (sc->rx_in_progress) { + pthread_mutex_unlock(&sc->rx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->rx_mtx); + } + pthread_mutex_unlock(&sc->rx_mtx); +} + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !\n")); + + sc->resetting = 1; + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); + + sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = (size_t) (60 - len); + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= ((size_t) tlen)); + + iov[0].iov_len -= ((size_t) tlen); + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + + ((size_t) tlen)); + riov = &iov[0]; + } + + return (riov); +} + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vsc_tapfd != -1); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = (int) readv(sc->vsc_tapfd, riov, n); + + if (len < 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, ((uint32_t) (len + sc->rx_vhdrlen))); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + +#if USE_MEVENT +static void +pci_vtnet_tap_callback(UNUSED int fd, UNUSED enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + +} + +#else /* !USE_MEVENT */ + +static void * +pci_vtnet_tap_select_func(void *vsc) { + struct pci_vtnet_softc *sc; + fd_set rfd; + + sc = vsc; + + assert(sc); + assert(sc->vsc_tapfd != -1); + + FD_ZERO(&rfd); + FD_SET(sc->vsc_tapfd, &rfd); + + while (1) { + if (select((sc->vsc_tapfd + 1), &rfd, NULL, NULL, NULL) == -1) { + abort(); + } + + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + } + + return (NULL); +} +#endif + +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + int i, n; + int plen, tlen; + uint16_t idx; + + /* + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = (int) iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; + } + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + + /* chain is processed, release it and set tlen */ + vq_relchain(vq, idx, ((uint32_t) tlen)); +} + +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, 1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} +#endif + +static int +pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} + + +static int +pci_vtnet_init(struct pci_devinst *pi, char *opts) +{ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct pci_vtnet_softc *sc; + char *devname; + char *vtopts; + int mac_provided; +#if !USE_MEVENT + pthread_t sthrd; +#endif + + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Attempt to open the tap device and read the MAC address + * if specified + */ + mac_provided = 0; + sc->vsc_tapfd = -1; + if (opts != NULL) { + char tbuf[80]; + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + free(devname); + + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + +#if USE_MEVENT + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + +#else /* !USE_MEVENT */ + if (pthread_create(&sthrd, NULL, pci_vtnet_tap_select_func, sc)) { + WPRINTF(("Could not create tap receive thread\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } +#endif + } + } + + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, ((unsigned int) strlen(nstr))); + MD5Final(digest, &mdctx); + + sc->vsc_config.mac[0] = 0x00; + sc->vsc_config.mac[1] = 0xa0; + sc->vsc_config.mac[2] = 0x98; + sc->vsc_config.mac[3] = digest[0]; + sc->vsc_config.mac[4] = digest[1]; + sc->vsc_config.mac[5] = digest[2]; + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + /* Link is up if we managed to open tap device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0); + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_in_progress = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < 6) { + assert(offset + size <= 6); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + +static struct pci_devemu pci_de_vnet_tap = { + .pe_emu = "virtio-tap", + .pe_init = pci_vtnet_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vnet_tap); diff --git a/vendor/github.com/hooklift/xhyve/pci_virtio_net_vmnet.c b/vendor/github.com/hooklift/xhyve/pci_virtio_net_vmnet.c new file mode 100644 index 0000000..a309cb1 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_virtio_net_vmnet.c @@ -0,0 +1,820 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * + * The vmnet support is ported from the MirageOS project: + * + * https://github.com/mirage/ocaml-vmnet + * + * Copyright (C) 2014 Anil Madhavapeddy + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VTNET_RINGSZ 1024 +#define VTNET_MAXSEGS 32 + +/* + * Host capabilities. Note that we only offer a few of these. + */ +// #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +// #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +// #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +// #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +// #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +// #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +// #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +// #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +// #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +// #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +// #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +// #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +// #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +// #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +// #define VIRTIO_NET_F_GUEST_ANNOUNCE (1 << 21) /* guest can send gratuit. pkts */ + +#define VTNET_S_HOSTCAPS \ + (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY) + +// #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +// #define VTNET_CTLQ 2 /* NB: not yet supported */ +#define VTNET_MAXQ 3 + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +#pragma clang diagnostic pop + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + struct vmnet_state *vms; + int vsc_rx_ready; + volatile int resetting;/* set and checked outside lock */ + uint64_t vsc_features; /* negotiated features */ + struct virtio_net_config vsc_config; + pthread_mutex_t rx_mtx; + int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +struct vmnet_state { + interface_ref iface; + uint8_t mac[6]; + unsigned int mtu; + unsigned int max_packet_size; +}; + +#pragma clang diagnostic pop + +static void pci_vtnet_tap_callback(struct pci_vtnet_softc *sc); + +static int +vmn_create(struct pci_vtnet_softc *sc) +{ + xpc_object_t interface_desc; + uuid_t uuid; + __block interface_ref iface; + __block vmnet_return_t iface_status; + dispatch_semaphore_t iface_created; + dispatch_queue_t if_create_q; + dispatch_queue_t if_q; + struct vmnet_state *vms; + uint32_t uuid_status; + + interface_desc = xpc_dictionary_create(NULL, NULL, 0); + xpc_dictionary_set_uint64(interface_desc, vmnet_operation_mode_key, + VMNET_SHARED_MODE); + + if (guest_uuid_str != NULL) { + uuid_from_string(guest_uuid_str, &uuid, &uuid_status); + if (uuid_status != uuid_s_ok) { + return (-1); + } + } else { + uuid_generate_random(uuid); + } + + xpc_dictionary_set_uuid(interface_desc, vmnet_interface_id_key, uuid); + iface = NULL; + iface_status = 0; + + vms = malloc(sizeof(struct vmnet_state)); + + if (!vms) { + return (-1); + } + + if_create_q = dispatch_queue_create("org.xhyve.vmnet.create", + DISPATCH_QUEUE_SERIAL); + + iface_created = dispatch_semaphore_create(0); + + iface = vmnet_start_interface(interface_desc, if_create_q, + ^(vmnet_return_t status, xpc_object_t interface_param) + { + iface_status = status; + if (status != VMNET_SUCCESS || !interface_param) { + dispatch_semaphore_signal(iface_created); + return; + } + + if (sscanf(xpc_dictionary_get_string(interface_param, + vmnet_mac_address_key), + "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &vms->mac[0], &vms->mac[1], &vms->mac[2], &vms->mac[3], + &vms->mac[4], &vms->mac[5]) != 6) + { + assert(0); + } + + vms->mtu = (unsigned) + xpc_dictionary_get_uint64(interface_param, vmnet_mtu_key); + vms->max_packet_size = (unsigned) + xpc_dictionary_get_uint64(interface_param, + vmnet_max_packet_size_key); + dispatch_semaphore_signal(iface_created); + }); + + dispatch_semaphore_wait(iface_created, DISPATCH_TIME_FOREVER); + dispatch_release(if_create_q); + + if (iface == NULL || iface_status != VMNET_SUCCESS) { + printf("virtio_net: Could not create vmnet interface, " + "permission denied or no entitlement?\n"); + free(vms); + return (-1); + } + + vms->iface = iface; + sc->vms = vms; + + if_q = dispatch_queue_create("org.xhyve.vmnet.iface_q", 0); + + vmnet_interface_set_event_callback(iface, VMNET_INTERFACE_PACKETS_AVAILABLE, + if_q, ^(UNUSED interface_event_t event_id, UNUSED xpc_object_t event) + { + pci_vtnet_tap_callback(sc); + }); + + return (0); +} + +static ssize_t +vmn_read(struct vmnet_state *vms, struct iovec *iov, int n) { + vmnet_return_t r; + struct vmpktdesc v; + int pktcnt; + int i; + + v.vm_pkt_size = 0; + + for (i = 0; i < n; i++) { + v.vm_pkt_size += iov[i].iov_len; + } + + assert(v.vm_pkt_size >= vms->max_packet_size); + + v.vm_pkt_iov = iov; + v.vm_pkt_iovcnt = (uint32_t) n; + v.vm_flags = 0; /* TODO no clue what this is */ + + pktcnt = 1; + + r = vmnet_read(vms->iface, &v, &pktcnt); + + assert(r == VMNET_SUCCESS); + + if (pktcnt < 1) { + return (-1); + } + + return ((ssize_t) v.vm_pkt_size); +} + +static void +vmn_write(struct vmnet_state *vms, struct iovec *iov, int n) { + vmnet_return_t r; + struct vmpktdesc v; + int pktcnt; + int i; + + v.vm_pkt_size = 0; + + for (i = 0; i < n; i++) { + v.vm_pkt_size += iov[i].iov_len; + } + + assert(v.vm_pkt_size <= vms->max_packet_size); + + v.vm_pkt_iov = iov; + v.vm_pkt_iovcnt = (uint32_t) n; + v.vm_flags = 0; /* TODO no clue what this is */ + + pktcnt = 1; + + r = vmnet_write(vms->iface, &v, &pktcnt); + + assert(r == VMNET_SUCCESS); +} + +/* + * If the transmit thread is active then stall until it is done. + */ +static void +pci_vtnet_txwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * If the receive thread is active then stall until it is done. + */ +static void +pci_vtnet_rxwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->rx_mtx); + while (sc->rx_in_progress) { + pthread_mutex_unlock(&sc->rx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->rx_mtx); + } + pthread_mutex_unlock(&sc->rx_mtx); +} + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !\n")); + + sc->resetting = 1; + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); + + sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (!sc->vms) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = (size_t) (60 - len); + iovcnt++; + } + vmn_write(sc->vms, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= ((size_t) tlen)); + + iov[0].iov_len -= ((size_t) tlen); + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + + ((size_t) tlen)); + riov = &iov[0]; + } + + return (riov); +} + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vms); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + iov[0].iov_base = dummybuf; + iov[0].iov_len = sizeof(dummybuf); + (void) vmn_read(sc->vms, iov, 1); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + iov[0].iov_base = dummybuf; + iov[0].iov_len = sizeof(dummybuf); + (void) vmn_read(sc->vms, iov, 1); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = (int) vmn_read(sc->vms, riov, n); + + if (len < 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, ((uint32_t) (len + sc->rx_vhdrlen))); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + +static void +pci_vtnet_tap_callback(struct pci_vtnet_softc *sc) +{ + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + +} + +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + int i, n; + int plen, tlen; + uint16_t idx; + + /* + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = (int) iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; + } + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + + /* chain is processed, release it and set tlen */ + vq_relchain(vq, idx, ((uint32_t) tlen)); +} + +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, 1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + DPRINTF(("vtnet: control qnotify!\n\r")); +} +#endif + +static int +pci_vtnet_init(struct pci_devinst *pi, UNUSED char *opts) +{ + struct pci_vtnet_softc *sc; + int mac_provided; + + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Attempt to open the tap device and read the MAC address + * if specified + */ + mac_provided = 0; + + if (vmn_create(sc) == -1) { + return (-1); + } + + sc->vsc_config.mac[0] = sc->vms->mac[0]; + sc->vsc_config.mac[1] = sc->vms->mac[1]; + sc->vsc_config.mac[2] = sc->vms->mac[2]; + sc->vsc_config.mac[3] = sc->vms->mac[3]; + sc->vsc_config.mac[4] = sc->vms->mac[4]; + sc->vsc_config.mac[5] = sc->vms->mac[5]; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + /* Link is up if we managed to open tap device. */ + sc->vsc_config.status = 1; + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_in_progress = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < 6) { + assert(offset + size <= 6); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + +static struct pci_devemu pci_de_vnet_vmnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vnet_vmnet); diff --git a/vendor/github.com/hooklift/xhyve/pci_virtio_rnd.c b/vendor/github.com/hooklift/xhyve/pci_virtio_rnd.c new file mode 100644 index 0000000..b392c46 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pci_virtio_rnd.c @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2014 Nahanni Systems Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) printf params +#define WPRINTF(params) printf params + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; +#pragma clang diagnostic pop + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !\n")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = (int) read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, ((uint32_t) len)); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct pci_devinst *pi, UNUSED char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + + /* + * Check that device is seeded and non-blocking. + */ + len = (int) read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vrsc_vs, 0); + + return (0); +} + + +static struct pci_devemu pci_de_vrnd = { + .pe_emu = "virtio-rnd", + .pe_init = pci_vtrnd_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vrnd); diff --git a/vendor/github.com/hooklift/xhyve/pm.c b/vendor/github.com/hooklift/xhyve/pm.c new file mode 100644 index 0000000..56139b0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/pm.c @@ -0,0 +1,302 @@ +/*- + * Copyright (c) 2013 Hudson River Trading LLC + * Written by: John H. Baldwin + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; +static struct mevent *power_button; +static sig_t old_power_handler; + +/* + * Reset Control register at I/O port 0xcf9. Bit 2 forces a system + * reset when it transitions from 0 to 1. Bit 1 selects the type of + * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard" + * reset. + */ +static int +reset_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + int error; + + static uint8_t reset_control; + + if (bytes != 1) + return (-1); + if (in) + *eax = reset_control; + else { + reset_control = (uint8_t) *eax; + + /* Treat hard and soft resets the same. */ + if (reset_control & 0x4) { + error = xh_vm_suspend(VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + } + } + return (0); +} +INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler); + +/* + * ACPI's SCI is a level-triggered interrupt. + */ +static int sci_active; + +static void +sci_assert(void) +{ + if (sci_active) + return; + xh_vm_isa_assert_irq(SCI_INT, SCI_INT); + sci_active = 1; +} + +static void +sci_deassert(void) +{ + if (!sci_active) + return; + xh_vm_isa_deassert_irq(SCI_INT, SCI_INT); + sci_active = 0; +} + +/* + * Power Management 1 Event Registers + * + * The only power management event supported is a power button upon + * receiving SIGTERM. + */ +static uint16_t pm1_enable, pm1_status; + +#define PM1_TMR_STS 0x0001 +#define PM1_BM_STS 0x0010 +#define PM1_GBL_STS 0x0020 +#define PM1_PWRBTN_STS 0x0100 +#define PM1_SLPBTN_STS 0x0200 +#define PM1_RTC_STS 0x0400 +#define PM1_WAK_STS 0x8000 + +#define PM1_TMR_EN 0x0001 +#define PM1_GBL_EN 0x0020 +#define PM1_PWRBTN_EN 0x0100 +#define PM1_SLPBTN_EN 0x0200 +#define PM1_RTC_EN 0x0400 + +static void +sci_update(void) +{ + int need_sci; + + /* See if the SCI should be active or not. */ + need_sci = 0; + if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS)) + need_sci = 1; + if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS)) + need_sci = 1; + if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS)) + need_sci = 1; + if (need_sci) + sci_assert(); + else + sci_deassert(); +} + +static int +pm1_status_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_status; + else { + /* + * Writes are only permitted to clear certain bits by + * writing 1 to those flags. + */ + pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS | + PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS)); + sci_update(); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} + +static int +pm1_enable_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_enable; + else { + /* + * Only permit certain bits to be set. We never use + * the global lock, but ACPI-CA whines profusely if it + * can't set GBL_EN. + */ + pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN); + sci_update(); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler); +INOUT_PORT(pm1_enable, PM1A_EVT_ADDR2, IOPORT_F_INOUT, pm1_enable_handler); + +static void +power_button_handler(UNUSED int signal, UNUSED enum ev_type type, + UNUSED void *arg) +{ + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(); + } + pthread_mutex_unlock(&pm_lock); +} + +/* + * Power Management 1 Control Register + * + * This is mostly unimplemented except that we wish to handle writes that + * set SPL_EN to handle S5 (soft power off). + */ +static uint16_t pm1_control; + +#define PM1_SCI_EN 0x0001 +#define PM1_SLP_TYP 0x1c00 +#define PM1_SLP_EN 0x2000 +#define PM1_ALWAYS_ZERO 0xc003 + +static int +pm1_control_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + int error; + + if (bytes != 2) + return (-1); + if (in) + *eax = pm1_control; + else { + /* + * Various bits are write-only or reserved, so force them + * to zero in pm1_control. Always preserve SCI_EN as OSPM + * can never change it. + */ + pm1_control = (uint16_t) ((pm1_control & PM1_SCI_EN) | + (*eax & ~((unsigned) (PM1_SLP_EN | PM1_ALWAYS_ZERO)))); + + /* + * If SLP_EN is set, check for S5. Bhyve's _S5_ method + * says that '5' should be stored in SLP_TYP for S5. + */ + if (*eax & PM1_SLP_EN) { + if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { + error = xh_vm_suspend(VM_SUSPEND_POWEROFF); + assert(error == 0 || errno == EALREADY); + } + } + } + return (0); +} +INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler); +SYSRES_IO(PM1A_EVT_ADDR, 8); + +/* + * ACPI SMI Command Register + * + * This write-only register is used to enable and disable ACPI. + */ +static int +smi_cmd_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + assert(!in); + if (bytes != 1) + return (-1); + + pthread_mutex_lock(&pm_lock); + switch (*eax) { + case BHYVE_ACPI_ENABLE: + pm1_control |= PM1_SCI_EN; + if (power_button == NULL) { + power_button = mevent_add(SIGTERM, EVF_SIGNAL, + power_button_handler, NULL); + old_power_handler = signal(SIGTERM, SIG_IGN); + } + break; + case BHYVE_ACPI_DISABLE: + pm1_control &= ~PM1_SCI_EN; + if (power_button != NULL) { + mevent_delete(power_button); + power_button = NULL; + signal(SIGTERM, old_power_handler); + } + break; + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler); +SYSRES_IO(SMI_CMD, 1); + +void +sci_init(void) +{ + /* + * Mark ACPI's SCI as level trigger and bump its use count + * in the PIRQ router. + */ + pci_irq_use(SCI_INT); + xh_vm_isa_set_irq_trigger(SCI_INT, LEVEL_TRIGGER); +} diff --git a/vendor/github.com/hooklift/xhyve/post.c b/vendor/github.com/hooklift/xhyve/post.c new file mode 100644 index 0000000..75bbc54 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/post.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +static int +post_data_handler(UNUSED int vcpu, int in, UNUSED int port, int bytes, + uint32_t *eax, UNUSED void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); +SYSRES_IO(0x84, 1); diff --git a/vendor/github.com/hooklift/xhyve/rtc.c b/vendor/github.com/hooklift/xhyve/rtc.c new file mode 100644 index 0000000..a71fdef --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/rtc.c @@ -0,0 +1,120 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#define IO_RTC 0x70 + +#define RTC_LMEM_LSB 0x34 +#define RTC_LMEM_MSB 0x35 +#define RTC_HMEM_LSB 0x5b +#define RTC_HMEM_SB 0x5c +#define RTC_HMEM_MSB 0x5d + +#define m_64KB (64*1024) +#define m_16MB (16*1024*1024) + +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(int use_localtime) +{ + struct tm tm; + time_t t; + + time(&t); + if (use_localtime) { + localtime_r(&t, &tm); + t = timegm(&tm); + } + return (t); +} + +void +rtc_init(int use_localtime) +{ + size_t himem; + size_t lomem; + int err; + + /* XXX init diag/reset code/equipment/checksum ? */ + + /* + * Report guest memory size in nvram cells as required by UEFI. + * Little-endian encoding. + * 0x34/0x35 - 64KB chunks above 16MB, below 4GB + * 0x5b/0x5c/0x5d - 64KB chunks above 4GB + */ + lomem = (xh_vm_get_lowmem_size() - m_16MB) / m_64KB; + err = xh_vm_rtc_write(RTC_LMEM_LSB, ((uint8_t) lomem)); + assert(err == 0); + err = xh_vm_rtc_write(RTC_LMEM_MSB, ((uint8_t) (lomem >> 8))); + assert(err == 0); + + himem = xh_vm_get_highmem_size() / m_64KB; + err = xh_vm_rtc_write(RTC_HMEM_LSB, ((uint8_t) himem)); + assert(err == 0); + err = xh_vm_rtc_write(RTC_HMEM_SB, ((uint8_t) (himem >> 8))); + assert(err == 0); + err = xh_vm_rtc_write(RTC_HMEM_MSB, ((uint8_t) (himem >> 16))); + assert(err == 0); + + err = xh_vm_rtc_settime(rtc_time(use_localtime)); + assert(err == 0); +} + +static void +rtc_dsdt(void) +{ + dsdt_line(""); + dsdt_line("Device (RTC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_RTC, 2); + dsdt_fixed_irq(8); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(rtc_dsdt); + +/* + * Reserve the extended RTC I/O ports although they are not emulated at this + * time. + */ +SYSRES_IO(0x72, 6); diff --git a/vendor/github.com/hooklift/xhyve/smbiostbl.c b/vendor/github.com/hooklift/xhyve/smbiostbl.c new file mode 100644 index 0000000..19c1caa --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/smbiostbl.c @@ -0,0 +1,828 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpacked" + +#define GB (1024ULL*1024*1024) + +#define SMBIOS_BASE 0xF1000 + +/* BHYVE_ACPI_BASE - SMBIOS_BASE) */ +#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) + +#define SMBIOS_TYPE_BIOS 0 +#define SMBIOS_TYPE_SYSTEM 1 +#define SMBIOS_TYPE_CHASSIS 3 +#define SMBIOS_TYPE_PROCESSOR 4 +#define SMBIOS_TYPE_MEMARRAY 16 +#define SMBIOS_TYPE_MEMDEVICE 17 +#define SMBIOS_TYPE_MEMARRAYMAP 19 +#define SMBIOS_TYPE_BOOT 32 +#define SMBIOS_TYPE_EOT 127 + +struct smbios_structure { + uint8_t type; + uint8_t length; + uint16_t handle; +} __packed; + +typedef int (*initializer_func_t)(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_template_entry { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; +}; + +/* + * SMBIOS Structure Table Entry Point + */ +#define SMBIOS_ENTRY_EANCHOR "_SM_" +#define SMBIOS_ENTRY_EANCHORLEN 4 +#define SMBIOS_ENTRY_IANCHOR "_DMI_" +#define SMBIOS_ENTRY_IANCHORLEN 5 + +struct smbios_entry_point { + char eanchor[4]; /* anchor tag */ + uint8_t echecksum; /* checksum of entry point structure */ + uint8_t eplen; /* length in bytes of entry point */ + uint8_t major; /* major version of the SMBIOS spec */ + uint8_t minor; /* minor version of the SMBIOS spec */ + uint16_t maxssize; /* maximum size in bytes of a struct */ + uint8_t revision; /* entry point structure revision */ + uint8_t format[5]; /* entry point rev-specific data */ + char ianchor[5]; /* intermediate anchor tag */ + uint8_t ichecksum; /* intermediate checksum */ + uint16_t stlen; /* len in bytes of structure table */ + uint32_t staddr; /* physical addr of structure table */ + uint16_t stnum; /* number of structure table entries */ + uint8_t bcdrev; /* BCD value representing DMI ver */ +} __packed; + +/* + * BIOS Information + */ +#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */ +#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */ +#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */ +#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */ +// #define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */ +#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */ + +#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */ + +#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */ +#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */ + +struct smbios_table_type0 { + struct smbios_structure header; + uint8_t vendor; /* vendor string */ + uint8_t version; /* version string */ + uint16_t segment; /* address segment location */ + uint8_t rel_date; /* release date */ + uint8_t size; /* rom size */ + uint64_t cflags; /* characteristics */ + uint8_t xc_bytes[2]; /* characteristics ext bytes */ + uint8_t sb_major_rel; /* system bios version */ + uint8_t sb_minor_rele; + uint8_t ecfw_major_rel; /* embedded ctrl fw version */ + uint8_t ecfw_minor_rel; +} __packed; + +/* + * System Information + */ +#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */ + +struct smbios_table_type1 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t product; /* product name string */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t uuid[16]; /* uuid byte array */ + uint8_t wakeup; /* wake-up event */ + uint8_t sku; /* sku number string */ + uint8_t family; /* family name string */ +} __packed; + +/* + * System Enclosure or Chassis + */ +#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_CHST_SAFE 0x03 /* safe */ + +#define SMBIOS_CHSC_NONE 0x03 /* none */ + +struct smbios_table_type3 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t type; /* type */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t bustate; /* boot-up state */ + uint8_t psstate; /* power supply state */ + uint8_t tstate; /* thermal state */ + uint8_t security; /* security status */ + uint8_t uheight; /* height in 'u's */ + uint8_t cords; /* number of power cords */ + uint8_t elems; /* number of element records */ + uint8_t elemlen; /* length of records */ + uint8_t sku; /* sku number string */ +} __packed; + +/* + * Processor Information + */ +#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */ + +#define SMBIOS_PRF_OTHER 0x01 /* other */ + +#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */ +#define SMBIOS_PRS_ENABLED 0x1 /* enabled */ + +#define SMBIOS_PRU_NONE 0x06 /* none */ + +#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */ + +struct smbios_table_type4 { + struct smbios_structure header; + uint8_t socket; /* socket designation string */ + uint8_t type; /* processor type */ + uint8_t family; /* processor family */ + uint8_t manufacturer; /* manufacturer string */ + uint64_t cpuid; /* processor cpuid */ + uint8_t version; /* version string */ + uint8_t voltage; /* voltage */ + uint16_t clkspeed; /* ext clock speed in mhz */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint16_t curspeed; /* current speed in mhz */ + uint8_t status; /* status */ + uint8_t upgrade; /* upgrade */ + uint16_t l1handle; /* l1 cache handle */ + uint16_t l2handle; /* l2 cache handle */ + uint16_t l3handle; /* l3 cache handle */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t cores; /* cores per socket */ + uint8_t ecores; /* enabled cores */ + uint8_t threads; /* threads per socket */ + uint16_t cflags; /* processor characteristics */ + uint16_t family2; /* processor family 2 */ +} __packed; + +/* + * Physical Memory Array + */ +#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */ + +#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */ + +#define SMBIOS_MAE_NONE 0x03 /* none */ + +struct smbios_table_type16 { + struct smbios_structure header; + uint8_t location; /* physical device location */ + uint8_t use; /* device functional purpose */ + uint8_t ecc; /* err detect/correct method */ + uint32_t size; /* max mem capacity in kb */ + uint16_t errhand; /* handle of error (if any) */ + uint16_t ndevs; /* num of slots or sockets */ + uint64_t xsize; /* max mem capacity in bytes */ +} __packed; + +/* + * Memory Device + */ +#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */ + +struct smbios_table_type17 { + struct smbios_structure header; + uint16_t arrayhand; /* handle of physl mem array */ + uint16_t errhand; /* handle of mem error data */ + uint16_t twidth; /* total width in bits */ + uint16_t dwidth; /* data width in bits */ + uint16_t size; /* size in bytes */ + uint8_t form; /* form factor */ + uint8_t set; /* set */ + uint8_t dloc; /* device locator string */ + uint8_t bloc; /* phys bank locator string */ + uint8_t type; /* memory type */ + uint16_t flags; /* memory characteristics */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint8_t manufacturer; /* manufacturer string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t attributes; /* attributes */ + uint32_t xsize; /* extended size in mbs */ + uint16_t curspeed; /* current speed in mhz */ + uint16_t minvoltage; /* minimum voltage */ + uint16_t maxvoltage; /* maximum voltage */ + uint16_t curvoltage; /* configured voltage */ +} __packed; + +/* + * Memory Array Mapped Address + */ +struct smbios_table_type19 { + struct smbios_structure header; + uint32_t saddr; /* start phys addr in kb */ + uint32_t eaddr; /* end phys addr in kb */ + uint16_t arrayhand; /* physical mem array handle */ + uint8_t width; /* num of dev in row */ + uint64_t xsaddr; /* start phys addr in bytes */ + uint64_t xeaddr; /* end phys addr in bytes */ +} __packed; + +/* + * System Boot Information + */ +#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */ + +struct smbios_table_type32 { + struct smbios_structure header; + uint8_t reserved[6]; + uint8_t status; /* boot status */ +} __packed; + +/* + * End-of-Table + */ +struct smbios_table_type127 { + struct smbios_structure header; +} __packed; + +#pragma clang diagnostic pop + +static struct smbios_table_type0 smbios_type0_template = { + { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 }, + 1, /* bios vendor string */ + 2, /* bios version string */ + 0xF000, /* bios address segment location */ + 3, /* bios release date */ + 0x0, /* bios size (64k * (n + 1) is the size in bytes) */ + SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW | + SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD, + { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM }, + 0x0, /* bios major release */ + 0x0, /* bios minor release */ + 0xff, /* embedded controller firmware major release */ + 0xff /* embedded controller firmware minor release */ +}; + +static const char *smbios_type0_strings[] = { + "BHYVE", /* vendor string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ + NULL +}; + +static struct smbios_table_type1 smbios_type1_template = { + { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 }, + 1, /* manufacturer string */ + 2, /* product string */ + 3, /* version string */ + 4, /* serial number string */ + { 0 }, + SMBIOS_WAKEUP_SWITCH, + 5, /* sku string */ + 6 /* family string */ +}; + +static int smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static const char *smbios_type1_strings[] = { + " ", /* manufacturer string */ + "BHYVE", /* product name string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* sku string */ + " ", /* family name string */ + NULL +}; + +static struct smbios_table_type3 smbios_type3_template = { + { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 }, + 1, /* manufacturer string */ + SMBIOS_CHT_UNKNOWN, + 2, /* version string */ + 3, /* serial number string */ + 4, /* asset tag string */ + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHSC_NONE, + 0, /* height in 'u's (0=enclosure height unspecified) */ + 0, /* number of power cords (0=number unspecified) */ + 0, /* number of contained element records */ + 0, /* length of records */ + 5 /* sku number string */ +}; + +static const char *smbios_type3_strings[] = { + " ", /* manufacturer string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* sku number string */ + NULL +}; + +static struct smbios_table_type4 smbios_type4_template = { + { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 }, + 1, /* socket designation string */ + SMBIOS_PRT_CENTRAL, + SMBIOS_PRF_OTHER, + 2, /* manufacturer string */ + 0, /* cpuid */ + 3, /* version string */ + 0, /* voltage */ + 0, /* external clock frequency in mhz (0=unknown) */ + 0, /* maximum frequency in mhz (0=unknown) */ + 0, /* current frequency in mhz (0=unknown) */ + SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED, + SMBIOS_PRU_NONE, + (uint16_t) (-1), /* l1 cache handle */ + (uint16_t) (-1), /* l2 cache handle */ + (uint16_t) (-1), /* l3 cache handle */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* cores per socket (0=unknown) */ + 0, /* enabled cores per socket (0=unknown) */ + 0, /* threads per socket (0=unknown) */ + SMBIOS_PFL_64B, + SMBIOS_PRF_OTHER +}; + +static const char *smbios_type4_strings[] = { + " ", /* socket designation string */ + " ", /* manufacturer string */ + " ", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_table_type16 smbios_type16_template = { + { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 }, + SMBIOS_MAL_SYSMB, + SMBIOS_MAU_SYSTEM, + SMBIOS_MAE_NONE, + 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */ + (uint16_t) (-1), /* handle of error (if any) */ + 0, /* number of slots or sockets (TBD) */ + 0 /* extended maximum memory capacity in bytes (TBD) */ +}; + +static int smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_table_type17 smbios_type17_template = { + { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 }, + (uint16_t) (-1), /* handle of physical memory array */ + (uint16_t) (-1), /* handle of memory error data */ + 64, /* total width in bits including ecc */ + 64, /* data width in bits */ + 0x7fff, /* size in bytes (0x7fff=use extended)*/ + SMBIOS_MDFF_UNKNOWN, + 0, /* set (0x00=none, 0xff=unknown) */ + 1, /* device locator string */ + 2, /* physical bank locator string */ + SMBIOS_MDT_UNKNOWN, + SMBIOS_MDF_UNKNOWN, + 0, /* maximum memory speed in mhz (0=unknown) */ + 3, /* manufacturer string */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* attributes (0=unknown rank information) */ + 0, /* extended size in mb (TBD) */ + 0, /* current speed in mhz (0=unknown) */ + 0, /* minimum voltage in mv (0=unknown) */ + 0, /* maximum voltage in mv (0=unknown) */ + 0 /* configured voltage in mv (0=unknown) */ +}; + +static const char *smbios_type17_strings[] = { + " ", /* device locator string */ + " ", /* physical bank locator string */ + " ", /* manufacturer string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_table_type19 smbios_type19_template = { + { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 }, + 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */ + 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */ + (uint16_t) (-1), /* physical memory array handle */ + 1, /* number of devices that form a row */ + 0, /* extended starting phys addr in bytes (TDB) */ + 0 /* extended ending phys addr in bytes (TDB) */ +}; + +static int smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_table_type32 smbios_type32_template = { + { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 }, + { 0, 0, 0, 0, 0, 0 }, + SMBIOS_BOOT_NORMAL +}; + +static struct smbios_table_type127 smbios_type127_template = { + { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 } +}; + +static int smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_template_entry smbios_template[] = { + { (struct smbios_structure *)&smbios_type0_template, + smbios_type0_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type1_template, + smbios_type1_strings, + smbios_type1_initializer }, + { (struct smbios_structure *)&smbios_type3_template, + smbios_type3_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type4_template, + smbios_type4_strings, + smbios_type4_initializer }, + { (struct smbios_structure *)&smbios_type16_template, + NULL, + smbios_type16_initializer }, + { (struct smbios_structure *)&smbios_type17_template, + smbios_type17_strings, + smbios_type17_initializer }, + { (struct smbios_structure *)&smbios_type19_template, + NULL, + smbios_type19_initializer }, + { (struct smbios_structure *)&smbios_type32_template, + NULL, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type127_template, + NULL, + smbios_generic_initializer }, + { NULL,NULL, NULL } +}; + +static uint64_t guest_lomem, guest_himem; +static uint16_t type16_handle; + +static int +smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, + UNUSED uint16_t *size) +{ + struct smbios_structure *entry; + + memcpy(curaddr, template_entry, template_entry->length); + entry = (struct smbios_structure *)curaddr; + entry->handle = *n + 1; + curaddr += entry->length; + if (template_strings != NULL) { + int i; + + for (i = 0; template_strings[i] != NULL; i++) { + const char *string; + int len; + + string = template_strings[i]; + len = (int) (strlen(string) + 1); + memcpy(curaddr, string, len); + curaddr += len; + } + *curaddr = '\0'; + curaddr++; + } else { + /* Minimum string section is double nul */ + *curaddr = '\0'; + curaddr++; + *curaddr = '\0'; + curaddr++; + } + (*n)++; + *endaddr = curaddr; + + return (0); +} + +static int +smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type1 *type1; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type1 = (struct smbios_table_type1 *)curaddr; + + if (guest_uuid_str != NULL) { + uuid_t uuid; + uint32_t status; + + uuid_from_string(guest_uuid_str, &uuid, &status); + if (status != uuid_s_ok) + return (-1); + + uuid_enc_le(&type1->uuid, &uuid); + } else { + MD5_CTX mdctx; + u_char digest[16]; + char hostname[MAXHOSTNAMELEN]; + + /* + * Universally unique and yet reproducible are an + * oxymoron, however reproducible is desirable in + * this case. + */ + if (gethostname(hostname, sizeof(hostname))) + return (-1); + + MD5Init(&mdctx); + MD5Update(&mdctx, vmname, ((unsigned) strlen(vmname))); + MD5Update(&mdctx, hostname, ((unsigned) sizeof(hostname))); + MD5Final(digest, &mdctx); + + /* + * Set the variant and version number. + */ + digest[6] &= 0x0F; + digest[6] |= 0x30; /* version 3 */ + digest[8] &= 0x3F; + digest[8] |= 0x80; + + memcpy(&type1->uuid, digest, sizeof (digest)); + } + + return (0); +} + +static int +smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + int i; + + for (i = 0; i < guest_ncpus; i++) { + struct smbios_table_type4 *type4; + char *p; + int nstrings, len; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type4 = (struct smbios_table_type4 *)curaddr; + p = curaddr + sizeof (struct smbios_table_type4); + nstrings = 0; + while (p < *endaddr - 1) { + if (*p++ == '\0') + nstrings++; + } + len = sprintf(*endaddr - 1, "CPU #%d", i) + 1; + *endaddr += len - 1; + *(*endaddr) = '\0'; + (*endaddr)++; + type4->socket = (uint8_t) (nstrings + 1); + curaddr = *endaddr; + } + + return (0); +} + +static int +smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type16 *type16; + + type16_handle = *n; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type16 = (struct smbios_table_type16 *)curaddr; + type16->xsize = guest_lomem + guest_himem; + type16->ndevs = guest_himem > 0 ? 2 : 1; + + return (0); +} + +static int +smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type17 *type17; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = (uint32_t) guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = (uint32_t) guest_himem; + } + + return (0); +} + +static int +smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type19 *type19; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 0; + type19->xeaddr = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 4*GB; + type19->xeaddr = guest_himem; + } + + return (0); +} + +static void +smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr) +{ + memset(smbios_ep, 0, sizeof(*smbios_ep)); + memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR, + SMBIOS_ENTRY_EANCHORLEN); + smbios_ep->eplen = 0x1F; + assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen); + smbios_ep->major = 2; + smbios_ep->minor = 6; + smbios_ep->revision = 0; + memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR, + SMBIOS_ENTRY_IANCHORLEN); + smbios_ep->staddr = staddr; + smbios_ep->bcdrev = 0x24; +} + +static void +smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len, + uint16_t num, uint16_t maxssize) +{ + uint8_t checksum; + int i; + + smbios_ep->maxssize = maxssize; + smbios_ep->stlen = len; + smbios_ep->stnum = num; + + checksum = 0; + for (i = 0x10; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->ichecksum = checksum; + + checksum = 0; + for (i = 0; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->echecksum = checksum; +} + +int +smbios_build(void) +{ + struct smbios_entry_point *smbios_ep; + uint16_t n; + uint16_t maxssize; + char *curaddr, *startaddr, *ststartaddr; + int i; + int err; + + guest_lomem = xh_vm_get_lowmem_size(); + guest_himem = xh_vm_get_highmem_size(); + + startaddr = paddr_guest2host(SMBIOS_BASE, SMBIOS_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "smbios table requires mapped mem\n"); + return (ENOMEM); + } + + curaddr = startaddr; + + smbios_ep = (struct smbios_entry_point *)curaddr; + smbios_ep_initializer(smbios_ep, SMBIOS_BASE + + sizeof(struct smbios_entry_point)); + curaddr += sizeof(struct smbios_entry_point); + ststartaddr = curaddr; + + n = 0; + maxssize = 0; + for (i = 0; smbios_template[i].entry != NULL; i++) { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; + char *endaddr; + uint16_t size; + + entry = smbios_template[i].entry; + strings = smbios_template[i].strings; + initializer = smbios_template[i].initializer; + + err = (*initializer)(entry, strings, curaddr, &endaddr, + &n, &size); + if (err != 0) + return (err); + + if (size > maxssize) + maxssize = size; + + curaddr = endaddr; + } + + assert(curaddr - startaddr < SMBIOS_MAX_LENGTH); + smbios_ep_finalizer(smbios_ep, ((uint16_t) (curaddr - ststartaddr)), n, + maxssize); + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/task_switch.c b/vendor/github.com/hooklift/xhyve/task_switch.c new file mode 100644 index 0000000..23b6768 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/task_switch.c @@ -0,0 +1,938 @@ +/*- + * Copyright (c) 2014 Neel Natu + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +CTASSERT(sizeof(struct tss32) == 104); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(int vcpu, int reg) +{ + uint64_t val; + int error; + + error = xh_vm_get_register(vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(int vcpu, int reg, uint64_t val) +{ + int error; + + error = xh_vm_set_register(vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = (uint32_t) (usd->sd_type | (usd->sd_dpl << 5) | (usd->sd_p << 7)); + seg_desc.access |= (uint32_t) (usd->sd_xx << 12); + seg_desc.access |= (uint32_t) (usd->sd_def32 << 14); + seg_desc.access |= (uint32_t) (usd->sd_gran << 15); + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + xh_vm_inject_fault(vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = xh_vm_get_desc(vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread, + int *faultptr) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = xh_vm_get_desc(vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = xh_vm_copy_setup(vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? XHYVE_PROT_READ : XHYVE_PROT_WRITE, iov, nitems(iov), + faultptr); + if (error || *faultptr) + return (error); + + if (doread) + xh_vm_copyin(iov, desc, sizeof(*desc)); + else + xh_vm_copyout(desc, iov, sizeof(*desc)); + return (0); +} + +static int +desc_table_read(int vcpu, struct vm_guest_paging *paging, uint16_t sel, + struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr)); +} + +static int +desc_table_write(int vcpu, struct vm_guest_paging *paging, uint16_t sel, + struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(int vcpu, struct vm_task_switch *ts, uint16_t sel, + struct user_segment_descriptor *desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +CTASSERT(sizeof(struct user_segment_descriptor) == 8); +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + */ +static int +validate_seg_desc(int vcpu, struct vm_task_switch *ts, int segment, + struct seg_desc *seg_desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = (uint16_t) GETREG(vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(vcpu, sel)) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr); + if (error || *faultptr) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = (uint16_t) GETREG(vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = (uint16_t) GETREG(vcpu, VM_REG_GUEST_ES); + tss->tss_cs = (uint16_t) GETREG(vcpu, VM_REG_GUEST_CS); + tss->tss_ss = (uint16_t) GETREG(vcpu, VM_REG_GUEST_SS); + tss->tss_ds = (uint16_t) GETREG(vcpu, VM_REG_GUEST_DS); + tss->tss_fs = (uint16_t) GETREG(vcpu, VM_REG_GUEST_FS); + tss->tss_gs = (uint16_t) GETREG(vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~((unsigned) PSL_NT); + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + xh_vm_copyout(tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = xh_vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + */ +static int +tss32_restore(int vcpu, struct vm_task_switch *ts, uint16_t ot_sel, + struct tss32 *tss, struct iovec *iov, int *faultptr) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(tss->tss_cr3 & ~((unsigned) 0x1f), 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(vcpu); + return (1); + } + } + SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + xh_vm_copyout(tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + */ +static int +push_errcode(int vcpu, struct vm_guest_paging *paging, int task_type, + uint32_t errcode, int *faultptr) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + *faultptr = 0; + + cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); + stacksel = (uint16_t) GETREG(vcpu, VM_REG_GUEST_SS); + + error = xh_vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = (uint32_t) GETREG(vcpu, VM_REG_GUEST_RSP); + esp -= (uint32_t) bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &seg_desc, esp, + bytes, stacksize, XHYVE_PROT_WRITE, &gla)) + { + sel_exception(vcpu, IDT_SS, stacksel, 1); + *faultptr = 1; + return (0); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(vcpu, 1); + *faultptr = 1; + return (0); + } + + error = xh_vm_copy_setup(vcpu, paging, gla, ((size_t) bytes), + XHYVE_PROT_WRITE, iov, nitems(iov), faultptr); + if (error || *faultptr) + return (error); + + xh_vm_copyout(&errcode, iov, ((size_t) bytes)); + SETREG(vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + */ +#define CHKERR(error,fault) \ + do { \ + assert((error == 0) || (error == EFAULT)); \ + if (error) \ + return (VMEXIT_ABORT); \ + else if (fault) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int vmexit_task_switch(struct vm_exit *vmexit, int *pvcpu); + +int +vmexit_task_switch(struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Calculate the instruction pointer to store in the old TSS. + */ + eip = (uint32_t) (vmexit->rip + ((uint64_t) vmexit->inst_length)); + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc, + &fault); + CHKERR(error, fault); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < ((uint32_t) minlimit)) { + sel_exception(vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = xh_vm_copy_setup(vcpu, &sup_paging, nt.base, + ((size_t) (minlimit + 1)), (XHYVE_PROT_READ | XHYVE_PROT_WRITE), nt_iov, + nitems(nt_iov), &fault); + CHKERR(error, fault); + xh_vm_copyin(nt_iov, &newtss, ((size_t) (minlimit + 1))); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = (uint16_t) GETREG(vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = xh_vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc, + &fault); + CHKERR(error, fault); + + /* Get the old TSS */ + error = xh_vm_copy_setup(vcpu, &sup_paging, ot_base, + ((size_t) (minlimit + 1)), (XHYVE_PROT_READ | XHYVE_PROT_WRITE), + ot_iov, nitems(ot_iov), &fault); + CHKERR(error, fault); + xh_vm_copyin(ot_iov, &oldtss, ((size_t) (minlimit + 1))); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(vcpu, &sup_paging, ot_sel, + &ot_desc, &fault); + CHKERR(error, fault); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(vcpu, &sup_paging, nt_sel, + &nt_desc, &fault); + CHKERR(error, fault); + } + + /* Update task register to point at the new TSS */ + SETREG(vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); + SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + error = xh_vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); + + /* Load processor state from new TSS */ + error = tss32_restore(vcpu, task_switch, ot_sel, &newtss, nt_iov, + &fault); + CHKERR(error, fault); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(vcpu, &task_switch->paging, nt_type, + task_switch->errcode, &fault); + CHKERR(error, fault); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = xh_vm_set_intinfo(vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/vendor/github.com/hooklift/xhyve/uart_emul.c b/vendor/github.com/hooklift/xhyve/uart_emul.c new file mode 100644 index 0000000..bb2dbb0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/uart_emul.c @@ -0,0 +1,684 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2013 Neel Natu + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define COM1_BASE 0x3F8 +#define COM1_IRQ 4 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 + +#define DEFAULT_RCLK 1843200 +#define DEFAULT_BAUD 9600 + +#define FCR_RX_MASK 0xC0 + +#define MCR_OUT1 0x04 +#define MCR_OUT2 0x08 + +#define MSR_DELTA_MASK 0x0f + +#ifndef REG_SCR +#define REG_SCR com_scr +#endif + +#define FIFOSZ 16 + +static bool uart_stdio; /* stdio in use for i/o */ +static struct termios tio_stdio_orig; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct { + int baseaddr; + int irq; + bool inuse; +} uart_lres[] = { + { COM1_BASE, COM1_IRQ, false}, + { COM2_BASE, COM2_IRQ, false}, +}; + +#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) + +struct fifo { + uint8_t buf[FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of characters in the fifo */ + int size; /* size of the fifo */ +}; + +struct ttyfd { + bool opened; + int fd; /* tty device file descriptor */ + char *name; /* tty filesystem device path in case of using autopty*/ + struct termios tio_orig, tio_new; /* I/O Terminals */ +}; + +struct uart_softc { + pthread_mutex_t mtx; /* protects all softc elements */ + uint8_t data; /* Data register (R/W) */ + uint8_t ier; /* Interrupt enable register (R/W) */ + uint8_t lcr; /* Line control register (R/W) */ + uint8_t mcr; /* Modem control register (R/W) */ + uint8_t lsr; /* Line status register (R/W) */ + uint8_t msr; /* Modem status register (R/W) */ + uint8_t fcr; /* FIFO control register (W) */ + uint8_t scr; /* Scratch register (R/W) */ + + uint8_t dll; /* Baudrate divisor latch LSB */ + uint8_t dlh; /* Baudrate divisor latch MSB */ + + struct fifo rxfifo; + struct mevent *mev; + + struct ttyfd tty; + bool thre_int_pending; /* THRE interrupt pending */ + + void *arg; + uart_intr_func_t intr_assert; + uart_intr_func_t intr_deassert; +}; +#pragma clang diagnostic pop + +static void uart_drain(int fd, enum ev_type ev, void *arg); + +static void +ttyclose(void) +{ + + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); +} + +static void +ttyopen(struct ttyfd *tf) +{ + + tcgetattr(tf->fd, &tf->tio_orig); + + tf->tio_new = tf->tio_orig; + cfmakeraw(&tf->tio_new); + tf->tio_new.c_cflag |= CLOCAL; + tcsetattr(tf->fd, TCSANOW, &tf->tio_new); + + if (tf->fd == STDIN_FILENO) { + tio_stdio_orig = tf->tio_orig; + atexit(ttyclose); + } +} + +static int +ttyread(struct ttyfd *tf) +{ + unsigned char rb; + + if (read(tf->fd, &rb, 1) == 1) + return (rb); + else + return (-1); +} + +static void +ttywrite(struct ttyfd *tf, unsigned char wb) +{ + + (void)write(tf->fd, &wb, 1); +} + +static void +rxfifo_reset(struct uart_softc *sc, int size) +{ + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +} + +static int +rxfifo_available(struct uart_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); +} + +static int +rxfifo_putchar(struct uart_softc *sc, uint8_t ch) +{ + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; + + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = ch; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } + } + return (0); + } else + return (-1); +} + +static int +rxfifo_getchar(struct uart_softc *sc) +{ + struct fifo *fifo; + int c, error, wasfull; + + wasfull = 0; + fifo = &sc->rxfifo; + if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; + c = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } + } + return (c); + } else + return (-1); +} + +static int +rxfifo_numchars(struct uart_softc *sc) +{ + struct fifo *fifo = &sc->rxfifo; + + return (fifo->num); +} + +static void +uart_opentty(struct uart_softc *sc) +{ + + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc); + assert(sc->mev != NULL); +} + +/* + * The IIR returns a prioritized interrupt reason: + * - receive data available + * - transmit holding register empty + * - modem status change + * + * Return an interrupt reason if one is available. + */ +static int +uart_intr_reason(struct uart_softc *sc) +{ + + if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) + return (IIR_RLS); + else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) + return (IIR_RXTOUT); + else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) + return (IIR_TXRDY); + else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) + return (IIR_MLSC); + else + return (IIR_NOPEND); +} + +static void +uart_reset(struct uart_softc *sc) +{ + uint16_t divisor; + + divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; + sc->dll = (uint8_t) divisor; + sc->dlh = (uint8_t) (divisor >> 16); + + rxfifo_reset(sc, 1); /* no fifo until enabled by software */ +} + +/* + * Toggle the COM port's intr pin depending on whether or not we have an + * interrupt condition to report to the processor. + */ +static void +uart_toggle_intr(struct uart_softc *sc) +{ + uint8_t intr_reason; + + intr_reason = (uint8_t) uart_intr_reason(sc); + + if (intr_reason == IIR_NOPEND) + (*sc->intr_deassert)(sc->arg); + else + (*sc->intr_assert)(sc->arg); +} + +static void +uart_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc; + int ch; + + sc = arg; + + assert(fd == sc->tty.fd); + assert(ev == EVF_READ); + + /* + * This routine is called in the context of the mevent thread + * to take out the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) ttyread(&sc->tty); + } else { + while (rxfifo_available(sc) && + ((ch = ttyread(&sc->tty)) != -1)) { + rxfifo_putchar(sc, ((uint8_t) ch)); + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); +} + +void +uart_write(struct uart_softc *sc, int offset, uint8_t value) +{ + int fifosz; + uint8_t msr; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + sc->dll = value; + goto done; + } + + if (offset == REG_DLH) { + sc->dlh = value; + goto done; + } + } + + switch (offset) { + case REG_DATA: + if (sc->mcr & MCR_LOOPBACK) { + if (rxfifo_putchar(sc, value) != 0) + sc->lsr |= LSR_OE; + } else if (sc->tty.opened) { + ttywrite(&sc->tty, value); + } /* else drop on floor */ + sc->thre_int_pending = true; + break; + case REG_IER: + /* + * Apply mask so that bits 4-7 are 0 + * Also enables bits 0-3 only if they're 1 + */ + sc->ier = value & 0x0F; + break; + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + rxfifo_reset(sc, fifosz); + } + + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + rxfifo_reset(sc, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + + msr = 0; + if (sc->mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the + * MCR are reflected back into MSR + */ + if (sc->mcr & MCR_RTS) + msr |= MSR_CTS; + if (sc->mcr & MCR_DTR) + msr |= MSR_DSR; + if (sc->mcr & MCR_OUT1) + msr |= MSR_RI; + if (sc->mcr & MCR_OUT2) + msr |= MSR_DCD; + } + + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); +} + +uint8_t +uart_read(struct uart_softc *sc, int offset) +{ + uint8_t iir, intr_reason, reg; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + reg = sc->dll; + goto done; + } + + if (offset == REG_DLH) { + reg = sc->dlh; + goto done; + } + } + + switch (offset) { + case REG_DATA: + reg = (uint8_t) rxfifo_getchar(sc); + break; + case REG_IER: + reg = sc->ier; + break; + case REG_IIR: + iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; + + intr_reason = (uint8_t) uart_intr_reason(sc); + + /* + * Deal with side effects of reading the IIR register + */ + if (intr_reason == IIR_TXRDY) + sc->thre_int_pending = false; + + iir |= intr_reason; + + reg = iir; + break; + case REG_LCR: + reg = sc->lcr; + break; + case REG_MCR: + reg = sc->mcr; + break; + case REG_LSR: + /* Transmitter is always ready for more data */ + sc->lsr |= LSR_TEMT | LSR_THRE; + + /* Check for new receive data */ + if (rxfifo_numchars(sc) > 0) + sc->lsr |= LSR_RXRDY; + else + sc->lsr &= ~LSR_RXRDY; + + reg = sc->lsr; + + /* The LSR_OE bit is cleared on LSR read */ + sc->lsr &= ~LSR_OE; + break; + case REG_MSR: + /* + * MSR delta bits are cleared on read + */ + reg = sc->msr; + sc->msr &= ~MSR_DELTA_MASK; + break; + case REG_SCR: + reg = sc->scr; + break; + default: + reg = 0xFF; + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); + + return (reg); +} + +int +uart_legacy_alloc(int which, int *baseaddr, int *irq) +{ + + if ((which < 0) || (((unsigned) which) >= UART_NLDEVS) || + uart_lres[which].inuse) + { + return (-1); + } + + uart_lres[which].inuse = true; + *baseaddr = uart_lres[which].baseaddr; + *irq = uart_lres[which].irq; + + return (0); +} + +struct uart_softc * +uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, + void *arg) +{ + struct uart_softc *sc; + + sc = calloc(1, sizeof(struct uart_softc)); + + sc->arg = arg; + sc->intr_assert = intr_assert; + sc->intr_deassert = intr_deassert; + + pthread_mutex_init(&sc->mtx, NULL); + + uart_reset(sc); + + return (sc); +} + +static int +uart_tty_backend(struct uart_softc *sc, const char *opts) +{ + int fd; + int retval; + + retval = -1; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd > 0 && isatty(fd)) { + sc->tty.fd = fd; + sc->tty.opened = true; + retval = 0; + } + + return (retval); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; + int ptyfd; + char *ptyname; + + retval = -1; + + if (opts == NULL) + return (0); + + if (strcmp("stdio", opts) == 0 && !uart_stdio) { + sc->tty.fd = STDIN_FILENO; + sc->tty.opened = true; + uart_stdio = true; + retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK); + } else if (strcmp("autopty", opts) == 0) { + if ((ptyfd = open("/dev/ptmx", O_RDWR | O_NONBLOCK)) == -1) { + fprintf(stderr, "error opening /dev/ptmx char device"); + return retval; + } + + if ((ptyname = ptsname(ptyfd)) == NULL) { + perror("ptsname: error getting name for slave pseudo terminal"); + return retval; + } + + if ((retval = grantpt(ptyfd)) == -1) { + perror("error setting up ownership and permissions on slave pseudo terminal"); + return retval; + } + + if ((retval = unlockpt(ptyfd)) == -1) { + perror("error unlocking slave pseudo terminal to allow its usage"); + return retval; + } + + // Sends to Go land the device path name for the slave pseudo-terminal. + go_set_pty_name(ptyname); + + sc->tty.fd = ptyfd; + sc->tty.name = ptyname; + sc->tty.opened = true; + retval = 0; + } else if (uart_tty_backend(sc, opts) == 0) { + retval = 0; + } + + if (retval == 0) + uart_opentty(sc); + + return (retval); +} diff --git a/vendor/github.com/hooklift/xhyve/vatpic.c b/vendor/github.com/hooklift/xhyve/vatpic.c new file mode 100644 index 0000000..0b0bb25 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vatpic.c @@ -0,0 +1,792 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VATPIC_LOCK_INIT(v) (v)->lock = OS_SPINLOCK_INIT; +#define VATPIC_LOCK(v) OSSpinLockLock(&(v)->lock) +#define VATPIC_UNLOCK(v) OSSpinLockUnlock(&(v)->lock) + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct atpic { + bool ready; + int icw_num; + int rd_cmd_reg; + bool aeoi; + bool poll; + bool rotate; + bool sfn; /* special fully-nested mode */ + int irq_base; + uint8_t request; /* Interrupt Request Register (IIR) */ + uint8_t service; /* Interrupt Service (ISR) */ + uint8_t mask; /* Interrupt Mask Register (IMR) */ + uint8_t smm; /* special mask mode */ + int acnt[8]; /* sum of pin asserts and deasserts */ + int lowprio; /* lowest priority irq */ + bool intr_raised; +}; + +struct vatpic { + struct vm *vm; + OSSpinLock lock; + struct atpic atpic[2]; + uint8_t elc[2]; +}; +#pragma clang diagnostic pop + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->service & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->smm && (atpic->mask & bit) != 0) + continue; + else + return (pin); + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->service; + if (atpic->sfn) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->smm) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + vatpic_set_pinstate(vatpic, 2, true); + vatpic_set_pinstate(vatpic, 2, false); + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); + + atpic->ready = false; + + atpic->icw_num = 1; + atpic->request = 0; + atpic->mask = 0; + atpic->lowprio = 7; + atpic->rd_cmd_reg = 0; + atpic->poll = 0; + atpic->smm = 0; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); + + atpic->irq_base = val & 0xf8; + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); + + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + if ((val & ICW4_AEOI) != 0) + atpic->aeoi = true; + + if ((val & ICW4_SFNM) != 0) { + if (master_atpic(vatpic, atpic)) { + atpic->sfn = true; + } else { + VATPIC_CTR1(vatpic, "Ignoring special fully nested " + "mode on slave atpic: %#x", val); + } + } + + atpic->icw_num = 0; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); + + atpic->mask = val & 0xff; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); + + atpic->rotate = ((val & OCW2_R) != 0); + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->service &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); + + if (val & OCW3_ESMM) { + atpic->smm = val & OCW3_SMM ? 1 : 0; + VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", + master_atpic(vatpic, atpic) ? "master" : "slave", + atpic->smm ? "enabled" : "disabled"); + } + + if (val & OCW3_RR) { + /* read register command */ + atpic->rd_cmd_reg = val & OCW3_RIS; + + /* Polling mode */ + atpic->poll = ((val & OCW3_P) != 0); + } + + return (0); +} + +static void +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + int oldcnt, newcnt; + bool level; + + KASSERT(pin >= 0 && pin < 16, + ("vatpic_set_pinstate: invalid pin number %d", pin)); + + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = atpic->acnt[pin & 0x7]; + if (newstate) + atpic->acnt[pin & 0x7]++; + else + atpic->acnt[pin & 0x7]--; + newcnt = atpic->acnt[pin & 0x7]; + + if (newcnt < 0) { + VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); + } + + level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); + + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); + atpic->request |= (1 << (pin & 0x7)); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); + if (level) + atpic->request &= ~(1 << (pin & 0x7)); + } else { + VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", + pin, newstate ? "asserted" : "deasserted", newcnt); + } + + vatpic_notify_intr(vatpic); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (atpic->ready == false) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + vatpic_set_pinstate(vatpic, irq, true); + vatpic_set_pinstate(vatpic, irq, false); + break; + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + struct vatpic *vatpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comment in vatpic_elc_handler. These IRQs must be + * edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + if (trigger == LEVEL_TRIGGER) + vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); + else + vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->request &= ~(1 << pin); + + if (atpic->aeoi == true) { + if (atpic->rotate == true) + atpic->lowprio = pin; + } else { + atpic->service |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, UNUSED bool in, + int port, UNUSED int bytes, uint32_t *eax) +{ + int pin; + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = 0; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | ((uint32_t) pin); + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->mask; + } else { + if (atpic->rd_cmd_reg == OCW3_RIS) { + /* read interrupt service register */ + *eax = atpic->service; + } else { + /* read interrupt request register */ + *eax = atpic->request; + } + } + } + + VATPIC_UNLOCK(vatpic); +//printf("vatpic_read 0x%04x 0x%02x\n", port, (uint8_t)*eax); + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, UNUSED bool in, + int port, UNUSED int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = (uint8_t) *eax; +//printf("vatpic_write 0x%04x 0x%02x %d\n", port, val, atpic->icw_num); + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_num) { + case 2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case 3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case 4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(struct vm *vm, UNUSED int vcpuid, bool in, int port, + int bytes, uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(struct vm *vm, UNUSED int vcpuid, bool in, int port, + int bytes, uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(struct vm *vm, UNUSED int vcpuid, bool in, int port, + int bytes, uint32_t *eax) +{ + struct vatpic *vatpic; + bool is_master; + + vatpic = vm_atpic(vm); + is_master = (port == IO_ELCR1); + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + + if (in) { + if (is_master) + *eax = vatpic->elc[0]; + else + *eax = vatpic->elc[1]; + } else { + /* + * For the master PIC the cascade channel (IRQ2), the + * heart beat timer (IRQ0), and the keyboard + * controller (IRQ1) cannot be programmed for level + * mode. + * + * For the slave PIC the real time clock (IRQ8) and + * the floating point error interrupt (IRQ13) cannot + * be programmed for level mode. + */ + if (is_master) + vatpic->elc[0] = (*eax & 0xf8); + else + vatpic->elc[1] = (*eax & 0xde); + } + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof(struct vatpic)); + assert(vatpic); + bzero(vatpic, sizeof(struct vatpic)); + vatpic->vm = vm; + + VATPIC_LOCK_INIT(vatpic); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + free(vatpic); +} diff --git a/vendor/github.com/hooklift/xhyve/vatpit.c b/vendor/github.com/hooklift/xhyve/vatpit.c new file mode 100644 index 0000000..5c59425 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vatpit.c @@ -0,0 +1,450 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VATPIT_LOCK_INIT(v) (v)->lock = OS_SPINLOCK_INIT; +#define VATPIT_LOCK(v) OSSpinLockLock(&(v)->lock) +#define VATPIT_UNLOCK(v) OSSpinLockUnlock(&(v)->lock) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + +struct channel { + int mode; + uint16_t initial; /* initial counter value */ + sbintime_t now_sbt; /* uptime when counter was loaded */ + uint8_t cr[2]; + uint8_t ol[2]; + bool slatched; /* status latched */ + uint8_t status; + int crbyte; + int olbyte; + int frbyte; + struct callout callout; + sbintime_t callout_sbt; /* target time */ + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + OSSpinLock lock; + sbintime_t freq_sbt; + struct channel channel[3]; +}; +#pragma clang diagnostic pop + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + sbintime_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + out = ((c->initial - delta_ticks) <= 0); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); + return; +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c; + sbintime_t now, delta; + + c = &vatpit->channel[0]; + if (c->initial != 0) { + delta = c->initial * vatpit->freq_sbt; + c->callout_sbt = c->callout_sbt + delta; + + /* + * Reset 'callout_sbt' if the time that the callout + * was supposed to fire is more than 'c->initial' + * ticks in the past. + */ + now = sbinuptime(); + if (c->callout_sbt < now) + c->callout_sbt = now + delta; + + callout_reset_sbt(&c->callout, c->callout_sbt, + 0, vatpit_callout_handler, &c->callout_arg, + C_ABSOLUTE); + } +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + sbintime_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olbyte != 0) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an o/s bug - reading the value of + * the timer without having set up the initial value. + * + * The original user-space version of this code set + * the timer to 100hz in this condition; do the same + * here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + c->now_sbt = sbinuptime(); + c->status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olbyte = 2; + c->ol[1] = (uint8_t) lval; /* LSB */ + c->ol[0] = (uint8_t) (lval >> 8); /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { + (void) pit_update_counter(vatpit, c, true); + } + + if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->status |= TIMER_STS_OUT; + else + c->status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw, mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) + pit_update_counter(vatpit, c, true); + else { + c->mode = mode; + c->olbyte = 0; /* reset latch after reprogramming */ + c->status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(struct vm *vm, UNUSED int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + struct channel *c; + uint8_t val; + int error; + + vatpit = vm_atpit(vm); + + if (bytes != 1) + return (-1); + + val = (uint8_t) *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x\n", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* + * Return the status byte if latched + */ + *eax = c->status; + c->slatched = false; + c->status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (c->olbyte == 0) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->frbyte) + tmp >>= 8; + tmp &= 0xff; + *eax = tmp; + c->frbyte ^= 1; + } else + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = (uint8_t) *eax; + if (c->crbyte == 2) { + c->status &= ~TIMER_STS_NULLCNT; + c->frbyte = 0; + c->crbyte = 0; + c->initial = (uint16_t) c->cr[0]; + c->initial |= (((uint16_t) c->cr[1]) << 8); + c->now_sbt = sbinuptime(); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->callout_sbt = c->now_sbt; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(struct vm *vm, UNUSED int vcpuid, bool in, UNUSED int port, + UNUSED int bytes, uint32_t *eax) +{ + struct vatpit *vatpit; + + vatpit = vm_atpit(vm); + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct bintime bt; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof(struct vatpit)); + assert(vatpit); + bzero(vatpit, sizeof(struct vatpit)); + vatpit->vm = vm; + + VATPIT_LOCK_INIT(vatpit) + + FREQ2BT(PIT_8254_FREQ, &bt); + vatpit->freq_sbt = bttosbt(bt); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, true); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + free(vatpit); +} diff --git a/vendor/github.com/hooklift/xhyve/vhpet.c b/vendor/github.com/hooklift/xhyve/vhpet.c new file mode 100644 index 0000000..1d81861 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vhpet.c @@ -0,0 +1,752 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HPET_FREQ 10000000 /* 10.0 Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet { + struct vm *vm; + pthread_mutex_t mtx; + sbintime_t freq_sbt; + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t countbase; /* HPET counter base value */ + sbintime_t countbase_sbt; /* uptime corresponding to base value */ + struct { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + sbintime_t callout_sbt; /* time when counter==compval */ + struct vhpet_callout_arg arg; + } timer[VHPET_NUM_TIMERS]; +}; +#pragma clang diagnostic pop + +#define VHPET_LOCK(vhp) pthread_mutex_lock(&((vhp)->mtx)) +#define VHPET_UNLOCK(vhp) pthread_mutex_unlock(&((vhp)->mtx)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + sbintime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= ((uint64_t) 0x8086) << 16; /* vendor id */ + cap |= ((uint64_t) (VHPET_NUM_TIMERS - 1)) << 8; /* number of timers */ + cap |= (uint64_t) 1; /* revision */ + cap &= ~((uint64_t) HPET_CAP_COUNT_SIZE); /* 32-bit timer */ + cap &= (uint64_t) 0xffffffff; + cap |= ((uint64_t) (FS_PER_S / HPET_FREQ)) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) +{ + uint32_t val; + sbintime_t now, delta; + + val = vhpet->countbase; + if (vhpet_counter_enabled(vhpet)) { + now = sbinuptime(); + delta = now - vhpet->countbase_sbt; + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%#llx to %#llx", vhpet->countbase_sbt, now)); + val += delta / vhpet->freq_sbt; + if (nowptr != NULL) + *nowptr = now; + } else { + /* + * The sbinuptime corresponding to the 'countbase' is + * meaningless when the counter is disabled. Make sure + * that the the caller doesn't want to use it. + */ + KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + sbintime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + xhyve_abort("vhpet(%p) callout with counter disabled\n", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); + return; +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_sbt < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) +{ + sbintime_t delta; + + if (vhpet->timer[n].comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; + vhpet->timer[n].callout_sbt = now + delta; + callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, + 0, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->countbase_sbt = sbinuptime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->countbase, + vhpet->countbase_sbt); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) +{ + int i; + + vhpet->countbase = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + xhyve_abort("vhpet timer %d isr should not be asserted\n", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016llx", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~((uint64_t) HPET_TCNF_INT_ROUTE); + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(void *vm, UNUSED int vcpuid, uint64_t gpa, uint64_t val, int size, + UNUSED void *arg) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + sbintime_t now, *nowptr; + int i, offset; + + now = 0; + vhpet = vm_hpet(vm); + offset = (int) (gpa - VHPET_BASE); + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'countbase' with the value right + * before it is disabled. + */ + nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; + counter = vhpet_counter(vhpet, nowptr); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~((uint64_t) HPET_CNF_LEG_RT); + + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = (uint32_t) (vhpet->isr & data); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->countbase = (uint32_t) val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = (uint32_t) val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = (uint32_t) val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = (uint32_t) val64; + } + vhpet->timer[i].cap_config &= ~((uint64_t) HPET_TCNF_VAL_SET); + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(void *vm, UNUSED int vcpuid, uint64_t gpa, uint64_t *rval, int size, + UNUSED void *arg) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + data = 0; + vhpet = vm_hpet(vm); + offset = (int) (gpa - VHPET_BASE); + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + struct bintime bt; + + vhpet = malloc(sizeof(struct vhpet)); + assert(vhpet); + bzero(vhpet, sizeof(struct vhpet)); + vhpet->vm = vm; + + pthread_mutex_init(&vhpet->mtx, NULL); + + FREQ2BT(HPET_FREQ, &bt); + vhpet->freq_sbt = bttosbt(bt); + + pincount = vioapic_pincount(vm); + if (pincount >= 24) + allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + free(vhpet); +} + +int +vhpet_getcap(uint32_t *cap) +{ + *cap = (uint32_t) vhpet_capabilities(); + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/vioapic.c b/vendor/github.com/hooklift/xhyve/vioapic.c new file mode 100644 index 0000000..55005c4 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vioapic.c @@ -0,0 +1,483 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 24 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vioapic { + struct vm *vm; + OSSpinLock lock; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ + } rtbl[REDIR_ENTRIES]; +}; +#pragma clang diagnostic pop + +#define VIOAPIC_LOCK_INIT(v) (v)->lock = OS_SPINLOCK_INIT; +#define VIOAPIC_LOCK(v) OSSpinLockLock(&(v)->lock) +#define VIOAPIC_UNLOCK(v) OSSpinLockUnlock(&(v)->lock) + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#ifdef XHYVE_CONFIG_TRACE +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) +#endif + +#ifdef XHYVE_CONFIG_TRACE +static const char * +pinstate_str(bool asserted) +{ + + if (asserted) + return ("asserted"); + else + return ("deasserted"); +} +#endif + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + low = (uint32_t) vioapic->rtbl[pin].reg; + high = (uint32_t) (vioapic->rtbl[pin].reg >> 32); + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); +} + +static void +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + int oldcnt, newcnt; + bool needintr; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + oldcnt = vioapic->rtbl[pin].acnt; + if (newstate) + vioapic->rtbl[pin].acnt++; + else + vioapic->rtbl[pin].acnt--; + newcnt = vioapic->rtbl[pin].acnt; + + if (newcnt < 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", + pin, newcnt); + } + + needintr = false; + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); + } else if (oldcnt == 1 && newcnt == 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); + } else { +#ifdef XHYVE_CONFIG_TRACE + VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", + pin, pinstate_str(newstate), newcnt); +#endif + } + if (needintr) + vioapic_send_intr(vioapic, pin); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + vioapic_set_pinstate(vioapic, irq, true); + vioapic_set_pinstate(vioapic, irq, false); + break; + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +/* + * Reset the vlapic's trigger-mode register to reflect the ioapic pin + * configuration. + */ +static void +vioapic_update_tmr(struct vm *vm, int vcpuid, UNUSED void *arg) +{ + struct vioapic *vioapic; + struct vlapic *vlapic; + uint32_t low, high, dest; + int delmode, pin, vector; + bool level, phys; + + vlapic = vm_lapic(vm, vcpuid); + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + /* + * Reset all vectors to be edge-triggered. + */ + vlapic_reset_tmr(vlapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + low = (uint32_t) vioapic->rtbl[pin].reg; + high = (uint32_t) (vioapic->rtbl[pin].reg >> 32); + + level = low & IOART_TRGRLVL ? true : false; + if (!level) + continue; + + /* + * For a level-triggered 'pin' let the vlapic figure out if + * an assertion on this 'pin' would result in an interrupt + * being delivered to it. If yes, then it will modify the + * TMR bit associated with this vector to level-triggered. + */ + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + } + VIOAPIC_UNLOCK(vioapic); +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, UNUSED int vcpuid, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + case IOAPIC_ARB: + return (vioapic->id); + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return ((uint32_t) (vioapic->rtbl[pin].reg >> rshift)); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) +{ + uint64_t data64, mask64; + uint64_t last, changed; + int regnum, pin, lshift; + cpuset_t allvcpus; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + last = vioapic->rtbl[pin].reg; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#llx", + pin, vioapic->rtbl[pin].reg); + + /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then rendezvous all the vcpus + * to update their vlapic trigger-mode registers. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~((uint64_t) (IOART_INTMASK | IOART_INTPOL))) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + VIOAPIC_UNLOCK(vioapic); + allvcpus = vm_active_cpus(vioapic->vm); + vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, + vioapic_update_tmr, NULL); + VIOAPIC_LOCK(vioapic); + } + + /* + * Generate an interrupt if the following conditions are met: + * - pin is not masked + * - previous interrupt has been EOIed + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = (uint32_t) *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + ((uint32_t) *data)); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size, UNUSED void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); + + return (error); +} + +int +vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size, UNUSED void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, UNUSED int vcpuid, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~((uint64_t) IOART_REM_IRR); + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof(struct vioapic)); + assert(vioapic); + bzero(vioapic, sizeof(struct vioapic)); + vioapic->vm = vm; + + VIOAPIC_LOCK_INIT(vioapic); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + free(vioapic); +} + +int +vioapic_pincount(UNUSED struct vm *vm) +{ + return (REDIR_ENTRIES); +} diff --git a/vendor/github.com/hooklift/xhyve/virtio.c b/vendor/github.com/hooklift/xhyve/virtio.c new file mode 100644 index 0000000..59ad266 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/virtio.c @@ -0,0 +1,771 @@ +/*- + * Copyright (c) 2013 Chris Torek + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_pi = pi; + pi->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = (uint16_t) i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + * + * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; + vq->vq_pfn = 0; + vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + if (vs->vs_isr) + pci_lintr_deassert(vs->vs_pi); + vs->vs_isr = 0; + vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; +} + +/* + * Set I/O BAR (usually 0) to map PCI config registers. + */ +void +vi_set_io_bar(struct virtio_softc *vs, int barnum) +{ + size_t size; + + /* + * ??? should we use CFG0 if MSI-X is disabled? + * Existing code did not... + */ + size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; + pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); +} + +/* + * Initialize MSI-X vector capabilities if we're to use MSI-X, + * or MSI capabilities if not. + * + * We assume we want one MSI-X vector per queue, here, plus one + * for the config vec. + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + int nvec; + + if (use_msix) { + vs->vs_flags |= VIRTIO_USE_MSIX; + VS_LOCK(vs); + vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ + VS_UNLOCK(vs); + nvec = vs->vs_vc->vc_nvq + 1; + if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) + return (1); + } else + vs->vs_flags &= ~VIRTIO_USE_MSIX; + + /* Only 1 MSI vector for bhyve */ + pci_emul_add_msicap(vs->vs_pi, 1); + + /* Legacy interrupts are mandatory for virtio devices */ + pci_lintr_request(vs->vs_pi); + + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +static void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn << VRING_PFN; + size = vring_size(vq->vq_qsize); + base = paddr_guest2host(phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *) roundup2(((uintptr_t) base), ((uintptr_t) VRING_ALIGN)); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct iovec *iov, int n_iov, + uint16_t *flags) +{ + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, + int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, iov, n_iov, flags); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain back to the available queue. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchain(struct vqueue_info *vq) +{ + + vq->vq_last_avail--; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = idx; + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_size; /* size (bytes) */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, + { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, + { VTCFG_R_PFN, 4, 0, "PFN" }, + { VTCFG_R_QNUM, 2, 1, "QNUM" }, + { VTCFG_R_QSEL, 2, 0, "QSEL" }, + { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, + { VTCFG_R_STATUS, 1, 0, "STATUS" }, + { VTCFG_R_ISR, 1, 0, "ISR" }, + { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, + { VTCFG_R_QVEC, 2, 0, "QVEC" }, +}; +#pragma clang diagnostic pop + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +/* + * Handle pci config space reads. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_pci_read(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct virtio_softc *vs = pi->pi_arg; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + uint32_t value; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + * If that fails, fall into general code. + */ + newoff = (uint32_t) (offset - virtio_config_size); + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if ((newoff + ((unsigned) size)) > max) + goto bad; + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), ((int) newoff), size, &value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr((int) offset); + if (cr == NULL || cr->cr_size != size) { + if (cr != NULL) { + /* offset must be OK, so size must be bad */ + fprintf(stderr, + "%s: read from %s: bad size %d\r\n", + name, cr->cr_name, size); + } else { + fprintf(stderr, + "%s: read from bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + value = (uint32_t) vc->vc_hv_caps; + break; + case VTCFG_R_GUESTCAP: + value = vs->vs_negotiated_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq < vc->vc_nvq) + value = vs->vs_queues[vs->vs_curq].vq_pfn; + break; + case VTCFG_R_QNUM: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + break; + case VTCFG_R_QSEL: + value = (uint32_t) (vs->vs_curq); + break; + case VTCFG_R_QNOTIFY: + value = 0; /* XXX */ + break; + case VTCFG_R_STATUS: + value = vs->vs_status; + break; + case VTCFG_R_ISR: + value = vs->vs_isr; + vs->vs_isr = 0; /* a read clears this flag */ + if (value) + pci_lintr_deassert(pi); + break; + case VTCFG_R_CFGVEC: + value = vs->vs_msix_cfg_idx; + break; + case VTCFG_R_QVEC: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_msix_idx : + VIRTIO_MSI_NO_VECTOR; + break; + } +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_pci_write(UNUSED int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = pi->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + */ + newoff = (uint32_t) (offset - virtio_config_size); + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if ((newoff + ((unsigned) size)) > max) + goto bad; + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), ((int) newoff), size, + ((uint32_t) value)); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr((int) offset); + if (cr == NULL || cr->cr_size != size || cr->cr_ro) { + if (cr != NULL) { + /* offset must be OK, wrong size and/or reg is R/O */ + if (cr->cr_size != size) + fprintf(stderr, + "%s: write to %s: bad size %d\r\n", + name, cr->cr_name, size); + if (cr->cr_ro) + fprintf(stderr, + "%s: write to read-only reg %s\r\n", + name, cr->cr_name); + } else { + fprintf(stderr, + "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + vs->vs_negotiated_caps = (uint32_t) (value & vc->vc_hv_caps); + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + break; + case VTCFG_R_PFN: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vi_vq_init(vs, ((uint32_t) value)); + break; + case VTCFG_R_QSEL: + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = (int) value; + break; + case VTCFG_R_QNOTIFY: + if (value >= ((uint64_t) vc->vc_nvq)) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + goto done; + } + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + case VTCFG_R_STATUS: + vs->vs_status = (uint8_t) value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VTCFG_R_CFGVEC: + vs->vs_msix_cfg_idx = (uint16_t) value; + break; + case VTCFG_R_QVEC: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_msix_idx = (uint16_t) value; + break; + } + goto done; + +bad_qindex: + fprintf(stderr, + "%s: write config reg %s: curq %d >= max %d\r\n", + name, cr->cr_name, vs->vs_curq, vc->vc_nvq); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/vendor/github.com/hooklift/xhyve/vlapic.c b/vendor/github.com/hooklift/xhyve/vlapic.c new file mode 100644 index 0000000..62cf730 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vlapic.c @@ -0,0 +1,1706 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) + +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) + +/* + * The 'vlapic->timer_lock' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK_INIT(v) (v)->timer_lock = OS_SPINLOCK_INIT; +#define VLAPIC_TIMER_LOCK(v) OSSpinLockLock(&(v)->timer_lock) +#define VLAPIC_TIMER_UNLOCK(v) OSSpinLockUnlock(&(v)->timer_lock) + +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return ((uint32_t) vlapic->vcpuid); + else + return ((uint32_t) (vlapic->vcpuid << 24)); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = (int) vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (uint32_t) ((apicid & 0xffff0) << 12); + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + } else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + } else { + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); + } +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~((unsigned) APIC_LDR_RESERVED); + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + xhyve_abort("vlapic_timer_divisor: invalid dcr 0x%08x\n", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(((unsigned) (VLAPIC_BUS_FREQ / divisor)), &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +int +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return (0); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); + } + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], mask); + + /* + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. + */ + tmrptr = &lapic->tmr0; + if ((tmrptr[idx] & mask) != (level ? mask : 0)) { + VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " + "interrupt is %s-triggered", idx / 4, tmrptr[idx], + level ? "level" : "edge"); + } + + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT: + case APIC_OFFSET_THERM_LVT: + case APIC_OFFSET_PERF_LVT: + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + case APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; + default: + xhyve_abort("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +{ + uint32_t vec, mode; + + if (lvt & APIC_LVT_M) + return (0); + + vec = lvt & APIC_LVT_VECTOR; + mode = lvt & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + return (0); + } + if (vlapic_set_intr_ready(vlapic, ((int) vec), false)) + vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = (int) vlapic->apic_page->tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + xhyve_abort("isrvec_stk is corrupted: %d\n", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + xhyve_abort("isrvec_stk does not satisfy invariant\n"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + xhyve_abort("ISR and isrvec_stk out of sync\n"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic_page->ppr = (uint32_t) ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i, idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + bitpos = fls((int) isrptr[idx]); + if (bitpos-- != 0) { + if (vlapic->isrvec_stk_top <= 0) { + xhyve_abort("invalid vlapic isrvec_stk_top %d\n", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, + vector); + } + return; + } + } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return ((int) (lvt & mask)); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +{ + uint32_t lvt; + + vlapic->esr_pending |= mask; + if (vlapic->esr_firing) + return; + vlapic->esr_firing = 1; + + // The error LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } + vlapic->esr_firing = 0; +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + // The timer LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + uint32_t lvt; + + if (vlapic_enabled(vlapic) == false) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); + break; + case APIC_LVT_LINT1: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); + break; + case APIC_LVT_TIMER: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_ERROR: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_PMC: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); + break; + case APIC_LVT_THERMAL: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); + break; + case APIC_LVT_CMCI: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + break; + default: + return (EINVAL); + } + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + return (0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); + /* FIXME */ + KWARN(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("XHYVE: vlapic callout at %#llx.%#llx, expected at " + "%#llx.#%llx\r\n", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %llu " + "usecs, period is %llu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + sbintime_t sbt; + uint32_t icr_timer; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +static void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, ((int) dest)); + if (vcpuid < VM_MAXCPU) + CPU_SET(((unsigned) vcpuid), dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only avilable in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in it's LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(((unsigned) vcpuid), &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(((unsigned) vcpuid), dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " + "from %#x to %#x", lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +static uint8_t +vlapic_get_tpr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + return ((uint8_t) lapic->tpr); +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~((uint64_t) 0xf)) { + vm_inject_gp(vlapic->vm, vlapic->vcpuid); + return; + } + + tpr = (uint8_t) (val << 4); + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + uint8_t tpr; + + tpr = vlapic_get_tpr(vlapic); + return (tpr >> 4); +} + +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) +{ + int i; + bool phys; + cpuset_t dmask; + uint64_t icrval; + uint32_t dest, vec, mode; + struct vlapic *vlapic2; + struct vm_exit *vmexit; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~((unsigned) APIC_DELSTAT_PEND); + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + + VLAPIC_CTR2(vlapic, "icrlo 0x%016llx triggered ipi %d", icrval, vec); + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + phys = ((icrval & APIC_DESTMODE_LOG) == 0); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, + x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(((unsigned) vlapic->vcpuid), &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(((unsigned) vlapic->vcpuid), &dmask); + break; + default: + CPU_ZERO(&dmask); /* satisfy gcc */ + break; + } + + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(((unsigned) i), &dmask); + if (mode == APIC_DELMODE_FIXED) { + lapic_intr_edge(vlapic->vm, i, ((int) vec)); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + IPIS_SENT, i, 1); + VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " + "to vcpuid %d", vec, i); + } else { + vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " + "to vcpuid %d", i); + } + } + + return (0); /* handled completely in the kernel */ + } + + if (mode == APIC_DELMODE_INIT) { + if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) + return (0); + + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, ((int) dest)); + + /* move from INIT to waiting-for-SIPI state */ + if (vlapic2->boot_state == BS_INIT) { + vlapic2->boot_state = BS_SIPI; + } + + return (0); + } + } + + if (mode == APIC_DELMODE_STARTUP) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, ((int) dest)); + + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (vlapic2->boot_state != BS_SIPI) + return (0); + + vlapic2->boot_state = BS_RUNNING; + + *retu = true; + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = (int) dest; + vmexit->u.spinup_ap.rip = vec << XHYVE_PAGE_SHIFT; + + return (0); + } + } + + /* + * This will cause a return to userland. + */ + return (1); +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) +{ + int vec; + + KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); + + vec = val & 0xff; + lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, + vlapic->vcpuid, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + + irrptr = &lapic->irr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls((int) val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (((unsigned) PRIO(vector)) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + if (vlapic->ops.intr_accepted) { + (*vlapic->ops.intr_accepted)(vlapic, vector); + return; + } + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + xhyve_abort("isrvec_stk_top overflow %d\n", stk_top); + + vlapic->isrvec_stk[stk_top] = (uint8_t) vector; + vlapic_update_ppr(vlapic); +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +int +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, UNUSED bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#llx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#llx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + + if (offset > sizeof(*lapic)) { + *data = 0; + goto done; + } + + offset &= ~((uint64_t) 3); + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = vlapic_get_tpr(vlapic); + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0: + case APIC_OFFSET_ISR1: + case APIC_OFFSET_ISR2: + case APIC_OFFSET_ISR3: + case APIC_OFFSET_ISR4: + case APIC_OFFSET_ISR5: + case APIC_OFFSET_ISR6: + case APIC_OFFSET_ISR7: + i = (int) ((offset - APIC_OFFSET_ISR0) >> 2); + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0: + case APIC_OFFSET_TMR1: + case APIC_OFFSET_TMR2: + case APIC_OFFSET_TMR3: + case APIC_OFFSET_TMR4: + case APIC_OFFSET_TMR5: + case APIC_OFFSET_TMR6: + case APIC_OFFSET_TMR7: + i = (int) ((offset - APIC_OFFSET_TMR0) >> 2); + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0: + case APIC_OFFSET_IRR1: + case APIC_OFFSET_IRR2: + case APIC_OFFSET_IRR3: + case APIC_OFFSET_IRR4: + case APIC_OFFSET_IRR5: + case APIC_OFFSET_IRR6: + case APIC_OFFSET_IRR7: + i = (int) ((offset - APIC_OFFSET_IRR0) >> 2); + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT: + case APIC_OFFSET_THERM_LVT: + case APIC_OFFSET_PERF_LVT: + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + case APIC_OFFSET_ERROR_LVT: + *data = vlapic_get_lvt(vlapic, ((uint32_t) offset)); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_SELF_IPI: + /* + * XXX generate a GP fault if vlapic is in x2apic mode + */ + *data = 0; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#llx, data %#llx", offset, *data); + return 0; +} + +int +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + int retval; + + KASSERT((offset & 0xf) == 0 && offset < XHYVE_PAGE_SIZE, + ("vlapic_write: invalid offset %#llx", offset)); + + VLAPIC_CTR2(vlapic, "vlapic write offset %#llx, data %#llx", + offset, data); + + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#llx to offset %#llx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#llx to offset %#llx " + "in xAPIC mode", data, offset); + return (0); + } + + retval = 0; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = (uint32_t) data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = (uint32_t) data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = (uint32_t) data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = (uint32_t) data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = (uint32_t) data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = (uint32_t) data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT: + case APIC_OFFSET_THERM_LVT: + case APIC_OFFSET_PERF_LVT: + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + case APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, ((uint32_t) offset)); + *regptr = (uint32_t) data; + vlapic_lvt_write_handler(vlapic, ((uint32_t) offset)); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = (uint32_t) data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = (uint32_t) data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (x2apic(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0: + case APIC_OFFSET_ISR1: + case APIC_OFFSET_ISR2: + case APIC_OFFSET_ISR3: + case APIC_OFFSET_ISR4: + case APIC_OFFSET_ISR5: + case APIC_OFFSET_ISR6: + case APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0: + case APIC_OFFSET_TMR1: + case APIC_OFFSET_TMR2: + case APIC_OFFSET_TMR3: + case APIC_OFFSET_TMR4: + case APIC_OFFSET_TMR5: + case APIC_OFFSET_TMR6: + case APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0: + case APIC_OFFSET_IRR1: + case APIC_OFFSET_IRR2: + case APIC_OFFSET_IRR3: + case APIC_OFFSET_IRR4: + case APIC_OFFSET_IRR5: + case APIC_OFFSET_IRR6: + case APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +static void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_reset_tmr(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + VLAPIC_TIMER_LOCK_INIT(vlapic); + callout_init(&vlapic->callout, 1); + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + + callout_drain(&vlapic->callout); +} + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +int +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) +{ + + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#llx to %#llx " + "not supported", vlapic->msr_apicbase, new); + return (-1); + } + + return (0); +} + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~((uint64_t) APICBASE_X2APIC); + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(((unsigned) vcpuid), &dmask); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vm, vcpuid); + } else { + lapic_set_intr(vm, vcpuid, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, UNUSED int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + xhyve_abort("hv_vcpu_interrupt\n"); /* FIXME */ + //ipi_cpu(hostcpu, ipinum); +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} + +static void +vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *tmrptr, mask; + int idx; + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + if (level) + tmrptr[idx] |= mask; + else + tmrptr[idx] &= ~mask; + + if (vlapic->ops.set_tmr != NULL) + (*vlapic->ops.set_tmr)(vlapic, vector, level); +} + +void +vlapic_reset_tmr(struct vlapic *vlapic) +{ + int vector; + + VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + + for (vector = 0; vector <= 255; vector++) + vlapic_set_tmr(vlapic, vector, false); +} + +void +vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector) +{ + cpuset_t dmask; + bool lowprio; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + + /* + * A level trigger is valid only for fixed and lowprio delivery modes. + */ + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " + "delivery-mode %d", delmode); + return; + } + + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); + + if (!CPU_ISSET(((unsigned) vlapic->vcpuid), &dmask)) + return; + + VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); + vlapic_set_tmr(vlapic, vector, true); +} diff --git a/vendor/github.com/hooklift/xhyve/vmcs.c b/vendor/github.com/hooklift/xhyve/vmcs.c new file mode 100644 index 0000000..250c948 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmcs.c @@ -0,0 +1,245 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); + default: + return ((uint32_t) -1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(int vcpuid, int ident, uint64_t *retval) +{ + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + *retval = vmcs_read(vcpuid, encoding); + + return (0); +} + +int +vmcs_setreg(int vcpuid, int ident, uint64_t val) +{ + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + vmcs_write(vcpuid, encoding, val); + + return (0); +} + +int +vmcs_setdesc(int vcpuid, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + xhyve_abort("vmcs_setdesc: invalid segment register %d\n", seg); + + vmcs_write(vcpuid, base, desc->base); + vmcs_write(vcpuid, limit, desc->limit); + if (access != VMCS_INVALID_ENCODING) { + vmcs_write(vcpuid, access, desc->access); + } + + return (0); +} + +int +vmcs_getdesc(int vcpuid, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + xhyve_abort("vmcs_setdesc: invalid segment register %d\n", seg); + + desc->base = vmcs_read(vcpuid, base); + desc->limit = (uint32_t) vmcs_read(vcpuid, limit); + if (access != VMCS_INVALID_ENCODING) { + desc->access = (uint32_t) vmcs_read(vcpuid, access); + } + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm.c b/vendor/github.com/hooklift/xhyve/vmm.c new file mode 100644 index 0000000..bf25c3b --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm.c @@ -0,0 +1,1908 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vlapic; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ +struct vcpu { + OSSpinLock lock; /* (o) protects 'state' */ + pthread_mutex_t state_sleep_mtx; + pthread_cond_t state_sleep_cnd; + pthread_mutex_t vcpu_sleep_mtx; + pthread_cond_t vcpu_sleep_cnd; + enum vcpu_state state; /* (o) vcpu state */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ +}; + +#define vcpu_lock_init(v) (v)->lock = OS_SPINLOCK_INIT; +#define vcpu_lock(v) OSSpinLockLock(&(v)->lock) +#define vcpu_unlock(v) OSSpinLockUnlock(&(v)->lock) + +struct mem_seg { + uint64_t gpa; + size_t len; + void *object; +}; + +#define VM_MAX_MEMORY_SEGMENTS 2 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ + cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ + void *rendezvous_arg; /* (x) rendezvous func/arg */ + vm_rendezvous_func_t rendezvous_func; + pthread_mutex_t rendezvous_mtx; /* (o) rendezvous lock */ + pthread_cond_t rendezvous_sleep_cnd; + int num_mem_segs; /* (o) guest memory segments */ + struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ +}; +#pragma clang diagnostic pop + +static int vmm_initialized; + +static struct vmm_ops *ops; + +#define VMM_INIT() \ + (*ops->init)() +#define VMM_CLEANUP() \ + (*ops->cleanup)() +#define VM_INIT(vmi) \ + (*ops->vm_init)(vmi) +#define VCPU_INIT(vmi, vcpu) \ + (*ops->vcpu_init)(vmi, vcpu) +#define VMRUN(vmi, vcpu, rip, rptr, sptr) \ + (*ops->vmrun)(vmi, vcpu, rip, rptr, sptr) +#define VM_CLEANUP(vmi) \ + (*ops->vm_cleanup)(vmi) +#define VCPU_CLEANUP(vmi, vcpu) \ + (*ops->vcpu_cleanup)(vmi, vcpu) +#define VMGETREG(vmi, vcpu, num, retval) \ + (*ops->vmgetreg)(vmi, vcpu, num, retval) +#define VMSETREG(vmi, vcpu, num, val) \ + (*ops->vmsetreg)(vmi, vcpu, num, val) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (*ops->vmgetdesc)(vmi, vcpu, num, desc) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (*ops->vmsetdesc)(vmi, vcpu, num, desc) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (*ops->vmgetcap)(vmi, vcpu, num, retval) +#define VMSETCAP(vmi, vcpu, num, val) \ + (*ops->vmsetcap)(vmi, vcpu, num, val) +#define VLAPIC_INIT(vmi, vcpu) \ + (*ops->vlapic_init)(vmi, vcpu) +#define VLAPIC_CLEANUP(vmi, vlapic) \ + (*ops->vlapic_cleanup)(vmi, vlapic) +#define VCPU_INTERRUPT(vcpu) \ + (*ops->vcpu_interrupt)(vcpu) + +/* statistics */ +//static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +static int trace_guest_exceptions = 0; + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); + if (destroy) { + vmm_stat_free(vcpu->stats); + } +} + +static void +vcpu_init(struct vm *vm, int vcpu_id, bool create) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { + vcpu_lock_init(vcpu); + pthread_mutex_init(&vcpu->state_sleep_mtx, NULL); + pthread_cond_init(&vcpu->state_sleep_cnd, NULL); + pthread_mutex_init(&vcpu->vcpu_sleep_mtx, NULL); + pthread_cond_init(&vcpu->vcpu_sleep_cnd, NULL); + vcpu->state = VCPU_IDLE; + vcpu->stats = vmm_stat_alloc(); + } + + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->exitintinfo = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + vmm_stat_init(vcpu->stats); +} + +int vcpu_create(struct vm *vm, int vcpu) { + if (vcpu < 0 || vcpu >= VM_MAXCPU) + xhyve_abort("vcpu_create: invalid cpuid %d\n", vcpu); + + return VCPU_INIT(vm->cookie, vcpu); +} + +void vcpu_destroy(struct vm *vm, int vcpu) { + if (vcpu < 0 || vcpu >= VM_MAXCPU) + xhyve_abort("vcpu_destroy: invalid cpuid %d\n", vcpu); + + VCPU_CLEANUP(vm, vcpu); +} + +int +vcpu_trace_exceptions(void) +{ + return (trace_guest_exceptions); +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + xhyve_abort("vm_exitinfo: invalid cpuid %d\n", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +int +vmm_init(void) +{ + int error; + + vmm_host_state_init(); + + error = vmm_mem_init(); + if (error) + return (error); + + ops = &vmm_ops_intel; + + error = VMM_INIT(); + + if (error == 0) + vmm_initialized = 1; + + return (error); +} + + +int +vmm_cleanup(void) { + int error; + + error = VMM_CLEANUP(); + + if (error == 0) + vmm_initialized = 0; + + return error; +} + +static void +vm_init(struct vm *vm, bool create) +{ + int vcpu; + + if (create) { + callout_system_init(); + } + + vm->cookie = VM_INIT(vm); + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + + if (create) { + vm->vrtc = vrtc_init(vm); + } + + CPU_ZERO(&vm->active_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + vcpu_init(vm, vcpu, create); + } +} + +int +vm_create(struct vm **retvm) +{ + struct vm *vm; + + if (!vmm_initialized) + return (ENXIO); + + vm = malloc(sizeof(struct vm)); + assert(vm); + bzero(vm, sizeof(struct vm)); + vm->num_mem_segs = 0; + pthread_mutex_init(&vm->rendezvous_mtx, NULL); + pthread_cond_init(&vm->rendezvous_sleep_cnd, NULL); + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +static void +vm_free_mem_seg(struct mem_seg *seg) +{ + if (seg->object != NULL) { + vmm_mem_free(seg->gpa, seg->len, seg->object); + } + + bzero(seg, sizeof(*seg)); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + int i, vcpu; + + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + vcpu_cleanup(vm, vcpu, destroy); + } + + if (destroy) { + vrtc_cleanup(vm->vrtc); + } else { + vrtc_reset(vm->vrtc); + } + vpmtmr_cleanup(vm->vpmtmr); + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + + VM_CLEANUP(vm->cookie); + + if (destroy) { + for (i = 0; i < vm->num_mem_segs; i++) { + vm_free_mem_seg(&vm->mem_segs[i]); + } + + vm->num_mem_segs = 0; + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(UNUSED struct vm *vm) +{ + return "VM"; +} + +bool +vm_mem_allocated(struct vm *vm, uint64_t gpa) +{ + int i; + uint64_t gpabase, gpalimit; + + for (i = 0; i < vm->num_mem_segs; i++) { + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa < gpalimit) + return (TRUE); /* 'gpa' is regular memory */ + } + + return (FALSE); +} + +int +vm_malloc(struct vm *vm, uint64_t gpa, size_t len) +{ + int available, allocated; + struct mem_seg *seg; + void *object; + uint64_t g; + + if ((gpa & XHYVE_PAGE_MASK) || (len & XHYVE_PAGE_MASK) || len == 0) + return (EINVAL); + + available = allocated = 0; + g = gpa; + while (g < gpa + len) { + if (vm_mem_allocated(vm, g)) + allocated++; + else + available++; + + g += XHYVE_PAGE_SIZE; + } + + /* + * If there are some allocated and some available pages in the address + * range then it is an error. + */ + if (allocated && available) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated && available == 0) + return (0); + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + seg = &vm->mem_segs[vm->num_mem_segs]; + + if ((object = vmm_mem_alloc(gpa, len)) == NULL) + return (ENOMEM); + + seg->gpa = gpa; + seg->len = len; + seg->object = object; + + vm->num_mem_segs++; + + return (0); +} + +void * +vm_gpa2hva(struct vm *vm, uint64_t gpa, uint64_t len) { + void *base; + uint64_t offset; + + if (vm_get_memobj(vm, gpa, len, &offset, &base)) { + return NULL; + } + + return (void *) (((uintptr_t) base) + offset); +} + +int +vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + seg->gpa = vm->mem_segs[i].gpa; + seg->len = vm->mem_segs[i].len; + return (0); + } + } + return (-1); +} + +int +vm_get_memobj(struct vm *vm, uint64_t gpa, size_t len, + uint64_t *offset, void **object) +{ + int i; + size_t seg_len; + uint64_t seg_gpa; + void *seg_obj; + + for (i = 0; i < vm->num_mem_segs; i++) { + if ((seg_obj = vm->mem_segs[i].object) == NULL) + continue; + + seg_gpa = vm->mem_segs[i].gpa; + seg_len = vm->mem_segs[i].len; + + if ((gpa >= seg_gpa) && ((gpa + len) <= (seg_gpa + seg_len))) { + *offset = gpa - seg_gpa; + *object = seg_obj; + return (0); + } + } + + return (EINVAL); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#llx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); +} + +static bool +is_descriptor_table(int reg) +{ + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static bool +is_segment_register(int reg) +{ + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +// static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + const struct timespec ts = {.tv_sec = 1, .tv_nsec = 0}; /* 1 second */ + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + pthread_mutex_lock(&vcpu->state_sleep_mtx); + vcpu_unlock(vcpu); + pthread_cond_timedwait_relative_np(&vcpu->state_sleep_cnd, + &vcpu->state_sleep_mtx, &ts); + vcpu_lock(vcpu); + pthread_mutex_unlock(&vcpu->state_sleep_mtx); + //msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + + if (newstate == VCPU_IDLE) + pthread_cond_broadcast(&vcpu->state_sleep_cnd); + //wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + xhyve_abort("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + xhyve_abort("Error %d setting state to %d", error, newstate); +} + +static void +vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) +{ + /* + * Update 'rendezvous_func' and execute a write memory barrier to + * ensure that it is visible across all host cpus. This is not needed + * for correctness but it does ensure that all the vcpus will notice + * that the rendezvous is requested immediately. + */ + vm->rendezvous_func = func; + wmb(); +} + +#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ + do { \ + if (vcpuid >= 0) {\ + VCPU_CTR0(vm, vcpuid, fmt); \ + } else {\ + VM_CTR0(vm, fmt); \ + } \ + } while (0) + +static void +vm_handle_rendezvous(struct vm *vm, int vcpuid) +{ + + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); + + pthread_mutex_lock(&vm->rendezvous_mtx); + while (vm->rendezvous_func != NULL) { + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); + + if (vcpuid != -1 && + CPU_ISSET(((unsigned) vcpuid), &vm->rendezvous_req_cpus) && + !CPU_ISSET(((unsigned) vcpuid), &vm->rendezvous_done_cpus)) { + VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); + (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); + CPU_SET(((unsigned) vcpuid), &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); + vm_set_rendezvous_func(vm, NULL); + pthread_cond_broadcast(&vm->rendezvous_sleep_cnd); + //wakeup(&vm->rendezvous_func); + break; + } + RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); + pthread_cond_wait(&vm->rendezvous_sleep_cnd, &vm->rendezvous_mtx); + //mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", 0); + } + pthread_mutex_unlock(&vm->rendezvous_mtx); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) +{ + struct vcpu *vcpu; + const char *wmesg; + int vcpu_halted, vm_halted; + const struct timespec ts = {.tv_sec = 1, .tv_nsec = 0}; /* 1 second */ + + KASSERT(!CPU_ISSET(((unsigned) vcpuid), &vm->halted_cpus), + ("vcpu already halted")); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_halted = 0; + vm_halted = 0; + + vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from VMRUN() and before it acquired the + * vcpu lock above. + */ + if (vm->rendezvous_func != NULL || vm->suspend) + break; + if (vm_nmi_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VCPU_CTR0(vm, vcpuid, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(((unsigned) vcpuid), &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } + } else { + wmesg = "vmidle"; + } + + //t = ticks; + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + pthread_mutex_lock(&vcpu->vcpu_sleep_mtx); + vcpu_unlock(vcpu); + pthread_cond_timedwait_relative_np(&vcpu->vcpu_sleep_cnd, + &vcpu->vcpu_sleep_mtx, &ts); + vcpu_lock(vcpu); + pthread_mutex_unlock(&vcpu->vcpu_sleep_mtx); + //msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + //vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + + if (vcpu_halted) + CPU_CLR_ATOMIC(((unsigned) vcpuid), &vm->halted_cpus); + + vcpu_unlock(vcpu); + + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t gla, gpa, cs_base; + struct vm_guest_paging *paging; + mem_region_read_t mread; + mem_region_write_t mwrite; + enum vm_cpu_mode cpu_mode; + int cs_d, error, fault, length; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; + cs_d = vme->u.inst_emul.cs_d; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; + + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#llx", gpa); + + /* Fetch, decode and emulate the faulting instruction */ + if (vie->num_valid == 0) { + /* + * If the instruction length is not known then assume a + * maximum size instruction. + */ + length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + + cs_base, length, vie, &fault); + } else { + /* + * The instruction bytes have already been copied into 'vie' + */ + error = fault = 0; + } + if (error || fault) + return (error); + + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#llx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } + + /* + * If the instruction length was not specified then update it now + * along with 'nextrip'. + */ + if (vme->inst_length == 0) { + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + } + + /* return to userland unless this is an in-kernel emulated device */ + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + XHYVE_PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + *retu = true; + return (0); + } + + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, + mread, mwrite, retu); + + return (error); +} + +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ + int i, done; + struct vcpu *vcpu; + const struct timespec ts = {.tv_sec = 1, .tv_nsec = 0}; /* 1 second */ + + done = 0; + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(((unsigned) vcpuid), &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + * + * Since a VM may be suspended at any time including when one or + * more vcpus are doing a rendezvous we need to call the rendezvous + * handler while we are waiting to prevent a deadlock. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + if (vm->rendezvous_func == NULL) { + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + + pthread_mutex_lock(&vcpu->vcpu_sleep_mtx); + vcpu_unlock(vcpu); + pthread_cond_timedwait_relative_np(&vcpu->vcpu_sleep_cnd, + &vcpu->vcpu_sleep_mtx, &ts); + vcpu_lock(vcpu); + pthread_mutex_unlock(&vcpu->vcpu_sleep_mtx); + //msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); + + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + } else { + VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); + vcpu_unlock(vcpu); + vm_handle_rendezvous(vm, vcpuid); + vcpu_lock(vcpu); + } + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(((unsigned) i), &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(((volatile u_int *) &vm->suspend), 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(((unsigned) i), &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = (enum vm_suspend_how) vm->suspend; +} + +void +vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); +} + +void pittest(struct vm *thevm); + +int +vm_run(struct vm *vm, int vcpuid, struct vm_exit *vm_exit) +{ + int error; + struct vcpu *vcpu; + // uint64_t tscval; + struct vm_exit *vme; + bool retu, intr_disabled; + void *rptr, *sptr; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (!CPU_ISSET(((unsigned) vcpuid), &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(((unsigned) vcpuid), &vm->suspended_cpus)) + return (EINVAL); + + rptr = &vm->rendezvous_func; + sptr = &vm->suspend; + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + retu = false; + +restart: + // tscval = rdtsc(); + + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, (register_t) vcpu->nextrip, rptr, sptr); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + + + // vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + if (error == 0) { + retu = false; + vcpu->nextrip = vme->rip + ((unsigned) vme->inst_length); + switch (((int) (vme->exitcode))) { + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RENDEZVOUS: + vm_handle_rendezvous(vm, vcpuid); + error = 0; + break; + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled); + break; + case VM_EXITCODE_PAGING: + error = 0; + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vm, vcpuid, vme, &retu); + break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + vm_inject_ud(vm, vcpuid); + break; + default: + retu = true; /* handled in userland */ + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + /* copy the exit information (FIXME: zero copy) */ + bcopy(vme, vm_exit, sizeof(struct vm_exit)); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#llx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#llx to %#llx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + xhyve_abort("%s: invalid state %d\n", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%#llx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#llx", info)); + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#llx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#llx is not valid", info2)); + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#llx), info2(%#llx)", + info1, info2); + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exc_vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exc_errcode_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exc_errcode << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#llx", + vcpu->exc_vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%#llx), info2(%#llx), " + "retinfo(%#llx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->exception_pending) { + VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " + "pending exception %d", vector, vcpu->exc_vector); + return (EBUSY); + } + + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); + + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); +} + +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, + int errcode) +{ + struct vm *vm; + int error, restart_instruction; + + vm = vmarg; + restart_instruction = 1; + + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + ((uint32_t) errcode), restart_instruction); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); +} + +void +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) +{ + struct vm *vm; + int error; + + vm = vmarg; + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#llx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_nmi_pending: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_nmi_pending: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + xhyve_abort("vm_nmi_clear: inconsistent nmi_pending state\n"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->extint_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_extint_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_extint_pending: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_extint_pending: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->extint_pending == 0) + xhyve_abort("vm_extint_clear: inconsistent extint_pending state\n"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + + return (vm->vioapic); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + + return (vm->vhpet); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_set_run_state: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + xhyve_abort("vm_get_run_state: invalid vcpuid %d\n", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + vcpu_unlock(vcpu); + + return (state); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (CPU_ISSET(((unsigned) vcpuid), &vm->active_cpus)) + return (EBUSY); + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(((unsigned) vcpuid), &vm->active_cpus); + return (0); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + vlapic_set_x2apic_state(vm, vcpuid, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +void +vcpu_notify_event(struct vm *vm, int vcpuid, UNUSED bool lapic_intr) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + if (vcpu->state == VCPU_RUNNING) { + VCPU_INTERRUPT(vcpuid); + /* FIXME */ + // if (hostcpu != curcpu) { + // if (lapic_intr) { + // vlapic_post_intr(vcpu->vlapic, hostcpu, + // vmm_ipinum); + // } else { + // ipi_cpu(hostcpu, vmm_ipinum); + // } + // } else { + // /* + // * If the 'vcpu' is running on 'curcpu' then it must + // * be sending a notification to itself (e.g. SELF_IPI). + // * The pending event will be picked up when the vcpu + // * transitions back to guest context. + // */ + // } + } else { + if (vcpu->state == VCPU_SLEEPING) + pthread_cond_signal(&vcpu->vcpu_sleep_cnd); + //wakeup_one(vcpu); + } + vcpu_unlock(vcpu); +} + +int +vm_apicid2vcpuid(UNUSED struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +void +vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg) +{ + int i; + + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); + +restart: + pthread_mutex_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) { + /* + * If a rendezvous is already in progress then we need to + * call the rendezvous handler in case this 'vcpuid' is one + * of the targets of the rendezvous. + */ + RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); + pthread_mutex_unlock(&vm->rendezvous_mtx); + vm_handle_rendezvous(vm, vcpuid); + goto restart; + } + KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " + "rendezvous is still in progress")); + + RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); + vm->rendezvous_req_cpus = dest; + CPU_ZERO(&vm->rendezvous_done_cpus); + vm->rendezvous_arg = arg; + vm_set_rendezvous_func(vm, func); + pthread_mutex_unlock(&vm->rendezvous_mtx); + + /* + * Wake up any sleeping vcpus and trigger a VM-exit in any running + * vcpus so they handle the rendezvous as soon as possible. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(((unsigned) i), &dest)) + vcpu_notify_event(vm, i, false); + } + + vm_handle_rendezvous(vm, vcpuid); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < ((int) nitems(seg_names)), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(UNUSED struct vm *vm, UNUSED int vcpuid, + struct vm_copyinfo *copyinfo, int num_copyinfo) +{ + bzero(copyinfo, ((unsigned) num_copyinfo) * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *fault) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * ((unsigned) num_copyinfo)); + + nused = 0; + remaining = len; + while (remaining > 0) { + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + off = gpa & XHYVE_PAGE_MASK; + n = min(remaining, XHYVE_PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa2hva(vm, copyinfo[idx].gpa, copyinfo[idx].len); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (EFAULT); + } else { + *fault = 0; + return (0); + } +} + +void +vm_copyin(UNUSED struct vm *vm, UNUSED int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(UNUSED struct vm *vm, UNUSED int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_api.c b/vendor/github.com/hooklift/xhyve/vmm_api.c new file mode 100644 index 0000000..062473e --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_api.c @@ -0,0 +1,831 @@ +/*- + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct vm *vm; +static int memflags; +static uint32_t lowmem_limit; +static enum vm_mmap_style mmap_style; +static size_t lowmem; +static void *lowmem_addr; +static size_t highmem; +static void *highmem_addr; + +static void +vcpu_freeze(int vcpu, bool freeze) +{ + enum vcpu_state state; + + state = (freeze) ? VCPU_FROZEN : VCPU_IDLE; + + if (vcpu_set_state(vm, vcpu, state, freeze)) { + xhyve_abort("vcpu_set_state failed\n"); + } +} + +static void +vcpu_freeze_all(bool freeze) +{ + enum vcpu_state state; + int vcpu; + + state = (freeze) ? VCPU_FROZEN : VCPU_IDLE; + + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + if (vcpu_set_state(vm, vcpu, state, freeze)) { + xhyve_abort("vcpu_set_state failed\n"); + } + } +} + +int +xh_vm_create(void) +{ + int error; + + if (vm != NULL) { + return (EEXIST); + } + + error = vmm_init(); + + if (error != 0) { + return (error); + } + + memflags = 0; + lowmem_limit = (3ull << 30); + + return (vm_create(&vm)); +} + +void +xh_vm_destroy(void) +{ + assert(vm != NULL); + + vm_destroy(vm); + + if (vmm_cleanup() == 0) { + vm = NULL; + } +} + +int +xh_vcpu_create(int vcpu) +{ + assert(vm != NULL); + return (vcpu_create(vm, vcpu)); +} + +void +xh_vcpu_destroy(int vcpu) +{ + assert(vm != NULL); + vcpu_destroy(vm, vcpu); +} + +int +xh_vm_get_memory_seg(uint64_t gpa, size_t *ret_len) +{ + int error; + + struct vm_memory_segment seg; + + error = vm_gpabase2memseg(vm, gpa, &seg); + + if (error == 0) { + *ret_len = seg.len; + } + + return (error); +} + +static int +setup_memory_segment(uint64_t gpa, size_t len, void **addr) +{ + void *object; + uint64_t offset; + int error; + + vcpu_freeze_all(true); + error = vm_malloc(vm, gpa, len); + if (error == 0) { + error = vm_get_memobj(vm, gpa, len, &offset, &object); + if (error == 0) { + *addr = (void *) (((uintptr_t) object) + offset); + } + } + vcpu_freeze_all(false); + return (error); +} + +int +xh_vm_setup_memory(size_t len, enum vm_mmap_style vms) +{ + void **addr; + int error; + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL); + + mmap_style = vms; + + /* + * If 'len' cannot fit entirely in the 'lowmem' segment then + * create another 'highmem' segment above 4GB for the remainder. + */ + + lowmem = (len > lowmem_limit) ? lowmem_limit : len; + highmem = (len > lowmem_limit) ? (len - lowmem) : 0; + + if (lowmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &lowmem_addr : NULL; + if ((error = setup_memory_segment(0, lowmem, addr))) { + return (error); + } + } + + if (highmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &highmem_addr : NULL; + if ((error = setup_memory_segment((4ull << 30), highmem, addr))) { + return (error); + } + } + + return (0); +} + +void * +xh_vm_map_gpa(uint64_t gpa, size_t len) +{ + assert(mmap_style == VM_MMAP_ALL); + + if ((gpa < lowmem) && ((gpa + len) <= lowmem)) { + return ((void *) (((uintptr_t) lowmem_addr) + gpa)); + } + + if (gpa >= (4ull << 30)) { + gpa -= (4ull << 30); + if ((gpa < highmem) && ((gpa + len) <= highmem)) { + return ((void *) (((uintptr_t) highmem_addr) + gpa)); + } + } + + return (NULL); +} + +int +xh_vm_gla2gpa(int vcpu, struct vm_guest_paging *paging, uint64_t gla, + int prot, uint64_t *gpa, int *fault) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_gla2gpa(vm, vcpu, paging, gla, prot, gpa, fault); + vcpu_freeze(vcpu, false); + + return (error); +} + +uint32_t +xh_vm_get_lowmem_limit(void) +{ + return (lowmem_limit); +} + +void +xh_vm_set_lowmem_limit(uint32_t limit) +{ + lowmem_limit = limit; +} + +void +xh_vm_set_memflags(int flags) +{ + memflags = flags; +} + +size_t +xh_vm_get_lowmem_size(void) +{ + return (lowmem); +} + +size_t +xh_vm_get_highmem_size(void) +{ + return (highmem); +} + +int +xh_vm_set_desc(int vcpu, int reg, uint64_t base, uint32_t limit, + uint32_t access) +{ + struct seg_desc sd; + int error; + + sd.base = base; + sd.limit = limit; + sd.access = access; + vcpu_freeze(vcpu, true); + error = vm_set_seg_desc(vm, vcpu, reg, &sd); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_get_desc(int vcpu, int reg, uint64_t *base, uint32_t *limit, + uint32_t *access) +{ + struct seg_desc sd; + int error; + + vcpu_freeze(vcpu, true); + error = vm_get_seg_desc(vm, vcpu, reg, &sd); + if (error == 0) { + *base = sd.base; + *limit = sd.limit; + *access = sd.access; + } + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_get_seg_desc(int vcpu, int reg, struct seg_desc *seg_desc) +{ + int error; + + error = xh_vm_get_desc(vcpu, reg, &seg_desc->base, &seg_desc->limit, + &seg_desc->access); + + return (error); +} + +int +xh_vm_set_register(int vcpu, int reg, uint64_t val) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_set_register(vm, vcpu, reg, val); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_get_register(int vcpu, int reg, uint64_t *retval) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_get_register(vm, vcpu, reg, retval); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_run(int vcpu, struct vm_exit *ret_vmexit) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_run(vm, vcpu, ret_vmexit); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_suspend(enum vm_suspend_how how) +{ + return (vm_suspend(vm, how)); +} + +int +xh_vm_reinit(void) +{ + int error; + + vcpu_freeze_all(true); + error = vm_reinit(vm); + vcpu_freeze_all(false); + + return (error); +} + +int +xh_vm_apicid2vcpu(int apicid) +{ + return (apicid); +} + +int +xh_vm_inject_exception(int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_inject_exception(vm, vcpu, vector, errcode_valid, errcode, + restart_instruction); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_lapic_irq(int vcpu, int vector) +{ + return (lapic_intr_edge(vm, vcpu, vector)); +} + +int +xh_vm_lapic_local_irq(int vcpu, int vector) +{ + return (lapic_set_local_intr(vm, vcpu, vector)); +} + +int +xh_vm_lapic_msi(uint64_t addr, uint64_t msg) +{ + return (lapic_intr_msi(vm, addr, msg)); +} + +int +xh_vm_ioapic_assert_irq(int irq) +{ + return (vioapic_assert_irq(vm, irq)); +} + +int +xh_vm_ioapic_deassert_irq(int irq) +{ + return (vioapic_deassert_irq(vm, irq)); +} + +int +xh_vm_ioapic_pulse_irq(int irq) +{ + return (vioapic_pulse_irq(vm, irq)); +} + +int +xh_vm_ioapic_pincount(int *pincount) +{ + *pincount = vioapic_pincount(vm); + return (0); +} + +int +xh_vm_isa_assert_irq(int atpic_irq, int ioapic_irq) +{ + int error; + + error = vatpic_assert_irq(vm, atpic_irq); + + if ((error == 0) && (ioapic_irq != -1)) { + error = vioapic_assert_irq(vm, ioapic_irq); + } + + return (error); +} + +int +xh_vm_isa_deassert_irq(int atpic_irq, int ioapic_irq) +{ + int error; + + error = vatpic_deassert_irq(vm, atpic_irq); + if ((error == 0) && (ioapic_irq != -1)) { + error = vioapic_deassert_irq(vm, ioapic_irq); + } + + return (error); +} + +int +xh_vm_isa_pulse_irq(int atpic_irq, int ioapic_irq) +{ + int error; + + error = vatpic_pulse_irq(vm, atpic_irq); + if ((error == 0) && (ioapic_irq != -1)) { + error = vioapic_pulse_irq(vm, ioapic_irq); + } + + return (error); +} + +int +xh_vm_isa_set_irq_trigger(int atpic_irq, enum vm_intr_trigger trigger) +{ + return (vatpic_set_irq_trigger(vm, atpic_irq, trigger)); +} + +int +xh_vm_inject_nmi(int vcpu) +{ + return (vm_inject_nmi(vm, vcpu)); +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct { + const char *name; + int type; +} capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { NULL, 0 } +}; +#pragma clang diagnostic pop + +int +xh_vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; (capstrmap[i].name != NULL) && (capname != NULL); i++) { + if (strcmp(capstrmap[i].name, capname) == 0) { + return (capstrmap[i].type); + } + } + + return (-1); +} + +const char * +xh_vm_capability_type2name(int type) +{ + int i; + + for (i = 0; (capstrmap[i].name != NULL); i++) { + if (capstrmap[i].type == type) { + return (capstrmap[i].name); + } + } + + return (NULL); +} + +int +xh_vm_get_capability(int vcpu, enum vm_cap_type cap, int *retval) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_get_capability(vm, vcpu, cap, retval); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_set_capability(int vcpu, enum vm_cap_type cap, int val) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_set_capability(vm, vcpu, cap, val); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_get_intinfo(int vcpu, uint64_t *i1, uint64_t *i2) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_get_intinfo(vm, vcpu, i1, i2); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_set_intinfo(int vcpu, uint64_t exit_intinfo) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_exit_intinfo(vm, vcpu, exit_intinfo); + vcpu_freeze(vcpu, false); + + return (error); +} + +uint64_t * +xh_vm_get_stats(int vcpu, struct timeval *ret_tv, int *ret_entries) +{ + static uint64_t statbuf[64]; + struct timeval tv; + int re; + int error; + + getmicrotime(&tv); + error = vmm_stat_copy(vm, vcpu, &re, ((uint64_t *) &statbuf)); + + if (error == 0) { + if (ret_entries) { + *ret_entries = re; + } + if (ret_tv) { + *ret_tv = tv; + } + return (((uint64_t *) &statbuf)); + } else { + return (NULL); + } +} + +const char * +xh_vm_get_stat_desc(int index) +{ + static char desc[128]; + + if (vmm_stat_desc_copy(index, ((char *) &desc), sizeof(desc)) == 0) { + return (desc); + } else { + return (NULL); + } +} + +int +xh_vm_get_x2apic_state(int vcpu, enum x2apic_state *s) +{ + return (vm_get_x2apic_state(vm, vcpu, s)); +} + +int +xh_vm_set_x2apic_state(int vcpu, enum x2apic_state s) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_set_x2apic_state(vm, vcpu, s); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_get_hpet_capabilities(uint32_t *capabilities) +{ + return (vhpet_getcap(capabilities)); +} + +int +xh_vm_copy_setup(int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, + int prot, struct iovec *iov, int iovcnt, int *fault) +{ + void *va; + uint64_t gpa; + size_t n, off; + int i, error; + + for (i = 0; i < iovcnt; i++) { + iov[i].iov_base = 0; + iov[i].iov_len = 0; + } + + while (len) { + assert(iovcnt > 0); + + error = xh_vm_gla2gpa(vcpu, pg, gla, prot, &gpa, fault); + if ((error) || *fault) { + return (error); + } + + off = gpa & XHYVE_PAGE_MASK; + n = min(len, XHYVE_PAGE_SIZE - off); + + va = xh_vm_map_gpa(gpa, n); + if (va == NULL) { + return (EFAULT); + } + + iov->iov_base = va; + iov->iov_len = n; + iov++; + iovcnt--; + + gla += n; + len -= n; + } + + return (0); +} + +void +xh_vm_copyin(struct iovec *iov, void *dst, size_t len) +{ + const char *src; + char *d; + size_t n; + + d = dst; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + src = iov->iov_base; + bcopy(src, d, n); + iov++; + d += n; + len -= n; + } +} + +void +xh_vm_copyout(const void *src, struct iovec *iov, size_t len) +{ + const char *s; + char *dst; + size_t n; + + s = src; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + dst = iov->iov_base; + bcopy(s, dst, n); + iov++; + s += n; + len -= n; + } +} + +int +xh_vm_rtc_write(int offset, uint8_t value) +{ + return (vrtc_nvram_write(vm, offset, value)); +} + +int +xh_vm_rtc_read(int offset, uint8_t *retval) +{ + return (vrtc_nvram_read(vm, offset, retval)); +} + +int +xh_vm_rtc_settime(time_t secs) +{ + return (vrtc_set_time(vm, secs)); +} + +int +xh_vm_rtc_gettime(time_t *secs) +{ + *secs = vrtc_get_time(vm); + return (0); +} + +int +xh_vcpu_reset(int vcpu) +{ + int error; + +#define SET_REG(r, v) (error = xh_vm_set_register(vcpu, (r), (v))) +#define SET_DESC(d, b, l, a) (error = xh_vm_set_desc(vcpu, (d), (b), (l), (a))) + + if (SET_REG(VM_REG_GUEST_RFLAGS, 0x2) || + SET_REG(VM_REG_GUEST_RIP, 0xfff0) || + SET_REG(VM_REG_GUEST_CR0, CR0_NE) || + SET_REG(VM_REG_GUEST_CR3, 0) || + SET_REG(VM_REG_GUEST_CR4, 0) || + SET_REG(VM_REG_GUEST_CS, 0xf000) || + SET_REG(VM_REG_GUEST_SS, 0) || + SET_REG(VM_REG_GUEST_DS, 0) || + SET_REG(VM_REG_GUEST_ES, 0) || + SET_REG(VM_REG_GUEST_FS, 0) || + SET_REG(VM_REG_GUEST_GS, 0) || + SET_REG(VM_REG_GUEST_RAX, 0) || + SET_REG(VM_REG_GUEST_RBX, 0) || + SET_REG(VM_REG_GUEST_RCX, 0) || + SET_REG(VM_REG_GUEST_RDX, 0xf00) || + SET_REG(VM_REG_GUEST_RSI, 0) || + SET_REG(VM_REG_GUEST_RDI, 0) || + SET_REG(VM_REG_GUEST_RBP, 0) || + SET_REG(VM_REG_GUEST_RSP, 0) || + SET_REG(VM_REG_GUEST_TR, 0) || + SET_REG(VM_REG_GUEST_LDTR, 0) || + SET_DESC(VM_REG_GUEST_CS, 0xffff0000, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_SS, 0, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_DS, 0, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_ES, 0, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_FS, 0, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_GS, 0, 0xffff, 0x0093) || + SET_DESC(VM_REG_GUEST_GDTR, 0, 0xffff, 0) || + SET_DESC(VM_REG_GUEST_IDTR, 0, 0xffff, 0) || + SET_DESC(VM_REG_GUEST_TR, 0, 0, 0x0000008b) || + SET_DESC(VM_REG_GUEST_LDTR, 0, 0xffff, 0x00000082)) + { + return (error); + } + + return (0); +} + +int +xh_vm_active_cpus(cpuset_t *cpus) +{ + *cpus = vm_active_cpus(vm); + return (0); +} + +int +xh_vm_suspended_cpus(cpuset_t *cpus) +{ + *cpus = vm_suspended_cpus(vm); + return (0); +} + +int +xh_vm_activate_cpu(int vcpu) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_activate_cpu(vm, vcpu); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_restart_instruction(int vcpu) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vm_restart_instruction(vm, vcpu); + vcpu_freeze(vcpu, false); + + return (error); +} + +int +xh_vm_emulate_instruction(int vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + vcpu_freeze(vcpu, true); + error = vmm_emulate_instruction(vm, vcpu, gpa, vie, paging, memread, + memwrite, memarg); + vcpu_freeze(vcpu, false); + + return (error); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_callout.c b/vendor/github.com/hooklift/xhyve/vmm_callout.c new file mode 100644 index 0000000..73011fe --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_callout.c @@ -0,0 +1,379 @@ +/*- + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* makeshift callout implementation based on OSv and FreeBSD */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define callout_cmp(a, b) ((a)->timeout < (b)->timeout) + +static mach_timebase_info_data_t timebase_info; +static pthread_t callout_thread; +static pthread_mutex_t callout_mtx; +static pthread_cond_t callout_cnd; +static struct callout *callout_queue; +static bool work; +static bool initialized = false; + +static inline uint64_t nanos_to_abs(uint64_t nanos) { + return (nanos * timebase_info.denom) / timebase_info.numer; +} + +static inline uint64_t abs_to_nanos(uint64_t abs) { + return (abs * timebase_info.numer) / timebase_info.denom; +} + +static inline uint64_t sbt2mat(sbintime_t sbt) { + uint64_t s, ns; + + s = (((uint64_t) sbt) >> 32); + ns = (((uint64_t) 1000000000) * (uint32_t) sbt) >> 32; + + return (nanos_to_abs((s * 1000000000) + ns)); +} + +static inline void mat_to_ts(uint64_t mat, struct timespec *ts) { + uint64_t ns; + + ns = abs_to_nanos(mat); + + ts->tv_sec = (ns / 1000000000); + ts->tv_nsec = (ns % 1000000000); +} + +void binuptime(struct bintime *bt) { + uint64_t ns; + + ns = abs_to_nanos(mach_absolute_time()); + + bt->sec = (ns / 1000000000); + bt->frac = (((ns % 1000000000) * (((uint64_t) 1 << 63) / 500000000))); +} + +void getmicrotime(struct timeval *tv) { + uint64_t ns, sns; + + ns = abs_to_nanos(mach_absolute_time()); + + sns = (ns / 1000000000); + tv->tv_sec = (long) sns; + tv->tv_usec = (int) ((ns - sns) / 1000); +} + +static void callout_insert(struct callout *c) { + struct callout *node = callout_queue; + + if (!node) { + callout_queue = c; + c->prev = NULL; + c->next = NULL; + c->queued = 1; + return; + } + + if (callout_cmp(c, node)) { + node->prev = c; + c->prev = NULL; + c->next = node; + callout_queue = c; + c->queued = 1; + return; + } + + while (node->next) { + if (callout_cmp(c, node->next)) { + c->prev = node; + c->next = node->next; + node->next->prev = c; + node->next = c; + c->queued = 1; + return; + } + node = node->next; + } + + c->prev = node; + c->next = NULL; + node->next = c; + c->queued = 1; +} + +static void callout_remove(struct callout *c) { + if (!c->queued) { + return; + } + + if (c->prev) { + c->prev->next = c->next; + } else { + callout_queue = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + c->prev = NULL; + c->next = NULL; + c->queued = 0; +} + +static void *callout_thread_func(UNUSED void *arg) { + struct callout *c; + struct timespec ts; + uint64_t delta, mat; + int ret; + + pthread_mutex_lock(&callout_mtx); + + while (true) { + /* wait for work */ + while (!callout_queue) { + pthread_cond_wait(&callout_cnd, &callout_mtx); + }; + + /* get the callout with the nearest timout */ + c = callout_queue; + + if (!(c->flags & (CALLOUT_ACTIVE | CALLOUT_PENDING))) { + abort(); + } + + /* wait for timeout */ + ret = 0; + while ((ret != ETIMEDOUT) && !work) { + mat = mach_absolute_time(); + if (mat >= c->timeout) { + /* XXX: it might not be worth sleeping for very short timeouts */ + ret = ETIMEDOUT; + break; + } + + delta = c->timeout - mat; + mat_to_ts(delta, &ts); + ret = pthread_cond_timedwait_relative_np(&callout_cnd, &callout_mtx, &ts); + }; + + work = false; + + if (!(ret == ETIMEDOUT) || !c->queued) { + continue; + } + + /* dispatch */ + c->flags &= ~CALLOUT_PENDING; + + pthread_mutex_unlock(&callout_mtx); + c->callout(c->argument); + pthread_mutex_lock(&callout_mtx); + + /* note: after the handler has been invoked the callout structure can look + * much differently, the handler may have rescheduled the callout or + * even freed it. + * + * if the callout is still enqueued it means that it hasn't been + * freed by the user + * + * reset || drain || !stop + */ + + if (c->queued) { + /* if the callout hasn't been rescheduled, remove it */ + if (((c->flags & CALLOUT_PENDING) == 0) || (c->flags & CALLOUT_WAITING)) { + c->flags |= CALLOUT_COMPLETED; + callout_remove(c); + } + } + } + + return NULL; +} + +void callout_init(struct callout *c, int mpsafe) { + if (!mpsafe) { + abort(); + } + + memset(c, 0, sizeof(struct callout)); + + if (pthread_cond_init(&c->wait, NULL)) { + abort(); + } +} + +static int callout_stop_safe_locked(struct callout *c, int drain) { + int result = 0; + + if ((drain) && (pthread_self() != callout_thread) && (callout_pending(c) || + (callout_active(c) && !callout_completed(c)))) + { + if (c->flags & CALLOUT_WAITING) { + abort(); + } + + /* wait for callout */ + c->flags |= CALLOUT_WAITING; + work = true; + + pthread_cond_signal(&callout_cnd); + + while (!(c->flags & CALLOUT_COMPLETED)) { + pthread_cond_wait(&c->wait, &callout_mtx); + } + + c->flags &= ~CALLOUT_WAITING; + result = 1; + } + + callout_remove(c); + + /* clear flags */ + c->flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_COMPLETED | + CALLOUT_WAITING); + + return (result); +} + +int callout_stop_safe(struct callout *c, int drain) { + pthread_mutex_lock(&callout_mtx); + callout_stop_safe_locked(c, drain); + pthread_mutex_unlock(&callout_mtx); + return 0; +} + +int callout_reset_sbt(struct callout *c, sbintime_t sbt, + UNUSED sbintime_t precision, void (*ftn)(void *), void *arg, int flags) +{ + int result; + bool is_next_timeout; + + is_next_timeout = false; + + pthread_mutex_lock(&callout_mtx); + + if (!((flags == 0) || (flags == C_ABSOLUTE)) || (c->flags !=0)) { + /* FIXME */ + //printf("XHYVE: callout_reset_sbt 0x%08x 0x%08x\r\n", flags, c->flags); + //abort(); + } + + c->timeout = sbt2mat(sbt); + + if (flags != C_ABSOLUTE) { + c->timeout += mach_absolute_time(); + } + + result = callout_stop_safe_locked(c, 0); + + c->callout = ftn; + c->argument = arg; + c->flags |= (CALLOUT_PENDING | CALLOUT_ACTIVE); + + callout_insert(c); + + if (c == callout_queue) { + work = true; + is_next_timeout = true; + } + + pthread_mutex_unlock(&callout_mtx); + + if (is_next_timeout) { + pthread_cond_signal(&callout_cnd); + is_next_timeout = false; + } + + return (result); +} + +void callout_system_init(void) { + if (initialized) { + return; + } + + mach_timebase_info(&timebase_info); + + if (pthread_mutex_init(&callout_mtx, NULL)) { + abort(); + } + + if (pthread_cond_init(&callout_cnd, NULL)) { + abort(); + } + + callout_queue = NULL; + work = false; + + if (pthread_create(&callout_thread, /*&attr*/ NULL, &callout_thread_func, + NULL)) + { + abort(); + } + + initialized = true; +} + +//static void callout_queue_print(void) { +// struct callout *node; +// +// pthread_mutex_lock(&callout_mtx); +// for (node = callout_queue; node; node = node->next) { +// printf("t:%llu -> ", abs_to_nanos(node->timeout)); +// if (!node->next) { +// break; +// } +// } +// pthread_mutex_unlock(&callout_mtx); +// printf("NULL\n"); +//} + +//void fire (void *arg) { +// printf("fire!\n"); +//} +// +//int main(void) { +// struct callout a; +// sbintime_t sbt; +// printf("xhyve_timer\n"); +// callout_system_init(); +// callout_init(&a, 1); +// sbt = ((sbintime_t) (((uint64_t) 3) << 32)); +// callout_reset_sbt(&a, sbt, 0, &fire, NULL, 0); +// while (1); +// return 0; +//} diff --git a/vendor/github.com/hooklift/xhyve/vmm_host.c b/vendor/github.com/hooklift/xhyve/vmm_host.c new file mode 100644 index 0000000..e11e3cf --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_host.c @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +static struct xsave_limits vmm_xsave_limits; + +void +vmm_host_state_init(void) +{ + uint32_t avx1_0, regs[4]; + size_t ln; + + vmm_xsave_limits.xsave_enabled = 0; + + ln = sizeof(uint32_t); + if (!sysctlbyname("hw.optional.avx1_0", &avx1_0, &ln, NULL, 0) && avx1_0) { + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_enabled = 1; + vmm_xsave_limits.xcr0_allowed = XFEATURE_AVX; + vmm_xsave_limits.xsave_max_size = regs[1]; + } +} + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + return (&vmm_xsave_limits); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_instruction_emul.c b/vendor/github.com/hooklift/xhyve/vmm_instruction_emul.c new file mode 100644 index 0000000..4fb07b2 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_instruction_emul.c @@ -0,0 +1,2388 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PG_V 0x001 /* P Valid */ +#define PG_RW 0x002 /* R/W Read/Write */ +#define PG_U 0x004 /* U/S User/Supervisor */ +#define PG_A 0x020 /* A Accessed */ +#define PG_M 0x040 /* D Dirty */ +#define PG_PS 0x080 /* PS Page size (0=4k,1=4M) */ +#define PGEX_P 0x01 /* Protection violation vs. not present */ +#define PGEX_W 0x02 /* during a Write cycle */ +#define PGEX_U 0x04 /* access from User mode (UPL) */ +#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ +#define PGEX_I 0x10 /* during an instruction fetch */ + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) + +static const struct vie_op two_byte_opcodes[256] = { + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x81] = { + /* Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = (uint8_t) (val >> 8); + else + *rval = (uint8_t) val; + return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } + return (error); +} + +int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +#define GETCC(sz) \ +static u_long \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(((uint8_t) x), ((uint8_t) y))); + else if (opsize == 2) + return (getcc16(((uint16_t) x), ((uint16_t) y))); + else if (opsize == 4) + return (getcc32(((uint32_t) x), ((uint32_t) y))); + else + return (getcc64(x, y)); +} + +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vm, vcpuid, vie, &byte); + if (error == 0) + error = memwrite(vm, vcpuid, gpa, byte, size, arg); + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vm, vcpuid, vie, ((uint8_t) val)); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vm, vcpuid, gpa, ((uint64_t) vie->immediate), size, + arg); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = ((uint64_t) vie->immediate) & size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, UNUSED mem_region_write_t memwrite, + void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (uint64_t) ((int64_t) ((int8_t) val)); + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + */ +static int +get_gla(void *vm, int vcpuid, UNUSED struct vie *vie, + struct vm_guest_paging *paging, int opsize, int addrsize, int prot, + enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + goto guest_fault; + } + + *fault = 0; + return (0); + +guest_fault: + *fault = 1; + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + struct vm_copyinfo copyinfo[2]; + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, fault, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + rcx = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + XHYVE_PROT_READ, ((enum vm_reg_name) seg), VM_REG_GUEST_RSI, &srcaddr, + &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, ((size_t) opsize), + XHYVE_PROT_READ, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, ((size_t) opsize)); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + XHYVE_PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, ((size_t) opsize), + XHYVE_PROT_WRITE, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, ((size_t) opsize)); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else { + /* + * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + XHYVE_PROT_READ, &srcgpa, &fault); + if (error || fault) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + XHYVE_PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + if (error) + goto done; + } + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= ((uint64_t) opsize); + rdi -= ((uint64_t) opsize); + } else { + rsi += ((uint64_t) opsize); + rdi += ((uint64_t) opsize); + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); +} + +static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + UNUSED struct vm_guest_paging *paging, UNUSED mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + rcx = 0; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= ((uint64_t) opsize); + else + rdi += ((uint64_t) opsize); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + result = 0; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & ((uint64_t) vie->immediate); + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~((uint64_t) RFLAGS_STATUS_BITS); + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t val1, result, rflags, rflags2; + + size = vie->opsize; + result = 0; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | ((uint64_t) vie->immediate); + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~((uint64_t) RFLAGS_STATUS_BITS); + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, UNUSED mem_region_write_t memwrite, + void *arg) +{ + int error, size; + uint64_t op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x3B: + /* + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare first operand (reg) with second operand (r/m) and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &op1); + if (error) + return (error); + + /* Get the second operand */ + error = memread(vm, vcpuid, gpa, &op2, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, op2); + break; + case 0x80: + case 0x81: + case 0x83: + /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + if (vie->op.op_byte == 0x80) + size = 1; + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, ((uint64_t) vie->immediate)); + break; + default: + return (EINVAL); + } + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~((uint64_t) RFLAGS_STATUS_BITS); + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, UNUSED mem_region_write_t memwrite, + void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + val1 = 0; + val2 = 0; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~((uint64_t) RFLAGS_STATUS_BITS); + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + struct vm_copyinfo copyinfo[2]; + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, fault, size, stackaddrsize, pushop; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compability mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= ((uint64_t) size); + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? XHYVE_PROT_WRITE : XHYVE_PROT_READ, + &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, ((size_t) size), + pushop ? XHYVE_PROT_WRITE : XHYVE_PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); + + if (pushop) { + error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + if (error == 0) + vm_copyout(vm, vcpuid, &val, copyinfo, ((size_t) size)); + } else { + vm_copyin(vm, vcpuid, copyinfo, &val, ((size_t) size)); + error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); + rsp += ((uint64_t) size); + } + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + UNUSED struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, UNUSED mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~((uint64_t) PSL_C); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_POP: + error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_PUSH: + error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = emulate_movx(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_OR: + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_SUB: + error = emulate_sub(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & ((uint64_t) (size - 1))) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); + KASSERT((prot & ~(XHYVE_PROT_READ | XHYVE_PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); + + if (prot & XHYVE_PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & XHYVE_PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + bzero(vie, sizeof(struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + + if (inst_length) { + bcopy(inst_bytes, vie->inst, ((size_t) inst_length)); + vie->num_valid = ((uint8_t) inst_length); + } +} + +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & XHYVE_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & XHYVE_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + u_int retries; + uint64_t *ptpbase, ptpphys, pte, pgsize; + uint32_t *ptpbase32, pte32; + void *cookie; + + *guest_fault = 0; + + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & XHYVE_PROT_WRITE; + cookie = NULL; + retval = 0; + retries = 0; + pte = 0; + pte32 = 0; + ptpshift = 0; + pgsize = 0; + ptpindex = 0; + ptpbase = NULL; + ptpbase32 = NULL; + +restart: + ptpphys = paging->cr3; /* root of the page tables */ + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + vm_inject_gp(vm, vcpuid); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + goto done; + } + + if (paging->paging_mode == PAGING_MODE_32) { + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~((uint64_t) 0xfff); + + ptpbase32 = vm_gpa2hva(vm, ptpphys, XHYVE_PAGE_SIZE); + + if (ptpbase32 == NULL) + goto error; + + ptpshift = XHYVE_PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if ((pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + goto done; + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = vm_gpa2hva(vm, ptpphys, (sizeof(*ptpbase) * 4)); + if (ptpbase == NULL) + goto error; + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + ptpphys = pte; + + nlevels = 2; + } else + nlevels = 4; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + ptpbase = vm_gpa2hva(vm, ptpphys, XHYVE_PAGE_SIZE); + if (ptpbase == NULL) + goto error; + + ptpshift = XHYVE_PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* Set the accessed bit in the page table entry */ + if ((pte & PG_A) == 0) { + if (atomic_cmpset_64(((volatile u_long *) &ptpbase[ptpindex]), + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + pfcode = pf_error_code(usermode, prot, 1, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(((volatile u_long *) &ptpbase[ptpindex]), pte, + pte | PG_M) == 0) + { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); +done: + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); + return (retval); +error: + retval = EFAULT; + goto done; +fault: + *guest_fault = 1; + goto done; +} + +int +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if (inst_length > VIE_INST_SIZE) + xhyve_abort("vmm_fetch_instruction: invalid length %d\n", inst_length); + + prot = XHYVE_PROT_READ | XHYVE_PROT_EXECUTE; + error = vm_copy_setup(vm, vcpuid, paging, rip, ((size_t) inst_length), prot, + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, ((size_t) inst_length)); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = (uint8_t) inst_length; + return (0); +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = (uint8_t) (1 << vie->ss); + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + xhyve_abort("decode_displacement: invalid disp_bytes %d\n", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = (char) x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = (char) x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = (char) x; + vie_advance(vie); + } + vie->displacement = (int64_t) u.u64; + return (0); +} + +/* + * Verify that all the bytes in the instruction buffer were consumed. + */ +static int +verify_inst_length(struct vie *vie) +{ + + if (vie->num_processed) + return (0); + else + return (-1); +} + +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +{ + int error; + uint64_t base, idx, gla2; + + /* Skip 'gla' verification */ + if (gla == VIE_INVALID_GLA) + return (0); + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_valid; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* XXX assuming that the base address of the segment is 0 */ + gla2 = base + vie->scale * idx + ((uint64_t) vie->displacement); + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: " + "base(0x%0llx), scale(%d), index(0x%0llx), " + "disp(0x%0llx), gla(0x%0llx), gla2(0x%0llx)\n", + base, vie->scale, idx, vie->displacement, gla, gla2); + return (-1); + } + + return (0); +} + +int +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +{ + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + + if (verify_inst_length(vie)) + return (-1); + + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); + } + + vie->decoded = 1; /* success */ + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_ioport.c b/vendor/github.com/hooklift/xhyve/vmm_ioport.c new file mode 100644 index 0000000..b935241 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_ioport.c @@ -0,0 +1,176 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_IOPORTS 1280 + +static const ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { + [TIMER_MODE] = vatpit_handler, + [TIMER_CNTR0] = vatpit_handler, + [TIMER_CNTR1] = vatpit_handler, + [TIMER_CNTR2] = vatpit_handler, + [NMISC_PORT] = vatpit_nmisc_handler, + [IO_ICU1] = vatpic_master_handler, + [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, + [IO_ICU2] = vatpic_slave_handler, + [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, + [IO_ELCR1] = vatpic_elc_handler, + [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, +}; + +#ifdef XHYVE_CONFIG_TRACE +static const char * +inout_instruction(struct vm_exit *vmexit) +{ + int index; + + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd", + "insb", "insw", "insd", + }; + + switch (vmexit->u.inout.bytes) { + case 1: + index = 0; + break; + case 2: + index = 1; + break; + default: + index = 2; + break; + } + + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(((unsigned) index) < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* XHYVE_CONFIG_TRACE */ + +static int +emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, + bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val; + int error; + + /* + * If there is no handler for the I/O port then punt to userspace. + */ + if (vmexit->u.inout.port >= MAX_IOPORTS || + (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { + *retu = true; + return (0); + } + + mask = (uint32_t) vie_size2mask(vmexit->u.inout.bytes); + + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + + error = (*handler)(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error) { + /* + * The value returned by this function is also the return value + * of vm_run(). This needs to be a positive number otherwise it + * can be interpreted as a "pseudo-error" like ERESTART. + * + * Enforce this by mapping all errors to EIO. + */ + return (EIO); + } + + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d setting guest " + "rax register", error)); + } + *retu = false; + return (0); +} + +static int +emulate_inout_str(bool *retu) +{ + *retu = true; + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + int bytes, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(retu); + else + error = emulate_inout_port(vm, vcpuid, vmexit, retu); + +#ifdef XHYVE_CONFIG_TRACE + VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); +#endif + + return (error); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_lapic.c b/vendor/github.com/hooklift/xhyve/vmm_lapic.c new file mode 100644 index 0000000..7f584db --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_lapic.c @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vm, cpu, true); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int error; + + if (cpu < -1 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(((unsigned) cpu), &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(((unsigned) cpu), &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#llx msg: %#llx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#llx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} + +static bool +x2apic_msr(u_int msr) +{ + if (msr >= 0x800 && msr <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +bool +lapic_msr(u_int msr) +{ + + if (x2apic_msr(msr) || (msr == MSR_APICBASE)) + return (TRUE); + else + return (FALSE); +} + +int +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_read(vlapic, 0, offset, rval, retu); + } + + return (error); +} + +int +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + error = vlapic_set_apicbase(vlapic, val); + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_write(vlapic, 0, offset, val, retu); + } + + return (error); +} + +int +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; +//printf("lapic_mmio_write 0x%016llx 0x%016llx\n", gpa, wval); + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_write(vlapic, 1, off, wval, arg); + return (error); +} + +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, + UNUSED int size, void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. + */ + off &= ~((uint64_t) 3); + if (off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_read(vlapic, 1, off, rval, arg); + //printf("lapic_mmio_read 0x%016llx (0x%016llx)\n", gpa, *rval); + return (error); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_mem.c b/vendor/github.com/hooklift/xhyve/vmm_mem.c new file mode 100644 index 0000000..ccd80a6 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_mem.c @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +int +vmm_mem_init(void) +{ + return (0); +} + + +void * +vmm_mem_alloc(uint64_t gpa, size_t size) +{ + void *object; + + object = valloc(size); + + if (!object) { + xhyve_abort("vmm_mem_alloc failed\n"); + } + + if (hv_vm_map(object, gpa, size, + HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC)) + { + xhyve_abort("hv_vm_map failed\n"); + } + + return object; +} + +void +vmm_mem_free(uint64_t gpa, size_t size, void *object) +{ + hv_vm_unmap(gpa, size); + free(object); +} diff --git a/vendor/github.com/hooklift/xhyve/vmm_stat.c b/vendor/github.com/hooklift/xhyve/vmm_stat.c new file mode 100644 index 0000000..bb52097 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_stat.c @@ -0,0 +1,156 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp); +} + +int +vmm_stat_desc_copy(int index, char *buf, size_t bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/vendor/github.com/hooklift/xhyve/vmm_util.c b/vendor/github.com/hooklift/xhyve/vmm_util.c new file mode 100644 index 0000000..73aa849 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmm_util.c @@ -0,0 +1,99 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include + +struct trapframe { + register_t tf_rdi; + register_t tf_rsi; + register_t tf_rdx; + register_t tf_rcx; + register_t tf_r8; + register_t tf_r9; + register_t tf_rax; + register_t tf_rbx; + register_t tf_rbp; + register_t tf_r10; + register_t tf_r11; + register_t tf_r12; + register_t tf_r13; + register_t tf_r14; + register_t tf_r15; + uint32_t tf_trapno; + uint16_t tf_fs; + uint16_t tf_gs; + register_t tf_addr; + uint32_t tf_flags; + uint16_t tf_es; + uint16_t tf_ds; + /* below portion defined in hardware */ + register_t tf_err; + register_t tf_rip; + register_t tf_cs; + register_t tf_rflags; + register_t tf_rsp; + register_t tf_ss; +}; + +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/vendor/github.com/hooklift/xhyve/vmx.c b/vendor/github.com/hooklift/xhyve/vmx.c new file mode 100644 index 0000000..f4f2bb0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmx.c @@ -0,0 +1,2752 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING | \ + PROCBASED_HLT_EXITING | \ + PROCBASED_TSC_OFFSET) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS | \ + PROCBASED_RDTSC_EXITING | \ + PROCBASED_USE_TPR_SHADOW | \ + PROCBASED_MOV_DR_EXITING | \ + PROCBASED_MTF | \ + PROCBASED_INVLPG_EXITING | \ + PROCBASED_PAUSE_EXITING) +#define PROCBASED_CTLS2_ONE_SETTING \ + (PROCBASED2_ENABLE_EPT | \ + PROCBASED2_UNRESTRICTED_GUEST | \ + PROCBASED2_ENABLE_VPID | \ + PROCBASED2_ENABLE_RDTSCP) +#define PROCBASED_CTLS2_ZERO_SETTING \ + (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | \ + PROCBASED2_DESC_TABLE_EXITING | \ + PROCBASED2_WBINVD_EXITING | \ + PROCBASED2_PAUSE_LOOP_EXITING /* FIXME */ | \ + PROCBASED2_RDRAND_EXITING | \ + PROCBASED2_ENABLE_INVPCID /* FIXME */ | \ + PROCBASED2_RDSEED_EXITING) +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING \ + (PINBASED_PREMPTION_TIMER) +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_EFER) +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR | \ + VM_ENTRY_GUEST_LMA) +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_LOAD_EFER) +#define VM_EXIT_CTLS_ZERO_SETTING \ + (VM_EXIT_SAVE_PREEMPTION_TIMER) +#define NMI_BLOCKING \ + (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING \ + (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +#define HANDLED 1 +#define UNHANDLED 0 + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; +static uint64_t cr0_ones_mask, cr0_zeros_mask; +static uint64_t cr4_ones_mask, cr4_zeros_mask; + +/* + * Optional capabilities + */ + +static int cap_halt_exit; +static int cap_pause_exit; +// static int cap_unrestricted_guest; +static int cap_monitor_trap; +// static int cap_invpcid; +// static int pirvec = -1; +// static struct unrhdr *vpid_unr; +// static u_int vpid_alloc_failed; + +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +// #define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); + +static __inline uint64_t +reg_read(int vcpuid, hv_x86_reg_t reg) { + uint64_t val; + + hv_vcpu_read_register(((hv_vcpuid_t) vcpuid), reg, &val); + return val; +} + +static __inline void +reg_write(int vcpuid, hv_x86_reg_t reg, uint64_t val) { + hv_vcpu_write_register(((hv_vcpuid_t) vcpuid), reg, val); +} + +static void hvdump(int vcpu) { + printf("VMCS_PIN_BASED_CTLS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_PIN_BASED_CTLS)); + printf("VMCS_PRI_PROC_BASED_CTLS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_PRI_PROC_BASED_CTLS)); + printf("VMCS_SEC_PROC_BASED_CTLS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_SEC_PROC_BASED_CTLS)); + printf("VMCS_ENTRY_CTLS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_ENTRY_CTLS)); + printf("VMCS_EXCEPTION_BITMAP: 0x%016llx\n", + vmcs_read(vcpu, VMCS_EXCEPTION_BITMAP)); + printf("VMCS_CR0_MASK: 0x%016llx\n", + vmcs_read(vcpu, VMCS_CR0_MASK)); + printf("VMCS_CR0_SHADOW: 0x%016llx\n", + vmcs_read(vcpu, VMCS_CR0_SHADOW)); + printf("VMCS_CR4_MASK: 0x%016llx\n", + vmcs_read(vcpu, VMCS_CR4_MASK)); + printf("VMCS_CR4_SHADOW: 0x%016llx\n", + vmcs_read(vcpu, VMCS_CR4_SHADOW)); + printf("VMCS_GUEST_CS_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CS_SELECTOR)); + printf("VMCS_GUEST_CS_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CS_LIMIT)); + printf("VMCS_GUEST_CS_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CS_ACCESS_RIGHTS)); + printf("VMCS_GUEST_CS_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CS_BASE)); + printf("VMCS_GUEST_DS_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_DS_SELECTOR)); + printf("VMCS_GUEST_DS_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_DS_LIMIT)); + printf("VMCS_GUEST_DS_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_DS_ACCESS_RIGHTS)); + printf("VMCS_GUEST_DS_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_DS_BASE)); + printf("VMCS_GUEST_ES_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_ES_SELECTOR)); + printf("VMCS_GUEST_ES_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_ES_LIMIT)); + printf("VMCS_GUEST_ES_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_ES_ACCESS_RIGHTS)); + printf("VMCS_GUEST_ES_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_ES_BASE)); + printf("VMCS_GUEST_FS_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_FS_SELECTOR)); + printf("VMCS_GUEST_FS_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_FS_LIMIT)); + printf("VMCS_GUEST_FS_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_FS_ACCESS_RIGHTS)); + printf("VMCS_GUEST_FS_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_FS_BASE)); + printf("VMCS_GUEST_GS_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GS_SELECTOR)); + printf("VMCS_GUEST_GS_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GS_LIMIT)); + printf("VMCS_GUEST_GS_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GS_ACCESS_RIGHTS)); + printf("VMCS_GUEST_GS_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GS_BASE)); + printf("VMCS_GUEST_SS_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_SS_SELECTOR)); + printf("VMCS_GUEST_SS_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_SS_LIMIT)); + printf("VMCS_GUEST_SS_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_SS_ACCESS_RIGHTS)); + printf("VMCS_GUEST_SS_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_SS_BASE)); + printf("VMCS_GUEST_LDTR_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_SELECTOR)); + printf("VMCS_GUEST_LDTR_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_LIMIT)); + printf("VMCS_GUEST_LDTR_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_ACCESS_RIGHTS)); + printf("VMCS_GUEST_LDTR_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_BASE)); + printf("VMCS_GUEST_TR_SELECTOR: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_TR_SELECTOR)); + printf("VMCS_GUEST_TR_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_TR_LIMIT)); + printf("VMCS_GUEST_TR_ACCESS_RIGHTS: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_TR_ACCESS_RIGHTS)); + printf("VMCS_GUEST_TR_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_TR_BASE)); + printf("VMCS_GUEST_GDTR_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GDTR_LIMIT)); + printf("VMCS_GUEST_GDTR_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_GDTR_BASE)); + printf("VMCS_GUEST_IDTR_LIMIT: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_LIMIT)); + printf("VMCS_GUEST_IDTR_BASE: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_LDTR_BASE)); + printf("VMCS_GUEST_CR0: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CR0)); + printf("VMCS_GUEST_CR3: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CR3)); + printf("VMCS_GUEST_CR4: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_CR4)); + printf("VMCS_GUEST_IA32_EFER: 0x%016llx\n", + vmcs_read(vcpu, VMCS_GUEST_IA32_EFER)); + printf("\n"); + printf("rip: 0x%016llx rfl: 0x%016llx cr2: 0x%016llx\n", + reg_read(vcpu, HV_X86_RIP), reg_read(vcpu, HV_X86_RFLAGS), + reg_read(vcpu, HV_X86_CR2)); + printf("rax: 0x%016llx rbx: 0x%016llx rcx: 0x%016llx rdx: 0x%016llx\n", + reg_read(vcpu, HV_X86_RAX), reg_read(vcpu, HV_X86_RBX), + reg_read(vcpu, HV_X86_RCX), reg_read(vcpu, HV_X86_RDX)); + printf("rsi: 0x%016llx rdi: 0x%016llx rbp: 0x%016llx rsp: 0x%016llx\n", + reg_read(vcpu, HV_X86_RSI), reg_read(vcpu, HV_X86_RDI), + reg_read(vcpu, HV_X86_RBP), reg_read(vcpu, HV_X86_RSP)); + printf("r8: 0x%016llx r9: 0x%016llx r10: 0x%016llx r11: 0x%016llx\n", + reg_read(vcpu, HV_X86_R8), reg_read(vcpu, HV_X86_R9), + reg_read(vcpu, HV_X86_R10), reg_read(vcpu, HV_X86_R11)); + printf("r12: 0x%016llx r13: 0x%016llx r14: 0x%016llx r15: 0x%016llx\n", + reg_read(vcpu, HV_X86_R12), reg_read(vcpu, HV_X86_R12), + reg_read(vcpu, HV_X86_R14), reg_read(vcpu, HV_X86_R15)); +} + +#ifdef XHYVE_CONFIG_TRACE +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} +#endif /* XHYVE_CONFIG_TRACE */ + +// static int +// vmx_allow_x2apic_msrs(struct vmx *vmx) +// { +// int i, error; + +// error = 0; + +// /* +// * Allow readonly access to the following x2APIC MSRs from the guest. +// */ +// error += guest_msr_ro(vmx, MSR_APIC_ID); +// error += guest_msr_ro(vmx, MSR_APIC_VERSION); +// error += guest_msr_ro(vmx, MSR_APIC_LDR); +// error += guest_msr_ro(vmx, MSR_APIC_SVR); + +// for (i = 0; i < 8; i++) +// error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); + +// for (i = 0; i < 8; i++) +// error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); + +// for (i = 0; i < 8; i++) +// error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); + +// error += guest_msr_ro(vmx, MSR_APIC_ESR); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); +// error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); +// error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); +// error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); +// error += guest_msr_ro(vmx, MSR_APIC_ICR); + +// /* +// * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. +// * +// * These registers get special treatment described in the section +// * "Virtualizing MSR-Based APIC Accesses". +// */ +// error += guest_msr_rw(vmx, MSR_APIC_TPR); +// error += guest_msr_rw(vmx, MSR_APIC_EOI); +// error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); + +// return (error); +// } + +u_long +vmx_fix_cr0(u_long cr0) +{ + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static int +vmx_cleanup(void) +{ + return (0); +} + +static int +vmx_init(void) +{ + int error = hv_vm_create(HV_VM_DEFAULT); + if (error) { + if (error == HV_NO_DEVICE) { + printf("vmx_init: processor not supported by " + "Hypervisor.framework\n"); + return (error); + } + else + xhyve_abort("hv_vm_create failed\n"); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(HV_VMX_CAP_PROCBASED, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(HV_VMX_CAP_PROCBASED2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(HV_VMX_CAP_PINBASED, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(HV_VMX_CAP_EXIT, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(HV_VMX_CAP_ENTRY, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = 1; + cap_monitor_trap = 1; + cap_pause_exit = 1; + // cap_unrestricted_guest = 1; + // cap_invpcid = 1; + + /* FIXME */ + cr0_ones_mask = cr4_ones_mask = 0; + cr0_zeros_mask = cr4_zeros_mask = 0; + + cr0_ones_mask |= (CR0_NE | CR0_ET); + cr0_zeros_mask |= (CR0_NW | CR0_CD); + cr4_ones_mask = 0x2000; + + vmx_msr_init(); + + return (0); +} + +static int +vmx_setup_cr_shadow(int vcpuid, int which, uint32_t initial) +{ + int error, mask_ident, shadow_ident; + uint64_t mask_value; + + if (which != 0 && which != 4) + xhyve_abort("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = (cr0_ones_mask | cr0_zeros_mask) | (CR0_PG | CR0_PE); + shadow_ident = VMCS_CR0_SHADOW; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + } + + error = vmcs_setreg(vcpuid, VMCS_IDENT(mask_ident), mask_value); + if (error) + return (error); + + error = vmcs_setreg(vcpuid, VMCS_IDENT(shadow_ident), initial); + if (error) + return (error); + + return (0); +} +#define vmx_setup_cr0_shadow(vcpuid,init) vmx_setup_cr_shadow(vcpuid, 0, (init)) +#define vmx_setup_cr4_shadow(vcpuid,init) vmx_setup_cr_shadow(vcpuid, 4, (init)) + +static void * +vmx_vm_init(struct vm *vm) +{ + struct vmx *vmx; + + vmx = malloc(sizeof(struct vmx)); + assert(vmx); + bzero(vmx, sizeof(struct vmx)); + vmx->vm = vm; + + return (vmx); +} + +static int +vmx_vcpu_init(void *arg, int vcpuid) { + uint32_t exc_bitmap; + struct vmx *vmx; + hv_vcpuid_t hvid; + int error; + + vmx = (struct vmx *) arg; + + if (hv_vcpu_create(&hvid, HV_VCPU_DEFAULT)) { + xhyve_abort("hv_vcpu_create failed\n"); + } + + if (hvid != ((hv_vcpuid_t) vcpuid)) { + /* FIXME */ + xhyve_abort("vcpu id mismatch\n"); + } + + if (hv_vcpu_enable_native_msr(hvid, MSR_GSBASE, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_FSBASE, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_SYSENTER_CS_MSR, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_SYSENTER_ESP_MSR, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_SYSENTER_EIP_MSR, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_TSC, 1) || + hv_vcpu_enable_native_msr(hvid, MSR_IA32_TSC_AUX, 1)) + { + xhyve_abort("vmx_vcpu_init: error setting guest msr access\n"); + } + + vmx_msr_guest_init(vmx, vcpuid); + + vmcs_write(vcpuid, VMCS_PIN_BASED_CTLS, pinbased_ctls); + vmcs_write(vcpuid, VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + vmcs_write(vcpuid, VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + vmcs_write(vcpuid, VMCS_EXIT_CTLS, exit_ctls); + vmcs_write(vcpuid, VMCS_ENTRY_CTLS, entry_ctls); + + /* exception bitmap */ + if (vcpu_trace_exceptions()) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + + vmcs_write(vcpuid, VMCS_EXCEPTION_BITMAP, exc_bitmap); + + vmx->cap[vcpuid].set = 0; + vmx->cap[vcpuid].proc_ctls = procbased_ctls; + vmx->cap[vcpuid].proc_ctls2 = procbased_ctls2; + vmx->state[vcpuid].nextrip = ~(uint64_t) 0; + + /* + * Set up the CR0/4 shadows, and init the read shadow + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + error = vmx_setup_cr0_shadow(vcpuid, 0x60000010); + if (error != 0) + xhyve_abort("vmx_setup_cr0_shadow %d\n", error); + + error = vmx_setup_cr4_shadow(vcpuid, 0); + + if (error != 0) + xhyve_abort("vmx_setup_cr4_shadow %d\n", error); + + return (0); +} + +static int +vmx_handle_cpuid(struct vm *vm, int vcpuid) +{ + uint32_t eax, ebx, ecx, edx; + int error; + + eax = (uint32_t) reg_read(vcpuid, HV_X86_RAX); + ebx = (uint32_t) reg_read(vcpuid, HV_X86_RBX); + ecx = (uint32_t) reg_read(vcpuid, HV_X86_RCX); + edx = (uint32_t) reg_read(vcpuid, HV_X86_RDX); + + error = x86_emulate_cpuid(vm, vcpuid, &eax, &ebx, &ecx, &edx); + + reg_write(vcpuid, HV_X86_RAX, eax); + reg_write(vcpuid, HV_X86_RBX, ebx); + reg_write(vcpuid, HV_X86_RCX, ecx); + reg_write(vcpuid, HV_X86_RDX, edx); + + return (error); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef XHYVE_CONFIG_TRACE + VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#llx", vmcs_guest_rip(vcpu)); +#else + (void) vmx; + (void) vcpu; +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled) +{ +#ifdef XHYVE_CONFIG_TRACE + VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0llx", + handled ? "handled" : "unhandled", + exit_reason_to_str((int) exit_reason), rip); +#else + (void) vmx; + (void) vcpu; + (void) rip; + (void) exit_reason; + (void) handled; +#endif +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(vcpu, VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); + } +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(vcpu, VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(vcpu, VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + } +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(vcpu, VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); +} + +static void +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + uint32_t gi, info; + + gi = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_INTERRUPTIBILITY); + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %#x", gi)); + + info = (uint32_t) vmcs_read(vcpu, VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %#x", info)); + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; + vmcs_write(vcpu, VMCS_ENTRY_INTR_INFO, info); + + VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vm_nmi_clear(vmx->vm, vcpu); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + if (vmx->state[vcpu].nextrip != guestrip) { + gi = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#llx/%#llx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(vcpu, VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#llx", __func__, entryinfo)); + + info = (uint32_t) vmcs_read(vcpu, VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#llx/%#x", __func__, entryinfo, info)); + + info = (uint32_t) entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(vcpu, VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(vcpu, VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + need_nmi_exiting = 1; + gi = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_INTERRUPTIBILITY); + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + info = (uint32_t) vmcs_read(vcpu, VMCS_ENTRY_INTR_INFO); + if ((info & VMCS_INTR_VALID) == 0) { + vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) + vmx_set_nmi_window_exiting(vmx, vcpu); + } + + extint_pending = vm_extint_pending(vmx->vm, vcpu); + + /* + * If interrupt-window exiting is already in effect then don't bother + * checking for pending interrupts. This is just an optimization and + * not needed for correctness. + */ + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { + VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " + "pending int_window_exiting"); + return; + } + + if (!extint_pending) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + rflags = vmcs_read(vcpu, VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#llx", vector, rflags); + goto cantinject; + } + + gi = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + + info = (uint32_t) vmcs_read(vcpu, VMCS_ENTRY_INTR_INFO); + if (info & VMCS_INTR_VALID) { + /* + * This is expected and could happen for multiple reasons: + * - A vectoring VM-entry was aborted due to astpending + * - A VM-exit happened during event injection. + * - An exception was injected above. + * - An NMI was injected above or after "NMI window exiting" + */ + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= (uint32_t) vector; + vmcs_write(vcpu, VMCS_ENTRY_INTR_INFO, info); + + if (!extint_pending) { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); + gi = (uint32_t) vmcs_read(vcpuid, VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(vcpuid, VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); + gi = (uint32_t) vmcs_read(vcpuid, VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(vcpuid, VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_assert_nmi_blocking(int vcpuid) +{ + uint32_t gi; + + gi = (uint32_t) vmcs_read(vcpuid, VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu) +{ + uint64_t xcrval; + const struct xsave_limits *limits; + + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (reg_read(vcpu, HV_X86_RCX) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || + !(vmcs_read(vcpu, VMCS_GUEST_CR4) & CR4_XSAVE)) + { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = reg_read(vcpu, HV_X86_RDX) << 32 + | (reg_read(vcpu, HV_X86_RAX) & 0xffffffff); + + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + reg_write(vcpu, HV_X86_XCR0, xcrval); + return (HANDLED); +} + +static uint64_t +vmx_get_guest_reg(int vcpu, int ident) +{ + switch (ident) { + case 0: + return (reg_read(vcpu, HV_X86_RAX)); + case 1: + return (reg_read(vcpu, HV_X86_RCX)); + case 2: + return (reg_read(vcpu, HV_X86_RDX)); + case 3: + return (reg_read(vcpu, HV_X86_RBX)); + case 4: + return (vmcs_read(vcpu, VMCS_GUEST_RSP)); + case 5: + return (reg_read(vcpu, HV_X86_RBP)); + case 6: + return (reg_read(vcpu, HV_X86_RSI)); + case 7: + return (reg_read(vcpu, HV_X86_RDI)); + case 8: + return (reg_read(vcpu, HV_X86_R8)); + case 9: + return (reg_read(vcpu, HV_X86_R9)); + case 10: + return (reg_read(vcpu, HV_X86_R10)); + case 11: + return (reg_read(vcpu, HV_X86_R11)); + case 12: + return (reg_read(vcpu, HV_X86_R12)); + case 13: + return (reg_read(vcpu, HV_X86_R13)); + case 14: + return (reg_read(vcpu, HV_X86_R14)); + case 15: + return (reg_read(vcpu, HV_X86_R15)); + default: + xhyve_abort("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(int vcpu, int ident, uint64_t regval) +{ + switch (ident) { + case 0: + reg_write(vcpu, HV_X86_RAX, regval); + break; + case 1: + reg_write(vcpu, HV_X86_RCX, regval); + break; + case 2: + reg_write(vcpu, HV_X86_RDX, regval); + break; + case 3: + reg_write(vcpu, HV_X86_RBX, regval); + break; + case 4: + vmcs_write(vcpu, VMCS_GUEST_RSP, regval); + break; + case 5: + reg_write(vcpu, HV_X86_RBP, regval); + break; + case 6: + reg_write(vcpu, HV_X86_RSI, regval); + break; + case 7: + reg_write(vcpu, HV_X86_RDI, regval); + break; + case 8: + reg_write(vcpu, HV_X86_R8, regval); + break; + case 9: + reg_write(vcpu, HV_X86_R9, regval); + break; + case 10: + reg_write(vcpu, HV_X86_R10, regval); + break; + case 11: + reg_write(vcpu, HV_X86_R11, regval); + break; + case 12: + reg_write(vcpu, HV_X86_R12, regval); + break; + case 13: + reg_write(vcpu, HV_X86_R13, regval); + break; + case 14: + reg_write(vcpu, HV_X86_R14, regval); + break; + case 15: + reg_write(vcpu, HV_X86_R15, regval); + break; + default: + xhyve_abort("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(UNUSED struct vm *vm, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + // *pt; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(vcpu, VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + // printf("cr0: v:0x%016llx 1:0x%08llx 0:0x%08llx v:0x%016llx\n", + // regval, cr0_ones_mask, cr0_zeros_mask, crval); + vmcs_write(vcpu, VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entryctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(vcpu, VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(vcpu, VMCS_GUEST_IA32_EFER, efer); + entryctls = vmcs_read(vcpu, VMCS_ENTRY_CTLS); + entryctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(vcpu, VMCS_ENTRY_CTLS, entryctls); + } + + // if (vmcs_read(vcpu, VMCS_GUEST_CR4) & CR4_PAE) { + // if (!(pt = (uint64_t *) vm_gpa2hva(vm, + // vmcs_read(vcpu, VMCS_GUEST_CR3), sizeof(uint64_t) * 4))) + // { + // xhyve_abort("invalid cr3\n"); + // } + + // vmcs_write(vcpu, VMCS_GUEST_PDPTE0, pt[0]); + // vmcs_write(vcpu, VMCS_GUEST_PDPTE1, pt[1]); + // vmcs_write(vcpu, VMCS_GUEST_PDPTE2, pt[2]); + // vmcs_write(vcpu, VMCS_GUEST_PDPTE3, pt[3]); + // } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(vcpu, VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(vcpu, VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vmx->vm, vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(int vcpu) +{ + uint32_t ssar; + + ssar = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(int vcpu) +{ + uint32_t csar; + + if (vmcs_read(vcpu, VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(vcpu, VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(int vcpu) +{ + + if (!(vmcs_read(vcpu, VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + if (!(vmcs_read(vcpu, VMCS_GUEST_CR4) & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(vcpu, VMCS_GUEST_IA32_EFER) & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +static uint64_t +inout_str_index(struct vmx *vmx, int vcpuid, int in) +{ + uint64_t val; + int error; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vmx, vcpuid, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx *vmx, int vcpuid, int rep) +{ + uint64_t val; + int error; + + if (rep) { + error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + xhyve_abort("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging, int vcpu) +{ + paging->cr3 = vmcs_guest_cr3(vcpu); + paging->cpl = vmx_cpl(vcpu); + paging->cpu_mode = vmx_cpu_mode(vcpu); + paging->paging_mode = vmx_paging_mode(vcpu); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla, int vcpu) +{ + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(paging, vcpu); + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(vcpu, VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(vcpu, VMCS_GUEST_CS_BASE); + csar = (uint32_t) vmcs_read(vcpu, VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + case CPU_MODE_64BIT: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + vie_init(&vmexit->u.inst_emul.vie, NULL, 0); +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = XHYVE_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = XHYVE_PROT_EXECUTE; + else + fault_type= XHYVE_PROT_READ; + + return (fault_type); +} + +static bool +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (FALSE); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (FALSE); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (FALSE); + } + + return (TRUE); +} + +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT: + case APIC_OFFSET_THERM_LVT: + case APIC_OFFSET_PERF_LVT: + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + case APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, ((uint32_t) offset)); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + XHYVE_PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0: + case APIC_OFFSET_ISR1: + case APIC_OFFSET_ISR2: + case APIC_OFFSET_ISR3: + case APIC_OFFSET_ISR4: + case APIC_OFFSET_ISR5: + case APIC_OFFSET_ISR6: + case APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0: + case APIC_OFFSET_TMR1: + case APIC_OFFSET_TMR2: + case APIC_OFFSET_TMR3: + case APIC_OFFSET_TMR4: + case APIC_OFFSET_TMR5: + case APIC_OFFSET_TMR6: + case APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0: + case APIC_OFFSET_IRR1: + case APIC_OFFSET_IRR2: + case APIC_OFFSET_IRR3: + case APIC_OFFSET_IRR4: + case APIC_OFFSET_IRR5: + case APIC_OFFSET_IRR6: + case APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + ((uint32_t) offset), + VIE_INVALID_GLA, vcpuid); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + xhyve_abort("%s: invalid reason %d", __func__, reason); + } +} + +static int +emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); + else + error = vmx_wrmsr(vmx, vcpuid, num, val); + + return (error); +} + +static int +emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) +{ + uint64_t result; + uint32_t eax, edx; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); + else + error = vmx_rdmsr(vmx, vcpuid, num, &result); + + if (error == 0) { + eax = (uint32_t) result; + reg_write(vcpuid, HV_X86_RAX, eax); + edx = (uint32_t) (result >> 32); + reg_write(vcpuid, HV_X86_RDX, edx); + } + + return (error); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int error, errcode, errcode_valid, handled, in; + struct vlapic *vlapic; + struct vm_inout_str *vis; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; + bool retu; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = (uint32_t) vmcs_idt_vectoring_info(vcpu); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1u << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = (uint32_t) vmcs_idt_vectoring_err(vcpu); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(vcpu, VMCS_ENTRY_INST_LENGTH, + ((uint64_t) vmexit->inst_length)); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging, vcpu); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = (uint32_t) vmcs_idt_vectoring_err(vcpu); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016llx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ((uint64_t) ts->errcode_valid)); + break; + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vmx->vm, vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); + retu = false; + ecx = (uint32_t) reg_read(vcpu, HV_X86_RCX); + VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + // printf("EXIT_REASON_RDMSR 0x%08x\n", ecx); + error = emulate_rdmsr(vmx, vcpu, ecx, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_WRMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); + retu = false; + eax = (uint32_t) reg_read(vcpu, HV_X86_RAX); + ecx = (uint32_t) reg_read(vcpu, HV_X86_RCX); + edx = (uint32_t) reg_read(vcpu, HV_X86_RDX); + VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016llx", + ecx, (uint64_t)edx << 32 | eax); + // printf("EXIT_REASON_WRMSR 0x%08x value 0x%016llx\n", + // ecx, (uint64_t)edx << 32 | eax); + error = emulate_wrmsr(vmx, vcpu, ecx, + (uint64_t)edx << 32 | eax, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(vcpu, VMCS_GUEST_RFLAGS); + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + vmx_clear_int_window_exiting(vmx, vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = (uint32_t) vmcs_read(vcpu, VMCS_EXIT_INTR_INFO); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %#x", intr_info)); + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vmx->vm, vcpu)) + vmx_inject_nmi(vmx, vcpu); + vmx_clear_nmi_window_exiting(vmx, vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t) reg_read(vcpu, HV_X86_RAX); + // if ((vmexit->u.inout.port == 0x0020) || + // (vmexit->u.inout.port == 0x0021) || + // (vmexit->u.inout.port == 0x00a0) || + // (vmexit->u.inout.port == 0x00a1)) + // { + // printf("EXIT_REASON_INOUT port 0x%03x in %d\n", + // vmexit->u.inout.port, vmexit->u.inout.in); + // } + if (vmexit->u.inout.string) { + inst_info = (uint32_t) vmcs_read(vcpu, VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging, vcpu); + vis->rflags = vmcs_read(vcpu, VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(vcpu, VMCS_GUEST_CR0); + vis->index = inout_str_index(vmx, vcpu, in); + vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + inout_str_seginfo(vmx, vcpu, inst_info, in, vis); + } + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + handled = vmx_handle_cpuid(vmx->vm, vcpu); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); + intr_info = (uint32_t) vmcs_read(vcpu, VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_vec != IDT_DF) && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if (intr_type == VMCS_INTR_T_NMI) + return (1); + + if (intr_vec == IDT_PF) { + reg_write(vcpu, HV_X86_CR2, qual); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(vcpu, VMCS_ENTRY_INST_LENGTH, + ((uint64_t) vmexit->inst_length)); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = (int) vmcs_read(vcpu, VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, ((int) intr_vec), + errcode_valid, ((uint32_t) errcode), 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ + gpa = vmcs_gpa(vcpu); + if (vm_mem_allocated(vmx->vm, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + } else if (ept_emulation_fault(qual)) { + vmexit_inst_emul(vmexit, gpa, vmcs_gla(vcpu), vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + handled = vmx_emulate_xsetbv(vmx, vcpu); + break; + case EXIT_REASON_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vmexit->rip += (uint64_t) vmexit->inst_length; + vmexit->inst_length = 0; + vmcs_write(vcpu, VMCS_GUEST_RIP, vmexit->rip); + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, void *rendezvous_cookie, + void *suspend_cookie) +{ + int handled; + struct vmx *vmx; + struct vm *vm; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint32_t exit_reason; + hv_return_t hvr; + + vmx = arg; + vm = vmx->vm; + vlapic = vm_lapic(vm, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + + vmcs_write(vcpu, VMCS_GUEST_RIP, ((uint64_t) rip)); + + do { + KASSERT(vmcs_guest_rip(vcpu) == ((uint64_t) rip), + ("%s: vmcs guest rip mismatch %#llx/%#llx", + __func__, vmcs_guest_rip(vcpu), ((uint64_t) rip))); + + handled = UNHANDLED; + + vmx_inject_interrupts(vmx, vcpu, vlapic, ((uint64_t) rip)); + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(suspend_cookie)) { + vm_exit_suspended(vmx->vm, vcpu, ((uint64_t) rip)); + break; + } + + if (vcpu_rendezvous_pending(rendezvous_cookie)) { + vm_exit_rendezvous(vmx->vm, vcpu, ((uint64_t) rip)); + break; + } + + vmx_run_trace(vmx, vcpu); + hvr = hv_vcpu_run((hv_vcpuid_t) vcpu); + /* Collect some information for VM exit processing */ + rip = (register_t) vmcs_guest_rip(vcpu); + vmexit->rip = (uint64_t) rip; + vmexit->inst_length = (int) vmexit_instruction_length(vcpu); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(vcpu); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(vcpu); + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = (uint64_t) rip; + if (hvr == HV_SUCCESS) { + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + hvdump(vcpu); + xhyve_abort("vmentry error\n"); + } + vmx_exit_trace(vmx, vcpu, ((uint64_t) rip), exit_reason, handled); + rip = (register_t) vmexit->rip; + } while (handled); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + xhyve_abort("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + if (!handled) + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); + + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", + vmexit->exitcode); + + return (0); +} + +static void +vmx_vm_cleanup(void *arg) +{ + struct vmx *vmx = arg; + + free(vmx); + + return; +} + +static void +vmx_vcpu_cleanup(void *arg, int vcpuid) { + if (arg || vcpuid) xhyve_abort("vmx_vcpu_cleanup\n"); +} + + +static int +vmx_get_intr_shadow(int vcpu, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(vcpu, VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, uint64_t val) +{ + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vcpu, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vcpu, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#llx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int +vmx_shadow_reg(int reg) +{ + int shreg; + + shreg = -1; + + switch (reg) { + case VM_REG_GUEST_CR0: + shreg = VMCS_CR0_SHADOW; + break; + case VM_REG_GUEST_CR4: + shreg = VMCS_CR4_SHADOW; + break; + default: + break; + } + + return (shreg); +} + +static const hv_x86_reg_t hvregs[] = { + HV_X86_RAX, + HV_X86_RBX, + HV_X86_RCX, + HV_X86_RDX, + HV_X86_RSI, + HV_X86_RDI, + HV_X86_RBP, + HV_X86_R8, + HV_X86_R9, + HV_X86_R10, + HV_X86_R11, + HV_X86_R12, + HV_X86_R13, + HV_X86_R14, + HV_X86_R15, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_CR2, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX, + HV_X86_REGISTERS_MAX +}; + +static int +vmx_getreg(UNUSED void *arg, int vcpu, int reg, uint64_t *retval) +{ + hv_x86_reg_t hvreg; + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vcpu, retval)); + + hvreg = hvregs[reg]; + if (hvreg != HV_X86_REGISTERS_MAX) { + *retval = reg_read(vcpu, hvreg); + return (0); + } + + return (vmcs_getreg(vcpu, reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error, shadow; + uint64_t ctls; + hv_x86_reg_t hvreg; + struct vmx *vmx = arg; + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, val)); + + hvreg = hvregs[reg]; + if (hvreg != HV_X86_REGISTERS_MAX) { + reg_write(vcpu, hvreg, val); + return (0); + } + + error = vmcs_setreg(vcpu, reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(vcpu, VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(vcpu, VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + + shadow = vmx_shadow_reg(reg); + if (shadow > 0) { + /* + * Store the unmodified value in the shadow + */ + error = vmcs_setreg(vcpu, VMCS_IDENT(shadow), val); + } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + */ + hv_vcpu_invalidate_tlb((hv_vcpuid_t) vcpu); + } + } + + return (error); +} + +static int +vmx_getdesc(UNUSED void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + return (vmcs_getdesc(vcpu, reg, desc)); +} + +static int +vmx_setdesc(UNUSED void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + return (vmcs_setdesc(vcpu, reg, desc)); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + uint32_t baseval; + uint32_t *pptr; + uint32_t reg; + uint32_t flag; + int retval; + + retval = ENOENT; + pptr = NULL; + baseval = 0; + reg = 0; + flag = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + default: + xhyve_abort("vmx_setcap\n"); + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + + vmcs_write(vcpu, reg, baseval); + + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + + } + + return (retval); +} + +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx *vmx; +}; + +// #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ +// do { \ +// VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ +// level ? "level" : "edge", vector); \ +// VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ +// VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ +// VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ +// VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ +// VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ +// } while (0) + +// /* +// * vlapic->ops handlers that utilize the APICv hardware assist described in +// * Chapter 29 of the Intel SDM. +// */ +// static int +// vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +// { +// struct vlapic_vtx *vlapic_vtx; +// struct pir_desc *pir_desc; +// uint64_t mask; +// int idx, notify; + +// vlapic_vtx = (struct vlapic_vtx *)vlapic; +// pir_desc = vlapic_vtx->pir_desc; + +// /* +// * Keep track of interrupt requests in the PIR descriptor. This is +// * because the virtual APIC page pointed to by the VMCS cannot be +// * modified if the vcpu is running. +// */ +// idx = vector / 64; +// mask = 1UL << (vector % 64); +// atomic_set_long(&pir_desc->pir[idx], mask); +// notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + +// VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, +// level, "vmx_set_intr_ready"); +// return (notify); +// } + +// static int +// vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +// { +// struct vlapic_vtx *vlapic_vtx; +// struct pir_desc *pir_desc; +// struct LAPIC *lapic; +// uint64_t pending, pirval; +// uint32_t ppr, vpr; +// int i; + +// /* +// * This function is only expected to be called from the 'HLT' exit +// * handler which does not care about the vector that is pending. +// */ +// KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + +// vlapic_vtx = (struct vlapic_vtx *)vlapic; +// pir_desc = vlapic_vtx->pir_desc; + +// pending = atomic_load_acq_long(&pir_desc->pending); +// if (!pending) +// return (0); /* common case */ + +// /* +// * If there is an interrupt pending then it will be recognized only +// * if its priority is greater than the processor priority. +// * +// * Special case: if the processor priority is zero then any pending +// * interrupt will be recognized. +// */ +// lapic = vlapic->apic_page; +// ppr = lapic->ppr & 0xf0; +// if (ppr == 0) +// return (1); + +// VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", +// lapic->ppr); + +// for (i = 3; i >= 0; i--) { +// pirval = pir_desc->pir[i]; +// if (pirval != 0) { +// vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; +// return (vpr > ppr); +// } +// } +// return (0); +// } + +// static void +// vmx_intr_accepted(struct vlapic *vlapic, int vector) +// { + +// xhyve_abort("vmx_intr_accepted: not expected to be called"); +// } + +// static void +// vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +// { +// struct vlapic_vtx *vlapic_vtx; +// struct vmx *vmx; +// struct vmcs *vmcs; +// uint64_t mask, val; + +// KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); +// KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), +// ("vmx_set_tmr: vcpu cannot be running")); + +// vlapic_vtx = (struct vlapic_vtx *)vlapic; +// vmx = vlapic_vtx->vmx; +// vmcs = &vmx->vmcs[vlapic->vcpuid]; +// mask = 1UL << (vector % 64); + +// VMPTRLD(vmcs); +// val = vmcs_read(VMCS_EOI_EXIT(vector)); +// if (level) +// val |= mask; +// else +// val &= ~mask; +// vmcs_write(VMCS_EOI_EXIT(vector), val); +// VMCLEAR(vmcs); +// } + +// static void +// vmx_enable_x2apic_mode(struct vlapic *vlapic) +// { +// struct vmx *vmx; +// struct vmcs *vmcs; +// uint32_t proc_ctls2; +// int vcpuid, error; + +// vcpuid = vlapic->vcpuid; +// vmx = ((struct vlapic_vtx *)vlapic)->vmx; +// vmcs = &vmx->vmcs[vcpuid]; + +// proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; +// KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, +// ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + +// proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; +// proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; +// vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; + +// VMPTRLD(vmcs); +// vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); +// VMCLEAR(vmcs); + +// if (vlapic->vcpuid == 0) { +// /* +// * The nested page table mappings are shared by all vcpus +// * so unmap the APIC access page just once. +// */ +// error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); +// KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", +// __func__, error)); + +// /* +// * The MSR bitmap is shared by all vcpus so modify it only +// * once in the context of vcpu 0. +// */ +// error = vmx_allow_x2apic_msrs(vmx); +// KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", +// __func__, error)); +// } +// } + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof(struct vlapic_vtx)); + assert(vlapic); + bzero(vlapic, sizeof(struct vlapic)); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->vmx = vmx; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(UNUSED void *arg, struct vlapic *vlapic) +{ + vlapic_cleanup(vlapic); + free(vlapic); +} + +static void +vmx_vcpu_interrupt(int vcpu) { + hv_vcpuid_t hvvcpu; + + hvvcpu = (hv_vcpuid_t) vcpu; + + hv_vcpu_interrupt(&hvvcpu, 1); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_vm_init, + vmx_vcpu_init, + vmx_run, + vmx_vm_cleanup, + vmx_vcpu_cleanup, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_getcap, + vmx_setcap, + vmx_vlapic_init, + vmx_vlapic_cleanup, + vmx_vcpu_interrupt +}; diff --git a/vendor/github.com/hooklift/xhyve/vmx_msr.c b/vendor/github.com/hooklift/xhyve/vmx_msr.c new file mode 100644 index 0000000..9612bee --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vmx_msr.c @@ -0,0 +1,353 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static bool +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +int vmx_set_ctlreg(hv_vmx_capability_t cap_field, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t cap; + bool one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) { + return EINVAL; + } + + if (hv_vmx_read_capability(cap_field, &cap)) { + return EINVAL; + } + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(cap, i); + zero_allowed = vmx_ctl_allows_zero_setting(cap, i); + + if (zero_allowed && !one_allowed) { + /* must be zero */ + if (ones_mask & (1 << i)) { + fprintf(stderr, + "vmx_set_ctlreg: cap_field: %d bit: %d must be zero\n", + cap_field, i); + return (EINVAL); + } + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { + /* must be one */ + if (zeros_mask & (1 << i)) { + fprintf(stderr, + "vmx_set_ctlreg: cap_field: %d bit: %d must be one\n", + cap_field, i); + return (EINVAL); + } + *retval |= 1 << i; + } else { + /* don't care */ + if (zeros_mask & (1 << i)){ + *retval &= ~(1 << i); + } else if (ones_mask & (1 << i)) { + *retval |= 1 << i; + } else { + /* XXX: don't allow unspecified don't cares */ + fprintf(stderr, + "vmx_set_ctlreg: cap_field: %d bit: %d unspecified " + "don't care\n", cap_field, i); + return (EINVAL); + } + } + } + + return (0); +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; + +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + +void +vmx_msr_init(void) { + uint64_t bus_freq, tsc_freq, ratio; + size_t length; + int i; + + length = sizeof(uint64_t); + + if (sysctlbyname("machdep.tsc.frequency", &tsc_freq, &length, NULL, 0)) { + xhyve_abort("machdep.tsc.frequency\n"); + } + + if (sysctlbyname("hw.busfrequency", &bus_freq, &length, NULL, 0)) { + xhyve_abort("hw.busfrequency\n"); + } + + /* Initialize emulated MSRs */ + /* FIXME */ + misc_enable = 1; + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1u << 12) | (1u << 11); + misc_enable &= ~((1u << 18) | (1u << 16)); + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) { + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; + } +} + +void +vmx_msr_guest_init(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + + + hv_vcpu_enable_native_msr(((hv_vcpuid_t) vcpuid), MSR_LSTAR, 1); + hv_vcpu_enable_native_msr(((hv_vcpuid_t) vcpuid), MSR_CSTAR, 1); + hv_vcpu_enable_native_msr(((hv_vcpuid_t) vcpuid), MSR_STAR, 1); + hv_vcpu_enable_native_msr(((hv_vcpuid_t) vcpuid), MSR_SF_MASK, 1); + hv_vcpu_enable_native_msr(((hv_vcpuid_t) vcpuid), MSR_KGSBASE, 1); + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + return; +} + +int +vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val) +{ + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_EFER: + *val = vmcs_read(vcpuid, VMCS_GUEST_IA32_EFER); + break; + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *val = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase: + case MSR_MTRR4kBase + 1: + case MSR_MTRR4kBase + 2: + case MSR_MTRR4kBase + 3: + case MSR_MTRR4kBase + 4: + case MSR_MTRR4kBase + 5: + case MSR_MTRR4kBase + 6: + case MSR_MTRR4kBase + 7: + case MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase: + case MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + *val = 0; + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val) +{ + uint64_t *guest_msrs; + uint64_t changed; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_EFER: + vmcs_write(vcpuid, VMCS_GUEST_IA32_EFER, val); + break; + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(vmx->vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase: + case MSR_MTRR4kBase + 1: + case MSR_MTRR4kBase + 2: + case MSR_MTRR4kBase + 3: + case MSR_MTRR4kBase + 4: + case MSR_MTRR4kBase + 5: + case MSR_MTRR4kBase + 6: + case MSR_MTRR4kBase + 7: + case MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase: + case MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + break; /* Ignore writes */ + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) + error = EINVAL; + + break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/vendor/github.com/hooklift/xhyve/vpmtmr.c b/vendor/github.com/hooklift/xhyve/vpmtmr.c new file mode 100644 index 0000000..4f237e4 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vpmtmr.c @@ -0,0 +1,102 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; +#pragma clang diagnostic pop + +struct vpmtmr * +vpmtmr_init(UNUSED struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr)); + assert(vpmtmr); + bzero(vpmtmr, sizeof(struct vpmtmr)); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr); +} + +int +vpmtmr_handler(struct vm *vm, UNUSED int vcpuid, bool in, UNUSED int port, + int bytes, uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#llx to %#llx", vpmtmr->baseuptime, now)); + *val = (uint32_t) (vpmtmr->baseval + (delta / vpmtmr->freq_sbt)); + + return (0); +} diff --git a/vendor/github.com/hooklift/xhyve/vrtc.c b/vendor/github.com/hooklift/xhyve/vrtc.c new file mode 100644 index 0000000..864fe22 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/vrtc.c @@ -0,0 +1,1124 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const u_char bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +struct vrtc { + struct vm *vm; + pthread_mutex_t mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +struct clocktime { + int year; /* year (4 digit year) */ + int mon; /* month (1 - 12) */ + int day; /* day (1 - 31) */ + int hour; /* hour (0 - 23) */ + int min; /* minute (0 - 59) */ + int sec; /* second (0 - 59) */ + int dow; /* day of week (0 - 6; 0 = Sunday) */ + long nsec; /* nano seconds */ +}; +#pragma clang diagnostic pop + +#define VRTC_LOCK(vrtc) pthread_mutex_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) pthread_mutex_unlock(&((vrtc)->mtx)) +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static int rtc_flag_broken_time = 1; +static clock_serv_t mach_clock; + +#define POSIX_BASE_YEAR 1970 +#define FEBRUARY 2 +#define SECDAY (24 * 60 * 60) +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + +static __inline int +leapyear(int year) +{ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } + } + return (rv); +} + +static int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) +{ + int i, year, days; + + year = ct->year; + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + (year > 2037 && sizeof(time_t) == 4)) { /* time_t overflow */ + return (EINVAL); + } + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 + + ct->sec; + ts->tv_nsec = ct->nsec; + + return (0); +} + +static void +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = (int) (secs / SECDAY); + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = (int) (rsec / 3600); + rsec = rsec % 3600; + ct->min = (int) (rsec / 60); + rsec = rsec % 60; + ct->sec = (int) rsec; + ct->nsec = ts->tv_nsec; +} + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#llx to %#llx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((uint8_t) ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val])); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + mach_timespec_t mts; + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + clock_get_time(mach_clock, &mts); + ts.tv_sec = mts.tv_sec; + ts.tv_nsec = mts.tv_nsec; + + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= 1900, ("invalid clocktime year %d", ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + struct vm *vm; + int century, error, hour, pm, year; + + vm = vrtc->vm; + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < 1900) { + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; + sbintime_t oldbase; + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#llx to %#llx", + oldbase, newbase); + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %lld hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#llx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = (uint8_t) (newirqf | newval); + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#llx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#llx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (((unsigned long) offset) < offsetof(struct rtcdev, nvram) || + offset == RTC_CENTURY || + ((unsigned long) offset) >= sizeof(struct rtcdev)) + { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || ((unsigned long) offset) >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, UNUSED int vcpuid, bool in, UNUSED int port, + int bytes, uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, UNUSED int port, + int bytes, uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = (int) vrtc->addr; + if (((unsigned long) offset) >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, ((uint8_t) *val)); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, ((uint8_t) *val)); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = ((uint8_t) *val); + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc)); + assert(vrtc); + bzero(vrtc, sizeof(struct vrtc)); + vrtc->vm = vm; + + pthread_mutex_init(&vrtc->mtx, NULL); + + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &mach_clock); + + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + callout_drain(&vrtc->callout); + mach_port_deallocate(mach_task_self(), mach_clock); + free(vrtc); +} diff --git a/vendor/github.com/hooklift/xhyve/x86.c b/vendor/github.com/hooklift/xhyve/x86.c new file mode 100644 index 0000000..315db1e --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/x86.c @@ -0,0 +1,466 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "bhyve bhyve "; + +static volatile u_long bhyve_xcpuids; + +/* + * The default CPU topology is a single thread per package. + */ +static u_int threads_per_core = 1; +static u_int cores_per_package = 1; +static int cpuid_leaf_b = 1; + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(u_int x) +{ + + return (fls((int) (x << (1 - powerof2(x)))) - 1); +} + +int +x86_emulate_cpuid(struct vm *vm, int vcpu_id, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + const struct xsave_limits *limits; + uint64_t cr4; + int error, level, width, x2apic_id; + unsigned int func, regs[4], logical_cpus; + u_int cpu_feature, amd_feature, amd_feature2, cpu_high, cpu_exthigh; + u_int tsc_is_invariant, smp_tsc; + enum x2apic_state x2apic_state; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); + + tsc_is_invariant = 1; + smp_tsc = 1; + do_cpuid(0, regs); + cpu_high = regs[0]; + do_cpuid(1, regs); + cpu_feature = regs[3]; + do_cpuid(0x80000000, regs); + cpu_exthigh = regs[0]; + do_cpuid(0x80000001, regs); + amd_feature = regs[3] & ~(cpu_feature & 0x0183f3ff); + amd_feature2 = regs[2]; + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && *eax >= 0x80000000) { + if (*eax > cpu_exthigh) + *eax = cpu_exthigh; + } else if (*eax >= 0x40000000) { + if (*eax > CPUID_VM_HIGH) + *eax = CPUID_VM_HIGH; + } else if (*eax > cpu_high) { + *eax = cpu_high; + } + + func = *eax; + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(*eax, *ecx, regs); + break; + case CPUID_8000_0008: + cpuid_count(*eax, *ecx, regs); + break; + case CPUID_8000_0001: + cpuid_count(*eax, *ecx, regs); + + /* + * Hide SVM and Topology Extension features from guest. + */ + regs[2] &= ~((unsigned) (AMDID2_SVM | AMDID2_TOPOLOGY)); + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~((unsigned) AMDID2_PCXC); + regs[2] &= ~((unsigned) AMDID2_PNXC); + regs[2] &= ~((unsigned) AMDID2_PTSCEL2I); + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~((unsigned) AMDID2_IBS); + + /* NodeID MSR not available */ + regs[2] &= ~((unsigned) AMDID2_NODE_ID); + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~((unsigned) AMDID2_OSVW); + + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + regs[3] &= ~((unsigned) AMDID_RDTSCP); + break; + + case CPUID_8000_0007: + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus + * + * XXX This still falls short because the vcpu + * can observe the TSC moving backwards as it + * migrates across physical cpus. But at least + * it should discourage the guest from using the + * TSC to keep track of time. + */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + xhyve_abort("x86_emulate_cpuid: error %d " + "fetching x2apic state\n", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~((unsigned) CPUID_LOCAL_APIC_ID); + regs[1] |= (((unsigned) vcpu_id) << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~((unsigned) (CPUID2_VMX | CPUID2_EST | CPUID2_TM2)); + regs[2] &= ~((unsigned) CPUID2_SMX); + + regs[2] |= (unsigned) CPUID2_HV; + + if (x2apic_state != ((unsigned) X2APIC_DISABLED)) + regs[2] |= ((unsigned) CPUID2_X2APIC); + else + regs[2] &= ~((unsigned) CPUID2_X2APIC); + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & ((unsigned) CPUID2_OSXSAVE))) + regs[2] &= ~((unsigned) CPUID2_XSAVE); + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~((unsigned) CPUID2_OSXSAVE); + if (regs[2] & ((unsigned) CPUID2_XSAVE)) { + error = vm_get_register(vm, vcpu_id, VM_REG_GUEST_CR4, &cr4); + if (error) + xhyve_abort("x86_emulate_cpuid: error %d " + "fetching %%cr4\n", error); + if (cr4 & CR4_XSAVE) + regs[2] |= ((unsigned) CPUID2_OSXSAVE); + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~((unsigned) CPUID2_MON); + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~((unsigned) CPUID2_PDCM); + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~((unsigned) CPUID2_TSCDLT); + + /* + * Hide thermal monitoring + */ + regs[3] &= ~((unsigned) (CPUID_ACPI | CPUID_TM)); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~((unsigned) CPUID_DS); + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (unsigned) (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + logical_cpus = threads_per_core * cores_per_package; + regs[1] &= ~((unsigned) CPUID_HTT_CORES); + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= (unsigned) CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(*eax, *ecx, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + regs[0] &= 0x3ff; + regs[0] |= (cores_per_package - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads_per_core; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores_per_package; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (*ecx == 0) { + cpuid_count(*eax, *ecx, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD); + regs[2] = 0; + regs[3] = 0; + /* FIXME */ + // regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Processor topology enumeration + */ + logical_cpus = 0; + width = 0; + level = 0; + x2apic_id = 0; + + if (*ecx == 0) { + logical_cpus = threads_per_core; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (*ecx == 1) { + logical_cpus = threads_per_core * + cores_per_package; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || *ecx >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (((unsigned) level) << 8) | (*ecx & 0xff); + regs[3] = (unsigned) x2apic_id; + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(*eax, *ecx, regs); + switch (*ecx) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << *ecx))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(*eax, *ecx, regs); + break; + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + + return (1); +} diff --git a/vendor/github.com/hooklift/xhyve/xhyve.c b/vendor/github.com/hooklift/xhyve/xhyve.c new file mode 100644 index 0000000..a315eff --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/xhyve.c @@ -0,0 +1,959 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define MB (1024UL * 1024) + +typedef int (*vmexit_handler_t)(struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vm_exit *, int *vcpu); + +char *vmname = "vm"; +bool exit_mevent_dispatch_loop = FALSE; + +int guest_ncpus; +char *guest_uuid_str; + +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; +static int virtio_msix = 1; +static int x2apic_mode = 0; /* default is xAPIC */ + +static int strictio; +static int strictmsr = 1; + +static int acpi; + +static char *progname; +static const int BSP = 0; + +static cpuset_t cpumask; + +static void vcpu_loop(int vcpu, uint64_t rip); + +static struct vm_exit vmexit[VM_MAXCPU]; + +static struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_bogus_switch; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; +} stats; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpadded" +static struct mt_vmm_info { + pthread_t mt_thr; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; +#pragma clang diagnostic pop + +static uint64_t (*fw_func)(void); + +__attribute__ ((noreturn)) static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-behuwxACHPWY] [-c vcpus] [-g ] [-l ]\n" + " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] -f \n" + " -A: create ACPI tables\n" + " -c: # cpus (default 1)\n" + " -C: include guest memory in core file\n" + " -e: exit on unhandled I/O access\n" + " -f: firmware\n" + " -g: gdb port\n" + " -h: help\n" + " -H: vmexit from the guest on hlt\n" + " -l: LPC device configuration\n" + " -m: memory size in MB\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -P: vmexit from the guest on pause\n" + " -s: PCI slot config\n" + " -u: RTC keeps UTC time\n" + " -U: uuid\n" + " -v: show build version\n" + " -w: ignore unimplemented MSRs\n" + " -W: force virtio to use single-vector MSI\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n", + progname, (int)strlen(progname), ""); + + exit(code); +} + +__attribute__ ((noreturn)) static void +show_version() +{ + fprintf(stderr, "%s: %s\n\n%s\n",progname, "VERSION", + "xhyve is a port of FreeBSD's bhyve hypervisor to OS X that\n" + "works entirely in userspace and has no other dependencies.\n\n" + "Homepage: https://github.com/mist64/xhyve\n" + "License: BSD\n"); + exit(0); +} + +void +xh_vm_inject_fault(int vcpu, int vector, int errcode_valid, + uint32_t errcode) +{ + int error, restart_instruction; + + restart_instruction = 1; + + error = xh_vm_inject_exception(vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); +} + +void * +paddr_guest2host(uintptr_t gaddr, size_t len) +{ + return (xh_vm_map_gpa(gaddr, len)); +} + +int +fbsdrun_vmexit_on_pause(void) +{ + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_virtio_msix(void) +{ + return (virtio_msix); +} + +static void +spinup_ap_realmode(int newcpu, uint64_t *rip) +{ + int vector, error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + vector = (int) (*rip >> XHYVE_PAGE_SHIFT); + *rip = 0; + + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + error = xh_vm_set_register(newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = xh_vm_get_desc(newcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, + &desc_access); + assert(error == 0); + + desc_base = (uint64_t) (vector << XHYVE_PAGE_SHIFT); + error = xh_vm_set_desc(newcpu, VM_REG_GUEST_CS, desc_base, desc_limit, + desc_access); + assert(error == 0); + + cs = (uint16_t) ((vector << XHYVE_PAGE_SHIFT) >> 4); + error = xh_vm_set_register(newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); +} + +static void * +vcpu_thread(void *param) +{ + struct mt_vmm_info *mtp; + uint64_t rip_entry; + int vcpu; + int error; + + mtp = param; + vcpu = mtp->mt_vcpu; + rip_entry = 0xfff0; + + error = xh_vcpu_create(vcpu); + assert(error == 0); + + vcpu_set_capabilities(vcpu); + + error = xh_vcpu_reset(vcpu); + assert(error == 0); + + if (vcpu == BSP) { + rip_entry = fw_func(); + } else { + rip_entry = vmexit[vcpu].rip; + spinup_ap_realmode(vcpu, &rip_entry); + } + + vmexit[vcpu].rip = rip_entry; + vmexit[vcpu].inst_length = 0; + + vcpu_loop(vcpu, vmexit[vcpu].rip); + + return (NULL); +} + +void +vcpu_add(int fromcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(fromcpu == BSP); + + /* + * The 'newcpu' must be activated in the context of 'fromcpu'. If + * vm_activate_cpu() is delayed until newcpu's pthread starts running + * then vmm.ko is out-of-sync with bhyve and this can create a race + * with vm_suspend(). + */ + error = xh_vm_activate_cpu(newcpu); + assert(error == 0); + + CPU_SET_ATOMIC(((unsigned) newcpu), &cpumask); + + mt_vmm_info[newcpu].mt_vcpu = newcpu; + + vmexit[newcpu].rip = rip; + + error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, vcpu_thread, + &mt_vmm_info[newcpu]); + + assert(error == 0); +} + +static int +vcpu_delete(int vcpu) +{ + if (!CPU_ISSET(((unsigned) vcpu), &cpumask)) { + fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + exit(1); + } + + CPU_CLR_ATOMIC(((unsigned) vcpu), &cpumask); + return (CPU_EMPTY(&cpumask)); +} + +static int +vmexit_handle_notify(UNUSED struct vm_exit *vme, UNUSED int *pvcpu, + UNUSED uint32_t eax) +{ + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out, string; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + string = vme->u.inout.string; + in = vme->u.inout.in; + out = !in; + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(vme, pvcpu, vme->u.inout.eax); + return (error); + } + + error = emulate_inout(vcpu, vme, strictio); + if (error) { + fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%llx\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), + port, vmexit->rip); + return (VMEXIT_ABORT); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_rdmsr(struct vm_exit *vme, int *pvcpu) +{ + uint64_t val; + uint32_t eax, edx; + int error; + + val = 0; + error = emulate_rdmsr(*pvcpu, vme->u.msr.code, &val); + if (error != 0) { + fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", + vme->u.msr.code, *pvcpu); + if (strictmsr) { + vm_inject_gp(*pvcpu); + return (VMEXIT_CONTINUE); + } + } + + eax = (uint32_t) val; + error = xh_vm_set_register(*pvcpu, VM_REG_GUEST_RAX, eax); + assert(error == 0); + + edx = val >> 32; + error = xh_vm_set_register(*pvcpu, VM_REG_GUEST_RDX, edx); + assert(error == 0); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_wrmsr(struct vm_exit *vme, int *pvcpu) +{ + int error; + + error = emulate_wrmsr(*pvcpu, vme->u.msr.code, vme->u.msr.wval); + if (error != 0) { + fprintf(stderr, "wrmsr to register %#x(%#llx) on vcpu %d\n", + vme->u.msr.code, vme->u.msr.wval, *pvcpu); + if (strictmsr) { + vm_inject_gp(*pvcpu); + return (VMEXIT_CONTINUE); + } + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_spinup_ap(struct vm_exit *vme, int *pvcpu) +{ + assert(vme->u.spinup_ap.vcpu != 0); + assert(vme->u.spinup_ap.vcpu < guest_ncpus); + + vcpu_add(*pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_vmx(struct vm_exit *vme, int *pvcpu) +{ + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tVMX\n"); + fprintf(stderr, "\trip\t\t0x%016llx\n", vme->rip); + fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); + fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); + fprintf(stderr, "\texit_reason\t%u\n", vme->u.vmx.exit_reason); + fprintf(stderr, "\tqualification\t0x%016llx\n", + vme->u.vmx.exit_qualification); + fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); + fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vm_exit *vme, UNUSED int *pvcpu) +{ + assert(vme->inst_length == 0); + + stats.vmexit_bogus++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_hlt(UNUSED struct vm_exit *vme, UNUSED int *pvcpu) +{ + stats.vmexit_hlt++; + + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); +} + +static int +vmexit_pause(UNUSED struct vm_exit *vme, UNUSED int *pvcpu) +{ + stats.vmexit_pause++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_mtrap(struct vm_exit *vme, UNUSED int *pvcpu) +{ + assert(vme->inst_length == 0); + + stats.vmexit_mtrap++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inst_emul(struct vm_exit *vme, int *pvcpu) +{ + int err, i; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vme->u.inst_emul.vie; + err = emulate_mem(*pvcpu, vme->u.inst_emul.gpa, vie, + &vme->u.inst_emul.paging); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%llx\n", + vme->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction ["); + for (i = 0; i < vie->num_valid; i++) { + fprintf(stderr, "0x%02x%s", vie->inst[i], + i != (vie->num_valid - 1) ? " " : ""); + } + fprintf(stderr, "] at 0x%llx\n", vme->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; + +static int +vmexit_suspend(struct vm_exit *vme, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vme->u.suspended.how; + + vcpu_delete(*pvcpu); + + if (*pvcpu != BSP) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + + switch ((int) (how)) { + case VM_SUSPEND_RESET: + go_callback_exit(0); + return 0; + case VM_SUSPEND_POWEROFF: + go_callback_exit(1); + return 1; + case VM_SUSPEND_HALT: + go_callback_exit(2); + return 2; + case VM_SUSPEND_TRIPLEFAULT: + go_callback_exit(3); + return 3; + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + go_callback_exit(100); + return 100; + } +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, +}; + +void +vcpu_set_capabilities(int cpu) +{ + int err, tmp; + + if (fbsdrun_vmexit_on_hlt()) { + err = xh_vm_get_capability(cpu, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, "VM exit on HLT not supported\n"); + exit(1); + } + xh_vm_set_capability(cpu, VM_CAP_HALT_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = xh_vm_get_capability(cpu, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, + "SMP mux requested, no pause support\n"); + exit(1); + } + xh_vm_set_capability(cpu, VM_CAP_PAUSE_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (x2apic_mode) + err = xh_vm_set_x2apic_state(cpu, X2APIC_ENABLED); + else + err = xh_vm_set_x2apic_state(cpu, X2APIC_DISABLED); + + if (err) { + fprintf(stderr, "Unable to set x2apic state (%d)\n", err); + exit(1); + } +} + +static void +vcpu_loop(int vcpu, uint64_t startrip) +{ + int error, rc, prevcpu; + enum vm_exitcode exitcode; + cpuset_t active_cpus; + + error = xh_vm_active_cpus(&active_cpus); + assert(CPU_ISSET(((unsigned) vcpu), &active_cpus)); + + error = xh_vm_set_register(vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + + while (1) { + error = xh_vm_run(vcpu, &vmexit[vcpu]); + if (error != 0) + break; + + prevcpu = vcpu; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vcpu_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(1); + } + + rc = (*handler[exitcode])(&vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + break; + case VMEXIT_ABORT: + abort(); + case VM_SUSPEND_HALT: + case VM_SUSPEND_POWEROFF: + case VM_SUSPEND_RESET: + break; + default: + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); + exit(1); + } + } +} + +static int +num_vcpus_allowed(void) +{ + return (VM_MAXCPU); +} + +static int +expand_number(const char *buf, uint64_t *num) +{ + char *endptr; + uintmax_t umaxval; + uint64_t number; + unsigned shift; + int serrno; + + serrno = errno; + errno = 0; + umaxval = strtoumax(buf, &endptr, 0); + if (umaxval > UINT64_MAX) + errno = ERANGE; + if (errno != 0) + return (-1); + errno = serrno; + number = umaxval; + + switch (tolower((unsigned char)*endptr)) { + case 'e': + shift = 60; + break; + case 'p': + shift = 50; + break; + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': /* No unit. */ + *num = number; + return (0); + default: + /* Unrecognized unit. */ + errno = EINVAL; + return (-1); + } + + if ((number << shift) >> shift != number) { + /* Overflow */ + errno = ERANGE; + return (-1); + } + *num = number << shift; + return (0); +} + +static int +parse_memsize(const char *opt, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(opt, &endptr, 0); + if (*opt != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(opt, ((uint64_t *) ret_memsize)); + + return (error); +} + +static int +firmware_parse(const char *opt) { + char *fw, *opt1, *opt2, *opt3, *cp; + + fw = strdup(opt); + + if (strncmp(fw, "kexec", strlen("kexec")) == 0) { + fw_func = kexec; + } else if (strncmp(fw, "fbsd", strlen("fbsd")) == 0) { + fw_func = fbsd_load; + } else { + goto fail; + } + + if ((cp = strchr(fw, ',')) != NULL) { + *cp = '\0'; + opt1 = cp + 1; + } else { + goto fail; + } + + if ((cp = strchr(opt1, ',')) != NULL) { + *cp = '\0'; + opt2 = cp + 1; + } else { + goto fail; + } + + if ((cp = strchr(opt2, ',')) != NULL) { + *cp = '\0'; + opt3 = cp + 1; + } else { + goto fail; + } + + opt2 = strlen(opt2) ? opt2 : NULL; + opt3 = strlen(opt3) ? opt3 : NULL; + + if (fw_func == kexec) { + kexec_init(opt1, opt2, opt3); + } else if (fw_func == fbsd_load) { + /* FIXME: let user set boot-loader serial device */ + fbsd_init(opt1, opt2, opt3, NULL); + } else { + goto fail; + } + + return 0; + +fail: + fprintf(stderr, "Invalid firmare argument\n" + " -f kexec,'kernel','initrd','\"cmdline\"'\n" + " -f fbsd,'userboot','boot volume','\"kernel env\"'\n"); + + return -1; +} + +int +run_xhyve(int argc, char* argv[]) +{ + int c, error, gdb_port, bvmcons, fw; + int dump_guest_memory, max_vcpus, mptgen; + int rtc_localtime; + uint64_t rip; + size_t memsize; + + bvmcons = 0; + dump_guest_memory = 0; + progname = basename(argv[0]); + gdb_port = 0; + guest_ncpus = 1; + memsize = 256 * MB; + mptgen = 1; + rtc_localtime = 1; + fw = 0; + + while ((c = getopt(argc, argv, "behvuwxACHPWY:f:g:c:s:m:l:U:")) != -1) { + switch (c) { + case 'A': + acpi = 1; + break; + case 'b': + bvmcons = 1; + break; + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'C': + dump_guest_memory = 1; + break; + case 'f': + if (firmware_parse(optarg) != 0) { + exit (1); + } else { + fw = 1; + break; + } + case 'g': + gdb_port = atoi(optarg); + break; + case 'l': + if (lpc_device_parse(optarg) != 0) { + errx(EX_USAGE, "invalid lpc device " + "configuration '%s'", optarg); + } + break; + case 's': + if (pci_parse_slot(optarg) != 0) + exit(1); + else + break; + case 'm': + error = parse_memsize(optarg, &memsize); + if (error) + errx(EX_USAGE, "invalid memsize '%s'", optarg); + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'e': + strictio = 1; + break; + case 'u': + rtc_localtime = 0; + break; + case 'U': + guest_uuid_str = optarg; + break; + case 'w': + strictmsr = 0; + break; + case 'W': + virtio_msix = 0; + break; + case 'x': + x2apic_mode = 1; + break; + case 'Y': + mptgen = 0; + break; + case 'v': + show_version(); + case 'h': + usage(0); + default: + usage(1); + } + } + + if (fw != 1) + usage(1); + + error = xh_vm_create(); + if (error) { + fprintf(stderr, "Unable to create VM (%d)\n", error); + exit(1); + } + + if (guest_ncpus < 1) { + fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); + exit(1); + } + + max_vcpus = num_vcpus_allowed(); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(1); + } + + error = xh_vm_setup_memory(memsize, VM_MMAP_ALL); + if (error) { + fprintf(stderr, "Unable to setup memory (%d)\n", error); + exit(1); + } + + error = init_msr(); + if (error) { + fprintf(stderr, "init_msr error %d\n", error); + exit(1); + } + + init_mem(); + init_inout(); + pci_irq_init(); + ioapic_init(); + + rtc_init(rtc_localtime); + sci_init(); + + /* + * Exit if a device emulation finds an error in it's initilization + */ + if (init_pci() != 0) + exit(1); + + if (gdb_port != 0) + init_dbgport(gdb_port); + + if (bvmcons) + init_bvmcons(); + + /* + * build the guest tables, MP etc. + */ + if (mptgen) { + error = mptable_build(guest_ncpus); + if (error) + exit(1); + } + + error = smbios_build(); + assert(error == 0); + + if (acpi) { + error = acpi_build(guest_ncpus); + assert(error == 0); + } + + rip = 0; + + vcpu_add(BSP, BSP, rip); + + /* + * Head off to the main event dispatch loop + */ + error = mevent_dispatch(); + + return error; +} diff --git a/vendor/github.com/hooklift/xhyve/xhyve.go b/vendor/github.com/hooklift/xhyve/xhyve.go new file mode 100644 index 0000000..3349e34 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/xhyve.go @@ -0,0 +1,110 @@ +// +build darwin + +package xhyve + +// #cgo CFLAGS: -I${SRCDIR}/include -x c -std=c11 -fno-common -arch x86_64 -DXHYVE_CONFIG_ASSERT -DVERSION=v0.2.0 -Os -fstrict-aliasing -Wno-unknown-warning-option -Wno-reserved-id-macro -pedantic -fmessage-length=152 -fdiagnostics-show-note-include-stack -fmacro-backtrace-limit=0 +// #cgo LDFLAGS: -L${SRCDIR} -arch x86_64 -framework Hypervisor -framework vmnet +// #include +// #include +// #include +import "C" +import ( + "fmt" + "os" + "runtime" + "syscall" + "unsafe" +) + +var termios syscall.Termios + +// getTermios gets the current settings for the terminal. +func getTermios() syscall.Termios { + var state syscall.Termios + if _, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(0), uintptr(syscall.TIOCGETA), uintptr(unsafe.Pointer(&state)), 0, 0, 0); err != 0 { + fmt.Fprintln(os.Stderr, err) + } + return state +} + +// setTermios restores terminal settings. +func setTermios(state syscall.Termios) { + syscall.Syscall6(syscall.SYS_IOCTL, uintptr(0), uintptr(syscall.TIOCSETA), uintptr(unsafe.Pointer(&state)), 0, 0, 0) +} + +// go_callback_exit gets invoked from within xhyve.c whenever a trap +// suspending the VM is triggered. This is so we can clean up resources +// in Go land, restore terminal settings and allow the goroutine to be scheduled +// on multiple OS threads again by Go's scheduler. +//export go_callback_exit +func go_callback_exit(status C.int) { + // Restores stty settings to the values that existed before running xhyve. + setTermios(termios) + + fmt.Printf("Exiting with status code %d\n", status) + fmt.Printf("Releasing allocated memory from Go land... ") + for _, arg := range argv { + C.free(unsafe.Pointer(arg)) + } + fmt.Println("done") + + // Turns exit flag On for mevent busy loop so that the next time kevent + // receives an event, mevent handles it and exits the loop. + C.exit_mevent_dispatch_loop = true + + // Forces kevent() to exit by using the self-pipe trick. + C.mevent_exit() + + // Allows Go's scheduler to move the goroutine to a different OS thread. + runtime.UnlockOSThread() +} + +// go_set_pty_name is called by xhyve whenever a master/slave pseudo-terminal is setup in +// COM1 or COM2. +//export go_set_pty_name +func go_set_pty_name(name *C.char) { + if newPty == nil { + return + } + newPty <- C.GoString(name) +} + +func init() { + // Saves stty settings + termios = getTermios() + + // We need to stick the goroutine to its current OS thread so Go's scheduler + // does not move it to a different thread while xhyve is running. By doing this + // we make sure that once go_callback_exit is invoked from C land, it in turn + // invokes C funtions from the same OS thread and thread context. + runtime.LockOSThread() +} + +// newPty is a channel to send through the devices path names +// for new devices created when a LPC device is added with the +// option: autopty. Example: -l com1,autopty +// If you add a LPC device on COM2 with autopty enabled, you might need to +// make sure the guest OS runs getty on the pseudo-terminal device created, so +// a login prompt is shown once you open such pseudo-terminal. +var newPty chan string + +var argv []*C.char + +// Run runs xhyve hypervisor. +func Run(params []string, newPtyCh chan string) error { + newPty = newPtyCh + + argc := C.int(len(params)) + argv = make([]*C.char, argc) + for i, arg := range params { + argv[i] = C.CString(arg) + } + + // Runs xhyve and blocks. + if err := C.run_xhyve(argc, &argv[0]); err != 0 { + fmt.Printf("ERROR => %s\n", C.GoString(C.strerror(err))) + return fmt.Errorf("Error initializing hypervisor") + } + + return nil +} diff --git a/vendor/github.com/hooklift/xhyve/xhyve.mk b/vendor/github.com/hooklift/xhyve/xhyve.mk new file mode 100644 index 0000000..28b8dc0 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/xhyve.mk @@ -0,0 +1,65 @@ +XHYVE_VERSION := $(shell cd vendor/xhyve; git describe --abbrev=6 --dirty --always --tags) + +VMM_SRC := \ + vendor/xhyve/src/vmm/x86.c \ + vendor/xhyve/src/vmm/vmm.c \ + vendor/xhyve/src/vmm/vmm_host.c \ + vendor/xhyve/src/vmm/vmm_mem.c \ + vendor/xhyve/src/vmm/vmm_lapic.c \ + vendor/xhyve/src/vmm/vmm_instruction_emul.c \ + vendor/xhyve/src/vmm/vmm_ioport.c \ + vendor/xhyve/src/vmm/vmm_callout.c \ + vendor/xhyve/src/vmm/vmm_stat.c \ + vendor/xhyve/src/vmm/vmm_util.c \ + vendor/xhyve/src/vmm/vmm_api.c \ + vendor/xhyve/src/vmm/intel/vmx.c \ + vendor/xhyve/src/vmm/intel/vmx_msr.c \ + vendor/xhyve/src/vmm/intel/vmcs.c \ + vendor/xhyve/src/vmm/io/vatpic.c \ + vendor/xhyve/src/vmm/io/vatpit.c \ + vendor/xhyve/src/vmm/io/vhpet.c \ + vendor/xhyve/src/vmm/io/vioapic.c \ + vendor/xhyve/src/vmm/io/vlapic.c \ + vendor/xhyve/src/vmm/io/vpmtmr.c \ + vendor/xhyve/src/vmm/io/vrtc.c + +XHYVE_SRC := \ + vendor/xhyve/src/acpitbl.c \ + vendor/xhyve/src/atkbdc.c \ + vendor/xhyve/src/block_if.c \ + vendor/xhyve/src/consport.c \ + vendor/xhyve/src/dbgport.c \ + vendor/xhyve/src/inout.c \ + vendor/xhyve/src/ioapic.c \ + vendor/xhyve/src/md5c.c \ + vendor/xhyve/src/mem.c \ + vendor/xhyve/src/mevent.c \ + vendor/xhyve/src/mptbl.c \ + vendor/xhyve/src/pci_ahci.c \ + vendor/xhyve/src/pci_emul.c \ + vendor/xhyve/src/pci_hostbridge.c \ + vendor/xhyve/src/pci_irq.c \ + vendor/xhyve/src/pci_lpc.c \ + vendor/xhyve/src/pci_uart.c \ + vendor/xhyve/src/pci_virtio_block.c \ + vendor/xhyve/src/pci_virtio_net_tap.c \ + vendor/xhyve/src/pci_virtio_net_vmnet.c \ + vendor/xhyve/src/pci_virtio_rnd.c \ + vendor/xhyve/src/pm.c \ + vendor/xhyve/src/post.c \ + vendor/xhyve/src/rtc.c \ + vendor/xhyve/src/smbiostbl.c \ + vendor/xhyve/src/task_switch.c \ + vendor/xhyve/src/uart_emul.c \ + vendor/xhyve/src/xhyve.c \ + vendor/xhyve/src/virtio.c \ + vendor/xhyve/src/xmsr.c + +FIRMWARE_SRC := \ + vendor/xhyve/src/firmware/kexec.c \ + vendor/xhyve/src/firmware/fbsd.c + +SRC := \ + $(VMM_SRC) \ + $(XHYVE_SRC) \ + $(FIRMWARE_SRC) diff --git a/vendor/github.com/hooklift/xhyve/xhyve.patch b/vendor/github.com/hooklift/xhyve/xhyve.patch new file mode 100644 index 0000000..ce84275 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/xhyve.patch @@ -0,0 +1,248 @@ +diff --git a/include/xhyve/mevent.h b/include/xhyve/mevent.h +index 48866bd..d09ed2e 100644 +--- a/include/xhyve/mevent.h ++++ b/include/xhyve/mevent.h +@@ -43,5 +43,5 @@ int mevent_enable(struct mevent *evp); + int mevent_disable(struct mevent *evp); + int mevent_delete(struct mevent *evp); + int mevent_delete_close(struct mevent *evp); +- +-void mevent_dispatch(void); ++int mevent_dispatch(void); ++void mevent_exit(void); +diff --git a/include/xhyve/xhyve.h b/include/xhyve/xhyve.h +index 4f85664..f9ae309 100644 +--- a/include/xhyve/xhyve.h ++++ b/include/xhyve/xhyve.h +@@ -29,6 +29,7 @@ + #pragma once + + #include ++#include + #include + + #ifndef CTASSERT /* Allow lint to override */ +@@ -43,6 +44,7 @@ + extern int guest_ncpus; + extern char *guest_uuid_str; + extern char *vmname; ++extern bool exit_mevent_dispatch_loop; + + void xh_vm_inject_fault(int vcpu, int vector, int errcode_valid, + uint32_t errcode); +@@ -78,3 +80,5 @@ void vcpu_add(int fromcpu, int newcpu, uint64_t rip); + int fbsdrun_vmexit_on_hlt(void); + int fbsdrun_vmexit_on_pause(void); + int fbsdrun_virtio_msix(void); ++int run_xhyve(int argc, char *argv[]); ++extern void go_callback_exit(int status); +diff --git a/src/mevent.c b/src/mevent.c +index 34a7d0e..977abcb 100644 +--- a/src/mevent.c ++++ b/src/mevent.c +@@ -27,7 +27,7 @@ + */ + + /* +- * Micro event library for FreeBSD, designed for a single i/o thread ++ * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -52,6 +53,7 @@ + #define MEV_DEL_PENDING 4 + + extern char *vmname; ++extern bool exit_mevent_dispatch_loop; + + static pthread_t mevent_tid; + static int mevent_timid = 43; +@@ -103,11 +105,11 @@ mevent_pipe_read(int fd, UNUSED enum ev_type type, UNUSED void *param) + } while (status == MEVENT_MAX); + } + +-static void ++void + mevent_notify(void) + { + char c; +- ++ + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. +@@ -117,6 +119,12 @@ mevent_notify(void) + } + } + ++void mevent_exit(void) ++{ ++ char c; ++ write(mevent_pipefd[1], &c, 1); ++} ++ + static int + mevent_kq_filter(struct mevent *mevp) + { +@@ -309,7 +317,7 @@ mevent_update(struct mevent *evp, int newstate) + */ + if (evp->me_state == newstate) + return (0); +- ++ + mevent_qlock(); + + evp->me_state = newstate; +@@ -387,8 +395,7 @@ mevent_set_name(void) + { + } + +-__attribute__ ((noreturn)) void +-mevent_dispatch(void) ++int mevent_dispatch(void) + { + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; +@@ -411,7 +418,8 @@ mevent_dispatch(void) + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); +- exit(0); ++ return EPIPE; ++ //exit(0); + } + + /* +@@ -421,6 +429,9 @@ mevent_dispatch(void) + assert(pipev != NULL); + + for (;;) { ++ if (exit_mevent_dispatch_loop) { ++ break; ++ } + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call +@@ -442,10 +453,11 @@ mevent_dispatch(void) + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } +- ++ + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); +- } ++ } ++ return 0; + } +diff --git a/src/xhyve.c b/src/xhyve.c +index 97fd41f..1ea4006 100644 +--- a/src/xhyve.c ++++ b/src/xhyve.c +@@ -76,6 +76,7 @@ typedef int (*vmexit_handler_t)(struct vm_exit *, int *vcpu); + extern int vmexit_task_switch(struct vm_exit *, int *vcpu); + + char *vmname = "vm"; ++bool exit_mevent_dispatch_loop = FALSE; + + int guest_ncpus; + char *guest_uuid_str; +@@ -154,7 +155,7 @@ usage(int code) + __attribute__ ((noreturn)) static void + show_version() + { +- fprintf(stderr, "%s: %s\n\n%s\n",progname, VERSION, ++ fprintf(stderr, "%s: %s\n\n%s\n",progname, "VERSION", + "xhyve is a port of FreeBSD's bhyve hypervisor to OS X that\n" + "works entirely in userspace and has no other dependencies.\n\n" + "Homepage: https://github.com/mist64/xhyve\n" +@@ -263,8 +264,6 @@ vcpu_thread(void *param) + + vcpu_loop(vcpu, vmexit[vcpu].rip); + +- /* not reached */ +- exit(1); + return (NULL); + } + +@@ -519,16 +518,21 @@ vmexit_suspend(struct vm_exit *vme, int *pvcpu) + + switch ((int) (how)) { + case VM_SUSPEND_RESET: +- exit(0); ++ go_callback_exit(0); ++ return 0; + case VM_SUSPEND_POWEROFF: +- exit(1); ++ go_callback_exit(1); ++ return 1; + case VM_SUSPEND_HALT: +- exit(2); ++ go_callback_exit(2); ++ return 2; + case VM_SUSPEND_TRIPLEFAULT: +- exit(3); ++ go_callback_exit(3); ++ return 3; + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); +- exit(100); ++ go_callback_exit(100); ++ return 100; + } + } + +@@ -615,18 +619,22 @@ vcpu_loop(int vcpu, uint64_t startrip) + exit(1); + } + +- rc = (*handler[exitcode])(&vmexit[vcpu], &vcpu); ++ rc = (*handler[exitcode])(&vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + break; + case VMEXIT_ABORT: + abort(); ++ case VM_SUSPEND_HALT: ++ case VM_SUSPEND_POWEROFF: ++ case VM_SUSPEND_RESET: ++ break; + default: ++ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); + exit(1); + } + } +- fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); + } + + static int +@@ -774,7 +782,7 @@ fail: + } + + int +-main(int argc, char *argv[]) ++run_xhyve(int argc, char* argv[]) + { + int c, error, gdb_port, bvmcons, fw; + int dump_guest_memory, max_vcpus, mptgen; +@@ -945,7 +953,7 @@ main(int argc, char *argv[]) + /* + * Head off to the main event dispatch loop + */ +- mevent_dispatch(); ++ error = mevent_dispatch(); + +- exit(1); ++ return error; + } diff --git a/vendor/github.com/hooklift/xhyve/xmsr.c b/vendor/github.com/hooklift/xhyve/xmsr.c new file mode 100644 index 0000000..79424e1 --- /dev/null +++ b/vendor/github.com/hooklift/xhyve/xmsr.c @@ -0,0 +1,104 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2015 xhyve developers + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +emulate_wrmsr(UNUSED int vcpu, uint32_t num, UNUSED uint64_t val) +{ + switch (num) { + case 0xd04: /* Sandy Bridge uncore PMCs */ + case 0xc24: + return (0); + case MSR_BIOS_UPDT_TRIG: + return (0); + case MSR_BIOS_SIGN: + return (0); + default: + break; + } + + return (-1); +} + +int +emulate_rdmsr(UNUSED int vcpu, uint32_t num, uint64_t *val) +{ + int error = 0; + + switch (num) { + case MSR_BIOS_SIGN: + case MSR_IA32_PLATFORM_ID: + case MSR_PKG_ENERGY_STATUS: + case MSR_PP0_ENERGY_STATUS: + case MSR_PP1_ENERGY_STATUS: + case MSR_DRAM_ENERGY_STATUS: + *val = 0; + break; + case MSR_RAPL_POWER_UNIT: + /* + * Use the default value documented in section + * "RAPL Interfaces" in Intel SDM vol3. + */ + *val = 0x000a1003; + break; + default: + error = -1; + break; + } + + return (error); +} + +int +init_msr(void) +{ + u_int regs[4]; + u_int cpu_vendor[4]; + + do_cpuid(0, regs); + cpu_vendor[0] = regs[1]; + cpu_vendor[1] = regs[3]; + cpu_vendor[2] = regs[2]; + cpu_vendor[3] = 0; + + if (strcmp(((char *) cpu_vendor), "GenuineIntel") == 0) { + return 0; + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", ((char *) cpu_vendor)); + return (-1); + } +} diff --git a/xhyve.go b/xhyve.go index dc96101..15445cf 100644 --- a/xhyve.go +++ b/xhyve.go @@ -318,17 +318,23 @@ func (d *Driver) Start() error { img := d.ResolveStorePath(d.MachineName + ".dmg") bootcmd := d.BootCmd - cmd := exec.Command("goxhyve", - fmt.Sprintf("%s", uuid), - fmt.Sprintf("%d", d.CPU), - fmt.Sprintf("%d", d.Memory), - fmt.Sprintf("%s", iso), - fmt.Sprintf("%s", img), - fmt.Sprintf("kexec,%s,%s,%s", vmlinuz, initrd, bootcmd), - "-d", //TODO fix daemonize flag - ) - log.Debug(cmd) - + args := []string{ + "xhyve", + "-A", + "-U", fmt.Sprintf("%s", uuid), + "-c", fmt.Sprintf("%d", d.CPU), + "-m", fmt.Sprintf("%dM", d.Memory), + "-l", "com1", + "-s", "0:0,hostbridge", + "-s", "31,lpc", + "-s", "2:0,virtio-net", + "-s", fmt.Sprintf("3,ahci-cd,%s", iso), + "-s", fmt.Sprintf("4,virtio-blk,%s", img), + "-f", fmt.Sprintf("kexec,%s,%s,%s", vmlinuz, initrd, bootcmd)} + + log.Debug(args) + + cmd := exec.Command(os.Args[0], args...) cmd.Stdout = &bytes.Buffer{} cmd.Stderr = &bytes.Buffer{}